Source code for macrosynergy.learning.forecasting.bootstrap.bootstrap

import numpy as np
import pandas as pd

from collections import Counter, defaultdict
from abc import ABC


[docs]class BasePanelBootstrap(ABC):
    def __init__(
        self,
        bootstrap_method="panel",
        resample_ratio=1,
        max_features=None,
    ):
        """
        Construct bootstrap datasets over a panel.

        Parameters
        ----------
        bootstrap_method : str
            Method to bootstrap the data. Current options are "panel",
            "period", "cross", "cross_per_period" and "period_per_cross".
            Default is "panel".
        resample_ratio : numbers.Number
            Ratio of resampling units comprised in each bootstrap dataset.
            This is a fraction of the quantity of the panel component to be
            resampled. Default value is 1.
        max_features: str or numbers.Number, optional
            The number of features to consider in each bootstrap dataset.
            This can be used to increase the variation between bootstrap datasets.
            Default is None and currently not implemented.

        Notes
        -----
        The non-parametric bootstrap is a method to generate datasets that follow the same
        distribution as the original dataset, as best as possible given the observed data.
        Mathematically, a bootstrap dataset is equivalent to sampling from the empirical
        distribution of the original dataset. A bootstrap dataset is constructed by
        sampling the observed data with replacement.

        Bootstrapping can be used to estimate the distribution of a statistic, for
        instance the sampling distribution of a model parameter. As an example, a
        regression model can be fit to each bootstrap dataset to estimate the distribution
        of the model parameters. The standard deviation of these distributions,
        consequently, can be used to estimate the sampling variation of the model
        parameters.

        Bootstrapping can also be used to create "bagged" models. Bagging is a method
        aimed at reducing the variance of a machine learning model. It involves training
        multiple models on different bootstrap datasets and averaging the predictions
        of these models. Often, additional variation is also introduced to each bootstrap
        dataset - for instance by randomly sampling a subset of features for each dataset.
        """
        # Checks
        self._check_boot_params(
            bootstrap_method=bootstrap_method,
            resample_ratio=resample_ratio,
            max_features=max_features,
        )

        # Set attributes
        self.bootstrap_method = bootstrap_method
        self.resample_ratio = resample_ratio
        self.max_features = max_features

[docs]    def create_bootstrap_dataset(
        self,
        X,
        y,
    ):
        """
        Generate a bootstrap dataset based on a panel of features and a
        dependent variable.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix
        y : pd.DataFrame or pd.Series
            Dependent variable.

        Returns
        -------
        X_resampled : pd.DataFrame
            Bootstrap resampled feature matrix.
        y_resampled : pd.DataFrame or pd.Series
            Bootstrap resampled dependent variable.
        """
        # Store index information in numpy arrays
        index_array = np.array(X.index.tolist())
        cross_sections = index_array[:, 0]
        unique_cross_sections = np.unique(cross_sections)
        real_dates = index_array[:, 1]
        unique_real_dates = np.unique(real_dates)

        # Create a bootstrap dataset
        if self.bootstrap_method == "panel":
            X_resampled, y_resampled = self._panel_bootstrap(
                X=X,
                y=y,
            )

        elif self.bootstrap_method == "period":
            X_resampled, y_resampled = self._period_bootstrap(
                X=X,
                y=y,
                unique_real_dates=unique_real_dates,
            )

        elif self.bootstrap_method == "cross":
            X_resampled, y_resampled = self._cross_bootstrap(
                X=X,
                y=y,
                unique_cross_sections=unique_cross_sections,
            )

        elif self.bootstrap_method == "cross_per_period":
            X_resampled, y_resampled = self._cross_per_period_bootstrap(
                X=X,
                y=y,
                unique_cross_sections=unique_cross_sections,
            )

        elif self.bootstrap_method == "period_per_cross":
            X_resampled, y_resampled = self._period_per_cross_bootstrap(
                X=X,
                y=y,
                unique_real_dates=unique_real_dates,
            )

        return X_resampled, y_resampled

    def _panel_bootstrap(
        self,
        X,
        y,
    ):
        """
        Generate a bootstrap dataset by resampling the panel.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix.
        y : pd.DataFrame or pd.Series
            Dependent variable.

        Returns
        -------
        X_resampled : pd.DataFrame
            Bootstrap resampled feature matrix.
        y_resampled : pd.DataFrame or pd.Series
            Bootstrap resampled dependent variable.
        """
        bootstrap_idx = np.random.choice(
            np.arange(X.shape[0]),
            size=int(np.ceil(self.resample_ratio * X.shape[0])),
            replace=True,
        )
        X_resampled = X.iloc[bootstrap_idx]
        y_resampled = y.iloc[bootstrap_idx]

        return X_resampled, y_resampled

    def _period_bootstrap(
        self,
        X,
        y,
        unique_real_dates,
    ):
        """
        Generate a bootstrap dataset by resampling periods in the panel.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix.
        y : pd.DataFrame or pd.Series
            Dependent variable.
        unique_real_dates : np.ndarray of pd.Timestamp
            Unique dates in the panel.

        Returns
        -------
        X_resampled : pd.DataFrame
            Bootstrap resampled feature matrix.
        y_resampled : pd.DataFrame or pd.Series
            Bootstrap resampled dependent variable.
        """
        # Resample unique panel dates with replacement
        bootstrap_periods = np.random.choice(
            unique_real_dates,
            size=int(np.ceil(self.resample_ratio * len(unique_real_dates))),
            replace=True,
        )

        # Obtain a {count: [periods]} dictionary so that the new panel can be efficiently
        # constructed by looping through the counts instead of the periods.
        period_counts = dict(Counter(bootstrap_periods))
        count_to_periods = defaultdict(list)
        for period, count in period_counts.items():
            count_to_periods[count].append(period)

        # For each count, extract the periods that have that count and tile then by count
        # to create the new panel.
        X_resampled = np.empty((0, X.shape[1]))
        y_resampled = np.empty(0)
        index_resampled = []
        for count, periods in count_to_periods.items():
            X_resampled = np.vstack(
                [
                    X_resampled,
                    np.tile(
                        X[X.index.get_level_values(1).isin(periods)].values,
                        (count, 1),
                    ),
                ]
            )
            y_resampled = np.append(
                y_resampled,
                np.tile(y[y.index.get_level_values(1).isin(periods)].values, count),
            )
            index_resampled.extend(
                X[X.index.get_level_values(1).isin(periods)].index.tolist() * count
            )

        # reconstruct index
        index_resampled = pd.MultiIndex.from_tuples(
            index_resampled, names=["cid", "real_date"]
        )

        # Convert resampled datasets to pandas dataframes
        X_resampled = pd.DataFrame(
            data=X_resampled,
            index=index_resampled,
            columns=X.columns,
        )
        y_resampled = pd.Series(
            data=y_resampled,
            index=index_resampled,
            name=y.name,
        )

        return X_resampled, y_resampled

    def _cross_bootstrap(
        self,
        X,
        y,
        unique_cross_sections,
    ):
        """
        Generate a bootstrap dataset by resampling cross-sections in the panel.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix.
        y : pd.DataFrame or pd.Series
            Dependent variable.
        unique_cross_sections : np.ndarray of str
            Unique cross-sections in the panel.

        Returns
        -------
        X_resampled : pd.DataFrame
            Bootstrap resampled feature matrix.
        y_resampled : pd.DataFrame or pd.Series
            Bootstrap resampled dependent variable.
        """
        # Resample unique panel cross-sections with replacement
        bootstrap_cross_sections = np.random.choice(
            unique_cross_sections,
            size=int(np.ceil(len(unique_cross_sections) * self.resample_ratio)),
            replace=True,
        )

        # Obtain a {count: [cross_sections]} dictionary so that the new panel can be efficiently
        # constructed by looping through the counts instead of the periods.
        cross_section_counts = dict(Counter(bootstrap_cross_sections))
        count_to_cross_sections = defaultdict(list)
        for cross_section, count in cross_section_counts.items():
            count_to_cross_sections[count].append(cross_section)

        # For each count, tile the observations within the cross-sections with that count
        X_resampled = np.empty((0, X.shape[1]))
        y_resampled = np.empty(0)
        index_resampled = []
        for count, cross_sections in count_to_cross_sections.items():
            X_resampled = np.vstack(
                [
                    X_resampled,
                    np.tile(
                        X[X.index.get_level_values(0).isin(cross_sections)].values,
                        (count, 1),
                    ),
                ]
            )
            y_resampled = np.append(
                y_resampled,
                np.tile(
                    y[y.index.get_level_values(0).isin(cross_sections)].values, count
                ),
            )
            index_resampled.extend(
                X[X.index.get_level_values(0).isin(cross_sections)].index.tolist()
                * count
            )

        # reconstruct index
        index_resampled = pd.MultiIndex.from_tuples(
            index_resampled, names=["cid", "real_date"]
        )

        # Convert resampled datasets to pandas dataframes
        X_resampled = pd.DataFrame(
            data=X_resampled,
            index=index_resampled,
            columns=X.columns,
        )
        y_resampled = pd.Series(
            data=y_resampled,
            index=index_resampled,
            name=y.name,
        )

        return X_resampled, y_resampled

    def _cross_per_period_bootstrap(
        self,
        X,
        y,
        unique_cross_sections,
    ):
        """
        Generate a bootstrap dataset by resampling cross-sections within each
        period in the panel.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix.
        y : pd.DataFrame or pd.Series
            Dependent variable.
        unique_cross_sections : np.ndarray of str
            Unique cross-sections in the panel.

        Returns
        -------
        X_resampled : pd.DataFrame
            Bootstrap resampled feature matrix.
        y_resampled : pd.DataFrame or pd.Series
            Bootstrap resampled dependent variable.
        """
        n_resample = int(np.ceil(len(unique_cross_sections) * self.resample_ratio))
        X_resampled = X.groupby(level=1).sample(replace=True, n=n_resample)
        y_resampled = y.loc[X_resampled.index]

        return X_resampled, y_resampled

    def _period_per_cross_bootstrap(
        self,
        X,
        y,
        unique_real_dates,
    ):
        """
        Generate a bootstrap dataset by resampling periods within each
        cross-section in the panel.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix.
        y : pd.DataFrame or pd.Series
            Dependent variable.
        unique_real_dates : np.ndarray of pd.Timestamp
            Unique dates in the panel.

        Returns
        -------
        X_resampled : pd.DataFrame
            Bootstrap resampled feature matrix.
        y_resampled : pd.DataFrame or pd.Series
            Bootstrap resampled dependent variable.
        """
        n_resample = int(np.ceil(len(unique_real_dates) * self.resample_ratio))
        X_resampled = X.groupby(level=0).sample(replace=True, n=n_resample)
        y_resampled = y.loc[X_resampled.index]

        return X_resampled, y_resampled

    def _check_boot_params(
        self,
        bootstrap_method,
        resample_ratio,
        max_features,
    ):
        """
        Bootstrap class initialization checks.

        Parameters
        ----------
        bootstrap_method : str
            Method to bootstrap the data. Current options are "panel",
            "period", "cross", "cross_per_period" and "period_per_cross".
            Default is "panel".
        resample_ratio : numbers.Number
            Ratio of resampling units comprised in each bootstrap dataset.
            This is a fraction of the quantity of the panel component to be
            resampled. Default value is 1.
        max_features: str or numbers.Number, optional
            The number of features to consider in each bootstrap dataset.
            This can be used to increase the variation between bootstrap datasets.
            Default is None and currently not implemented.
        """
        # bootstrap_method
        if not isinstance(bootstrap_method, str):
            raise TypeError(
                f"bootstrap_method must be a string. Got {type(bootstrap_method)}."
            )
        if bootstrap_method not in [
            "panel",
            "period",
            "cross",
            "cross_per_period",
            "period_per_cross",
        ]:
            raise ValueError(
                f"bootstrap_method must be one of 'panel', 'period', 'cross',"
                " 'cross_per_period', 'period_per_cross'. Got {bootstrap_method}."
            )

        # resample_ratio
        if not isinstance(resample_ratio, (int, float)):
            raise TypeError(
                f"resample_ratio must be an integer or a float. Got {type(resample_ratio)}."
            )
        if resample_ratio <= 0:
            raise ValueError("resample_ratio must be greater than 0.")
        if resample_ratio > 1:
            raise ValueError("resample_ratio must be less than or equal to 1.")

        # max_features
        if max_features is not None:
            raise NotImplementedError("max_features is not implemented yet.")


if __name__ == "__main__":
    from macrosynergy.management.simulate import make_qdf
    import macrosynergy.management as msm

    # Simulate an unbalanced panel, multiindexed by cross-section and date
    cids = ["AUD", "CAD", "GBP", "USD"]
    xcats = ["XR", "CRY", "GROWTH", "INFL"]
    cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )
    df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1]
    df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1]
    df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1]
    df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1]

    df_xcats = pd.DataFrame(index=xcats, columns=cols)
    df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0, 1, 0, 3]
    df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 0, 1, 0, 0]
    df_xcats.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 0, 1, -0.9, 0]
    df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", 0, 1, 0.8, 0]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    dfd["grading"] = np.ones(dfd.shape[0])
    black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]}
    dfd = msm.reduce_df(df=dfd, cids=cids, xcats=xcats, blacklist=black)

    dfd = dfd.pivot(index=["cid", "real_date"], columns="xcat", values="value")
    X = dfd.drop(columns=["XR"])
    y = dfd["XR"]

    bootstrap_methods = [
        "panel",
        "period",
        "cross",
        "cross_per_period",
        "period_per_cross",
    ]

    for method in bootstrap_methods:
        # Initialize the BasePanelBootstrap class
        bpb = BasePanelBootstrap(
            bootstrap_method=method,
            resample_ratio=0.8,
        )
        print(bpb.create_bootstrap_dataset(X, y))