Source code for macrosynergy.learning.forecasting.bootstrap.bootstrap

import numpy as np
import pandas as pd

from collections import Counter, defaultdict
from abc import ABC


[docs]class BasePanelBootstrap(ABC): def __init__( self, bootstrap_method="panel", resample_ratio=1, max_features=None, ): """ Construct bootstrap datasets over a panel. Parameters ---------- bootstrap_method : str Method to bootstrap the data. Current options are "panel", "period", "cross", "cross_per_period" and "period_per_cross". Default is "panel". resample_ratio : numbers.Number Ratio of resampling units comprised in each bootstrap dataset. This is a fraction of the quantity of the panel component to be resampled. Default value is 1. max_features: str or numbers.Number, optional The number of features to consider in each bootstrap dataset. This can be used to increase the variation between bootstrap datasets. Default is None and currently not implemented. Notes ----- The non-parametric bootstrap is a method to generate datasets that follow the same distribution as the original dataset, as best as possible given the observed data. Mathematically, a bootstrap dataset is equivalent to sampling from the empirical distribution of the original dataset. A bootstrap dataset is constructed by sampling the observed data with replacement. Bootstrapping can be used to estimate the distribution of a statistic, for instance the sampling distribution of a model parameter. As an example, a regression model can be fit to each bootstrap dataset to estimate the distribution of the model parameters. The standard deviation of these distributions, consequently, can be used to estimate the sampling variation of the model parameters. Bootstrapping can also be used to create "bagged" models. Bagging is a method aimed at reducing the variance of a machine learning model. It involves training multiple models on different bootstrap datasets and averaging the predictions of these models. Often, additional variation is also introduced to each bootstrap dataset - for instance by randomly sampling a subset of features for each dataset. """ # Checks self._check_boot_params( bootstrap_method=bootstrap_method, resample_ratio=resample_ratio, max_features=max_features, ) # Set attributes self.bootstrap_method = bootstrap_method self.resample_ratio = resample_ratio self.max_features = max_features
[docs] def create_bootstrap_dataset( self, X, y, ): """ Generate a bootstrap dataset based on a panel of features and a dependent variable. Parameters ---------- X : pd.DataFrame Input feature matrix y : pd.DataFrame or pd.Series Dependent variable. Returns ------- X_resampled : pd.DataFrame Bootstrap resampled feature matrix. y_resampled : pd.DataFrame or pd.Series Bootstrap resampled dependent variable. """ # Store index information in numpy arrays index_array = np.array(X.index.tolist()) cross_sections = index_array[:, 0] unique_cross_sections = np.unique(cross_sections) real_dates = index_array[:, 1] unique_real_dates = np.unique(real_dates) # Create a bootstrap dataset if self.bootstrap_method == "panel": X_resampled, y_resampled = self._panel_bootstrap( X=X, y=y, ) elif self.bootstrap_method == "period": X_resampled, y_resampled = self._period_bootstrap( X=X, y=y, unique_real_dates=unique_real_dates, ) elif self.bootstrap_method == "cross": X_resampled, y_resampled = self._cross_bootstrap( X=X, y=y, unique_cross_sections=unique_cross_sections, ) elif self.bootstrap_method == "cross_per_period": X_resampled, y_resampled = self._cross_per_period_bootstrap( X=X, y=y, unique_cross_sections=unique_cross_sections, ) elif self.bootstrap_method == "period_per_cross": X_resampled, y_resampled = self._period_per_cross_bootstrap( X=X, y=y, unique_real_dates=unique_real_dates, ) return X_resampled, y_resampled
def _panel_bootstrap( self, X, y, ): """ Generate a bootstrap dataset by resampling the panel. Parameters ---------- X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Dependent variable. Returns ------- X_resampled : pd.DataFrame Bootstrap resampled feature matrix. y_resampled : pd.DataFrame or pd.Series Bootstrap resampled dependent variable. """ bootstrap_idx = np.random.choice( np.arange(X.shape[0]), size=int(np.ceil(self.resample_ratio * X.shape[0])), replace=True, ) X_resampled = X.iloc[bootstrap_idx] y_resampled = y.iloc[bootstrap_idx] return X_resampled, y_resampled def _period_bootstrap( self, X, y, unique_real_dates, ): """ Generate a bootstrap dataset by resampling periods in the panel. Parameters ---------- X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Dependent variable. unique_real_dates : np.ndarray of pd.Timestamp Unique dates in the panel. Returns ------- X_resampled : pd.DataFrame Bootstrap resampled feature matrix. y_resampled : pd.DataFrame or pd.Series Bootstrap resampled dependent variable. """ # Resample unique panel dates with replacement bootstrap_periods = np.random.choice( unique_real_dates, size=int(np.ceil(self.resample_ratio * len(unique_real_dates))), replace=True, ) # Obtain a {count: [periods]} dictionary so that the new panel can be efficiently # constructed by looping through the counts instead of the periods. period_counts = dict(Counter(bootstrap_periods)) count_to_periods = defaultdict(list) for period, count in period_counts.items(): count_to_periods[count].append(period) # For each count, extract the periods that have that count and tile then by count # to create the new panel. X_resampled = np.empty((0, X.shape[1])) y_resampled = np.empty(0) index_resampled = [] for count, periods in count_to_periods.items(): X_resampled = np.vstack( [ X_resampled, np.tile( X[X.index.get_level_values(1).isin(periods)].values, (count, 1), ), ] ) y_resampled = np.append( y_resampled, np.tile(y[y.index.get_level_values(1).isin(periods)].values, count), ) index_resampled.extend( X[X.index.get_level_values(1).isin(periods)].index.tolist() * count ) # reconstruct index index_resampled = pd.MultiIndex.from_tuples( index_resampled, names=["cid", "real_date"] ) # Convert resampled datasets to pandas dataframes X_resampled = pd.DataFrame( data=X_resampled, index=index_resampled, columns=X.columns, ) y_resampled = pd.Series( data=y_resampled, index=index_resampled, name=y.name, ) return X_resampled, y_resampled def _cross_bootstrap( self, X, y, unique_cross_sections, ): """ Generate a bootstrap dataset by resampling cross-sections in the panel. Parameters ---------- X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Dependent variable. unique_cross_sections : np.ndarray of str Unique cross-sections in the panel. Returns ------- X_resampled : pd.DataFrame Bootstrap resampled feature matrix. y_resampled : pd.DataFrame or pd.Series Bootstrap resampled dependent variable. """ # Resample unique panel cross-sections with replacement bootstrap_cross_sections = np.random.choice( unique_cross_sections, size=int(np.ceil(len(unique_cross_sections) * self.resample_ratio)), replace=True, ) # Obtain a {count: [cross_sections]} dictionary so that the new panel can be efficiently # constructed by looping through the counts instead of the periods. cross_section_counts = dict(Counter(bootstrap_cross_sections)) count_to_cross_sections = defaultdict(list) for cross_section, count in cross_section_counts.items(): count_to_cross_sections[count].append(cross_section) # For each count, tile the observations within the cross-sections with that count X_resampled = np.empty((0, X.shape[1])) y_resampled = np.empty(0) index_resampled = [] for count, cross_sections in count_to_cross_sections.items(): X_resampled = np.vstack( [ X_resampled, np.tile( X[X.index.get_level_values(0).isin(cross_sections)].values, (count, 1), ), ] ) y_resampled = np.append( y_resampled, np.tile( y[y.index.get_level_values(0).isin(cross_sections)].values, count ), ) index_resampled.extend( X[X.index.get_level_values(0).isin(cross_sections)].index.tolist() * count ) # reconstruct index index_resampled = pd.MultiIndex.from_tuples( index_resampled, names=["cid", "real_date"] ) # Convert resampled datasets to pandas dataframes X_resampled = pd.DataFrame( data=X_resampled, index=index_resampled, columns=X.columns, ) y_resampled = pd.Series( data=y_resampled, index=index_resampled, name=y.name, ) return X_resampled, y_resampled def _cross_per_period_bootstrap( self, X, y, unique_cross_sections, ): """ Generate a bootstrap dataset by resampling cross-sections within each period in the panel. Parameters ---------- X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Dependent variable. unique_cross_sections : np.ndarray of str Unique cross-sections in the panel. Returns ------- X_resampled : pd.DataFrame Bootstrap resampled feature matrix. y_resampled : pd.DataFrame or pd.Series Bootstrap resampled dependent variable. """ n_resample = int(np.ceil(len(unique_cross_sections) * self.resample_ratio)) X_resampled = X.groupby(level=1).sample(replace=True, n=n_resample) y_resampled = y.loc[X_resampled.index] return X_resampled, y_resampled def _period_per_cross_bootstrap( self, X, y, unique_real_dates, ): """ Generate a bootstrap dataset by resampling periods within each cross-section in the panel. Parameters ---------- X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Dependent variable. unique_real_dates : np.ndarray of pd.Timestamp Unique dates in the panel. Returns ------- X_resampled : pd.DataFrame Bootstrap resampled feature matrix. y_resampled : pd.DataFrame or pd.Series Bootstrap resampled dependent variable. """ n_resample = int(np.ceil(len(unique_real_dates) * self.resample_ratio)) X_resampled = X.groupby(level=0).sample(replace=True, n=n_resample) y_resampled = y.loc[X_resampled.index] return X_resampled, y_resampled def _check_boot_params( self, bootstrap_method, resample_ratio, max_features, ): """ Bootstrap class initialization checks. Parameters ---------- bootstrap_method : str Method to bootstrap the data. Current options are "panel", "period", "cross", "cross_per_period" and "period_per_cross". Default is "panel". resample_ratio : numbers.Number Ratio of resampling units comprised in each bootstrap dataset. This is a fraction of the quantity of the panel component to be resampled. Default value is 1. max_features: str or numbers.Number, optional The number of features to consider in each bootstrap dataset. This can be used to increase the variation between bootstrap datasets. Default is None and currently not implemented. """ # bootstrap_method if not isinstance(bootstrap_method, str): raise TypeError( f"bootstrap_method must be a string. Got {type(bootstrap_method)}." ) if bootstrap_method not in [ "panel", "period", "cross", "cross_per_period", "period_per_cross", ]: raise ValueError( f"bootstrap_method must be one of 'panel', 'period', 'cross'," " 'cross_per_period', 'period_per_cross'. Got {bootstrap_method}." ) # resample_ratio if not isinstance(resample_ratio, (int, float)): raise TypeError( f"resample_ratio must be an integer or a float. Got {type(resample_ratio)}." ) if resample_ratio <= 0: raise ValueError("resample_ratio must be greater than 0.") if resample_ratio > 1: raise ValueError("resample_ratio must be less than or equal to 1.") # max_features if max_features is not None: raise NotImplementedError("max_features is not implemented yet.")
if __name__ == "__main__": from macrosynergy.management.simulate import make_qdf import macrosynergy.management as msm # Simulate an unbalanced panel, multiindexed by cross-section and date cids = ["AUD", "CAD", "GBP", "USD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1] df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1] df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1] df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1] df_xcats = pd.DataFrame(index=xcats, columns=cols) df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0, 1, 0, 3] df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 0, 1, 0, 0] df_xcats.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 0, 1, -0.9, 0] df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", 0, 1, 0.8, 0] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfd["grading"] = np.ones(dfd.shape[0]) black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]} dfd = msm.reduce_df(df=dfd, cids=cids, xcats=xcats, blacklist=black) dfd = dfd.pivot(index=["cid", "real_date"], columns="xcat", values="value") X = dfd.drop(columns=["XR"]) y = dfd["XR"] bootstrap_methods = [ "panel", "period", "cross", "cross_per_period", "period_per_cross", ] for method in bootstrap_methods: # Initialize the BasePanelBootstrap class bpb = BasePanelBootstrap( bootstrap_method=method, resample_ratio=0.8, ) print(bpb.create_bootstrap_dataset(X, y))