Source code for macrosynergy.learning.splitters.walk_forward_splitters

"""
Classes for incremental expanding panel cross-validators. 
"""

import numpy as np
import pandas as pd

from macrosynergy.learning.splitters.base_splitters import WalkForwardPanelSplit


[docs]class ExpandingIncrementPanelSplit(WalkForwardPanelSplit): """ Walk-forward cross-validator over a panel. Provides train/test indices to split data into train/test sets. The dataset is split so that subsequent training sets are expanded by a fixed number of time periods to incorporate the latest available information. Each training set is followed by a test set of fixed length. Parameters ---------- train_intervals : int The number of time periods by which the previous training set is expanded. Default is 21. test_size : int The number of time periods forward of each training set to use in the associated test set. Default is 21. min_cids : int The minimum number of cross-sections required for the first training set. Default is 4. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If both are provided, start_date takes precedence. min_periods : int The minimum number of time periods required for the first training set. Default is 500. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If both are provided, start_date takes precedence. min_xcats : int The minimum number of xcats required for the first training set. Default is 1. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If both are provided, start_date takes precedence. start_date : Optional[str] The targeted final date in the initial training set in ISO 8601 format. Default is None. Either start_date or (min_cids, min_periods) must be provided. If both are provided, start_date takes precedence. max_periods : Optional[int] The maximum number of time periods in each training set. If the maximum is exceeded, the earliest periods are cut off. This effectively creates rolling training sets. Default is None. Notes ----- The first training set is determined by the specification of either `start_date` or by the parameters `min_cids`, `min_periods`, and `min_xcats`. When `start_date` is provided, the initial training set comprises all available data before and including the `start_date`, unless `max_periods` is specified, in which case at most the last `max_periods` periods prior to the `start_date` are included. If `start_date` is not provided, the first training set is determined by the parameters `min_cids`, `min_periods`, and `min_xcats`. This set comprises at least `min_xcats` categories for at least `min_periods` time periods for at least `min_cids` cross-sections. """ def __init__( self, train_intervals=21, test_size=21, min_cids=4, min_periods=500, min_xcats=1, start_date=None, max_periods=None, ): # Checks super().__init__( min_cids=min_cids, min_periods=min_periods, min_xcats=min_xcats, start_date=start_date, max_periods=max_periods, ) self._check_init_params( train_intervals=train_intervals, test_size=test_size, ) # Attributes self.train_intervals = train_intervals self.test_size = test_size
[docs] def split(self, X, y, groups=None): """ Generate indices to split data into training and test sets. Parameters ---------- X : pd.DataFrame Pandas dataframe of features, multi-indexed by (cross-section, date). The dates must be in datetime format. Otherwise the dataframe must be in wide format: each feature is a column. y : Union[pd.DataFrame, pd.Series] Pandas dataframe or series of a target variable, multi-indexed by (cross-section, date). The dates must be in datetime format. If a dataframe is provided, the target variable must be the sole column. groups : None Ignored. Exists for compatibility with scikit-learn. Yields ------ train : np.ndarray The training set indices for that split. test : np.ndarray The testing set indices for that split. """ # Checks self._check_split_params(X, y, groups) # Determine the unique dates in each training split train_indices = [] test_indices = [] Xy = pd.concat([X, y], axis=1) real_dates = Xy.index.get_level_values(1) splits = self._determine_unique_training_times(X, real_dates) # Determine the training and test indices for each split train_splits: list = [ splits[0] if not self.max_periods else splits[0][-self.max_periods :] ] for i in range(1, self.n_splits): train_splits.append(np.concatenate([train_splits[i - 1], splits[i]])) # Drop beginning of training set if it exceeds max_periods. if self.max_periods: train_splits[i] = train_splits[i][-self.max_periods :] for split in train_splits: train_indices: list = np.where(real_dates.isin(split))[0] test_start: int = self.unique_dates.get_loc(split.max()) + 1 test_indices: list = np.where( real_dates.isin( self.unique_dates[test_start : test_start + self.test_size] ) )[0] yield train_indices, test_indices
def _determine_unique_training_times(self, X, real_dates): """ Returns the unique dates in each training split. Parameters ---------- X : pd.DataFrame A pandas dataframe of features, multi-indexed by (cross-section, date). real_dates : pd.Index The dates associated with each sample in X. Notes ----- This method is called by self.split(). It also returns other variables needed for ensuing components of the self.split() method. """ self.unique_dates: pd.DatetimeIndex = real_dates.unique().sort_values() # First determine the dates for the first training set if self.start_date: date_last_train = self.start_date else: # Number of features with min_periods of non-NaN data # for each cid/date pair num_xcats_available = ( X.notna() .groupby("cid") .cumsum() .ge(self.min_periods) .sum(axis=1) ) # Number of cids with at least min_xcats available cid_count = ( num_xcats_available .ge(self.min_xcats) .groupby("real_date") .sum() ) valid_dates = cid_count[cid_count >= self.min_cids].index date_last_train: pd.Timestamp = valid_dates.min() if not isinstance(date_last_train, pd.Timestamp): raise ValueError( f"No splits that satisfy min_xcats {self.min_xcats}." f"Try reducing value passed to min_xcats." ) # Determine all remaining training splits splits = [self.unique_dates[self.unique_dates <= date_last_train]] i = self.unique_dates.get_loc(date_last_train) # Loop until no test sets can be created while i < len(self.unique_dates) - (self.test_size + 1): next_loc = i + self.train_intervals splits.append(self.unique_dates[i + 1 : next_loc + 1]) i = next_loc self.n_splits = len(splits) return splits
[docs] def get_n_splits(self, X, y, groups=None): """ Calculates and returns the number of splits. Parameters ---------- X : pd.DataFrame Pandas dataframe of features, multi-indexed by (cross-section, date). The dates must be in datetime format. Otherwise the dataframe must be in wide format: each feature is a column. y : Union[pd.DataFrame, pd.Series] Pandas dataframe or series of a target variable, multi-indexed by (cross-section, date). The dates must be in datetime format. If a dataframe is provided, the target variable must be the sole column. groups : None Ignored. Exists for compatibility with scikit-learn. Returns ------- n_splits : int The number of splits. """ Xy = pd.concat([X, y], axis=1) Xy.dropna(inplace=True) real_dates = Xy.index.get_level_values(1) self._determine_unique_training_times(Xy, real_dates) return self.n_splits
[docs] def visualise_splits( self, X, y, figsize=(20, 5), show_title=True, tick_fontsize=None, label_fontsize=None, subtitle_fontsize=None, ): """ Visualise the cross-validation splits. Parameters ---------- X : pd.DataFrame Pandas dataframe of features/quantamental indicators, multi-indexed by (cross-section, date). The dates must be in datetime format. The dataframe must be in wide format: each feature is a column. y : pd.DataFrame Pandas dataframe of target variable, multi-indexed by (cross-section, date). The dates must be in datetime format. figsize : Tuple[int, int] Tuple of integers specifying the splitter visualisation figure size. show_title : bool, optional Boolean specifying whether to show the title of the figure. Default is True. tick_fontsize : int, optional Integer specifying the size of the x-axis tick labels. Default is None. label_fontsize : int, optional Integer specifying the size of the y-axis labels. Default is None. subtitle_fontsize : int, optional Integer specifying the size of the subplot titles. Default is None. """ super().visualise_splits(X, y, figsize, show_title, tick_fontsize, label_fontsize, subtitle_fontsize)
def _check_init_params( self, train_intervals, test_size, ): """ Type and value checks for the class initialisation parameters. Parameters ---------- train_intervals : int The number of time periods by which the previous training set is expanded. test_size : int The number of time periods forward of each training set to use in the associated test set. """ # train_intervals if not isinstance(train_intervals, int): raise TypeError( f"train_intervals must be an integer. Got {type(train_intervals)}." ) if train_intervals < 1: raise ValueError( f"train_intervals must be an integer greater than 0. Got {train_intervals}." ) # test_size if not isinstance(test_size, int): raise TypeError(f"test_size must be an integer. Got {type(test_size)}.") if test_size < 1: raise ValueError( f"test_size must be an integer greater than 0. Got {test_size}." )
[docs]class ExpandingFrequencyPanelSplit(WalkForwardPanelSplit): """ Walk-forward cross-validator over a panel. Provides train/test indices to split data into train/test sets. The dataset is split so that subsequent training sets are expanded by a user-specified frequency to incorporate the latest available information. Each training set is followed by a test set spanning a user-defined frequency. Parameters ---------- expansion_freq : str Frequency of training set expansion. For a given native dataset frequency, the training sets expand by the smallest number of dates to cover this frequency. Default is "D". Accepted values are "D", "W", "M", "Q" and "Y". test_freq : str Frequency forward of each training set for the unique dates in each test set to cover. Default is "D". Accepted values are "D", "W", "M", "Q" and "Y". min_cids : int Minimum number of cross-sections required for the initial training set. Default is 4. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If both are provided, start_date takes precedence. min_periods : int Minimum number of time periods required for the initial training set. Default is 500. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If both are provided, start_date takes precedence. min_xcats : int Minimum number of xcats required for the initial training set. Default is 1. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If both are provided, start_date takes precedence. start_date : Optional[str] First rebalancing date in ISO 8601 format. This is the last date of the first training set. Default is None. Either start_date or (min_cids, min_periods) must be provided. If both are provided, start_date takes precedence. max_periods : Optional[int] The maximum number of time periods in each training set. If the maximum is exceeded, the earliest periods are cut off. This effectively creates rolling training sets. Default is None. Notes ----- The first training set is either determined by the specification of `start_date` or by the parameters `min_cids` and `min_periods` collectively. When `start_date` is provided, the initial training set comprises all available data prior to the `start_date`, unless `max_periods` is specified, in which case at most the last `max_periods` periods prior to the `start_date` are included. If `start_date` is not provided, the first training set is determined by the parameters `min_cids`, `min_periods`, and `min_xcats`. This set comprises at least `min_xcats` categories for at least `min_periods` time periods for at least `min_cids` cross-sections. This initial training set is immediately adjusted depending on the specified training interval frequency. For instance, if the training frequency is "M", the initial training set is further expanding so that all samples prior to the end of the month are included. The associated test set immediately follows the adjusted initial training set and spans the specified test set frequency forward of its associated training set. For instance, if the test frequency is "Q", the available dates that cover the subsequent quarter are grouped together to form the test set. Subsequent training sets are created by expanding the previous training set by the smallest number of dates to cover the training frequency. As before, each test set immediately follows its associated training set and is determined in the same manner as the initial test set. """ def __init__( self, expansion_freq="D", test_freq="D", min_cids=4, min_periods=500, min_xcats=1, start_date=None, max_periods=None, ): # Checks super().__init__( min_cids=min_cids, min_periods=min_periods, start_date=start_date, max_periods=max_periods, min_xcats=min_xcats, ) self._check_init_params( expansion_freq=expansion_freq, test_freq=test_freq, ) # Attributes self.expansion_freq = expansion_freq self.test_freq = test_freq self.freq_offsets = { "D": pd.DateOffset(days=1), "W": pd.DateOffset(weeks=1), "M": pd.DateOffset(months=1), "Q": pd.DateOffset(months=3), "Y": pd.DateOffset(years=1), }
[docs] def split(self, X, y, groups=None): """Generate indices to split data into training and test sets. Parameters ---------- X : pd.DataFrame Pandas dataframe of features, multi-indexed by (cross-section, date). The dates must be in datetime format. Otherwise the dataframe must be in wide format: each feature is a column. y : Union[pd.DataFrame, pd.Series] Pandas dataframe or series of a target variable, multi-indexed by (cross-section, date). The dates must be in datetime format. If a dataframe is provided, the target variable must be the sole column. groups : None Ignored. Exists for compatibility with scikit-learn. Yields ------ train : np.ndarray The training set indices for that split. test : np.ndarray The testing set indices for that split. """ # Checks self._check_split_params(X, y, groups) # Determine the unique dates in each training split train_indices = [] test_indices = [] Xy = pd.concat([X, y], axis=1) real_dates = Xy.index.get_level_values(1) splits = self._determine_unique_training_times(X, real_dates) train_splits: list = [ splits[0] if not self.max_periods else splits[0][-self.max_periods :] ] for i in range(1, self.n_splits): train_splits.append(np.concatenate([train_splits[i - 1], splits[i]])) # Drop beginning of training set if it exceeds max_periods. if self.max_periods: train_splits[i] = train_splits[i][-self.max_periods :] test_offset = self.freq_offsets[self.test_freq] for split_idx, split in enumerate(train_splits): train_indices = np.where(Xy.index.get_level_values(1).isin(split))[0] test_start: pd.Timestamp = self.unique_dates[ self.unique_dates.get_loc(split.max()) + 1 ] test_end: pd.Timestamp = test_start + test_offset if split_idx == len(train_splits) - 1: test_dates = sorted(self.unique_dates[self.unique_dates >= test_start]) else: test_dates = sorted( self.unique_dates[ (self.unique_dates >= test_start) & (self.unique_dates < test_end) ] ) test_indices = np.where(Xy.index.get_level_values(1).isin(test_dates))[0] yield train_indices, test_indices
[docs] def get_n_splits(self, X=None, y=None, groups=None) -> int: """ Calculates and returns the number of splits. :param <pd.DataFrame> X: Pandas dataframe of features, multi-indexed by (cross-section, date). The dates must be in datetime format. Otherwise the dataframe must be in wide format: each feature is a column. :param <pd.DataFrame> y: Pandas dataframe of the target variable, multi-indexed by (cross-section, date). The dates must be in datetime format. :param <pd.DataFrame> groups: Always ignored, exists for compatibility. :return <int> n_splits: Returns the number of splits. """ self._determine_unique_time_splits(X, y) return self.n_splits
def _determine_unique_training_times(self, Xy, real_dates): """ Returns the unique dates in each training split. Parameters ---------- Xy : pd.DataFrame Combined pandas dataframe of features and the target variable, multi-indexed by (cross-section, date). real_dates : pd.Index The dates associated with each sample in Xy. Notes ----- This method is called by self.split(). It also returns other variables needed for ensuing components of the self.split() method. """ self.unique_dates: pd.DatetimeIndex = real_dates.unique().sort_values() # First determine the dates for the first training set if self.start_date: date_last_train = self.start_date else: # Number of features with min_periods of non-NaN data # for each cid/date pair num_xcats_available = ( Xy.notna() .groupby("cid") .cumsum() .ge(self.min_periods) .sum(axis=1) ) # Number of cids with at least min_xcats available cid_count = ( num_xcats_available .ge(self.min_xcats) .groupby("real_date") .sum() ) valid_dates = cid_count[cid_count >= self.min_cids].index date_last_train: pd.Timestamp = valid_dates.min() if not isinstance(date_last_train, pd.Timestamp): raise ValueError( f"No splits that satisfy min_xcats {self.min_xcats}." f"Try reducing value passed to min_xcats." ) # Determine all remaining training splits # To do this, loop through unique panel dates to create the training and test sets # Loop until the last "test_freq" dates in the panel are reached end_date = self.unique_dates[-1] - self.freq_offsets[self.test_freq] unique_dates_train: pd.arrays.DatetimeArray = self.unique_dates[ (self.unique_dates > date_last_train) & (self.unique_dates <= end_date) ].sort_values() current_date = unique_dates_train[0] splits = [] while current_date < end_date: next_date = current_date + self.freq_offsets[self.expansion_freq] if next_date > end_date: mask = (unique_dates_train >= current_date) & ( unique_dates_train <= end_date ) else: mask = (unique_dates_train >= current_date) & ( unique_dates_train < next_date ) split_dates = unique_dates_train[mask] if not split_dates.empty: splits.append(split_dates) current_date = next_date # Add the first training set to the list of training splits, so that the dates # that constitute each training split are together. splits.insert( 0, real_dates[real_dates <= date_last_train].unique().sort_values(), ) self.n_splits = len(splits) return splits def _check_init_params( self, expansion_freq, test_freq, ): # expansion_freq if not isinstance(expansion_freq, str): raise TypeError( f"expansion_freq must be a string. Got {type(expansion_freq)}." ) if expansion_freq not in ["D", "W", "M", "Q", "Y"]: raise ValueError( f"expansion_freq must be one of 'D', 'W', 'M', 'Q' or 'Y'." f" Got {expansion_freq}." ) # test_freq if not isinstance(test_freq, str): raise TypeError(f"test_freq must be a string. Got {type(test_freq)}.") if test_freq not in ["D", "W", "M", "Q", "Y"]: raise ValueError( f"test_freq must be one of 'D', 'W', 'M', 'Q' or 'Y'." f" Got {test_freq}." )
[docs] def visualise_splits( self, X, y, figsize=(20, 5), show_title=True, tick_fontsize=None, label_fontsize=None, subtitle_fontsize=None, ): """ Visualise the cross-validation splits. Parameters ---------- X : pd.DataFrame Pandas dataframe of features/quantamental indicators, multi-indexed by (cross-section, date). The dates must be in datetime format. The dataframe must be in wide format: each feature is a column. y : pd.DataFrame Pandas dataframe of target variable, multi-indexed by (cross-section, date). The dates must be in datetime format. figsize : Tuple[int, int] Tuple of integers specifying the splitter visualisation figure size. show_title : bool, optional Boolean specifying whether to show the title of the figure. Default is True. tick_fontsize : int, optional Integer specifying the size of the x-axis tick labels. Default is None. label_fontsize : int, optional Integer specifying the size of the y-axis labels. Default is None. subtitle_fontsize : int, optional Integer specifying the size of the subplot titles. Default is None. """ super().visualise_splits(X, y, figsize, show_title, tick_fontsize, label_fontsize, subtitle_fontsize)
if __name__ == "__main__": from macrosynergy.management.simulate import make_qdf import macrosynergy.management as msm # Example dataset cids = ["AUD", "CAD", "GBP", "USD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1] df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1] df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1] df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1] df_xcats = pd.DataFrame(index=xcats, columns=cols) df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3] df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1] df_xcats.loc["GROWTH"] = ["2001-01-01", "2020-12-31", 1, 2, 0.9, 1] df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", 1, 2, 0.8, 0.5] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfd["grading"] = np.ones(dfd.shape[0]) black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]} dfd = msm.reduce_df(df=dfd, cids=cids, xcats=xcats, blacklist=black) dfd = dfd.pivot(index=["cid", "real_date"], columns="xcat", values="value") X = dfd.drop(columns=["XR"]) y = dfd["XR"] # ExpandingIncrementPanelSplit splitter = ExpandingIncrementPanelSplit(train_intervals=21 * 12, test_size=21 * 12, min_cids = 1, min_periods = 12*12) splitter.visualise_splits(X, y) # ExpandingFrequencyPanelSplit splitter = ExpandingFrequencyPanelSplit(expansion_freq="Y", test_freq="Y", min_cids = 1, min_periods = 12*12) splitter.visualise_splits(X, y)