Source code for macrosynergy.learning.splitters.walk_forward_splitters

"""
Classes for incremental expanding panel cross-validators. 
"""

import numpy as np
import pandas as pd

from macrosynergy.learning.splitters.base_splitters import WalkForwardPanelSplit


[docs]class ExpandingIncrementPanelSplit(WalkForwardPanelSplit):
    """
    Walk-forward cross-validator over a panel.

    Provides train/test indices to split data into train/test sets. The dataset is split
    so that subsequent training sets are expanded by a fixed number of time periods to
    incorporate the latest available information. Each training set is followed by a test
    set of fixed length.

    Parameters
    ----------
    train_intervals : int
        The number of time periods by which the previous training set is expanded.
        Default is 21.
    test_size : int
        The number of time periods forward of each training set to use in the associated
        test set. Default is 21.
    min_cids : int
        The minimum number of cross-sections required for the first training set. Default
        is 4. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If
        both are provided, start_date takes precedence.
    min_periods : int
        The minimum number of time periods required for the first training set. Default is
        500. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If
        both are provided, start_date takes precedence.
    min_xcats : int
        The minimum number of xcats required for the first training set. Default is 1.
        Either start_date or (min_cids, min_periods, min_xcats) must be provided. If
        both are provided, start_date takes precedence.
    start_date : Optional[str]
        The targeted final date in the initial training set in ISO 8601 format.
        Default is None. Either start_date or (min_cids, min_periods) must be provided.
        If both are provided, start_date takes precedence.
    max_periods : Optional[int]
        The maximum number of time periods in each training set. If the maximum is
        exceeded, the earliest periods are cut off. This effectively creates rolling
        training sets. Default is None.

    Notes
    -----
    The first training set is determined by the specification of either `start_date` or
    by the parameters `min_cids`, `min_periods`, and `min_xcats`. When `start_date` is
    provided, the initial training set comprises all available data before and including
    the `start_date`, unless `max_periods` is specified, in which case at most the last
    `max_periods` periods prior to the `start_date` are included.

    If `start_date` is not provided, the first training set is determined by the
    parameters `min_cids`, `min_periods`, and `min_xcats`. This set comprises at
    least `min_xcats` categories for at least `min_periods` time periods for at
    least `min_cids` cross-sections.
    """

    def __init__(
        self,
        train_intervals=21,
        test_size=21,
        min_cids=4,
        min_periods=500,
        min_xcats=1,
        start_date=None,
        max_periods=None,
    ):
        # Checks
        super().__init__(
            min_cids=min_cids,
            min_periods=min_periods,
            min_xcats=min_xcats,
            start_date=start_date,
            max_periods=max_periods,
        )
        self._check_init_params(
            train_intervals=train_intervals,
            test_size=test_size,
        )

        # Attributes
        self.train_intervals = train_intervals
        self.test_size = test_size

[docs]    def split(self, X, y, groups=None):
        """
        Generate indices to split data into training and test sets.

        Parameters
        ----------
        X : pd.DataFrame
            Pandas dataframe of features, multi-indexed by (cross-section, date). The
            dates must be in datetime format. Otherwise the dataframe must be in wide
            format: each feature is a column.
        y : Union[pd.DataFrame, pd.Series]
            Pandas dataframe or series of a target variable, multi-indexed by
            (cross-section, date). The dates must be in datetime format. If a dataframe
            is provided, the target variable must be the sole column.
        groups : None
            Ignored. Exists for compatibility with scikit-learn.

        Yields
        ------
        train : np.ndarray
            The training set indices for that split.
        test : np.ndarray
            The testing set indices for that split.
        """
        # Checks
        self._check_split_params(X, y, groups)

        # Determine the unique dates in each training split
        train_indices = []
        test_indices = []

        Xy = pd.concat([X, y], axis=1)
        real_dates = Xy.index.get_level_values(1)
        splits = self._determine_unique_training_times(X, real_dates)

        # Determine the training and test indices for each split
        train_splits: list = [
            splits[0] if not self.max_periods else splits[0][-self.max_periods :]
        ]
        for i in range(1, self.n_splits):
            train_splits.append(np.concatenate([train_splits[i - 1], splits[i]]))

            # Drop beginning of training set if it exceeds max_periods.
            if self.max_periods:
                train_splits[i] = train_splits[i][-self.max_periods :]

        for split in train_splits:
            train_indices: list = np.where(real_dates.isin(split))[0]
            test_start: int = self.unique_dates.get_loc(split.max()) + 1
            test_indices: list = np.where(
                real_dates.isin(
                    self.unique_dates[test_start : test_start + self.test_size]
                )
            )[0]

            yield train_indices, test_indices

    def _determine_unique_training_times(self, X, real_dates):
        """
        Returns the unique dates in each training split.

        Parameters
        ----------
        X : pd.DataFrame
            A pandas dataframe of features, multi-indexed by (cross-section, date).
        real_dates : pd.Index
            The dates associated with each sample in X.

        Notes
        -----
        This method is called by self.split(). It also returns other variables needed
        for ensuing components of the self.split() method.
        """
        self.unique_dates: pd.DatetimeIndex = real_dates.unique().sort_values()

        # First determine the dates for the first training set
        if self.start_date:
            date_last_train = self.start_date
        else:
            # Number of features with min_periods of non-NaN data
            # for each cid/date pair
            num_xcats_available = (
                X.notna()
                .groupby("cid")
                .cumsum()
                .ge(self.min_periods)
                .sum(axis=1)
            )

            # Number of cids with at least min_xcats available
            cid_count = (
                num_xcats_available
                .ge(self.min_xcats)
                .groupby("real_date")
                .sum()
            )

            valid_dates = cid_count[cid_count >= self.min_cids].index
            date_last_train: pd.Timestamp = valid_dates.min()

            if not isinstance(date_last_train, pd.Timestamp):
                raise ValueError(
                    f"No splits that satisfy min_xcats {self.min_xcats}."
                    f"Try reducing value passed to min_xcats."
                )

        # Determine all remaining training splits
        splits = [self.unique_dates[self.unique_dates <= date_last_train]]
        i = self.unique_dates.get_loc(date_last_train)

        # Loop until no test sets can be created
        while i < len(self.unique_dates) - (self.test_size + 1):
            next_loc = i + self.train_intervals
            splits.append(self.unique_dates[i + 1 : next_loc + 1])
            i = next_loc

        self.n_splits = len(splits)

        return splits

[docs]    def get_n_splits(self, X, y, groups=None):
        """
        Calculates and returns the number of splits.

        Parameters
        ----------
        X : pd.DataFrame
            Pandas dataframe of features, multi-indexed by (cross-section, date). The
            dates must be in datetime format. Otherwise the dataframe must be in wide
            format: each feature is a column.

        y : Union[pd.DataFrame, pd.Series]
            Pandas dataframe or series of a target variable, multi-indexed by
            (cross-section, date). The dates must be in datetime format. If a dataframe
            is provided, the target variable must be the sole column.

        groups : None
            Ignored. Exists for compatibility with scikit-learn.

        Returns
        -------
        n_splits : int
            The number of splits.
        """
        Xy = pd.concat([X, y], axis=1)
        Xy.dropna(inplace=True)
        real_dates = Xy.index.get_level_values(1)
        self._determine_unique_training_times(Xy, real_dates)

        return self.n_splits

[docs]    def visualise_splits(
        self,
        X,
        y,
        figsize=(20, 5),
        show_title=True,
        tick_fontsize=None,
        label_fontsize=None,
        subtitle_fontsize=None,
    ):
        """
        Visualise the cross-validation splits.

        Parameters
        ----------
        X : pd.DataFrame
            Pandas dataframe of features/quantamental indicators, multi-indexed by
            (cross-section, date). The dates must be in datetime format. The
            dataframe must be in wide format: each feature is a column.
        y : pd.DataFrame
            Pandas dataframe of target variable, multi-indexed by (cross-section, date).
            The dates must be in datetime format.
        figsize : Tuple[int, int]
            Tuple of integers specifying the splitter visualisation figure size.
        show_title : bool, optional
            Boolean specifying whether to show the title of the figure. Default is True.
        tick_fontsize : int, optional
            Integer specifying the size of the x-axis tick labels. Default is None.
        label_fontsize : int, optional
            Integer specifying the size of the y-axis labels. Default is None.
        subtitle_fontsize : int, optional
            Integer specifying the size of the subplot titles. Default is None.
        """
        super().visualise_splits(X, y, figsize, show_title, tick_fontsize, label_fontsize, subtitle_fontsize)

    def _check_init_params(
        self,
        train_intervals,
        test_size,
    ):
        """
        Type and value checks for the class initialisation parameters.

        Parameters
        ----------
        train_intervals : int
            The number of time periods by which the previous training set is expanded.
        test_size : int
            The number of time periods forward of each training set to use in the associated
            test set.
        """
        # train_intervals
        if not isinstance(train_intervals, int):
            raise TypeError(
                f"train_intervals must be an integer. Got {type(train_intervals)}."
            )
        if train_intervals < 1:
            raise ValueError(
                f"train_intervals must be an integer greater than 0. Got {train_intervals}."
            )

        # test_size
        if not isinstance(test_size, int):
            raise TypeError(f"test_size must be an integer. Got {type(test_size)}.")
        if test_size < 1:
            raise ValueError(
                f"test_size must be an integer greater than 0. Got {test_size}."
            )


[docs]class ExpandingFrequencyPanelSplit(WalkForwardPanelSplit):
    """
    Walk-forward cross-validator over a panel.

    Provides train/test indices to split data into train/test sets. The dataset is split
    so that subsequent training sets are expanded by a user-specified frequency to
    incorporate the latest available information. Each training set is followed by a test
    set spanning a user-defined frequency.

    Parameters
    ----------
    expansion_freq : str
        Frequency of training set expansion. For a given native dataset frequency, the
        training sets expand by the smallest number of dates to cover this frequency.
        Default is "D". Accepted values are "D", "W", "M", "Q" and "Y".
    test_freq : str
        Frequency forward of each training set for the unique dates in each test set to
        cover. Default is "D". Accepted values are "D", "W", "M", "Q" and "Y".
    min_cids : int
        Minimum number of cross-sections required for the initial training set.
        Default is 4. Either start_date or (min_cids, min_periods, min_xcats) must be
        provided. If both are provided, start_date takes precedence.
    min_periods : int
        Minimum number of time periods required for the initial training set. Default is
        500. Either start_date or (min_cids, min_periods, min_xcats) must be provided. If
        both are provided, start_date takes precedence.
    min_xcats : int
        Minimum number of xcats required for the initial training set. Default is 1.
        Either start_date or (min_cids, min_periods, min_xcats) must be provided. If
        both are provided, start_date takes precedence.
    start_date : Optional[str]
        First rebalancing date in ISO 8601 format. This is the last date of the first
        training set. Default is None. Either start_date or (min_cids, min_periods)
        must be provided. If both are provided, start_date takes precedence.
    max_periods : Optional[int]
        The maximum number of time periods in each training set. If the maximum is
        exceeded, the earliest periods are cut off. This effectively creates rolling
        training sets. Default is None.

    Notes
    -----
    The first training set is either determined by the specification of `start_date` or by
    the parameters `min_cids` and `min_periods` collectively. When
    `start_date` is provided, the initial training set comprises all available data prior
    to the `start_date`, unless `max_periods` is specified, in which case at most the last
    `max_periods` periods prior to the `start_date` are included.

    If `start_date` is not provided, the first training set is determined by the parameters
    `min_cids`, `min_periods`, and `min_xcats`. This set comprises at least `min_xcats`
    categories for at least `min_periods` time periods for at least `min_cids` cross-sections.

    This initial training set is immediately adjusted depending on the specified training
    interval frequency. For instance, if the training frequency is "M", the initial
    training set is further expanding so that all samples prior to the end of the month
    are included.

    The associated test set immediately follows the adjusted initial training set and spans
    the specified test set frequency forward of its associated training set.
    For instance, if the test frequency is "Q", the available dates that cover the
    subsequent quarter are grouped together to form the test set.

    Subsequent training sets are created by expanding the previous training set by the
    smallest number of dates to cover the training frequency. As before, each test set immediately
    follows its associated training set and is determined in the same manner as the initial test set.
    """

    def __init__(
        self,
        expansion_freq="D",
        test_freq="D",
        min_cids=4,
        min_periods=500,
        min_xcats=1,
        start_date=None,
        max_periods=None,
    ):
        # Checks
        super().__init__(
            min_cids=min_cids,
            min_periods=min_periods,
            start_date=start_date,
            max_periods=max_periods,
            min_xcats=min_xcats,
        )
        self._check_init_params(
            expansion_freq=expansion_freq,
            test_freq=test_freq,
        )

        # Attributes
        self.expansion_freq = expansion_freq
        self.test_freq = test_freq
        self.freq_offsets = {
            "D": pd.DateOffset(days=1),
            "W": pd.DateOffset(weeks=1),
            "M": pd.DateOffset(months=1),
            "Q": pd.DateOffset(months=3),
            "Y": pd.DateOffset(years=1),
        }

[docs]    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test sets.

        Parameters
        ----------
        X : pd.DataFrame
            Pandas dataframe of features, multi-indexed by (cross-section, date). The
            dates must be in datetime format. Otherwise the dataframe must be in wide
            format: each feature is a column.

        y : Union[pd.DataFrame, pd.Series]
            Pandas dataframe or series of a target variable, multi-indexed by
            (cross-section, date). The dates must be in datetime format. If a dataframe
            is provided, the target variable must be the sole column.

        groups : None
            Ignored. Exists for compatibility with scikit-learn.

        Yields
        ------
        train : np.ndarray
            The training set indices for that split.

        test : np.ndarray
            The testing set indices for that split.
        """
        # Checks
        self._check_split_params(X, y, groups)

        # Determine the unique dates in each training split
        train_indices = []
        test_indices = []

        Xy = pd.concat([X, y], axis=1)
        real_dates = Xy.index.get_level_values(1)
        splits = self._determine_unique_training_times(X, real_dates)

        train_splits: list = [
            splits[0] if not self.max_periods else splits[0][-self.max_periods :]
        ]
        for i in range(1, self.n_splits):
            train_splits.append(np.concatenate([train_splits[i - 1], splits[i]]))

            # Drop beginning of training set if it exceeds max_periods.
            if self.max_periods:
                train_splits[i] = train_splits[i][-self.max_periods :]

        test_offset = self.freq_offsets[self.test_freq]
        for split_idx, split in enumerate(train_splits):
            train_indices = np.where(Xy.index.get_level_values(1).isin(split))[0]
            test_start: pd.Timestamp = self.unique_dates[
                self.unique_dates.get_loc(split.max()) + 1
            ]
            test_end: pd.Timestamp = test_start + test_offset
            if split_idx == len(train_splits) - 1:
                test_dates = sorted(self.unique_dates[self.unique_dates >= test_start])
            else:
                test_dates = sorted(
                    self.unique_dates[
                        (self.unique_dates >= test_start)
                        & (self.unique_dates < test_end)
                    ]
                )
            test_indices = np.where(Xy.index.get_level_values(1).isin(test_dates))[0]

            yield train_indices, test_indices

[docs]    def get_n_splits(self, X=None, y=None, groups=None) -> int:
        """
        Calculates and returns the number of splits.

        :param <pd.DataFrame> X: Pandas dataframe of features,
            multi-indexed by (cross-section, date). The dates must be in datetime format.
            Otherwise the dataframe must be in wide format: each feature is a column.
        :param <pd.DataFrame> y: Pandas dataframe of the target variable, multi-indexed by
            (cross-section, date). The dates must be in datetime format.
        :param <pd.DataFrame> groups: Always ignored, exists for compatibility.

        :return <int> n_splits: Returns the number of splits.
        """
        self._determine_unique_time_splits(X, y)

        return self.n_splits

    def _determine_unique_training_times(self, Xy, real_dates):
        """
        Returns the unique dates in each training split.

        Parameters
        ----------
        Xy : pd.DataFrame
            Combined pandas dataframe of features and the target variable, multi-indexed
            by (cross-section, date).
        real_dates : pd.Index
            The dates associated with each sample in Xy.

        Notes
        -----
        This method is called by self.split(). It also returns other variables needed
        for ensuing components of the self.split() method.
        """
        self.unique_dates: pd.DatetimeIndex = real_dates.unique().sort_values()
        # First determine the dates for the first training set
        if self.start_date:
            date_last_train = self.start_date
        else:
            # Number of features with min_periods of non-NaN data
            # for each cid/date pair
            num_xcats_available = (
                Xy.notna()
                .groupby("cid")
                .cumsum()
                .ge(self.min_periods)
                .sum(axis=1)
            )

            # Number of cids with at least min_xcats available
            cid_count = (
                num_xcats_available
                .ge(self.min_xcats)
                .groupby("real_date")
                .sum()
            )

            valid_dates = cid_count[cid_count >= self.min_cids].index
            date_last_train: pd.Timestamp = valid_dates.min()

            if not isinstance(date_last_train, pd.Timestamp):
                raise ValueError(
                    f"No splits that satisfy min_xcats {self.min_xcats}."
                    f"Try reducing value passed to min_xcats."
                )

        # Determine all remaining training splits
        # To do this, loop through unique panel dates to create the training and test sets
        # Loop until the last "test_freq" dates in the panel are reached
        end_date = self.unique_dates[-1] - self.freq_offsets[self.test_freq]
        unique_dates_train: pd.arrays.DatetimeArray = self.unique_dates[
            (self.unique_dates > date_last_train) & (self.unique_dates <= end_date)
        ].sort_values()
        current_date = unique_dates_train[0]
        splits = []

        while current_date < end_date:
            next_date = current_date + self.freq_offsets[self.expansion_freq]
            if next_date > end_date:
                mask = (unique_dates_train >= current_date) & (
                    unique_dates_train <= end_date
                )
            else:
                mask = (unique_dates_train >= current_date) & (
                    unique_dates_train < next_date
                )
            split_dates = unique_dates_train[mask]
            if not split_dates.empty:
                splits.append(split_dates)
            current_date = next_date

        # Add the first training set to the list of training splits, so that the dates
        # that constitute each training split are together.
        splits.insert(
            0,
            real_dates[real_dates <= date_last_train].unique().sort_values(),
        )

        self.n_splits = len(splits)

        return splits

    def _check_init_params(
        self,
        expansion_freq,
        test_freq,
    ):
        # expansion_freq
        if not isinstance(expansion_freq, str):
            raise TypeError(
                f"expansion_freq must be a string. Got {type(expansion_freq)}."
            )
        if expansion_freq not in ["D", "W", "M", "Q", "Y"]:
            raise ValueError(
                f"expansion_freq must be one of 'D', 'W', 'M', 'Q' or 'Y'."
                f" Got {expansion_freq}."
            )

        # test_freq
        if not isinstance(test_freq, str):
            raise TypeError(f"test_freq must be a string. Got {type(test_freq)}.")
        if test_freq not in ["D", "W", "M", "Q", "Y"]:
            raise ValueError(
                f"test_freq must be one of 'D', 'W', 'M', 'Q' or 'Y'."
                f" Got {test_freq}."
            )
        
[docs]    def visualise_splits(
        self,
        X,
        y,
        figsize=(20, 5),
        show_title=True,
        tick_fontsize=None,
        label_fontsize=None,
        subtitle_fontsize=None,
    ):
        """
        Visualise the cross-validation splits.

        Parameters
        ----------
        X : pd.DataFrame
            Pandas dataframe of features/quantamental indicators, multi-indexed by
            (cross-section, date). The dates must be in datetime format. The
            dataframe must be in wide format: each feature is a column.
        y : pd.DataFrame
            Pandas dataframe of target variable, multi-indexed by (cross-section, date).
            The dates must be in datetime format.
        figsize : Tuple[int, int]
            Tuple of integers specifying the splitter visualisation figure size.
        show_title : bool, optional
            Boolean specifying whether to show the title of the figure. Default is True.
        tick_fontsize : int, optional
            Integer specifying the size of the x-axis tick labels. Default is None.
        label_fontsize : int, optional
            Integer specifying the size of the y-axis labels. Default is None.
        subtitle_fontsize : int, optional
            Integer specifying the size of the subplot titles. Default is None.
        """
        super().visualise_splits(X, y, figsize, show_title, tick_fontsize, label_fontsize, subtitle_fontsize)

if __name__ == "__main__":
    from macrosynergy.management.simulate import make_qdf
    import macrosynergy.management as msm

    # Example dataset
    cids = ["AUD", "CAD", "GBP", "USD"]
    xcats = ["XR", "CRY", "GROWTH", "INFL"]
    cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )
    df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1]
    df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1]
    df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1]
    df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1]

    df_xcats = pd.DataFrame(index=xcats, columns=cols)
    df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
    df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1]
    df_xcats.loc["GROWTH"] = ["2001-01-01", "2020-12-31", 1, 2, 0.9, 1]
    df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", 1, 2, 0.8, 0.5]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    dfd["grading"] = np.ones(dfd.shape[0])
    black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]}
    dfd = msm.reduce_df(df=dfd, cids=cids, xcats=xcats, blacklist=black)

    dfd = dfd.pivot(index=["cid", "real_date"], columns="xcat", values="value")
    X = dfd.drop(columns=["XR"])
    y = dfd["XR"]

    # ExpandingIncrementPanelSplit
    splitter = ExpandingIncrementPanelSplit(train_intervals=21 * 12, test_size=21 * 12, min_cids = 1, min_periods = 12*12)
    splitter.visualise_splits(X, y)

    # ExpandingFrequencyPanelSplit
    splitter = ExpandingFrequencyPanelSplit(expansion_freq="Y", test_freq="Y", min_cids = 1, min_periods = 12*12)
    splitter.visualise_splits(X, y)