Source code for macrosynergy.learning.splitters.kfold_splitters

"""
Panel K-Fold cross-validator classes. 
"""

import numpy as np
import pandas as pd

from macrosynergy.learning.splitters.base_splitters import KFoldPanelSplit


[docs]class ExpandingKFoldPanelSplit(KFoldPanelSplit):
    """
    Time-respecting K-Fold cross-validator for panel data.

    Parameters
    ----------
    n_splits : int
        Number of folds i.e. (training set, test set) pairs. Default is 5.
        Must be at least 2.

    Notes
    -----
    This splitter can be considered to be a panel data analogue to the `TimeSeriesSplit`
    splitter provided by `scikit-learn`.

    Unique dates in the panel are divided into 'n_splits + 1' sequential and
    non-overlapping intervals, resulting in 'n_splits' pairs of training and test sets.
    The 'i'th training set is the union of the first 'i' intervals, and the 'i'th test set
    is the 'i+1'th interval.
    """

    def _determine_splits(self, unique_dates, n_splits):
        """
        Determine panel time period splits based on the sorted collection of unique dates and the
        number of splits specified by the user.

        Parameters
        ----------
        unique_dates : pd.DatetimeIndex
            Sorted collection of unique dates in the panel.
        n_splits : int
            Number of splits to generate.

        Returns
        -------
        splits : list of np.ndarray
            List of numpy arrays denoting dates in each split.
        """
        return np.array_split(unique_dates, n_splits + 1)

    def _get_split_indicies(self, n_split, splits, Xy, dates, unique_dates):
        """
        Determine the training and test set indices for a given split.

        Parameters
        ----------
        n_split : int
            Index of the current split.
        splits : list of np.ndarray
            List of numpy arrays denoting dates in each split.
        Xy : pd.DataFrame
            Combined dataframe of the features and the target variable.
        dates : pd.DatetimeIndex
            DatetimeIndex of all dates in the panel.
        unique_dates : pd.DatetimeIndex
            Sorted collection of unique dates in the panel.

        Returns
        -------
        train : np.ndarray
            The training set indices for that split.
        test : np.ndarray
            The testing set indices for that split.
        """
        train_split = np.concatenate(splits[: n_split + 1])
        train_indices = np.where(dates.isin(train_split))[0]
        test_indices = np.where(dates.isin(splits[n_split + 1]))[0]

        return train_indices, test_indices


[docs]class RollingKFoldPanelSplit(KFoldPanelSplit):
    """
    Unshuffled K-Fold cross-validator for panel data.

    Parameters
    ----------
    n_splits : int
        Number of folds. Default is 5. Must be at least 2.

    Notes
    -----
    This splitter can be considered to be a panel data analogue to the `KFold` splitter
    provided by `scikit-learn`, with `shuffle=False` and with splits determined on the
    time dimension.

    Unique dates in the panel are divided into 'n_splits' sequential and non-overlapping
    intervals of equal size, resulting in 'n_splits' pairs of training and test sets.
    The 'i'th test set is the 'i'th interval, and the 'i'th training set is all other
    intervals.
    """

    def _determine_splits(self, unique_dates, n_splits):
        """
        Determine panel time period splits based on the sorted collection of unique dates and the
        number of splits specified by the user.

        Parameters
        ----------
        unique_dates : pd.DatetimeIndex
            Sorted collection of unique dates in the panel.
        n_splits : int
            Number of splits to generate.

        Returns
        -------
        splits : list of np.ndarray
            List of numpy arrays denoting dates in each split.
        """
        return np.array_split(unique_dates, n_splits)

    def _get_split_indicies(self, n_split, splits, Xy, dates, unique_dates):
        """
        Determine the training and test set indices for a given split.

        Parameters
        ----------
        n_split : int
            Index of the current split.
        splits : list of np.ndarray
            List of numpy arrays denoting dates in each split.
        Xy : pd.DataFrame
            Combined dataframe of the features and the target variable.
        dates : pd.DatetimeIndex
            DatetimeIndex of all dates in the panel.
        unique_dates : pd.DatetimeIndex
            Sorted collection of unique dates in the panel.

        Returns
        -------
        train : np.ndarray
            The training set indices for that split.
        test : np.ndarray
            The testing set indices for that split.
        """
        test_split = splits[n_split]
        train_split = np.concatenate(splits[:n_split] + splits[n_split + 1 :])
        train_indices = np.where(dates.isin(train_split))[0]
        test_indices = np.where(dates.isin(test_split))[0]

        return train_indices, test_indices


[docs]class RecencyKFoldPanelSplit(KFoldPanelSplit):
    """
    Time-respecting K-Fold panel cross-validator that creates training and test sets based
    on the most recent samples in the panel.

    Parameters
    ----------
    n_splits : int
        Number of folds i.e. (training set, test set) pairs. Default is 5.
        Must be at least 1.
    n_periods : int
        Number of time periods, in units of native dataset frequency, to comprise each
        test set. Default is 252 (1 year for daily data).

    Notes
    -----
    This splitter is similar to the ExpandingKFoldPanelSplit, except that the sorted
    unique timestamps are not divided into equal intervals. Instead, the last
    `n_periods` * `n_splits` timestamps in the panel are divided into `n_splits`
    non-overlapping intervals, each of which is used as a test set. The corresponding
    training set is comprised of all samples with timestamps earlier than its test set.
    Consequently, this is a K-Fold walk-forward cross-validator, but with test folds
    concentrated on the most recent information.
    """

    def __init__(self, n_splits=5, n_periods=252):
        super().__init__(n_splits=n_splits, min_n_splits=1)

        # Additional checks
        if not isinstance(n_periods, int):
            raise TypeError(f"n_periods must be an integer. Got {type(n_periods)}.")
        if n_periods < 1:
            raise ValueError(
                f"Cannot have number of periods less than 1. Got {n_periods}."
            )

        # Additional attributes
        self.n_periods = n_periods

    def _determine_splits(self, unique_dates, n_splits):
        """
        Determine panel time period splits based on the sorted collection of unique dates and the
        number of splits specified by the user.

        Parameters
        ----------
        unique_dates : pd.DatetimeIndex
            Sorted collection of unique dates in the panel.
        n_splits : int
            Number of splits to generate.

        Returns
        -------
        splits : list of np.ndarray
            List of numpy arrays denoting dates in each split.
        """
        return np.array_split(unique_dates[-n_splits * self.n_periods :], n_splits)

    def _get_split_indicies(self, n_split, splits, Xy, dates, unique_dates):
        """
        Determine the training and test set indices for a given split.

        Parameters
        ----------
        n_split : int
            Index of the current split.
        splits : list of np.ndarray
            List of numpy arrays denoting dates in each split.
        Xy : pd.DataFrame
            Combined dataframe of the features and the target variable.
        dates : pd.DatetimeIndex
            DatetimeIndex of all dates in the panel.
        unique_dates : pd.DatetimeIndex
            Sorted collection of unique dates in the panel.

        Returns
        -------
        train : np.ndarray
            The training set indices for that split.
        test : np.ndarray
            The testing set indices for that split.
        """
        test_split = np.array(splits[n_split], dtype=np.datetime64)
        train_split = unique_dates[unique_dates < test_split[0]]

        train_indices = np.where(dates.isin(train_split))[0]
        test_indices = np.where(dates.isin(test_split))[0]

        return train_indices, test_indices


if __name__ == "__main__":
    from macrosynergy.management.simulate import make_qdf
    import macrosynergy.management as msm

    # Example dataset
    cids = ["AUD", "CAD", "GBP", "USD"]
    xcats = ["XR", "CRY", "GROWTH", "INFL"]
    cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )
    df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1]
    df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1]
    df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1]
    df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1]

    df_xcats = pd.DataFrame(index=xcats, columns=cols)
    df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
    df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1]
    df_xcats.loc["GROWTH"] = ["2001-01-01", "2020-12-31", 1, 2, 0.9, 1]
    df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", 1, 2, 0.8, 0.5]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    dfd["grading"] = np.ones(dfd.shape[0])
    black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]}
    dfd = msm.reduce_df(df=dfd, cids=cids, xcats=xcats, blacklist=black)

    dfd = dfd.pivot(index=["cid", "real_date"], columns="xcat", values="value")
    X = dfd.drop(columns=["XR"])
    y = dfd["XR"]    

    """ Single validation set example """
    splitter = RecencyKFoldPanelSplit(n_splits=1, n_periods=21*12)
    splitter.visualise_splits(X, y, show_title=False, tick_fontsize=12, label_fontsize=12)

    """ Cross-validation examples """
    # ExpandingKFoldPanelSplit
    splitter = ExpandingKFoldPanelSplit(n_splits=5)
    splitter.visualise_splits(X, y, tick_fontsize=12, label_fontsize=12, subtitle_fontsize=14)

    # RollingKFoldPanelSplit
    splitter = RollingKFoldPanelSplit(n_splits=5)
    splitter.visualise_splits(X, y, tick_fontsize=12, label_fontsize=12, subtitle_fontsize=14)

    # RecencyKFoldPanelSplit
    splitter = RecencyKFoldPanelSplit(n_splits=4, n_periods=21 * 3)
    splitter.visualise_splits(X, y, tick_fontsize=8, label_fontsize=8, subtitle_fontsize=20)