Source code for macrosynergy.learning.cv_tools

"""
A set of tools for cross-validation of panel data.
"""

import numpy as np
import pandas as pd
import datetime

from macrosynergy.learning import ExpandingKFoldPanelSplit
from macrosynergy.learning.splitters.base_splitters import BasePanelSplit

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer


[docs]def panel_cv_scores(
    X,
    y,
    splitter,
    estimators,
    scoring,
    show_longbias = True,
    show_std = False,
    verbose = 1,
    n_jobs = -1,
):
    """
    Returns a dataframe of cross-validation scores for a collection of models, with
    respect to a cross-validation splitter and a set of scorers.

    Parameters
    ----------
    X : pd.DataFrame
        Input feature matrix.
    y : pd.DataFrame or pd.Series
        Target variable.
    splitter : BasePanelSplit
        Panel cross-validation splitter.
    estimators : dict
        Dictionary of models.
    scoring : dict
        Dictionary of scorers.
    show_longbias : bool, optional, default=True
        Whether to show the proportion of times a model predicts a positive return.
    show_std : bool, optional, default=False
        Whether to show the standard deviation of the cross-validation scores over folds.
    verbose : int, optional, default=1
        Verbosity level.
    n_jobs : int, optional, default=-1
        Number of jobs to run in parallel.

    Returns
    -------
    pd.DataFrame
        Dataframe of cross-validation scores.

    Notes
    -----
    This function returns a dataframe that is multi-indexed with the outer index
    representing a metric and the inner index representing the mean & (optionally) a
    standard deviation over validation splits. The columns are the estimators.
    """

    # check input types
    if not isinstance(X, pd.DataFrame):
        raise TypeError("X must be a pandas dataframe.")
    if not isinstance(y, (pd.DataFrame, pd.Series)):
        raise TypeError("y must be a pandas dataframe or series.")
    if not isinstance(X.index, pd.MultiIndex):
        raise TypeError("X must be multi-indexed.")
    if not isinstance(y.index, pd.MultiIndex):
        raise TypeError("y must be multi-indexed.")
    if not isinstance(splitter, BasePanelSplit):
        raise TypeError("splitter must be an inherit from BasePanelSplit.")
    if not isinstance(estimators, dict):
        raise TypeError("estimators must be a dictionary.")
    if estimators == {}:
        raise ValueError("estimators must not be an empty dictionary.")
    if np.any([not isinstance(est_name, str) for est_name in estimators.keys()]):
        raise TypeError("estimator names must all be strings.")
    if np.any([not isinstance(est, object) for est in estimators.values()]):
        raise TypeError("estimators must all be objects.")
    if not isinstance(scoring, dict):
        raise TypeError("scoring must be a dictionary.")
    if scoring == {}:
        raise ValueError("scoring must not be an empty dictionary.")
    if np.any([not isinstance(metric_name, str) for metric_name in scoring.keys()]):
        raise TypeError("scorer names must all be strings.")

    if not isinstance(show_longbias, bool):
        raise TypeError("show_longbias must be a boolean.")
    if not isinstance(show_std, bool):
        raise TypeError("show_std must be a boolean.")
    if not isinstance(verbose, int):
        raise TypeError("verbose must be an integer.")
    if not isinstance(n_jobs, int):
        raise TypeError("n_jobs must be an integer.")

    # check the dataframes are in the right format
    if not isinstance(X.index.get_level_values(1)[0], datetime.date):
        raise TypeError("The inner index of X must be datetime.date.")
    if not isinstance(y.index.get_level_values(1)[0], datetime.date):
        raise TypeError("The inner index of y must be datetime.date.")
    if not X.index.equals(y.index):
        raise ValueError(
            "The indices of the input dataframe X and the output dataframe y don't match."
        )

    # check that there is at least one estimator and at least one scoring metric
    if len(estimators) <= 0:
        raise ValueError("There must be at least one estimator provided.")
    if len(scoring) <= 0:
        raise ValueError("There must be at least one scoring metric provided.")
    if verbose < 0:
        raise ValueError("verbose must be a non-negative integer.")
    if (n_jobs < 1) & (n_jobs != -1):
        raise ValueError("n_jobs must either be a positive integer or equal to -1.")

    # construct the dataframe to return
    if show_longbias:
        scoring = scoring.copy()
        scoring["Positive prediction ratio"] = make_scorer(
            lambda y_true, y_pred: np.sum(y_pred > 0) / len(y_pred)
        )
        scoring["Positive test-target ratio"] = make_scorer(
            lambda y_true, y_pred: np.sum(y_true > 0) / len(y_true)
        )

    results: dict = {
        estimator_name: {} for estimator_name in estimators
    }

    for estimator_name, estimator in estimators.items():
        if verbose:
            print(f"Calculating walk-forward validation metrics for {estimator_name}.")
        cv_results: dict = cross_validate(
            estimator,
            X,
            y,
            cv=splitter,
            scoring=scoring,
            verbose=verbose,
            n_jobs=n_jobs,
        )

        for metric_name in scoring:
            score: np.ndarray = cv_results[f"test_{metric_name}"]
            results[estimator_name][metric_name] = np.mean(score)
            if show_std:
                results[estimator_name][f"{metric_name}_std"] = np.std(score)

    metrics_df = pd.DataFrame(results)
    if show_std:
        multi_index: pd.MultiIndex = pd.MultiIndex.from_tuples(
            [
                (key.split("_")[0], "std" if "std" in key else "mean")
                for key in metrics_df.index
            ]
        )
        metrics_df.index = multi_index

    return metrics_df


if __name__ == "__main__":
    from macrosynergy.management.simulate import make_qdf
    import macrosynergy.management as msm
    import macrosynergy.learning as msl

    from sklearn.linear_model import LinearRegression, Lasso
    from sklearn.metrics import (
        root_mean_squared_error,
        mean_absolute_error,
        mean_absolute_percentage_error,
    )

    cids = ["AUD", "CAD", "GBP", "USD"]
    xcats = ["XR", "CRY", "GROWTH", "INFL"]
    cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]

    """Example 1: Unbalanced panel """

    df_cids2 = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )
    df_cids2.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1]
    df_cids2.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1]
    df_cids2.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1]
    df_cids2.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1]

    df_xcats2 = pd.DataFrame(index=xcats, columns=cols)
    df_xcats2.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
    df_xcats2.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1]
    df_xcats2.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 1, 2, 0.9, 1]
    df_xcats2.loc["INFL"] = ["2000-01-01", "2020-12-31", 1, 2, 0.8, 0.5]

    dfd2 = make_qdf(df_cids2, df_xcats2, back_ar=0.75)
    dfd2["grading"] = np.ones(dfd2.shape[0])
    black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]}
    dfd2 = msm.reduce_df(df=dfd2, cids=cids, xcats=xcats, blacklist=black)

    dfd2 = dfd2.pivot(index=["cid", "real_date"], columns="xcat", values="value")
    X2 = dfd2.drop(columns=["XR"])
    y2 = dfd2["XR"]

    # 1) Demonstration of panel_cv_scores
    splitex = ExpandingKFoldPanelSplit(n_splits=100)
    models = {"OLS": LinearRegression(), "Lasso": Lasso(alpha=0.05)}
    metrics = {
        "neg_rmse": make_scorer(root_mean_squared_error, greater_is_better=False),
        "neg_mae": make_scorer(mean_absolute_error, greater_is_better=False),
        "neg_mape": make_scorer(mean_absolute_percentage_error, greater_is_better=False),
        "acc": make_scorer(msl.regression_accuracy),
        "bac": make_scorer(msl.regression_balanced_accuracy),
        "map": make_scorer(msl.panel_significance_probability),
        "sharpe": make_scorer(msl.sharpe_ratio),
        "sortino": make_scorer(msl.sortino_ratio),
    }
    df_ev = panel_cv_scores(
        X2,
        y2,
        splitter=splitex,
        estimators=models,
        scoring=metrics,
        show_longbias=True,
        show_std=False,
        n_jobs=-1,
        verbose=1,
    )
    print(df_ev)