Source code for macrosynergy.learning.cv_tools

"""
A set of tools for cross-validation of panel data.
"""

import numpy as np
import pandas as pd
import datetime

from macrosynergy.learning import ExpandingKFoldPanelSplit
from macrosynergy.learning.splitters.base_splitters import BasePanelSplit

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer


[docs]def panel_cv_scores( X, y, splitter, estimators, scoring, show_longbias = True, show_std = False, verbose = 1, n_jobs = -1, ): """ Returns a dataframe of cross-validation scores for a collection of models, with respect to a cross-validation splitter and a set of scorers. Parameters ---------- X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Target variable. splitter : BasePanelSplit Panel cross-validation splitter. estimators : dict Dictionary of models. scoring : dict Dictionary of scorers. show_longbias : bool, optional, default=True Whether to show the proportion of times a model predicts a positive return. show_std : bool, optional, default=False Whether to show the standard deviation of the cross-validation scores over folds. verbose : int, optional, default=1 Verbosity level. n_jobs : int, optional, default=-1 Number of jobs to run in parallel. Returns ------- pd.DataFrame Dataframe of cross-validation scores. Notes ----- This function returns a dataframe that is multi-indexed with the outer index representing a metric and the inner index representing the mean & (optionally) a standard deviation over validation splits. The columns are the estimators. """ # check input types if not isinstance(X, pd.DataFrame): raise TypeError("X must be a pandas dataframe.") if not isinstance(y, (pd.DataFrame, pd.Series)): raise TypeError("y must be a pandas dataframe or series.") if not isinstance(X.index, pd.MultiIndex): raise TypeError("X must be multi-indexed.") if not isinstance(y.index, pd.MultiIndex): raise TypeError("y must be multi-indexed.") if not isinstance(splitter, BasePanelSplit): raise TypeError("splitter must be an inherit from BasePanelSplit.") if not isinstance(estimators, dict): raise TypeError("estimators must be a dictionary.") if estimators == {}: raise ValueError("estimators must not be an empty dictionary.") if np.any([not isinstance(est_name, str) for est_name in estimators.keys()]): raise TypeError("estimator names must all be strings.") if np.any([not isinstance(est, object) for est in estimators.values()]): raise TypeError("estimators must all be objects.") if not isinstance(scoring, dict): raise TypeError("scoring must be a dictionary.") if scoring == {}: raise ValueError("scoring must not be an empty dictionary.") if np.any([not isinstance(metric_name, str) for metric_name in scoring.keys()]): raise TypeError("scorer names must all be strings.") if not isinstance(show_longbias, bool): raise TypeError("show_longbias must be a boolean.") if not isinstance(show_std, bool): raise TypeError("show_std must be a boolean.") if not isinstance(verbose, int): raise TypeError("verbose must be an integer.") if not isinstance(n_jobs, int): raise TypeError("n_jobs must be an integer.") # check the dataframes are in the right format if not isinstance(X.index.get_level_values(1)[0], datetime.date): raise TypeError("The inner index of X must be datetime.date.") if not isinstance(y.index.get_level_values(1)[0], datetime.date): raise TypeError("The inner index of y must be datetime.date.") if not X.index.equals(y.index): raise ValueError( "The indices of the input dataframe X and the output dataframe y don't match." ) # check that there is at least one estimator and at least one scoring metric if len(estimators) <= 0: raise ValueError("There must be at least one estimator provided.") if len(scoring) <= 0: raise ValueError("There must be at least one scoring metric provided.") if verbose < 0: raise ValueError("verbose must be a non-negative integer.") if (n_jobs < 1) & (n_jobs != -1): raise ValueError("n_jobs must either be a positive integer or equal to -1.") # construct the dataframe to return if show_longbias: scoring = scoring.copy() scoring["Positive prediction ratio"] = make_scorer( lambda y_true, y_pred: np.sum(y_pred > 0) / len(y_pred) ) scoring["Positive test-target ratio"] = make_scorer( lambda y_true, y_pred: np.sum(y_true > 0) / len(y_true) ) results: dict = { estimator_name: {} for estimator_name in estimators } for estimator_name, estimator in estimators.items(): if verbose: print(f"Calculating walk-forward validation metrics for {estimator_name}.") cv_results: dict = cross_validate( estimator, X, y, cv=splitter, scoring=scoring, verbose=verbose, n_jobs=n_jobs, ) for metric_name in scoring: score: np.ndarray = cv_results[f"test_{metric_name}"] results[estimator_name][metric_name] = np.mean(score) if show_std: results[estimator_name][f"{metric_name}_std"] = np.std(score) metrics_df = pd.DataFrame(results) if show_std: multi_index: pd.MultiIndex = pd.MultiIndex.from_tuples( [ (key.split("_")[0], "std" if "std" in key else "mean") for key in metrics_df.index ] ) metrics_df.index = multi_index return metrics_df
if __name__ == "__main__": from macrosynergy.management.simulate import make_qdf import macrosynergy.management as msm import macrosynergy.learning as msl from sklearn.linear_model import LinearRegression, Lasso from sklearn.metrics import ( root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, ) cids = ["AUD", "CAD", "GBP", "USD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] """Example 1: Unbalanced panel """ df_cids2 = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids2.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1] df_cids2.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1] df_cids2.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1] df_cids2.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1] df_xcats2 = pd.DataFrame(index=xcats, columns=cols) df_xcats2.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3] df_xcats2.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1] df_xcats2.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 1, 2, 0.9, 1] df_xcats2.loc["INFL"] = ["2000-01-01", "2020-12-31", 1, 2, 0.8, 0.5] dfd2 = make_qdf(df_cids2, df_xcats2, back_ar=0.75) dfd2["grading"] = np.ones(dfd2.shape[0]) black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]} dfd2 = msm.reduce_df(df=dfd2, cids=cids, xcats=xcats, blacklist=black) dfd2 = dfd2.pivot(index=["cid", "real_date"], columns="xcat", values="value") X2 = dfd2.drop(columns=["XR"]) y2 = dfd2["XR"] # 1) Demonstration of panel_cv_scores splitex = ExpandingKFoldPanelSplit(n_splits=100) models = {"OLS": LinearRegression(), "Lasso": Lasso(alpha=0.05)} metrics = { "neg_rmse": make_scorer(root_mean_squared_error, greater_is_better=False), "neg_mae": make_scorer(mean_absolute_error, greater_is_better=False), "neg_mape": make_scorer(mean_absolute_percentage_error, greater_is_better=False), "acc": make_scorer(msl.regression_accuracy), "bac": make_scorer(msl.regression_balanced_accuracy), "map": make_scorer(msl.panel_significance_probability), "sharpe": make_scorer(msl.sharpe_ratio), "sortino": make_scorer(msl.sortino_ratio), } df_ev = panel_cv_scores( X2, y2, splitter=splitex, estimators=models, scoring=metrics, show_longbias=True, show_std=False, n_jobs=-1, verbose=1, ) print(df_ev)