Source code for macrosynergy.learning.forecasting.model_systems.regressor_systems

import numpy as np
import pandas as pd
import scipy.stats as stats
from macrosynergy.learning.forecasting.model_systems import BaseRegressionSystem
from sklearn.linear_model import LinearRegression, Ridge

from macrosynergy.learning.forecasting import LADRegressor


[docs]class LinearRegressionSystem(BaseRegressionSystem): """ Cross-sectional system of linear regression models for panel data. Parameters ---------- fit_intercept : bool, default=True Whether to fit an intercept for each regression. positive : bool, default=False Whether to enforce positive coefficients for each regression. roll : int or str, default = "full" The lookback of the rolling window for the regression. This should be expressed in either integer units of the native dataset frequency, or as the string `roll = 'full'` to use the entire available history. min_xs_samples : int, default=2 The minimum number of samples required in each cross-section training set for a regression model to be fitted on that cross-section. If `data_freq` is None or `unadjusted`, this parameter is specified in units of the underlying dataset frequency. Otherwise, this parameter should be expressed in units of the frequency specified in `data_freq`. data_freq : str, optional Training set data frequency for resampling. This is primarily to be used within the context of market beta estimation in the `BetaEstimator` class in `macrosynergy.learning`, allowing for cross-validation of the underlying dataset frequency for good beta estimation. Accepted strings are 'unadjusted' to use the native dataset frequency, 'W' for weekly, 'M' for monthly and 'Q' for quarterly. It is recommended to set this parameter to 'W', 'M' or 'Q' only when the native dataset frequency is greater. Notes ----- Separate regression models are fit for each cross-section, but evaluation is performed over the panel. Consequently, the results of a hyperparameter search will choose a single set of hyperparameters for all cross-sections, but the model parameters themselves may differ across cross-sections. This estimator is primarily intended for use within the context of market beta estimation, but can be plausibly used for return forecasting or other downstream tasks. The `data_freq` parameter is particularly intended for cross-validating market beta estimation models, since choosing the underlying data frequency is of interest for this problem. """ def __init__( self, fit_intercept=True, positive=False, roll="full", min_xs_samples=2, data_freq=None, ): # Call the parent class constructor super().__init__(roll=roll, data_freq=data_freq, min_xs_samples=min_xs_samples) # Additional checks self._check_init_params( fit_intercept, positive, ) # Additional attributes self.fit_intercept = fit_intercept self.positive = positive # Create data structures to store model information for each cross-section self.coefs_ = {} self.intercepts_ = {}
[docs] def create_model(self): """ Instantiate a linear regression model. Returns ------- LinearRegression A linear regression model with the specified hyperparameters. """ return LinearRegression( fit_intercept=self.fit_intercept, positive=self.positive, )
[docs] def store_model_info(self, section, model): """ Store the coefficients and intercepts of a fitted linear regression model. Parameters ---------- section : str The cross-section identifier. model : LinearRegression The fitted linear regression model. """ self.coefs_[section] = model.coef_[0] self.intercepts_[section] = model.intercept_
def _check_init_params( self, fit_intercept, positive, ): """ Parameter checks for the LinearRegressionSystem constructor. Parameters ---------- fit_intercept : bool Whether to fit an intercept for each regression. positive : bool Whether to enforce positive coefficients for each regression. """ if not isinstance(fit_intercept, bool): raise TypeError("fit_intercept must be a boolean.") if not isinstance(positive, bool): raise TypeError("positive must be a boolean.")
[docs]class LADRegressionSystem(BaseRegressionSystem): """ Cross-sectional system of LAD regression models. Parameters ---------- fit_intercept : bool, default=True Whether to fit an intercept for each regression. positive : bool, default=False Whether to enforce positive coefficients for each regression. roll : int or str, default = "full" The lookback of the rolling window for the regression. This should be expressed in either integer units of the native dataset frequency, or as the string `roll = 'full'` to use the entire available history. min_xs_samples : int, default=2 The minimum number of samples required in each cross-section training set for a regression model to be fitted on that cross-section. If `data_freq` is None or 'unadjusted', this parameter is specified in units of the underlying dataset frequency. Otherwise, this parameter should be expressed in unites of the frequency specified in `data_freq`. data_freq : str, optional Training set data frequency for resampling. This is primarily to be used within the context of market beta estimation in the `BetaEstimator` class in `macrosynergy.learning`, allowing for cross-validation of the underlying dataset frequency for good beta estimation. Accepted strings are 'unadjusted' to use the native dataset frequency, 'W' for weekly, 'M' for monthly and 'Q' for quarterly. It is recommended to set this parameter to 'W', 'M' or 'Q' only when the native dataset frequency is greater. Notes ----- Separate regression models are fit for each cross-section, but evaluation is performed over the panel. Consequently, the results of a hyperparameter search will choose a single set of hyperparameters for all cross-sections, but the model parameters themselves may differ across cross-sections. This estimator is primarily intended for use within the context of market beta estimation, but can be plausibly used for return forecasting or other downstream tasks. The `data_freq` parameter is particularly intended for cross-validating market beta estimation models, since choosing the underlying data frequency is of interest for this problem. """ def __init__( self, fit_intercept=True, positive=False, roll="full", min_xs_samples=2, data_freq=None, ): # Call the parent class constructor super().__init__(roll=roll, data_freq=data_freq, min_xs_samples=min_xs_samples) # Additional checks self._check_init_params( fit_intercept, positive, ) # Additional attributes self.fit_intercept = fit_intercept self.positive = positive # Create data structures to store model information for each cross-section self.coefs_ = {} self.intercepts_ = {}
[docs] def create_model(self): """ Instantiate a LAD regression model. Returns ------- LADRegressor A LAD regression model with the specified hyperparameters. """ return LADRegressor( fit_intercept=self.fit_intercept, positive=self.positive, )
[docs] def store_model_info(self, section, model): """ Store the coefficients and intercepts of a fitted LAD regression model. Parameters ---------- section : str The cross-section identifier. model : LADRegressor The fitted linear regression model. """ self.coefs_[section] = model.coef_[0] self.intercepts_[section] = model.intercept_
def _check_init_params( self, fit_intercept, positive, ): """ Parameter checks for the LADRegressionSystem constructor. Parameters ---------- fit_intercept : bool Whether to fit an intercept for each regression. positive : bool Whether to enforce positive coefficients for each regression. """ if not isinstance(fit_intercept, bool): raise TypeError("fit_intercept must be a boolean.") if not isinstance(positive, bool): raise TypeError("positive must be a boolean.")
[docs]class RidgeRegressionSystem(BaseRegressionSystem): """ Cross-sectional system of ridge regression models for panel data. Parameters ---------- fit_intercept : bool, default=True Whether to fit an intercept for each regression. positive : bool, default=False Whether to enforce positive coefficients for each regression. alpha : float, default=1.0 L2 regularization hyperparameter. Greater values specify stronger regularization. roll : int or str, default = "full" The lookback of the rolling window for the regression. This should be expressed in either integer units of the native dataset frequency, or as the string `roll = 'full'` to use the entire available history. tol : float, default=1e-4 The tolerance for termination. solver : str, default='lsqr' Solver to use in the computational routines. Options are 'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga' and 'lbfgs'. min_xs_samples : int, default=2 The minimum number of samples required in each cross-section training set for a regression model to be fitted on that cross-section. If `data_freq` is None or 'unadjusted', this parameter is specified in units of the underlying dataset frequency. Otherwise, this parameter should be expressed in unites of the frequency specified in `data_freq`. data_freq : str, optional Training set data frequency for resampling. This is primarily to be used within the context of market beta estimation in the `BetaEstimator` class in `macrosynergy.learning`, allowing for cross-validation of the underlying dataset frequency for good beta estimation. Accepted strings are 'unadjusted' to use the native dataset frequency, 'W' for weekly, 'M' for monthly and 'Q' for quarterly. It is recommended to set this parameter to 'W', 'M' or 'Q' only when the native dataset frequency is greater. Notes ----- Separate regression models are fit for each cross-section, but evaluation is performed over the panel. Consequently, the results of a hyperparameter search will choose a single set of hyperparameters for all cross-sections, but the model parameters themselves may differ across cross-sections. This estimator is primarily intended for use within the context of market beta estimation, but can be plausibly used for return forecasting or other downstream tasks. The `data_freq` parameter is particularly intended for cross-validating market beta estimation models, since choosing the underlying data frequency is of interest in quant analysis. """ def __init__( self, fit_intercept=True, positive=False, alpha=1.0, tol=1e-4, solver="lsqr", roll="full", min_xs_samples=2, data_freq=None, ): # Call the parent class constructor super().__init__(roll=roll, data_freq=data_freq, min_xs_samples=min_xs_samples) # Checks self._check_init_params( alpha, fit_intercept, positive, tol, solver, ) # Additional attributes self.alpha = alpha self.fit_intercept = fit_intercept self.positive = positive self.tol = tol self.solver = solver # Create data structures to store model information for each cross-section self.coefs_ = {} self.intercepts_ = {}
[docs] def create_model(self): """ Instantiate a ridge regression model. Returns ------- Ridge A ridge regression model with the specified hyperparameters. """ return Ridge( fit_intercept=self.fit_intercept, positive=self.positive, alpha=self.alpha, tol=self.tol, solver=self.solver, )
[docs] def store_model_info(self, section, model): """ Store the coefficients and intercepts of a fitted ridge regression model. Parameters ---------- section : str The cross-section identifier. model : Ridge The fitted ridge regression model. """ self.coefs_[section] = model.coef_[0] self.intercepts_[section] = model.intercept_
def _check_init_params( self, alpha, fit_intercept, positive, tol, solver, ): """ Parameter checks for the RidgeRegressionSystem constructor. Parameters ---------- alpha : float L2 regularization hyperparameter. Greater values specify stronger regularization. fit_intercept : bool Whether to fit an intercept for each regression. positive : bool Whether to enforce positive coefficients for each regression. tol : float The tolerance for termination. solver : str Solver to use in the computational routines. """ if not isinstance(alpha, (int, float)): raise TypeError("alpha must be either an integer or a float.") if alpha < 0: raise ValueError("alpha must be non-negative.") if not isinstance(fit_intercept, bool): raise TypeError("fit_intercept must be a boolean.") if not isinstance(positive, bool): raise TypeError("positive must be a boolean.") if not isinstance(tol, (int, float)): raise TypeError("tol must be either an integer or a float.") if tol <= 0: raise ValueError("tol must be a positive number.") if solver not in [ "auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs", ]: raise ValueError( "solver must be one of 'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', " "'sag', 'saga' or 'lbfgs'." )
[docs]class CorrelationVolatilitySystem(BaseRegressionSystem): """ Cross-sectional system of moving average models to estimate correlation and volatility components of a macro beta separately over a panel of financial contracts. Parameters ---------- correlation_lookback : int or str, default="full" The lookback of the rolling window for correlation estimation. If "full", the entire cross-sectional history is used. Otherwise, this parameter should be an integer specified in the native dataset frequency. If `data_freq` is not None or 'unadjusted', this parameter should be expressed in units of the frequency specified in `data_freq`. correlation_type : str, default='pearson' The type of correlation to be calculated. Accepted values are 'pearson', 'kendall' and 'spearman'. volatility_lookback : int or str, default="full" The lookback of the rolling window for volatility estimation. If "full", the entire cross-sectional history is used. Otherwise, this parameter should be an integer specified in the native dataset frequency. If `data_freq` is not None or 'unadjusted', this parameter should be expressed in units of the frequency specified in `data_freq`. volatility_window_type : str, default='rolling' The type of window to use for the volatility calculation. Accepted values are 'rolling' and 'exponential'. min_xs_samples : int, default=2 The minimum number of samples required in each cross-section training set for a regression model to be fitted on that cross-section. If `data_freq` is None or 'unadjusted', this parameter is specified in units of the underlying dataset frequency. Otherwise, this parameter should be expressed in unites of the frequency specified in `data_freq`. data_freq : str, optional Training set data frequency for resampling. This is primarily to be used within the context of market beta estimation in the `BetaEstimator` class in `macrosynergy.learning`, allowing for cross-validation of the underlying dataset frequency for good beta estimation. Accepted strings are 'unadjusted' to use the native dataset frequency, 'W' for weekly, 'M' for monthly and 'Q' for quarterly. It is recommended to set this parameter to 'W', 'M' or 'Q' only when the native dataset frequency is greater. Notes ----- This class is specifically designed for market beta estimation based on the decomposition of the beta into correlation and volatility components in univariate analysis. Separate estimators are fit for each cross-section, but evaluation is performed over the panel. Consequently, the results of a hyperparameter search will choose a single set of hyperparameters for all cross-sections, but the model parameters themselves may differ across cross-sections. """ def __init__( self, correlation_lookback="full", correlation_type="pearson", volatility_lookback="full", volatility_window_type="rolling", min_xs_samples=2, data_freq=None, ): # Call the parent class constructor super().__init__( roll="full", data_freq=data_freq, min_xs_samples=min_xs_samples ) # Additional checks self._check_init_params( correlation_lookback, correlation_type, volatility_lookback, volatility_window_type, ) # Additional attributes self.correlation_lookback = correlation_lookback self.correlation_type = correlation_type self.volatility_lookback = volatility_lookback self.volatility_window_type = volatility_window_type # Create data structures to store the estimated betas for each cross-section self.coefs_ = {} def _fit_cross_section(self, section, X_section, y_section): """ Fit correlation and volatility estimators on a single cross-section. Parameters ---------- section : str The identifier of the cross-section. X_section : pd.DataFrame Input feature matrix for the cross-section. y_section : pd.Series Target variable for the cross-section. """ # First estimate local correlation between the benchmark and contract returns if self.correlation_lookback == "full": if self.correlation_type == "pearson": corr = np.corrcoef(X_section.values[:, 0], y_section.values)[0, 1] elif self.correlation_type == "spearman": X_section_ranked = np.argsort(np.argsort(X_section.values[:, 0])) y_section_ranked = np.argsort(np.argsort(y_section.values)) corr = np.corrcoef(X_section_ranked, y_section_ranked)[0, 1] elif self.correlation_type == "kendall": corr = stats.kendalltau(X_section.values[:, 0], y_section.values)[0] else: X_section_corr = X_section.values[-self.correlation_lookback :, 0] y_section_corr = y_section.values[-self.correlation_lookback :] if self.correlation_type == "pearson": corr = np.corrcoef(X_section_corr, y_section_corr)[0, 1] elif self.correlation_type == "spearman": X_section_ranked = np.argsort(np.argsort(X_section_corr)) y_section_ranked = np.argsort(np.argsort(y_section_corr)) corr = np.corrcoef(X_section_ranked, y_section_ranked)[0, 1] elif self.correlation_type == "kendall": corr = stats.kendalltau(X_section_corr, y_section_corr)[0] # Now estimate local standard deviations of both the benchmark and contract returns if self.volatility_lookback == "full": X_section_std = X_section.values[:, 0].std(ddof=1) y_section_std = y_section.values.std(ddof=1) else: if self.volatility_window_type == "rolling": X_section_std = X_section.values[-self.volatility_lookback :, 0].std( ddof=1 ) y_section_std = y_section.values[-self.volatility_lookback :].std( ddof=1 ) elif self.volatility_window_type == "exponential": alpha = 2 / (self.volatility_lookback + 1) weights = np.array( [(1 - alpha) ** i for i in range(len(X_section))][::-1] ) X_section_std = np.sqrt( np.cov(X_section.iloc[:, 0].values.flatten(), aweights=weights) ) y_section_std = np.sqrt(np.cov(y_section.values, aweights=weights)) # Get beta estimate and store it beta = corr * (y_section_std / X_section_std) self.store_model_info(section, beta)
[docs] def predict( self, X, ): """ Make naive zero predictions over a panel dataset. Parameters ---------- X : pd.DataFrame Input feature matrix. Returns ------- predictions : pd.Series Pandas series of zero predictions, multi-indexed by cross-section and date. Notes ----- This method outputs zero predictions for all cross-sections and dates, since the CorrelationVolatilitySystem is solely used for beta estimation and no forecasting is performed. """ # Checks if not isinstance(X, pd.DataFrame): raise TypeError("The X argument must be a pandas DataFrame.") if not isinstance(X.index, pd.MultiIndex): raise ValueError("X must be multi-indexed.") if not X.index.get_level_values(0).dtype == "object": raise TypeError("The outer index of X must be strings.") if not X.index.get_level_values(1).dtype == "datetime64[ns]": raise TypeError("The inner index of X must be datetime.date.") if not np.all(X.columns == self.feature_names_in_): raise ValueError( "The input feature matrix must have the same columns as the", "training feature matrix.", ) if len(X.columns) != self.n_features_in_: raise ValueError( "The input feature matrix must have the same number of", "columns as the training feature matrix.", ) if X.isnull().values.any(): raise ValueError( "The input feature matrix must not contain any missing values." ) if not X.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all(): raise ValueError( "All columns in the input feature matrix for CorrelationVolatilitySystem", " must be numeric.", ) predictions = pd.Series(index=X.index, data=0) return predictions
[docs] def store_model_info(self, section, beta): """ Store the betas induced by the correlation and volatility estimators. Parameters ---------- section : str The cross-section identifier. beta : numbers.Number The beta estimate for the associated cross-section. """ self.coefs_[section] = beta
[docs] def create_model(self): """ Redundant method for the CorrelationVolatilitySystem class. """ raise NotImplementedError("This method is not implemented for this class.")
def _check_xs_dates(self, min_xs_samples, num_dates): """ Cross-sectional availability check. Parameters ---------- min_xs_samples : int The minimum number of samples required in each cross-section training set for correlation and volatility estimation. num_dates : int The number of unique dates in the cross-section. Returns ------- bool True if the number of samples is sufficient, False otherwise """ if num_dates < min_xs_samples: return False # If the correlation lookback is greater than the number of available dates, skip # to the next cross-section if ( self.correlation_lookback != "full" and num_dates <= self.correlation_lookback ): return False # If the volatility lookback is greater than the number of available dates, skip # to the next cross-section if self.volatility_lookback != "full" and num_dates <= self.volatility_lookback: return False return True def _check_init_params( self, correlation_lookback, correlation_type, volatility_lookback, volatility_window_type, ): """ Parameter checks for the CorrelationVolatilitySystem constructor. Parameters ---------- correlation_lookback : int or str The lookback of the rolling window for correlation estimation. correlation_type : str The type of correlation to be calculated. volatility_lookback : int or str The lookback of the rolling window for volatility estimation. volatility_window_type : str The type of window to use for the volatility calculation. """ if not isinstance(correlation_lookback, (int, str)): raise TypeError("correlation_lookback must be an integer or a string.") if isinstance(correlation_lookback, int) and correlation_lookback <= 0: raise ValueError("correlation_lookback must be a positive integer.") if isinstance(correlation_lookback, str) and correlation_lookback != "full": raise ValueError( "correlation_lookback must be 'full' or a positive integer." ) if not isinstance(correlation_type, str): raise TypeError("correlation_type must be a string.") if correlation_type not in ["pearson", "kendall", "spearman"]: raise ValueError( "correlation_type must be one of 'pearson', 'kendall' or 'spearman'." ) if not isinstance(volatility_lookback, (int, str)): raise TypeError("volatility_lookback must be an integer or a string.") if isinstance(volatility_lookback, int) and volatility_lookback <= 0: raise ValueError("volatility_lookback must be a positive integer.") if isinstance(volatility_lookback, str) and volatility_lookback != "full": raise ValueError( "volatility_lookback must be 'full' or a positive integer." ) if not isinstance(volatility_window_type, str): raise TypeError("volatility_window_type must be a string.") if volatility_window_type not in ["rolling", "exponential"]: raise ValueError( "volatility_window_type must be one of 'rolling' or 'exponential'." )
if __name__ == "__main__": import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression import macrosynergy.management as msm from macrosynergy.management import make_qdf np.random.seed(1) cids = ["AUD", "CAD", "GBP", "USD"] xcats = ["XR", "BENCH_XR", "CRY", "GROWTH", "INFL"] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] """Example: Unbalanced panel """ df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids.loc["AUD"] = ["2012-01-01", "2020-12-31", 0, 1] df_cids.loc["CAD"] = ["2013-01-01", "2020-12-31", 0, 1] df_cids.loc["GBP"] = ["2010-01-01", "2020-12-31", 0, 1] df_cids.loc["USD"] = ["2010-01-01", "2020-12-31", 0, 1] df_xcats = pd.DataFrame(index=xcats, columns=cols) df_xcats.loc["XR"] = ["2010-01-01", "2020-12-31", 0.1, 1, 0, 0.3] df_xcats.loc["BENCH_XR"] = ["2010-01-01", "2020-12-31", 0.1, 1, 0, 0.3] df_xcats.loc["CRY"] = ["2010-01-01", "2020-12-31", 1, 2, 0.95, 1] df_xcats.loc["GROWTH"] = ["2010-01-01", "2020-12-31", 1, 2, 0.9, 1] df_xcats.loc["INFL"] = ["2010-01-01", "2020-12-31", 1, 2, 0.8, 0.5] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfd["grading"] = np.ones(dfd.shape[0]) black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]} dfd = msm.reduce_df(df=dfd, cids=cids, xcats=xcats, blacklist=black) dfd = dfd.pivot(index=["cid", "real_date"], columns="xcat", values="value") # Demonstration of CorrelationVolatilitySystem usage # X2 = pd.DataFrame(dfd["BENCH_XR"]) # y2 = dfd["XR"] # cv = CorrelationVolatilitySystem().fit(X2, y2) # print(cv.coefs_) # # # Demonstration of LinearRegressionSystem usage # X1 = dfd.drop(columns=["XR", "BENCH_XR"]) # y1 = dfd["XR"] # lr = LinearRegressionSystem(roll=5).fit(X1, y1) # print(lr.coefs_) X1 = dfd.drop(columns=["XR", "BENCH_XR"]) y1 = dfd["XR"] lr = LADRegressionSystem().fit(X1, y1) print(lr.coefs_)