Source code for macrosynergy.learning.forecasting.bootstrap.base_modified_regressor

import numpy as np
import pandas as pd
import numbers

from sklearn.base import RegressorMixin, BaseEstimator, clone

from macrosynergy.learning.forecasting.bootstrap import BasePanelBootstrap

from abc import ABC


[docs]class BaseModifiedRegressor(BaseEstimator, RegressorMixin, BasePanelBootstrap, ABC): def __init__( self, model, method, error_offset=1e-5, bootstrap_method="panel", bootstrap_iters=100, resample_ratio=1, max_features=None, analytic_method=None, ): """ Modified linear regression model. Estimated coefficients are divided by estimated standard errors to form an auxiliary factor model. Parameters ---------- model : RegressorMixin Underlying linear regression model to be modified to account for statistical precision of parameter estimates. This model must have `coef_` and `intercept_` attributes, in accordance with `scikit-learn` convention. method : str Method to modify coefficients. Accepted values are "analytic" or "bootstrap". error_offset : float, default = 1e-5 Small offset to add to estimated standard errors in order to prevent small denominators during the coefficient adjustment. bootstrap_method : str, default = "panel" Method used to modify coefficients, when `method = bootstrap`. Accepted values are "panel", "period", "cross", "cross_per_period", "period_per_cross". bootstrap_iters : int, default = 100 Number of bootstrap iterations to determine standard errors, used only when `method = bootstrap`. resample_ratio : numbers.Number, default = 1 Ratio of resampling units in each bootstrap dataset, used only when `method = bootstrap`. This is a fraction of the quantity of the panel component to be resampled. max_features : str or int or float, default = None Number of features consider in each bootstrap dataset. This is used to increase the amount of variation in bootstrap datasets. Accepted values are "sqrt", "log2", an integer number of features and a floating point proportion of features. Default behaviour is to raise a NotImplementedError. analytic_method : str, default = None The analytic method used to determine standard errors. This parameter is passed into `adjust_analyical_se`, which should be implemented by the user if analytical, model-specific, expressions are required. Notes ----- Parametric regression models are fit by finding optimal parameters that minimize a loss function. In the frequentist statistics framework, "true" population-wide values exist for these parameters, which can only be estimated from sampled data. Consequently, our parameter estimates can be considered to be realizations from a random variable, and hence subject to sampling variation. Broadly speaking, the greater the amount of independent data sampled, the smaller the variation in parameter estimates. In other words, parameter estimates are more unreliable when less data is seen during training. By estimating the standard deviation of their sampling distributions - a.k.a. their "standard errors" - we can adjust our model coefficients to account for lack of statistical precision. In our modified parametric regression models, each estimated parameter is divided by the estimated standard error (plus an offset). This means that greater volatility in a parameter estimate due to lack of data is accounted for by reducing the magnitude of this estimate, whilst greater certainty in the precision of the estimate is reflected by inflating a regression coefficient. Use of this class is only recommended for linear models, since these regression models are interpretable and the coefficient adjustment can accordingly be interpreted as increasing the relevance of factors whose coefficients we are more confident in, and decreasing relevance for factors whose coefficients we are less confident in. For a more complex function, for instance a neural network, amending model coefficients can be disastrous; it would be unclear how such adjustment would affect the downstream performance of the model. As a consequence, this class should be used with care and we recommend its use for linear models only. """ # Checks super().__init__( bootstrap_method=bootstrap_method, resample_ratio=resample_ratio, max_features=max_features, ) self._check_init_params( model=model, method=method, error_offset=error_offset, bootstrap_iters=bootstrap_iters, analytic_method=analytic_method, ) # Set attributes self.model = model self.method = method self.error_offset = error_offset self.bootstrap_iters = bootstrap_iters self.analytic_method = analytic_method def _check_init_params( self, model, method, error_offset, bootstrap_iters, analytic_method, ): """ Constructor parameter checks. Parameters ---------- model : RegressorMixin Underlying linear regression model to be modified to account for statistical precision of parameter estimates. This model must have `coef_` and `intercept_` attributes, in accordance with `scikit-learn` convention. method : str Method to modify coefficients. Accepted values are "analytic" or "bootstrap". error_offset : float, default = 1e-5 Small offset to add to estimated standard errors in order to prevent small denominators during the coefficient adjustment. bootstrap_iters : int, default = 100 Number of bootstrap iterations to determine standard errors, used only when `method = bootstrap`. analytic_method : str, default = None The analytic method used to determine standard errors. This parameter is passed into `adjust_analyical_se`, which should be implemented by the user if analytical, model-specific, expressions are required. """ # model if not isinstance(model, BaseEstimator): raise TypeError("model must be a valid `scikit-learn` estimator.") if not isinstance(model, RegressorMixin): raise TypeError("model must be a valid `scikit-learn` regressor.") # method if not isinstance(method, str): raise TypeError("method must be a string.") if method not in ["analytic", "bootstrap"]: raise ValueError("method must be either 'analytic' or 'bootstrap'.") # error_offset if not isinstance(error_offset, numbers.Number): raise TypeError("error_offset must be a float or an integer.") if error_offset <= 0: raise ValueError("error_offset must be greater than 0.") # bootstrap_iters if method == "bootstrap": if not isinstance(bootstrap_iters, numbers.Integral): raise TypeError("bootstrap_iters must be an integer.") if bootstrap_iters <= 0: raise ValueError("bootstrap_iters must be a positive integer.") # analytic_method if method == "analytic": if analytic_method is not None: if not isinstance(analytic_method, str): raise TypeError("analytic_method must be a string.")
[docs] def fit( self, X, y, ): """ Fit a linear model and modify coefficients based on standard errors. Parameters ---------- X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Target vector associated with each sample in X. Returns ------- self Fitted estimator. """ # Checks self._check_fit_params(X=X, y=y) # Fit self.model.fit(X, y) if not hasattr(self.model, "coef_"): raise AttributeError("The underlying model must have a `coef_` attribute.") if not hasattr(self.model, "intercept_"): raise AttributeError( "The underlying model must have an `intercept_` attribute." ) # Modify coefficients if self.method == "analytic": self.intercept_, self.coef_ = self.adjust_analytical_se( self.model, X, y, self.analytic_method, ) elif self.method == "bootstrap": # clone the model to avoid modifying the original model model = clone(self.model) self.intercept_, self.coef_ = self.adjust_bootstrap_se( model, X, y, ) return self
[docs] def predict( self, X, ): """ Predict using the unadjusted linear model. Parameters ---------- X : pd.DataFrame Input feature matrix. Returns ------- np.ndarray or pd.Series Predicted values. """ # Checks if not isinstance(X, pd.DataFrame): raise TypeError( "Input feature matrix must be a pandas dataframe. " "If used as part of an sklearn pipeline, ensure that previous steps " "return a pandas dataframe." ) if not isinstance(X.index, pd.MultiIndex): raise ValueError("X must be multi-indexed.") if not X.index.get_level_values(0).dtype == "object": raise TypeError("The outer index of X must be strings.") if not X.index.get_level_values(1).dtype == "datetime64[ns]": raise TypeError("The inner index of X must be datetime.date.") if not X.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all(): raise TypeError("All columns in X must be numeric.") if X.isnull().values.any(): raise ValueError("X must not contain missing values.") return self.model.predict(X)
[docs] def create_signal( self, X, ): """ Predict using the coefficient-adjusted linear model. Parameters ---------- X : pd.DataFrame Input feature matrix. Returns ------- np.ndarray or pd.Series Signal from the adjusted factor model based on X. Notes ----- We define an additional `create_signal` method instead of using the `predict` method in order to not interfere with hyperparameter searches with standard metrics. Moreover, outputs from the adjusted factor model are not valid predictions, but are valid trading signals. """ # Checks if not isinstance(X, pd.DataFrame): raise TypeError( "Input feature matrix must be a pandas dataframe. " "If used as part of an sklearn pipeline, ensure that previous steps " "return a pandas dataframe." ) if not isinstance(X.index, pd.MultiIndex): raise ValueError("X must be multi-indexed.") if not X.index.get_level_values(0).dtype == "object": raise TypeError("The outer index of X must be strings.") if not X.index.get_level_values(1).dtype == "datetime64[ns]": raise TypeError("The inner index of X must be datetime.date.") if not X.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all(): raise TypeError("All columns in X must be numeric.") if X.isnull().values.any(): raise ValueError("X must not contain missing values.") return np.dot(X, self.coef_) + self.intercept_
[docs] def adjust_bootstrap_se( self, model, X, y, ): """ Adjust the coefficients of the linear model by bootstrap standard errors. Parameters ---------- model : RegressorMixin The underlying linear model to be modified. X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Target vector associated with each sample in X. Returns ------- intercept : float Adjusted intercept. coef : np.ndarray Adjusted coefficients. """ # Create storage for bootstrap coefficients and intercepts bootstrap_coefs = np.zeros((self.bootstrap_iters, X.shape[1])) bootstrap_intercepts = np.zeros(self.bootstrap_iters) # Bootstrap loop for i in range(self.bootstrap_iters): X_resampled, y_resampled = self.create_bootstrap_dataset(X, y) model.fit(X_resampled, y_resampled) bootstrap_coefs[i] = model.coef_ bootstrap_intercepts[i] = model.intercept_ # Calculate standard errors coef_se = np.std(bootstrap_coefs, axis=0, ddof=0) intercept_se = np.std(bootstrap_intercepts, ddof=0) # Adjust the coefficients and intercepts by the standard errors coef = self.model.coef_ / (coef_se + self.error_offset) intercept = self.model.intercept_ / (intercept_se + self.error_offset) return intercept, coef
[docs] def adjust_analytical_se( self, model, X, y, analytic_method, ): """ Adjust the coefficients of the linear model by an analytical standard error formula. Parameters ---------- model : RegressorMixin The underlying linear model to be modified. X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Target vector associated with each sample in X. analytic_method : str The analytic method used to calculate standard errors. Returns ------- intercept : float Adjusted intercept. coef : np.ndarray Adjusted coefficients. Notes ----- Analytical standard errors are model-specific, meaning that they must be implemented in a subclass of BaseModifiedRegressor. """ raise NotImplementedError( "Analytical standard error adjustments are not available for most models." "This function must be implemented in a subclass of BaseModifiedRegressor " "if known standard error expressions are available." )
def _check_fit_params( self, X, y, ): """ Check parameter validity for the fit method. Parameters ---------- X : pd.DataFrame Input feature matrix. y : pd.DataFrame or pd.Series Target vector associated with each sample in X. """ # Checks if not isinstance(X, pd.DataFrame): raise TypeError( "Input feature matrix must be a pandas dataframe. " "If used as part of an sklearn pipeline, ensure that previous steps " "return a pandas dataframe." ) if not (isinstance(y, pd.Series) or isinstance(y, pd.DataFrame)): raise TypeError( "Target vector must be a pandas series or dataframe. " "If used as part of an sklearn pipeline, ensure that previous steps " "return a pandas series or dataframe." ) if isinstance(y, pd.DataFrame) and y.shape[1] != 1: raise ValueError( "The target dataframe must have only one column. If used as part of " "an sklearn pipeline, ensure that previous steps return a pandas " "series or dataframe." ) if not isinstance(X.index, pd.MultiIndex): raise ValueError("X must be multi-indexed.") if not isinstance(y.index, pd.MultiIndex): raise ValueError("y must be multi-indexed.") if not X.index.get_level_values(0).dtype == "object": raise TypeError("The outer index of X must be strings.") if not X.index.get_level_values(1).dtype == "datetime64[ns]": raise TypeError("The inner index of X must be datetime.date.") if not y.index.get_level_values(0).dtype == "object": raise TypeError("The outer index of X must be strings.") if not y.index.get_level_values(1).dtype == "datetime64[ns]": raise TypeError("The inner index of X must be datetime.date.") if not X.index.equals(y.index): raise ValueError( "The indices of the input dataframe X and the output dataframe y don't " "match." ) if not X.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all(): raise TypeError("All columns in X must be numeric.") if isinstance(y, pd.DataFrame): if not pd.api.types.is_numeric_dtype(y.iloc[:, 0]): raise TypeError("All columns in y must be numeric.") else: if not pd.api.types.is_numeric_dtype(y): raise TypeError("All columns in y must be numeric.") if X.isnull().values.any(): raise ValueError("X must not contain missing values.") if y.isnull().values.any(): raise ValueError("y must not contain missing values.")