Source code for macrosynergy.learning.forecasting.weighted_regressors

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, RegressorMixin

import numbers


[docs]class BaseWeightedRegressor(BaseEstimator, RegressorMixin): def __init__( self, model, sign_weighted=False, time_weighted=False, half_life=12 * 21, ): """ Weighted regression model to prioritize contribution from certain samples over others during training. Parameters ---------- model : RegressorMixin The underlying model to be trained with weighted sample contributions. sign_weighted : bool, optional Flag to weight samples based on the frequency of the label's sign in a training set. time_weighted : bool, optional Flag to weight samples based on the recency of the sample. half_life : numbers.Number , optional Half-life of the exponential decay function used to calculate the time weights. This should be expressed in units of the native dataset frequency. Default is 12 * 21, which corresponds to a half-life of 1-year for a daily dataset. Notes ----- Sign-weighted regression models are useful when the dependent (return) variable has a directional bias. By assigning higher weights to the less frequent class, the model is encouraged to learn equally from both positive and negative return samples, irrespective of class imbalance. Time-weighted regression models are useful when the practitioner holds the prior that more recent samples are more informative than older samples. By assigning exponentially decaying weights to samples based on their recency, the model is encouraged to prioritize newer information. """ # Checks self._check_init_params(model, sign_weighted, time_weighted, half_life) # Attributes self.model = model self.sign_weighted = sign_weighted self.time_weighted = time_weighted self.half_life = None if not time_weighted else half_life
[docs] def fit( self, X, y, ): """ Learn optimal weighted regression parameters. Parameters ---------- X : pd.DataFrame or np.ndarray Pandas dataframe or numpy array of input features. y : pd.Series or pd.DataFrame or np.ndarray Pandas series, dataframe or numpy array of targets associated with each sample in X. """ # Checks self._check_fit_params(X, y) # Fit self.n = X.shape[0] self.p = X.shape[1] self.sample_weights = self._calculate_sample_weights(y) self.model.fit(X, y, sample_weight=self.sample_weights) if hasattr(self.model, "coef_"): self.coef_ = self.model.coef_ if hasattr(self.model, "intercept_"): self.intercept_ = self.model.intercept_ return self
[docs] def predict(self, X): """ Predict dependent variable using the fitted weighted regression model. Parameters ---------- X : pd.DataFrame or numpy array Input feature matrix. Returns ------- y_pred : np.ndarray Numpy array of predictions. """ # Checks if not isinstance(X, (pd.DataFrame, np.ndarray)): raise TypeError( "Input feature matrix for weighted regressors must be either a pandas " "dataframe or numpy array. If used as part of an sklearn pipeline, ensure " "that previous steps return a pandas dataframe or numpy array." ) if self.time_weighted and not isinstance(X, pd.DataFrame): raise TypeError( "When time weighting is enabled, the input feature matrix for weighted " "regressors must be a pandas dataframe." ) if isinstance(X, np.ndarray): if X.ndim != 2: raise ValueError( "When the input feature matrix for weighted regressor forecasts is a " "numpy array, it must have two dimensions. If used as part of an " "sklearn pipeline, ensure that previous steps return a two-dimensional " "data structure." ) if X.shape[1] != self.p: raise ValueError( "The number of features in the input feature matrix must match the number " "seen in training." ) # Predict return self.model.predict(X)
def _calculate_sample_weights( self, y, ): """ Determine sample weights based on the sign and time weighting flags. Parameters ---------- y : pd.Series or pd.DataFrame or np.ndarray Target vector associated with each sample in X. """ if self.sign_weighted and self.time_weighted: sign_weights = self._calculate_sign_weights(y) time_weights = self._calculate_time_weights(y) return sign_weights * time_weights elif self.sign_weighted: return self._calculate_sign_weights(y) elif self.time_weighted: return self._calculate_time_weights(y) else: return np.ones(y.shape[0]) def _calculate_sign_weights( self, targets, ): """ Calculate balanced inverse frequency weights for positive and negative signs in the target vector. Parameters ---------- targets : pd.Series or pd.DataFrame or np.ndarray Dependent variable. Returns ------- sample_weights : np.ndarray Numpy array of sample weights. """ pos_sum = np.sum(targets >= 0) neg_sum = np.sum(targets < 0) pos_weight = len(targets) / (2 * pos_sum) if pos_sum > 0 else 0 neg_weight = len(targets) / (2 * neg_sum) if neg_sum > 0 else 0 sample_weights = np.where(targets >= 0, pos_weight, neg_weight) return sample_weights def _calculate_time_weights( self, targets, ): """ Calculate exponentially decaying weights based on the recency of the sample in the panel. Parameters ---------- targets : pd.Series or pd.DataFrame Dependent variable. Returns ------- sample_weights : np.ndarray Numpy array of sample weights. """ dates = sorted(targets.index.get_level_values(1).unique(), reverse=True) num_dates = len(dates) weights = np.power(2, -np.arange(num_dates) / self.half_life) weight_map = dict(zip(dates, weights)) sample_weights = targets.index.get_level_values(1).map(weight_map).to_numpy() return sample_weights def _check_init_params( self, model, sign_weighted, time_weighted, half_life, ): """ Checks for constructor parameters. """ if not isinstance(model, RegressorMixin): raise TypeError( "The model parameter must be an instance of a sklearn regressor." ) if not isinstance(sign_weighted, bool): raise TypeError("The sign_weighted parameter must be a boolean.") if not isinstance(time_weighted, bool): raise TypeError("The time_weighted parameter must be a boolean.") if not isinstance(half_life, numbers.Number) or isinstance(half_life, bool): raise TypeError("The half_life parameter must be a number.") if half_life <= 0: raise ValueError("The half_life parameter must be a positive number.") def _check_fit_params( self, X, y, ): """ Checks for fit method parameters. """ # X if not isinstance(X, (pd.DataFrame, np.ndarray)): raise TypeError( "Input feature matrix for weighted regressors must be either a pandas " "dataframe or numpy array." ) elif self.time_weighted and not isinstance(X, pd.DataFrame): raise TypeError( "When time weighting is enabled, the input feature matrix for weighted " "regressors must be a pandas dataframe." ) if isinstance(X, np.ndarray): if X.ndim != 2: raise ValueError( "When the input feature matrix for weighted regressor forecasts is a " "numpy array, it must have two dimensions." ) # y if not isinstance(y, (pd.Series, pd.DataFrame, np.ndarray)): raise TypeError( "Target vector for weighted regressors must be either a pandas series, " "dataframe or numpy array." ) elif self.time_weighted and not isinstance(y, (pd.Series, pd.DataFrame)): raise TypeError( "When time weighting is enabled, the target vector for weighted regressors " "must be either a pandas series or pandas dataframe." ) if isinstance(y, pd.DataFrame): if y.shape[1] != 1: raise ValueError( "The dependent variable dataframe must have only one column. If used " "as part of an sklearn pipeline, ensure that previous steps return " "a pandas series or dataframe." ) if isinstance(y, np.ndarray): if y.ndim != 1: raise ValueError( "When the target vector for weighted regressor forecasts is a numpy " "array, it must have one dimension." ) if X.shape[0] != y.shape[0]: raise ValueError( "The number of samples in the input feature matrix must match the number " "of samples in the target vector." ) # Joint X and y checks if len(X) != len(y): raise ValueError( "The number of samples in the input feature matrix must match the number " "of samples in the target vector." ) if self.time_weighted: if not y.index.equals(X.index): raise ValueError( "When time weighting is enabled, the target vector and input feature " "matrix must have the same index." )
[docs]class SignWeightedRegressor(BaseWeightedRegressor): def __init__(self, model): """ Regressor with sign-weighted sample weights. Parameters ---------- model : RegressorMixin The underlying regression model to be trained with weighted samples. Notes ----- By weighting the contribution of different training samples based on the sign of the label, the model is encouraged to learn equally from both positive and negative return samples, irrespective of class imbalance. If there are more positive targets than negative targets in the training set, then the negative target samples are given a higher weight in the model training process. The opposite is true if there are more negative targets than positive targets. """ super().__init__(model, sign_weighted=True, time_weighted=False)
[docs]class TimeWeightedRegressor(BaseWeightedRegressor): def __init__( self, model, half_life, ): """ Regressor with time-weighted sample weights. Parameters ---------- model : RegressorMixin The underlying regression model to be trained with weighted samples. half_life : numbers.Number Half-life of the exponential decay function used to calculate the time weights. Notes ----- By weighting the contribution of different training samples based on the timestamp, the model is encouraged to prioritise newer information. The half-life denotes the number of time periods in units of the native data frequency for the weight attributed to the most recent sample to decay by half. """ super().__init__( model, sign_weighted=False, time_weighted=True, half_life=half_life )