Source code for macrosynergy.learning.forecasting.model_systems.base_regression_system

import numpy as np
import pandas as pd

import datetime
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, RegressorMixin


[docs]class BaseRegressionSystem(BaseEstimator, RegressorMixin, ABC):
    def __init__(
        self,
        roll="full",
        min_xs_samples=2,
        data_freq=None,
    ):
        """
        Base class for systems of regressors.

        Parameters
        ----------
        roll : int or str, default = "full"
            The lookback of the rolling window for the regression. If "full",
            the entire cross-sectional history is used for each regression. Otherwise,
            this should be specified in units of the data frequency. If `data_freq` is not
            None or "unadjusted", then an integer value for `roll` should be expressed in
            units of the data frequency provided in the `data_freq` argument.
        min_xs_samples : int, default=2
            The minimum number of samples required in each cross-section training set for
            a regression model to be fitted for that cross-section. If `data_freq` is None
            or "unadjusted", this parameter is specified in terms of the native dataset
            frequency. Otherwise, this parameter should be expressed in units of the
            frequency specified in the `data_freq` argument.
        data_freq : str, optional
            Training set data frequency. This is primarily to be used within the context
            of market beta estimation in the `BetaEstimator` class in
            `macrosynergy.learning`, allowing one to cross-validate the underlying data
            frequency for good beta estimation. Accepted strings are 'unadjusted', 'W' for
            weekly, 'M' for monthly and 'Q' for quarterly. It is recommended to set this
            parameter to "W", "M" or "Q" only when the native dataset frequency is greater.

        Notes
        -----
        Systems of regressors are used to fit a different regression model on each
        cross-section of a panel. This is useful when one believes the within-group
        relationships are sufficiently different to warrant separate models, or when
        Simpson's paradox is a concern.

        A concern with this approach, however, is that the number of samples in each
        cross-section may be too small to fit a model. This is particularly true when
        dealing with low-frequency macro quantamental data.
        """
        # Checks
        if not isinstance(roll, (str, int)):
            raise TypeError("roll must be either a string or integer.")
        if isinstance(roll, str) and roll != "full":
            raise ValueError("roll must equal `full` when a string is specified.")
        if isinstance(roll, int) and roll <= 1:
            raise ValueError(
                "roll must be greater than 1 when an integer is specified."
            )

        if not isinstance(min_xs_samples, int):
            raise TypeError("The min_xs_samples argument must be an integer.")
        if min_xs_samples < 2:
            raise ValueError("The min_xs_samples argument must be at least 2.")

        if data_freq is not None:
            if not isinstance(data_freq, str):
                raise TypeError("The data_freq argument must be a string.")
            if data_freq not in ["unadjusted", "W", "M", "Q"]:
                raise ValueError(
                    "data_freq must be one of 'unadjusted', 'W', 'M' or 'Q'."
                )

        # Set attributes
        self.roll = roll
        self.data_freq = data_freq
        self.min_xs_samples = min_xs_samples

        self.models_ = None

[docs]    def fit(
        self,
        X,
        y,
    ):
        """
        Fit a regression on each cross-section of a panel, subject to availability.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix.
        y : pd.Series, pd.DataFrame or np.ndarray
            Target variable.

        Returns
        -------
        self : BaseRegressionSystem
            Fitted regression system object.
        """
        # Checks
        y = self._check_fit_params(X, y)
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = X.columns
        self.models_ = {}

        # Downsample data frequency if necessary
        if (self.data_freq is not None) and (self.data_freq != "unadjusted"):
            X = self._downsample_by_data_freq(X)
            y = self._downsample_by_data_freq(y)

        # Iterate over cross-sections and fit a regression model on each
        cross_sections = X.index.unique(level=0)
        for section in cross_sections:
            X_section = X.xs(section, level=0, drop_level=False)
            y_section = y.xs(section, level=0, drop_level=False)

            unique_dates = sorted(X_section.index.unique())
            num_dates = len(unique_dates)
            # Skip cross-sections with insufficient samples
            if not self._check_xs_dates(self.min_xs_samples, num_dates):
                continue
            # Roll the data if necessary
            if self.roll and self.roll != "full":
                if num_dates <= self.roll:
                    continue
                else:
                    X_section, y_section = self.roll_dates(
                        self.roll, X_section, y_section, unique_dates
                    )
            # Fit the model
            self._fit_cross_section(section, X_section, y_section)

        return self

    def _fit_cross_section(self, section, X_section, y_section):
        """
        Fit a regression model on a single cross-section.

        Parameters
        ----------
        section : str
            The identifier of the cross-section.
        X_section : pd.DataFrame
            Input feature matrix for the cross-section.
        y_section : pd.Series
            Target variable for the cross-section.
        """
        model = self.create_model()
        model.fit(pd.DataFrame(X_section), y_section)
        # Store model and coefficients
        self.models_[section] = model
        self.store_model_info(section, model)

[docs]    def predict(
        self,
        X,
    ):
        """
        Make predictions over a panel dataset based on trained observation-specific
        models.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix.

        Returns
        -------
        predictions : pd.Series
            Pandas series of predictions, multi-indexed by cross-section and date.
        """
        # Checks
        if not isinstance(X, pd.DataFrame):
            raise TypeError("The X argument must be a pandas DataFrame.")
        if not isinstance(X.index, pd.MultiIndex):
            raise ValueError("X must be multi-indexed.")
        if not X.index.get_level_values(0).dtype == "object":
            raise TypeError("The outer index of X must be strings.")
        if not X.index.get_level_values(1).dtype == "datetime64[ns]":
            raise TypeError("The inner index of X must be datetime.date.")
        if not np.all(X.columns == self.feature_names_in_):
            raise ValueError(
                "The input feature matrix must have the same columns as the",
                "training feature matrix.",
            )
        if len(X.columns) != self.n_features_in_:
            raise ValueError(
                "The input feature matrix must have the same number of",
                "columns as the training feature matrix.",
            )
        if X.isnull().values.any():
            raise ValueError(
                "The input feature matrix must not contain any missing values."
            )
        if not X.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all():
            raise ValueError(
                "All columns in the input feature matrix for regression systems",
                " must be numeric.",
            )

        predictions = pd.Series(index=X.index, data=np.nan)

        # Store predictions for each test cross-section, if an existing model is available
        cross_sections = predictions.index.get_level_values(0).unique()
        for idx, section in enumerate(cross_sections):
            if section in self.models_.keys():
                # If a model exists, return the estimated OOS contract return.
                predictions[predictions.index.get_level_values(0) == section] = (
                    self.models_[section].predict(X.xs(section, level=0)).flatten()
                )

        return predictions

[docs]    def roll_dates(self, roll, X_section, y_section, unique_dates):
        """
        Adjust dataset to be contained within a rolling window.

        Parameters
        ----------
        roll : int
            The lookback of the rolling window.
        X_section : pd.DataFrame
            Input feature matrix for the cross-section.
        y_section : pd.Series
            Target variable for the cross-section.
        unique_dates : list
            List of unique dates in the cross-section.

        Returns
        -------
        X_section : pd.DataFrame
            Input feature matrix for the cross-section, adjusted for the rolling window.
        y_section : pd.Series
            Target variable for the cross-section, adjusted for the rolling window.
        """
        right_dates = unique_dates[-roll:]
        
        common_index = X_section.index.intersection(right_dates)

        X_section = X_section.reindex(common_index)
        y_section = y_section.reindex(common_index)

        return X_section, y_section

[docs]    @abstractmethod
    def store_model_info(self, section, model):
        """
        Store necessary model information for explainability.

        Parameters
        ----------
        section : str
            The identifier of the cross-section.
        model : RegressorMixin
            The fitted regression model.

        Notes
        ------
        Must be overridden.
        """
        pass

[docs]    @abstractmethod
    def create_model(self):
        """
        Instantiate a regression model for a given cross-section.

        Notes
        -----
        Must be overridden.
        """
        pass

    def _check_xs_dates(self, min_xs_samples, num_dates):
        """
        Cross-sectional availability check.

        Parameters
        ----------
        min_xs_samples : int
            The minimum number of samples required in each cross-section training set for
            a regression model to be fitted.
        num_dates : int
            The number of unique dates in the cross-section.

        Returns
        -------
        bool
            True if the number of samples is sufficient, False otherwise
        """
        if num_dates < min_xs_samples:
            return False

        return True

    def _downsample_by_data_freq(self, df):
        """
        Resample the input dataset to the specified data frequency.

        Parameters
        ----------
        df : pd.DataFrame
            Input feature matrix.

        Returns
        -------
        pd.DataFrame
            Resampled feature matrix.
        """
        return (
            df.groupby(
                [
                    pd.Grouper(level="cid"),
                    pd.Grouper(level="real_date", freq=self.data_freq),
                ]
            )
            .sum()
            .copy()
        )

    def _check_fit_params(self, X, y):
        """
        Input checks for the fit method parameters.

        Parameters
        ----------
        X : pd.DataFrame
            Input feature matrix.
        y : pd.Series, pd.DataFrame or np.ndarray
            Target variable.
        """
        # X
        if not isinstance(X, pd.DataFrame):
            raise TypeError("The X argument must be a pandas DataFrame.")
        if not isinstance(X.index, pd.MultiIndex):
            raise ValueError("X must be multi-indexed.")
        if not X.index.get_level_values(0).dtype == "object":
            raise TypeError("The outer index of X must be strings.")
        if not X.index.get_level_values(1).dtype == "datetime64[ns]":
            raise TypeError("The inner index of X must be datetime.date.")
        if not X.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all():
            raise ValueError(
                "All columns in the input feature matrix for regression systems",
                " must be numeric.",
            )
        if X.isnull().values.any():
            raise ValueError(
                "The input feature matrix for regression systems must not contain any "
                "missing values."
            )

        if not isinstance(y, (pd.DataFrame, pd.Series, np.ndarray)):
            raise TypeError(
                "The y argument must be a pandas DataFrame, Series or numpy array."
            )
        if len(X) != len(y):
            raise ValueError("The number of samples in X and y must match.")
        if isinstance(y, np.ndarray):
            # This can happen during sklearn's GridSearch when a voting regressor is used
            if y.ndim != 1 and y.ndim != 2:
                raise ValueError("y must be a 1D or 2D array.")
            if y.ndim == 2 and y.shape[1] != 1:
                raise ValueError("y must have only one column.")
            y = pd.Series(y, index=X.index)
        if not isinstance(y, np.ndarray):
            if not np.issubdtype(y.values.dtype, np.number):
                raise ValueError("The target vector must be numeric.")
            if y.isnull().values.any():
                raise ValueError(
                    "The target vector must not contain any missing values."
                )
        else:
            if not np.issubdtype(y.dtype, np.number):
                raise ValueError("The target vector must be numeric.")
            if np.isnan(y).any():
                raise ValueError(
                    "The target vector must not contain any missing values."
                )

        return y