Source code for macrosynergy.learning.preprocessing.scalers.base_panel_scaler

from abc import ABC, abstractmethod

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from macrosynergy.compat import OneToOneFeatureMixin


[docs]class BasePanelScaler(BaseEstimator, TransformerMixin, OneToOneFeatureMixin, ABC):
    """
    Base class for scaling a panel of features in a learning pipeline.

    Parameters
    ----------
    type : str, default="panel"
        The panel dimension over which the scaling is applied. Options are
        "panel" and "cross_section".

    Notes
    -----
    Learning algorithms can benefit from scaling each feature to a similar range. This
    ensures they consider each feature equally in the model training process. It can
    also encourage faster convergence of an optimization algorithm.
    """

    def __init__(self, type="panel"):
        # Checks
        if not isinstance(type, str):
            raise TypeError("`type` must be a string.")
        if type not in ["panel", "cross_section"]:
            raise ValueError("`type` must be either 'panel' or 'cross_section'.")

        # Attributes
        self.type = type
        self.n_features_in_ = None
        self.feature_names_in_ = None

[docs]    def fit(self, X, y=None):
        """
        Fit method to learn training set quantities for feature scaling.

        Parameters
        ----------
        X : pd.DataFrame
            The feature matrix.
        y : pd.Series or pd.DataFrame, default=None
            The target vector.

        Returns
        -------
        self
            The fitted scaler.
        """
        # Checks
        self._check_fit_params(X, y)
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = X.columns

        # Set up hash table for storing statistics
        unique_cross_sections = X.index.get_level_values(0).unique()
        self.statistics: dict = {
            cross_section: {feature_name: None for feature_name in X.columns}
            for cross_section in unique_cross_sections
        }
        self.statistics["panel"] = {feature_name: None for feature_name in X.columns}

        # Extract statistics for each feature
        for feature in X.columns:
            if self.type == "cross_section":
                # Get unique training cross-sections
                unique_cross_sections = X.index.get_level_values(0).unique()
                for cross_section in unique_cross_sections:
                    self.statistics[cross_section][feature] = self.extract_statistics(
                        X.loc[cross_section], feature
                    )
            self.statistics["panel"][feature] = self.extract_statistics(X, feature)

        return self

[docs]    def transform(self, X):
        """
        Transform method to scale the input data based on extracted training statistics.

        Parameters
        ----------
        X : pandas.DataFrame
            The feature matrix.

        Returns
        -------
        X_transformed : pandas.DataFrame
            The feature matrix with scaled features.
        """
        # Checks
        self._check_transform_params(X)

        # Transform the data
        unique_cross_sections = X.index.get_level_values(0).unique()
        scaled_columns = []
        for feature in X.columns:
            if self.type == "cross_section":
                # Scale each cross-section based on stored cross sectional statistics in abstract method
                # If the cross-section is not in the statistics dictionary, use the panel statistics
                X_transformed = pd.concat(
                    [
                        self.scale(
                            X.loc[cross_section],
                            feature,
                            self.statistics.get(
                                cross_section, self.statistics["panel"]
                            )[feature],
                        )
                        for cross_section in unique_cross_sections
                    ],
                    axis=0,
                )
            else:
                # Scale the panel based on stored panel statistics in abstract method
                X_transformed = self.scale(
                    X, feature, self.statistics["panel"][feature]
                )
            # Add transformed column to list
            scaled_columns.append(X_transformed)

        # Concatenate the transformed columns
        X_transformed = pd.DataFrame(
            pd.concat(scaled_columns, axis=1).values, index=X.index, columns=X.columns
        )

        return X_transformed

[docs]    @abstractmethod
    def extract_statistics(self, X, feature):
        """
        Determine the relevant statistics for feature scaling.
        """
        pass

[docs]    @abstractmethod
    def scale(self, X, feature, statistics):
        """
        Scale the input data based on the relevant statistics.
        """
        pass

    def _check_fit_params(self, X, y):
        """
        Checks the input data for the fit method.

        Parameters
        ----------
        X : pandas.DataFrame
            The feature matrix.
        y : pandas.Series or pandas.DataFrame
            The target vector.
        """
        # Checks only necessary on X
        if not isinstance(X, pd.DataFrame):
            raise TypeError(
                "Input feature matrix for the selector must be a pandas dataframe. ",
                "If used as part of an sklearn pipeline, ensure that previous steps ",
                "return a pandas dataframe.",
            )
        if not X.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all():
            raise ValueError(
                "All columns in the input feature matrix for PanelStandardScaler",
                " must be numeric.",
            )
        if X.isnull().values.any():
            raise ValueError(
                "The input feature matrix for PanelStandardScaler must not contain any "
                "missing values."
            )
        if not isinstance(X.index, pd.MultiIndex):
            raise ValueError("The input feature matrix for X must be multi-indexed.")
        if not X.index.get_level_values(0).dtype == "object":
            raise TypeError("The outer index of X must be strings.")
        if not X.index.get_level_values(1).dtype == "datetime64[ns]":
            raise TypeError("The inner index of X must be datetime.date.")

    def _check_transform_params(self, X):
        """
        Input checks for the transform method.

        Parameters
        ----------
        X : pandas.DataFrame
            The feature matrix.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError(
                "Input feature matrix for the scaler must be a pandas dataframe. "
                "If used as part of an sklearn pipeline, ensure that previous steps "
                "return a pandas dataframe."
            )
        if not X.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all():
            raise ValueError(
                "All columns in the input feature matrix for PanelStandardScaler",
                " must be numeric.",
            )
        if X.isnull().values.any():
            raise ValueError(
                "The input feature matrix for PanelStandardScaler must not contain any "
                "missing values."
            )
        if not X.index.get_level_values(0).dtype == "object":
            raise TypeError("The outer index of X must be strings.")
        if not X.index.get_level_values(1).dtype == "datetime64[ns]":
            raise TypeError("The inner index of X must be datetime.date.")

        if X.shape[1] != self.n_features_in_:
            raise ValueError(
                "The input feature matrix must have the same number of columns as the "
                "training feature matrix."
            )
        if not X.columns.equals(self.feature_names_in_):
            raise ValueError(
                "The input feature matrix must have the same columns as the training "
                "feature matrix."
            )