Source code for macrosynergy.learning.preprocessing.scalers.scalers

from macrosynergy.learning.preprocessing.scalers.base_panel_scaler import (
    BasePanelScaler,
)


[docs]class PanelMinMaxScaler(BasePanelScaler): """ Scale and translate panel features to lie within the range [0,1]. Notes ----- This class is designed to replicate scikit-learn's `MinMaxScaler()` class, with the additional option to scale within cross-sections. Unlike the `MinMaxScaler()` class, dataframes are always returned, preserving the multi-indexing of the inputs. """
[docs] def extract_statistics(self, X, feature): """ Determine the minimum and maximum values of a feature in the input matrix. Parameters ---------- X : pandas.DataFrame The feature matrix. feature : str The feature to extract statistics for. Returns ------- statistics : list List containing the minimum and maximum values of the feature. """ return [X[feature].min(), X[feature].max()]
[docs] def scale(self, X, feature, statistics): """ Scale the 'feature' column in the design matrix 'X' based on the minimum and maximum values of the feature. Parameters ---------- X : pandas.DataFrame The feature matrix. feature : str The feature to scale. statistics : list List containing the minimum and maximum values of the feature, in that order. Returns ------- X_transformed : pandas.Series The scaled feature. """ return (X[feature] - statistics[0]) / (statistics[1] - statistics[0])
[docs]class PanelStandardScaler(BasePanelScaler): """ Scale and translate panel features to have zero mean and unit variance. Parameters ---------- type : str, default="panel" The panel dimension over which the scaling is applied. Options are "panel" and "cross_section". with_mean : bool, default=True Whether to centre the data before scaling. with_std : bool, default=True Whether to scale the data to unit variance. Notes ----- This class is designed to replicate scikit-learn's StandardScaler() class, with the additional option to scale within cross-sections. Unlike the StandardScaler() class, dataframes are always returned, preserving the multi-indexing of the inputs. """ def __init__(self, type="panel", with_mean=True, with_std=True): # Checks if not isinstance(with_mean, bool): raise TypeError("'with_mean' must be a boolean.") if not isinstance(with_std, bool): raise TypeError("'with_std' must be a boolean.") # Attributes self.with_mean = with_mean self.with_std = with_std super().__init__(type=type)
[docs] def extract_statistics(self, X, feature): """ Determine the mean and standard deviation of values of a feature in the input matrix. Parameters ---------- X : pandas.DataFrame The feature matrix. feature : str The feature to extract statistics for. Returns ------- statistics : list List containing the mean and standard deviation of values of the feature. """ return [X[feature].mean(), X[feature].std()]
[docs] def scale(self, X, feature, statistics): """ Scale the 'feature' column in the design matrix 'X' based on the mean and standard deviation values of the feature. Parameters ---------- X : pandas.DataFrame The feature matrix. feature : str The feature to scale. statistics : list List containing the mean and standard deviation of values of the feature, in that order. Returns ------- X_transformed : pandas.Series The scaled feature. """ if self.with_mean: if self.with_std: return (X[feature] - statistics[0]) / statistics[1] else: return X[feature] - statistics[0] else: if self.with_std: return X[feature] / statistics[1] else: return X[feature]
if __name__ == "__main__": import numpy as np import pandas as pd from macrosynergy.management import ( categories_df, make_qdf, ) cids = ["AUD", "CAD", "GBP", "USD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] """Example: Unbalanced panel """ df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1] df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1] df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1] df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1] df_xcats = pd.DataFrame(index=xcats, columns=cols) df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3] df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1] df_xcats.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 1, 2, 0.9, 1] df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", -0.1, 2, 0.8, 0.3] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfd["grading"] = np.ones(dfd.shape[0]) black = { "GBP": ( pd.Timestamp(year=2009, month=1, day=1), pd.Timestamp(year=2012, month=6, day=30), ), "CAD": ( pd.Timestamp(year=2015, month=1, day=1), pd.Timestamp(year=2100, month=1, day=1), ), } train = categories_df( df=dfd, xcats=xcats, cids=cids, val="value", blacklist=black, freq="M", lag=1 ).dropna() train = train[ train.index.get_level_values(1) >= pd.Timestamp(year=2005, month=8, day=1) ] X_train = train.drop(columns=["XR"]) y_train = train["XR"] # Standard scaling over each cross-section scaler = PanelStandardScaler(type="cross_section") scaler.fit(X_train, y_train) print(scaler.transform(X_train)) # MinMax scaling over the panel scaler = PanelMinMaxScaler() scaler.fit(X_train, y_train) print(scaler.transform(X_train))