Source code for macrosynergy.learning.forecasting.meta_estimators.probability

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin

[docs]class ProbabilityEstimator(BaseEstimator, MetaEstimatorMixin, ClassifierMixin): """ Meta estimator to create trading signals based on the probability of going long. Parameters ---------- classifier : ClassifierMixin A scikit-learn classifier. Notes ----- This class stores feature importances as the feature importances of the base estimator as well as defining a create_signal method that returns the probability of going long in excess of 0.5. This is taken into account when used in the SignalOptimizer class in this package. """ def __init__(self, classifier): if not isinstance(classifier, ClassifierMixin): raise TypeError("classifier must be a scikit-learn classifier.") self.classifier = classifier self.classes_ = [-1,1]
[docs] def fit(self, X, y): """ Fit the underlying classifier. Parameters ---------- X : pd.DataFrame or np.ndarray Pandas dataframe or numpy array of input features. y : pd.Series or pd.DataFrame or np.ndarray Pandas series, dataframe or numpy array of targets associated with each sample in X. """ # Checks self._check_fit_params(X, y) # Model fitting self.classifier.fit(X, y) # Store feature importances if hasattr(self.classifier, "feature_importances_"): self.feature_importances_ = self.classifier.feature_importances_ elif hasattr(self.classifier, "coef_"): self.feature_importances_ = np.abs(self.classifier.coef_) / np.sum(np.abs(self.classifier.coef_)) return self
[docs] def predict(self, X): """ Predict the class labels for the provided data. Parameters ---------- X : pd.DataFrame or numpy array Input feature matrix. Returns ------- y_pred : np.ndarray Numpy array of predictions. """ # Checks self._check_predict_params(X) # Predict return self.classifier.predict(X)
[docs] def create_signal(self, X): """ Create a trading signal based on the probability of going long. Parameters ---------- X : pd.DataFrame or numpy array Input feature matrix. Returns ------- y_pred : np.ndarray Numpy array of signals. """ # Checks self._check_predict_params(X) # Create signal return self.classifier.predict_proba(X)[:,1] - 0.5
def _check_fit_params( self, X, y, ): """ Checks for fit method parameters. """ # X if not isinstance(X, (pd.DataFrame, np.ndarray)): raise TypeError( "Input feature matrix for the probability estimator must be either a pandas " "dataframe or numpy array." ) if isinstance(X, np.ndarray): if X.ndim != 2: raise ValueError( "When the input feature matrix for the probability estimator is a " "numpy array, it must have two dimensions." ) # y if not isinstance(y, (pd.Series, pd.DataFrame, np.ndarray)): raise TypeError( "Target vector for the probability estimator must be either a pandas series, " "dataframe or numpy array." ) if isinstance(y, pd.DataFrame): if y.shape[1] != 1: raise ValueError( "The dependent variable dataframe must have only one column. If used " "as part of an sklearn pipeline, ensure that previous steps return " "a pandas series or dataframe." ) if isinstance(y, np.ndarray): if y.ndim != 1: raise ValueError( "When the target vector for the probability estimator is a numpy " "array, it must have one dimension." ) if X.shape[0] != y.shape[0]: raise ValueError( "The number of samples in the input feature matrix must match the number " "of samples in the target vector." ) def _check_predict_params(self, X): """ Checks for predict method parameters. """ if not isinstance(X, (pd.DataFrame, np.ndarray)): raise TypeError( "Input feature matrix for the probability estimator must be either a pandas " "dataframe or numpy array. If used as part of an sklearn pipeline, ensure " "that previous steps return a pandas dataframe or numpy array." ) if isinstance(X, np.ndarray): if X.ndim != 2: raise ValueError( "When the input feature matrix for the probability estimator is a " "numpy array, it must have two dimensions. If used as part of an " "sklearn pipeline, ensure that previous steps return a two-dimensional " "data structure." ) if X.shape[1] != self.n_features_in_: raise ValueError( "The number of features in the input feature matrix must match the number " "seen in training." ) def __getattr__(self, attr): """ Get attributes from the underlying classifier. """ if hasattr(self.classifier, attr): return getattr(self.classifier, attr) raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
if __name__ == "__main__": import macrosynergy.management as msm from macrosynergy.management.simulate import make_qdf import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression cids = ["AUD", "CAD", "GBP", "USD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] """Example: Unbalanced panel """ df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1] df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1] df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1] df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1] df_xcats = pd.DataFrame(index=xcats, columns=cols) df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3] df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1] df_xcats.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 1, 2, 0.9, 1] df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", -0.1, 2, 0.8, 0.3] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfd["grading"] = np.ones(dfd.shape[0]) black = { "GBP": ( pd.Timestamp(year=2009, month=1, day=1), pd.Timestamp(year=2012, month=6, day=30), ), "CAD": ( pd.Timestamp(year=2015, month=1, day=1), pd.Timestamp(year=2100, month=1, day=1), ), } train = msm.categories_df( df=dfd, xcats=xcats, cids=cids, val="value", blacklist=black, freq="M", lag=1 ).dropna() # Regressor X_train = train.drop(columns=["XR"]) y_train = np.sign(train["XR"]) pe = ProbabilityEstimator(LogisticRegression()).fit(X_train, y_train) print(pe.create_signal(X_train))