import numbers
import numpy as np
import pandas as pd
from sklearn.linear_model import Lars, lars_path
from scipy.stats import kendalltau
from macrosynergy.learning.preprocessing.panel_selectors.base_panel_selector import (
BasePanelSelector,
)
from macrosynergy.learning.random_effects import RandomEffects
[docs]class LassoSelector(BasePanelSelector):
def __init__(self, n_factors=10, positive=False):
"""
Statistical feature selection with LASSO-LARS.
Parameters
----------
n_factors : int
Number of factors to select.
positive : bool
Whether to constrain the LASSO coefficients to be positive.
Notes
-----
The Least Absolute Shrinkage and Selection Operator (LASSO) [1]_ is a linear model
that estimates sparse coefficients. This means that some encouragement is given
for the model to set some coefficients to zero. Hence, the LASSO can be said to
perform feature selection. It transpires that the LARS algorithm [2]_
(see `LarsSelector`) can be used to track the LASSO coefficients as the user-defined
sparsity level is increased. Consequently, we use the LARS algorithm to
compute the LASSO paths and select the desired number of factors. See [3]_ for
the `scikit-learn` documentation on the LASSO-LARS model fit.
References
----------
.. [1] Tibshirani, R., 1996. Regression shrinkage and selection via the lasso.
Journal of the Royal Statistical Society Series B: Statistical Methodology,
58(1), pp.267-288.
https://www.jstor.org/stable/2346178
.. [2] Efron, B., Hastie, T., Johnstone, I. and Tibshirani, R., 2004.
Least angle regression.
https://arxiv.org/abs/math/0406456
.. [3] https://scikit-learn.org/dev/modules/generated/sklearn.linear_model.LassoLars.html
"""
# Checks
if not isinstance(n_factors, int):
raise TypeError("'n_factors' must be an integer.")
if n_factors <= 0:
raise ValueError("'n_factors' must be a positive integer.")
if not isinstance(positive, bool):
raise TypeError("'positive' must be a boolean.")
# Attributes
self.n_factors = n_factors
self.positive = positive
super().__init__()
[docs] def determine_features(self, X, y):
"""
Create feature mask based on the LASSO-LARS algorithm.
Parameters
----------
X : pandas.DataFrame
The feature matrix.
y : pandas.Series or pandas.DataFrame
The target vector.
Returns
-------
mask : np.ndarray
Boolean mask of selected features.
"""
# Obtain coefficient paths with dimensions (n_features, n_alphas)
_, _, coefs_path = lars_path(
X.values,
y.values,
positive=self.positive,
method="lasso",
)
mask = coefs_path[:, min(self.n_factors, coefs_path.shape[1] - 1)] != 0
return mask
[docs]class MapSelector(BasePanelSelector):
def __init__(self, n_factors=None, significance_level=0.05, positive=False):
"""
Univariate statistical feature selection using the Macrosynergy panel test.
Parameters
----------
n_factors : int, optional
Number of factors to select.
significance_level : float, default=0.05
Significance level.
positive : bool, default=False
Whether to only keep features with positive estimated model coefficients.
Notes
-----
The Macrosynergy panel test [1]_ is a univariate test that estimates the
significance of a relationship between each feature and the target variable, over
a panel. This test accounts for cross-sectional correlations. Often, different
cross-sections in a panel are highly correlated - particularly in the case of
dependent variable return data. This violates the assumption of independence
in the usual z-test or t-test, from which the usual p-values are derived. As a
consequence, probabilities of significance can be overstated.
In the Macrosynergy panel test, a Wald test is used to compare the null hypothesis
of an intercept + period-specific random effects model against the alternative
hypothesis of an intercept + period-specific random effects model + the feature
of interest. This works because the null-alternative hypotheses are nested models.
The model in the null hypothesis accounts for the cross-sectional correlations
that exist in each time period. Rejecting this model in favour of the alternative
model indicates that the feature of interest is significant, accounting for those
cross-sectional correlations.
References
----------
.. [1] Gholkar, Rushil and Sueppel, Ralph, 2023.
Testing macro trading factors.
https://macrosynergy.com/research/testing-macro-trading-factors/
"""
# Checks
if n_factors is not None:
if not isinstance(n_factors, int):
raise TypeError("The 'n_factors' parameter must be an integer.")
if n_factors <= 0:
raise ValueError(
"The 'n_factors' parameter must be a positive integer."
)
if not isinstance(significance_level, numbers.Number):
raise TypeError("The significance_level must be a float.")
if (significance_level < 0) or (significance_level > 1):
raise ValueError("The significance_level must be in between 0 and 1.")
if not isinstance(positive, (bool, np.bool_)):
raise TypeError("The 'positive' parameter must be a boolean.")
self.significance_level = significance_level
self.positive = positive
self.n_factors = n_factors
[docs] def determine_features(self, X, y):
"""
Create feature mask based on the Macrosynergy panel test.
Parameters
----------
X : pandas.DataFrame
The feature matrix.
y : pandas.Series or pandas.DataFrame
The target vector.
Returns
-------
mask : np.ndarray
Boolean mask of selected features.
"""
# Iterate through each feature and perform the panel test
factor_pvals = []
for col in self.feature_names_in_:
ftr = X[col]
re = RandomEffects(fit_intercept=True).fit(ftr, y)
est = re.params[col]
pval = re.pvals[col]
factor_pvals.append(pval)
if self.n_factors is not None:
# Return a mask of factors with `n_factors` smallest p_values
factor_indexes = np.argsort(factor_pvals)[: self.n_factors]
mask = [
True if idx in factor_indexes else False
for idx in range(len(factor_pvals))
]
else:
if self.positive:
# Return a mask of factors with positive estimated coefficients and
# p_values < significance_level
mask = [
True if ((est > 0) and (pval < self.significance_level)) else False
for est in factor_pvals
]
else:
# Return as mask of factors with p_values < significance_level
mask = [
True if pval < self.significance_level else False
for pval in factor_pvals
]
return np.array(mask)
[docs]class KendallSignificanceSelector(BasePanelSelector):
"""
Univariate statistical feature selection using Kendall correlation tests.
Future enhancements will include Bonferroni corrections for multiple testing.
Parameters
----------
alpha : float, default=0.05
Significance level.
"""
def __init__(self, alpha=0.05):
if not isinstance(alpha, numbers.Number):
raise TypeError("The 'alpha' parameter must be a number.")
if alpha <= 0 or alpha >= 1:
raise ValueError("The 'alpha' parameter must be between 0 and 1.")
self.alpha = alpha
[docs] def determine_features(self, X, y):
"""
Create feature mask based on the Macrosynergy panel test.
Parameters
----------
X : pandas.DataFrame
The feature matrix.
y : pandas.Series or pandas.DataFrame
The target vector.
Returns
-------
mask : np.ndarray
Boolean mask of selected features.
"""
n_features = X.shape[1]
pvalues = np.zeros(n_features)
taus = np.zeros(n_features)
X = np.asarray(X)
y = np.asarray(y)
# Compute Kendall tau and p-values
for i in range(n_features):
tau, p = kendalltau(X[:, i], y)
taus[i] = tau
pvalues[i] = p
self.scores_ = taus
self.pvalues_ = pvalues
# Select all significant features
significant = pvalues < self.alpha
if significant.any():
self.support_ = significant
else:
# Select only the most significant feature
best_feature = np.argmin(pvalues)
support = np.zeros(n_features, dtype=bool)
support[best_feature] = True
self.support_ = support
return np.array(self.support_)
[docs]class FactorAvailabilitySelector(BasePanelSelector):
def __init__(self, min_cids=2, min_periods=36):
"""
Feature selector based on availability criteria.
Selects features that have sufficient data coverage across both
cross-sections and time periods. A feature is selected if there
are at least ``min_periods`` time periods in which that feature
has non-missing values for at least ``min_cids`` cross-sections.
Notes
-----
If the dataset contains fewer time periods than ``min_periods``, the
threshold is automatically reduced to the number of available periods.
Parameters
----------
min_cids : int, default=2
Minimum number of cross-sections that must have non-missing data
for a feature at a given time period for that period to count
as sufficiently covered.
min_periods : int, default=36
Minimum number of sufficiently covered time periods required for
a feature to be selected.
"""
super().__init__()
if not isinstance(min_cids, numbers.Integral):
raise TypeError("The 'min_cids' parameter must be a positive integer.")
if not isinstance(min_periods, numbers.Integral):
raise TypeError("The 'min_periods' parameter must be a positive integer.")
if min_cids < 0:
raise ValueError("The 'min_cids' parameter must be a positive integer.")
if min_periods < 0:
raise ValueError("The 'min_periods' parameter must be a positive integer.")
self.min_cids = min_cids
self.min_periods = min_periods
[docs] def determine_features(self, X, y):
# If the dataset doesn't have self.min_periods periods, then we need to
# restrict analysis to the number of periods in the dataset
num_dates = len(X.index.get_level_values(1).unique())
min_periods = min(self.min_periods, num_dates)
mask = (
X.notna()
.groupby(level=1)
.sum()
.ge(self.min_cids)
.sum()
.ge(min_periods)
)
return np.array(mask)
if __name__ == "__main__":
import macrosynergy.management as msm
from macrosynergy.management.simulate import make_qdf
# Randomly generate a panel
cids = ["AUD", "CAD", "GBP", "USD"]
xcats = ["XR", "CRY", "GROWTH", "INFL"]
cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]
"""Example: Unbalanced panel """
df_cids = pd.DataFrame(
index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
)
df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1]
df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1]
df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1]
df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1]
df_xcats = pd.DataFrame(index=xcats, columns=cols)
df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1]
df_xcats.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 1, 2, 0.9, 1]
df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", -0.1, 2, 0.8, 0.3]
dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
dfd["grading"] = np.ones(dfd.shape[0])
black = {
"GBP": (
pd.Timestamp(year=2009, month=1, day=1),
pd.Timestamp(year=2012, month=6, day=30),
),
"CAD": (
pd.Timestamp(year=2015, month=1, day=1),
pd.Timestamp(year=2100, month=1, day=1),
),
}
train = msm.categories_df(
df=dfd, xcats=xcats, cids=cids, val="value", blacklist=black, freq="M", lag=1
).dropna()
train = train[
train.index.get_level_values(1) >= pd.Timestamp(year=2005, month=8, day=1)
]
X_train = train.drop(columns=["XR"])
y_train = train["XR"]
# Kendall selector
kendall = KendallSignificanceSelector(alpha=0.1).fit(X_train, y_train)
print(f"Kendall significance alpha 0.1: {kendall.get_feature_names_out()}")
kendall = KendallSignificanceSelector(alpha=0.01).fit(X_train, y_train)
print(f"Kendall significance alpha 0.01: {kendall.get_feature_names_out()}")
# LARS selector
lars = LarsSelector(n_factors=2).fit(X_train, y_train)
print(f"LARS 2-factors, no intercept: {lars.get_feature_names_out()}")
lars = LarsSelector(n_factors=2, fit_intercept=True).fit(X_train, y_train)
print(f"LARS 2-factors, with intercept: {lars.get_feature_names_out()}")
print(lars.transform(X_train))
# LASSO selector
lasso = LassoSelector(n_factors=1, positive=True).fit(X_train, y_train)
print(f"Lasso 1-factor, positive restriction: {lasso.get_feature_names_out()}")
lasso = LassoSelector(n_factors=3, positive=False).fit(X_train, y_train)
print(f"Lasso 3-factors, with intercept: {lasso.get_feature_names_out()}")
print(lasso.transform(X_train))
# Map selector
map_selector = MapSelector(n_factors=2).fit(X_train, y_train)
print(f"Map 2-factors: {map_selector.get_feature_names_out()}")
map_selector = MapSelector(significance_level=0.2).fit(X_train, y_train)
print(f"Map significance 0.2: {map_selector.get_feature_names_out()}")
print(map_selector.transform(X_train))