Source code for macrosynergy.learning.preprocessing.panel_selectors.panel_selectors

import numbers

import numpy as np
import pandas as pd
from sklearn.linear_model import Lars, lars_path

from scipy.stats import kendalltau

from macrosynergy.learning.preprocessing.panel_selectors.base_panel_selector import (
    BasePanelSelector,
)
from macrosynergy.learning.random_effects import RandomEffects


[docs]class LarsSelector(BasePanelSelector):
    def __init__(self, n_factors=10, fit_intercept=False):
        """
        Statistical feature selection using LARS.

        Parameters
        ----------
        n_factors : int, default=10
            Number of factors to select.
        fit_intercept : bool, default=False
            Whether to fit an intercept term in the LARS model.

        Notes
        -----
        The Least Angle Regression (LARS) algorithm was designed to fit high dimensional
        linear models. It is a means of estimating the covariates to include in the model,
        as well as associated coefficients. LARS can be considered to be a continuous
        equivalent to forward selection.

        The algorithm is described in detail in [1]_ and is implemented in the
        `scikit-learn` library [2]_. It works as follows:

        1. Set coefficients to zero.
        2. Find the covariate that has the highest correlation with the target variable.
        3. Increase the coefficient of this covariate in a stepwise fashion, recording
            the residual at each step. Stop when another covariate is as correlated with
            the residuals as the current one.
        4. Add this second covariate to the model and the compute the two-variable OLS
            solution.
        5. Increase the coefficients of the two covariates in a stepwise fashion towards
            the OLS solution, recording the residuals at each step. Stop when another
            covariate is as correlated with the residuals as the current ones.
        6. Add this third covariate to the model and compute the three-variable OLS
            solution.
        7. Iterate this process until the desired number of covariates have been selected.

        References
        ----------
        .. [1] Efron, B., Hastie, T., Johnstone, I. and Tibshirani, R., 2004.
            Least angle regression.
            https://arxiv.org/abs/math/0406456
        .. [2] https://scikit-learn.org/dev/modules/linear_model.html#least-angle-regression

        """
        # Checks
        if not isinstance(fit_intercept, bool):
            raise TypeError("'fit_intercept' must be a boolean.")
        if not isinstance(n_factors, int):
            raise TypeError("'n_factors' must be an integer.")
        if n_factors <= 0:
            raise ValueError("'n_factors' must be a positive integer.")

        # Attributes
        self.fit_intercept = fit_intercept
        self.n_factors = n_factors

        super().__init__()

[docs]    def determine_features(self, X, y):
        """
        Create feature mask based on the LARS algorithm.

        Parameters
        ----------
        X : pandas.DataFrame
            The feature matrix.
        y : pandas.Series or pandas.DataFrame
            The target vector.

        Returns
        -------
        mask : list
            Boolean mask of selected features.
        """
        lars = Lars(fit_intercept=self.fit_intercept, n_nonzero_coefs=self.n_factors)
        lars.fit(X.values, y.values.reshape(-1, 1))
        coefs = lars.coef_

        return [True if coef != 0 else False for coef in coefs]


[docs]class LassoSelector(BasePanelSelector):
    def __init__(self, n_factors=10, positive=False):
        """
        Statistical feature selection with LASSO-LARS.

        Parameters
        ----------
        n_factors : int
            Number of factors to select.
        positive : bool
            Whether to constrain the LASSO coefficients to be positive.

        Notes
        -----
        The Least Absolute Shrinkage and Selection Operator (LASSO) [1]_ is a linear model
        that estimates sparse coefficients. This means that some encouragement is given
        for the model to set some coefficients to zero. Hence, the LASSO can be said to
        perform feature selection. It transpires that the LARS algorithm [2]_
        (see `LarsSelector`) can be used to track the LASSO coefficients as the user-defined
        sparsity level is increased. Consequently, we use the LARS algorithm to
        compute the LASSO paths and select the desired number of factors. See [3]_ for
        the `scikit-learn` documentation on the LASSO-LARS model fit.

        References
        ----------
        .. [1] Tibshirani, R., 1996. Regression shrinkage and selection via the lasso.
            Journal of the Royal Statistical Society Series B: Statistical Methodology,
            58(1), pp.267-288.
            https://www.jstor.org/stable/2346178
        .. [2] Efron, B., Hastie, T., Johnstone, I. and Tibshirani, R., 2004.
            Least angle regression.
            https://arxiv.org/abs/math/0406456
        .. [3] https://scikit-learn.org/dev/modules/generated/sklearn.linear_model.LassoLars.html
        """
        # Checks
        if not isinstance(n_factors, int):
            raise TypeError("'n_factors' must be an integer.")
        if n_factors <= 0:
            raise ValueError("'n_factors' must be a positive integer.")
        if not isinstance(positive, bool):
            raise TypeError("'positive' must be a boolean.")

        # Attributes
        self.n_factors = n_factors
        self.positive = positive

        super().__init__()

[docs]    def determine_features(self, X, y):
        """
        Create feature mask based on the LASSO-LARS algorithm.

        Parameters
        ----------
        X : pandas.DataFrame
            The feature matrix.
        y : pandas.Series or pandas.DataFrame
            The target vector.

        Returns
        -------
        mask : np.ndarray
            Boolean mask of selected features.
        """
        # Obtain coefficient paths with dimensions (n_features, n_alphas)
        _, _, coefs_path = lars_path(
            X.values,
            y.values,
            positive=self.positive,
            method="lasso",
        )

        mask = coefs_path[:, min(self.n_factors, coefs_path.shape[1] - 1)] != 0

        return mask


[docs]class MapSelector(BasePanelSelector):
    def __init__(self, n_factors=None, significance_level=0.05, positive=False):
        """
        Univariate statistical feature selection using the Macrosynergy panel test.

        Parameters
        ----------
        n_factors : int, optional
            Number of factors to select.
        significance_level : float, default=0.05
            Significance level.
        positive : bool, default=False
            Whether to only keep features with positive estimated model coefficients.

        Notes
        -----
        The Macrosynergy panel test [1]_ is a univariate test that estimates the
        significance of a relationship between each feature and the target variable, over
        a panel. This test accounts for cross-sectional correlations. Often, different
        cross-sections in a panel are highly correlated - particularly in the case of
        dependent variable return data. This violates the assumption of independence
        in the usual z-test or t-test, from which the usual p-values are derived. As a
        consequence, probabilities of significance can be overstated.

        In the Macrosynergy panel test, a Wald test is used to compare the null hypothesis
        of an intercept + period-specific random effects model against the alternative
        hypothesis of an intercept + period-specific random effects model + the feature
        of interest. This works because the null-alternative hypotheses are nested models.
        The model in the null hypothesis accounts for the cross-sectional correlations
        that exist in each time period. Rejecting this model in favour of the alternative
        model indicates that the feature of interest is significant, accounting for those
        cross-sectional correlations.

        References
        ----------
        .. [1] Gholkar, Rushil and Sueppel, Ralph, 2023.
            Testing macro trading factors.
            https://macrosynergy.com/research/testing-macro-trading-factors/
        """
        # Checks
        if n_factors is not None:
            if not isinstance(n_factors, int):
                raise TypeError("The 'n_factors' parameter must be an integer.")
            if n_factors <= 0:
                raise ValueError(
                    "The 'n_factors' parameter must be a positive integer."
                )
        if not isinstance(significance_level, numbers.Number):
            raise TypeError("The significance_level must be a float.")
        if (significance_level < 0) or (significance_level > 1):
            raise ValueError("The significance_level must be in between 0 and 1.")
        if not isinstance(positive, (bool, np.bool_)):
            raise TypeError("The 'positive' parameter must be a boolean.")

        self.significance_level = significance_level
        self.positive = positive
        self.n_factors = n_factors

[docs]    def determine_features(self, X, y):
        """
        Create feature mask based on the Macrosynergy panel test.

        Parameters
        ----------
        X : pandas.DataFrame
            The feature matrix.
        y : pandas.Series or pandas.DataFrame
            The target vector.

        Returns
        -------
        mask : np.ndarray
            Boolean mask of selected features.
        """
        # Iterate through each feature and perform the panel test
        factor_pvals = []

        for col in self.feature_names_in_:
            ftr = X[col]
            re = RandomEffects(fit_intercept=True).fit(ftr, y)
            est = re.params[col]
            pval = re.pvals[col]
            factor_pvals.append(pval)

        if self.n_factors is not None:
            # Return a mask of factors with `n_factors` smallest p_values
            factor_indexes = np.argsort(factor_pvals)[: self.n_factors]
            mask = [
                True if idx in factor_indexes else False
                for idx in range(len(factor_pvals))
            ]

        else:
            if self.positive:
                # Return a mask of factors with positive estimated coefficients and
                # p_values < significance_level
                mask = [
                    True if ((est > 0) and (pval < self.significance_level)) else False
                    for est in factor_pvals
                ]
            else:
                # Return as mask of factors with p_values < significance_level
                mask = [
                    True if pval < self.significance_level else False
                    for pval in factor_pvals
                ]

        return np.array(mask)
    
[docs]class KendallSignificanceSelector(BasePanelSelector):
    """
    Univariate statistical feature selection using Kendall correlation tests.

    Future enhancements will include Bonferroni corrections for multiple testing.

    Parameters
    ----------
    alpha : float, default=0.05
        Significance level.
    """
    def __init__(self, alpha=0.05):
        if not isinstance(alpha, numbers.Number):
            raise TypeError("The 'alpha' parameter must be a number.")
        if alpha <= 0 or alpha >= 1:
            raise ValueError("The 'alpha' parameter must be between 0 and 1.")
        
        self.alpha = alpha

[docs]    def determine_features(self, X, y):
        """
        Create feature mask based on the Macrosynergy panel test.

        Parameters
        ----------
        X : pandas.DataFrame
            The feature matrix.
        y : pandas.Series or pandas.DataFrame
            The target vector.

        Returns
        -------
        mask : np.ndarray
            Boolean mask of selected features.
        """
        n_features = X.shape[1]
        pvalues = np.zeros(n_features)
        taus = np.zeros(n_features)

        X = np.asarray(X)
        y = np.asarray(y)

        # Compute Kendall tau and p-values
        for i in range(n_features):
            tau, p = kendalltau(X[:, i], y)
            taus[i] = tau
            pvalues[i] = p

        self.scores_ = taus
        self.pvalues_ = pvalues

        # Select all significant features
        significant = pvalues < self.alpha

        if significant.any():
            self.support_ = significant
        else:
            # Select only the most significant feature
            best_feature = np.argmin(pvalues)
            support = np.zeros(n_features, dtype=bool)
            support[best_feature] = True
            self.support_ = support

        return np.array(self.support_)

[docs]class FactorAvailabilitySelector(BasePanelSelector):
    def __init__(self, min_cids=2, min_periods=36):
        """
        Feature selector based on availability criteria.

        Selects features that have sufficient data coverage across both
        cross-sections and time periods. A feature is selected if there
        are at least ``min_periods`` time periods in which that feature
        has non-missing values for at least ``min_cids`` cross-sections.

        Notes
        -----
        If the dataset contains fewer time periods than ``min_periods``, the
        threshold is automatically reduced to the number of available periods.

        Parameters
        ----------
        min_cids : int, default=2
            Minimum number of cross-sections that must have non-missing data
            for a feature at a given time period for that period to count
            as sufficiently covered.
        min_periods : int, default=36
            Minimum number of sufficiently covered time periods required for
            a feature to be selected.
        """
        super().__init__()

        if not isinstance(min_cids, numbers.Integral):
            raise TypeError("The 'min_cids' parameter must be a positive integer.")
        if not isinstance(min_periods, numbers.Integral):
            raise TypeError("The 'min_periods' parameter must be a positive integer.")
        if min_cids < 0:
            raise ValueError("The 'min_cids' parameter must be a positive integer.")
        if min_periods < 0:
            raise ValueError("The 'min_periods' parameter must be a positive integer.")

        self.min_cids = min_cids
        self.min_periods = min_periods

[docs]    def determine_features(self, X, y):
        # If the dataset doesn't have self.min_periods periods, then we need to
        # restrict analysis to the number of periods in the dataset
        num_dates = len(X.index.get_level_values(1).unique())
        min_periods = min(self.min_periods, num_dates)

        mask = (
            X.notna()
            .groupby(level=1)
            .sum()
            .ge(self.min_cids)
            .sum()
            .ge(min_periods)
        )

        return np.array(mask)


if __name__ == "__main__":
    import macrosynergy.management as msm
    from macrosynergy.management.simulate import make_qdf

    # Randomly generate a panel
    cids = ["AUD", "CAD", "GBP", "USD"]
    xcats = ["XR", "CRY", "GROWTH", "INFL"]
    cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]

    """Example: Unbalanced panel """

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )
    df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1]
    df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1]
    df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1]
    df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1]

    df_xcats = pd.DataFrame(index=xcats, columns=cols)
    df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
    df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 1, 2, 0.95, 1]
    df_xcats.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 1, 2, 0.9, 1]
    df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", -0.1, 2, 0.8, 0.3]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    dfd["grading"] = np.ones(dfd.shape[0])
    black = {
        "GBP": (
            pd.Timestamp(year=2009, month=1, day=1),
            pd.Timestamp(year=2012, month=6, day=30),
        ),
        "CAD": (
            pd.Timestamp(year=2015, month=1, day=1),
            pd.Timestamp(year=2100, month=1, day=1),
        ),
    }

    train = msm.categories_df(
        df=dfd, xcats=xcats, cids=cids, val="value", blacklist=black, freq="M", lag=1
    ).dropna()
    train = train[
        train.index.get_level_values(1) >= pd.Timestamp(year=2005, month=8, day=1)
    ]

    X_train = train.drop(columns=["XR"])
    y_train = train["XR"]

    # Kendall selector
    kendall = KendallSignificanceSelector(alpha=0.1).fit(X_train, y_train)
    print(f"Kendall significance alpha 0.1: {kendall.get_feature_names_out()}")
    kendall = KendallSignificanceSelector(alpha=0.01).fit(X_train, y_train)
    print(f"Kendall significance alpha 0.01: {kendall.get_feature_names_out()}")
                                              
    # LARS selector
    lars = LarsSelector(n_factors=2).fit(X_train, y_train)
    print(f"LARS 2-factors, no intercept: {lars.get_feature_names_out()}")
    lars = LarsSelector(n_factors=2, fit_intercept=True).fit(X_train, y_train)
    print(f"LARS 2-factors, with intercept: {lars.get_feature_names_out()}")

    print(lars.transform(X_train))

    # LASSO selector
    lasso = LassoSelector(n_factors=1, positive=True).fit(X_train, y_train)
    print(f"Lasso 1-factor, positive restriction: {lasso.get_feature_names_out()}")
    lasso = LassoSelector(n_factors=3, positive=False).fit(X_train, y_train)
    print(f"Lasso 3-factors, with intercept: {lasso.get_feature_names_out()}")

    print(lasso.transform(X_train))

    # Map selector
    map_selector = MapSelector(n_factors=2).fit(X_train, y_train)
    print(f"Map 2-factors: {map_selector.get_feature_names_out()}")
    map_selector = MapSelector(significance_level=0.2).fit(X_train, y_train)
    print(f"Map significance 0.2: {map_selector.get_feature_names_out()}")

    print(map_selector.transform(X_train))