Source code for macrosynergy.learning.model_evaluation.scorers.scorers

from collections import defaultdict
import numpy as np
import pandas as pd
import scipy.stats as stats

from sklearn.ensemble import VotingRegressor
from sklearn.base import RegressorMixin, ClassifierMixin

from macrosynergy.learning.forecasting.model_systems import BaseRegressionSystem


[docs]def neg_mean_abs_corr(
    estimator,
    X_test,
    y_test,
    correlation_type="pearson",
):
    """
    Negative mean absolute correlation between a time series of benchmark returns and a
    panel of computed hedged returns, with average taken over all cross-sections.

    Parameters
    ----------
    estimator : BaseRegressionSystem
        A fitted `scikit-learn` regression object with separate linear models for each
        cross-section of returns, regressed against a time series of benchmark risk basket
        returns. It is expected to possess a `coefs_` dictionary attribute with keys
        corresponding to the cross-sections of returns and values corresponding to the
        estimated coefficients of the linear model for each cross-section.
    X_test : pd.DataFrame
        Risk-basket returns replicated for each cross-section of returns in `y_test`.
    y_test : pd.Series
        Panel of financial contract returns.
    correlation_type : str
        Type of correlation to compute between each hedged return
        series and the risk basket return series. Default is "pearson".
        Alternatives are "spearman" and "kendall".

    Returns
    -------
    neg_mean_abs_corr : float
        Negative mean absolute correlation between benchmark risk basket returns and
        computed hedged returns.

    Notes
    -----
    For each cross-section :math:`c` in `X_test`, hedged returns are calculated by
    subtracting :math:`X_{test, c} \\cdot \\text{coefs_}[c]` from each `y_{test, c}`.
    Following this, the negative mean absolute correlation over cross-sections can be
    calculated:

    ```{math}
    :label: neg_mean_abs_corr
    \\text{neg_mean_abs_corr} = - (1/C)\\sum_{c=1}^{C} \\left [ abs_corr_{c} \\right ]
    ```

    This function is a specialised scorer to evaluate the quality of a hedge within the
    `BetaEstimator` class in the `macrosynergy.learning` subpackage.
    """
    # Checks
    # estimator
    if isinstance(estimator, BaseRegressionSystem):
        if estimator.models_ is None:
            raise ValueError("estimator must be a fitted model.")
    elif isinstance(estimator, VotingRegressor):
        if not all(
            isinstance(est, BaseRegressionSystem) for est in estimator.estimators_
        ):
            raise TypeError(
                "estimator must be a VotingRegressor with BaseRegressionSystem estimators."
            )
        if not all(est.models_ is not None for est in estimator.estimators_):
            raise ValueError("estimator must be a VotingRegressor with fitted models.")
    else:
        raise TypeError(
            "estimator must be a BaseRegressionSystem or VotingRegressor object."
        )

    # X_test
    if not isinstance(X_test, pd.DataFrame):
        raise TypeError("X_test must be a pandas DataFrame.")
    if X_test.ndim != 2:
        raise ValueError("X_test must be a 2-dimensional DataFrame.")
    if X_test.shape[1] != 1:
        raise ValueError("X_test must have only one column.")
    if not isinstance(X_test.index, pd.MultiIndex):
        raise ValueError("X_test must be multi-indexed.")
    if not X_test.index.get_level_values(0).dtype == "object":
        raise TypeError("The outer index of X_test must be strings.")
    if not X_test.index.get_level_values(1).dtype == "datetime64[ns]":
        raise TypeError("The inner index of X_test must be datetime.date.")
    if not X_test.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all():
        raise ValueError(
            "The input feature matrix column for neg_mean_abs_corr",
            " must be numeric.",
        )
    if X_test.isnull().values.any():
        raise ValueError(
            "The input feature matrix for neg_mean_abs_corr must not contain any "
            "missing values."
        )

    # y_test
    if not isinstance(y_test, pd.Series):
        raise TypeError("y_test must be a pandas Series.")
    if not isinstance(y_test.index, pd.MultiIndex):
        raise ValueError("y_test must be multi-indexed.")
    if not y_test.index.get_level_values(0).dtype == "object":
        raise TypeError("The outer index of y_test must be strings.")
    if not y_test.index.get_level_values(1).dtype == "datetime64[ns]":
        raise TypeError("The inner index of y_test must be datetime.date.")
    if not y_test.index.equals(X_test.index):
        raise ValueError("y_test and X_test must have the same index.")
    if not pd.api.types.is_numeric_dtype(y_test):
        raise ValueError(
            "The input target vector for neg_mean_abs_corr",
            " must be numeric.",
        )
    if y_test.isnull().values.any():
        raise ValueError(
            "The input target vector for neg_mean_abs_corr must not contain any "
            "missing values."
        )
    # Obtain key information
    market_returns = X_test.iloc[:, 0].copy()
    contract_returns = y_test.copy()
    unique_cross_sections = X_test.index.get_level_values(0).unique()
    
    # Handle voting regressor case later
    if isinstance(estimator, VotingRegressor):
        estimators = estimator.estimators_
        coefs_list = [est.coefs_ for est in estimators]
        sum_dict = defaultdict(lambda: [0, 0])

        for coefs in coefs_list:
            for key, value in coefs.items():
                sum_dict[key][0] += value
                sum_dict[key][1] += 1

        estimated_coefs = {key: sum / count for key, (sum, count) in sum_dict.items()}
    else:
        estimated_coefs = estimator.coefs_

    running_sum = 0
    xs_count = 0
    for cross_section in unique_cross_sections:
        # Check whether a model for this cross-section has been estimated
        if cross_section in estimated_coefs.keys():
            xs_count += 1
            # Get cross-section returns and matched risk basket returns
            contract_returns_c = contract_returns.xs(cross_section)
            market_returns_c = market_returns.xs(cross_section)
            hedged_returns_c = (
                contract_returns_c - estimated_coefs[cross_section] * market_returns_c
            )
            # Compute negative absolute market correlation
            if correlation_type == "pearson":
                abs_corr = abs(stats.pearsonr(hedged_returns_c, market_returns_c)[0])
            elif correlation_type == "spearman":
                abs_corr = abs(stats.spearmanr(hedged_returns_c, market_returns_c)[0])
            else:
                # Use Kendall
                abs_corr = abs(stats.kendalltau(hedged_returns_c, market_returns_c)[0])
            # Update running sum
            running_sum += abs_corr
        else:
            # Then a model wasn't estimated for this cross-section
            continue

    if xs_count == 0:
        return np.nan
    else:
        return -running_sum / xs_count
    
[docs]def multi_output_sortino(estimator, X_test, y_test):
    """
    Sortino ratio of a naive long-short directional strategy based on multi-output model predictions.
    """
    if not isinstance(estimator, (RegressorMixin, ClassifierMixin)):
        raise TypeError("estimator must be a scikit-learn regressor or classifier.")
    # TODO: add check that estimator is a multi-output model
    if not isinstance(X_test, pd.DataFrame):
        raise TypeError("X_test must be a pandas DataFrame.")
    if not isinstance(y_test, pd.DataFrame):
        raise TypeError("y_test must be a pandas DataFrame.")
    if X_test.shape[0] != y_test.shape[0]:
        raise ValueError("X_test and y_test must have the same number of rows.")
    
    preds = pd.DataFrame(
        estimator.predict(X_test), index=y_test.index, columns=y_test.columns
    )
    signals = np.sign(preds)
    returns = signals * y_test
    portfolio_returns = returns.sum(axis=1)

    return portfolio_returns.mean() / portfolio_returns[portfolio_returns < 0].std()

[docs]def multi_output_sharpe(estimator, X_test, y_test):
    """
    Sharpe ratio of a naive long-short directional strategy based on multi-output model predictions.
    """
    if not isinstance(estimator, (RegressorMixin, ClassifierMixin)):
        raise TypeError("estimator must be a scikit-learn regressor or classifier.")
    # TODO: add check that estimator is a multi-output model
    if not isinstance(X_test, pd.DataFrame):
        raise TypeError("X_test must be a pandas DataFrame.")
    if not isinstance(y_test, pd.DataFrame):
        raise TypeError("y_test must be a pandas DataFrame.")
    if X_test.shape[0] != y_test.shape[0]:
        raise ValueError("X_test and y_test must have the same number of rows.")
    
    preds = pd.DataFrame(
        estimator.predict(X_test), index=y_test.index, columns=y_test.columns
    )
    signals = np.sign(preds)
    returns = signals * y_test
    portfolio_returns = returns.sum(axis=1)

    return portfolio_returns.mean() / portfolio_returns.std()


if __name__ == "__main__":
    import macrosynergy.management as msm
    from macrosynergy.management.simulate import make_qdf
    from macrosynergy.learning import RidgeRegressionSystem

    cids = ["AUD", "CAD", "GBP", "USD"]
    xcats = ["XR", "BMXR"]
    cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )
    df_cids.loc["AUD"] = ["2012-01-01", "2020-12-31", 0, 1]
    df_cids.loc["CAD"] = ["2012-01-01", "2020-12-31", 0, 1]
    df_cids.loc["GBP"] = ["2012-01-01", "2020-12-31", 0, 1]
    df_cids.loc["USD"] = ["2012-01-01", "2020-12-31", 0, 1]

    df_xcats = pd.DataFrame(index=xcats, columns=cols)
    df_xcats.loc["XR"] = ["2012-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
    df_xcats.loc["BMXR"] = ["2012-01-01", "2020-12-31", 1, 2, 0.95, 1]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    Xy = msm.categories_df(
        df=dfd, xcats=xcats, cids=cids, freq="M", lag=1, xcat_aggs=["last", "sum"]
    ).dropna()
    X = Xy.iloc[:, :-1]
    y = Xy.iloc[:, -1]

    ridge = RidgeRegressionSystem()
    ridge.fit(X, y)
    print(
        "\nNegative mean absolute correlation: "
        f"{neg_mean_abs_corr(ridge, X, y, correlation_type='pearson')}"
    )