from collections import defaultdict
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.ensemble import VotingRegressor
from sklearn.base import RegressorMixin, ClassifierMixin
from macrosynergy.learning.forecasting.model_systems import BaseRegressionSystem
[docs]def neg_mean_abs_corr(
estimator,
X_test,
y_test,
correlation_type="pearson",
):
"""
Negative mean absolute correlation between a time series of benchmark returns and a
panel of computed hedged returns, with average taken over all cross-sections.
Parameters
----------
estimator : BaseRegressionSystem
A fitted `scikit-learn` regression object with separate linear models for each
cross-section of returns, regressed against a time series of benchmark risk basket
returns. It is expected to possess a `coefs_` dictionary attribute with keys
corresponding to the cross-sections of returns and values corresponding to the
estimated coefficients of the linear model for each cross-section.
X_test : pd.DataFrame
Risk-basket returns replicated for each cross-section of returns in `y_test`.
y_test : pd.Series
Panel of financial contract returns.
correlation_type : str
Type of correlation to compute between each hedged return
series and the risk basket return series. Default is "pearson".
Alternatives are "spearman" and "kendall".
Returns
-------
neg_mean_abs_corr : float
Negative mean absolute correlation between benchmark risk basket returns and
computed hedged returns.
Notes
-----
For each cross-section :math:`c` in `X_test`, hedged returns are calculated by
subtracting :math:`X_{test, c} \\cdot \\text{coefs_}[c]` from each `y_{test, c}`.
Following this, the negative mean absolute correlation over cross-sections can be
calculated:
```{math}
:label: neg_mean_abs_corr
\\text{neg_mean_abs_corr} = - (1/C)\\sum_{c=1}^{C} \\left [ abs_corr_{c} \\right ]
```
This function is a specialised scorer to evaluate the quality of a hedge within the
`BetaEstimator` class in the `macrosynergy.learning` subpackage.
"""
# Checks
# estimator
if isinstance(estimator, BaseRegressionSystem):
if estimator.models_ is None:
raise ValueError("estimator must be a fitted model.")
elif isinstance(estimator, VotingRegressor):
if not all(
isinstance(est, BaseRegressionSystem) for est in estimator.estimators_
):
raise TypeError(
"estimator must be a VotingRegressor with BaseRegressionSystem estimators."
)
if not all(est.models_ is not None for est in estimator.estimators_):
raise ValueError("estimator must be a VotingRegressor with fitted models.")
else:
raise TypeError(
"estimator must be a BaseRegressionSystem or VotingRegressor object."
)
# X_test
if not isinstance(X_test, pd.DataFrame):
raise TypeError("X_test must be a pandas DataFrame.")
if X_test.ndim != 2:
raise ValueError("X_test must be a 2-dimensional DataFrame.")
if X_test.shape[1] != 1:
raise ValueError("X_test must have only one column.")
if not isinstance(X_test.index, pd.MultiIndex):
raise ValueError("X_test must be multi-indexed.")
if not X_test.index.get_level_values(0).dtype == "object":
raise TypeError("The outer index of X_test must be strings.")
if not X_test.index.get_level_values(1).dtype == "datetime64[ns]":
raise TypeError("The inner index of X_test must be datetime.date.")
if not X_test.apply(lambda x: pd.api.types.is_numeric_dtype(x)).all():
raise ValueError(
"The input feature matrix column for neg_mean_abs_corr",
" must be numeric.",
)
if X_test.isnull().values.any():
raise ValueError(
"The input feature matrix for neg_mean_abs_corr must not contain any "
"missing values."
)
# y_test
if not isinstance(y_test, pd.Series):
raise TypeError("y_test must be a pandas Series.")
if not isinstance(y_test.index, pd.MultiIndex):
raise ValueError("y_test must be multi-indexed.")
if not y_test.index.get_level_values(0).dtype == "object":
raise TypeError("The outer index of y_test must be strings.")
if not y_test.index.get_level_values(1).dtype == "datetime64[ns]":
raise TypeError("The inner index of y_test must be datetime.date.")
if not y_test.index.equals(X_test.index):
raise ValueError("y_test and X_test must have the same index.")
if not pd.api.types.is_numeric_dtype(y_test):
raise ValueError(
"The input target vector for neg_mean_abs_corr",
" must be numeric.",
)
if y_test.isnull().values.any():
raise ValueError(
"The input target vector for neg_mean_abs_corr must not contain any "
"missing values."
)
# Obtain key information
market_returns = X_test.iloc[:, 0].copy()
contract_returns = y_test.copy()
unique_cross_sections = X_test.index.get_level_values(0).unique()
# Handle voting regressor case later
if isinstance(estimator, VotingRegressor):
estimators = estimator.estimators_
coefs_list = [est.coefs_ for est in estimators]
sum_dict = defaultdict(lambda: [0, 0])
for coefs in coefs_list:
for key, value in coefs.items():
sum_dict[key][0] += value
sum_dict[key][1] += 1
estimated_coefs = {key: sum / count for key, (sum, count) in sum_dict.items()}
else:
estimated_coefs = estimator.coefs_
running_sum = 0
xs_count = 0
for cross_section in unique_cross_sections:
# Check whether a model for this cross-section has been estimated
if cross_section in estimated_coefs.keys():
xs_count += 1
# Get cross-section returns and matched risk basket returns
contract_returns_c = contract_returns.xs(cross_section)
market_returns_c = market_returns.xs(cross_section)
hedged_returns_c = (
contract_returns_c - estimated_coefs[cross_section] * market_returns_c
)
# Compute negative absolute market correlation
if correlation_type == "pearson":
abs_corr = abs(stats.pearsonr(hedged_returns_c, market_returns_c)[0])
elif correlation_type == "spearman":
abs_corr = abs(stats.spearmanr(hedged_returns_c, market_returns_c)[0])
else:
# Use Kendall
abs_corr = abs(stats.kendalltau(hedged_returns_c, market_returns_c)[0])
# Update running sum
running_sum += abs_corr
else:
# Then a model wasn't estimated for this cross-section
continue
if xs_count == 0:
return np.nan
else:
return -running_sum / xs_count
[docs]def multi_output_sortino(estimator, X_test, y_test):
"""
Sortino ratio of a naive long-short directional strategy based on multi-output model predictions.
"""
if not isinstance(estimator, (RegressorMixin, ClassifierMixin)):
raise TypeError("estimator must be a scikit-learn regressor or classifier.")
# TODO: add check that estimator is a multi-output model
if not isinstance(X_test, pd.DataFrame):
raise TypeError("X_test must be a pandas DataFrame.")
if not isinstance(y_test, pd.DataFrame):
raise TypeError("y_test must be a pandas DataFrame.")
if X_test.shape[0] != y_test.shape[0]:
raise ValueError("X_test and y_test must have the same number of rows.")
preds = pd.DataFrame(
estimator.predict(X_test), index=y_test.index, columns=y_test.columns
)
signals = np.sign(preds)
returns = signals * y_test
portfolio_returns = returns.sum(axis=1)
return portfolio_returns.mean() / portfolio_returns[portfolio_returns < 0].std()
[docs]def multi_output_sharpe(estimator, X_test, y_test):
"""
Sharpe ratio of a naive long-short directional strategy based on multi-output model predictions.
"""
if not isinstance(estimator, (RegressorMixin, ClassifierMixin)):
raise TypeError("estimator must be a scikit-learn regressor or classifier.")
# TODO: add check that estimator is a multi-output model
if not isinstance(X_test, pd.DataFrame):
raise TypeError("X_test must be a pandas DataFrame.")
if not isinstance(y_test, pd.DataFrame):
raise TypeError("y_test must be a pandas DataFrame.")
if X_test.shape[0] != y_test.shape[0]:
raise ValueError("X_test and y_test must have the same number of rows.")
preds = pd.DataFrame(
estimator.predict(X_test), index=y_test.index, columns=y_test.columns
)
signals = np.sign(preds)
returns = signals * y_test
portfolio_returns = returns.sum(axis=1)
return portfolio_returns.mean() / portfolio_returns.std()
if __name__ == "__main__":
import macrosynergy.management as msm
from macrosynergy.management.simulate import make_qdf
from macrosynergy.learning import RidgeRegressionSystem
cids = ["AUD", "CAD", "GBP", "USD"]
xcats = ["XR", "BMXR"]
cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]
df_cids = pd.DataFrame(
index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
)
df_cids.loc["AUD"] = ["2012-01-01", "2020-12-31", 0, 1]
df_cids.loc["CAD"] = ["2012-01-01", "2020-12-31", 0, 1]
df_cids.loc["GBP"] = ["2012-01-01", "2020-12-31", 0, 1]
df_cids.loc["USD"] = ["2012-01-01", "2020-12-31", 0, 1]
df_xcats = pd.DataFrame(index=xcats, columns=cols)
df_xcats.loc["XR"] = ["2012-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
df_xcats.loc["BMXR"] = ["2012-01-01", "2020-12-31", 1, 2, 0.95, 1]
dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
Xy = msm.categories_df(
df=dfd, xcats=xcats, cids=cids, freq="M", lag=1, xcat_aggs=["last", "sum"]
).dropna()
X = Xy.iloc[:, :-1]
y = Xy.iloc[:, -1]
ridge = RidgeRegressionSystem()
ridge.fit(X, y)
print(
"\nNegative mean absolute correlation: "
f"{neg_mean_abs_corr(ridge, X, y, correlation_type='pearson')}"
)