"""
Module for analysing and visualizing signal and a return series.
"""
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics as skm
from scipy import stats
from typing import List, Union, Tuple, Dict, Any, Optional, Callable
import statsmodels.api as sm
from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.utils import (
apply_slip as apply_slip_util,
reduce_df,
categories_df,
update_df,
)
from macrosynergy.management.types import QuantamentalDataFrame
import macrosynergy.visuals as msv
# Ensure warnings are printed
warnings.simplefilter("always")
[docs]class SignalReturnRelations:
"""
Class for analysing and visualizing signals and return series. The class is designed
to provide a comprehensive analysis of the relationship between signals and returns
across different frequencies and aggregation methods. The class can be used
to calculate and visualize the following metrics:
- Accuracy
- Balanced accuracy
- Positive signal ratio
- Positive return ratio
- Positive precision
- Negative precision
- Pearson correlation
- Pearson correlation p-value
- Kendall correlation
- Kendall correlation p-value
- AUC
- Macrosynergy Panel test
Parameters
----------
df : ~pandas.DataFrame
standardized DataFrame with the following necessary columns: 'cid', 'xcat',
'real_date' and 'value.
rets : str, List[str]
one or several target return categories.
sigs : str, List[str]
list of signal categories to be considered for which detailed relational
statistics can be calculated.
sig_neg : bool, List[bool]
if set to True puts the signal in negative terms for all analysis. If more than
one signal is tested, `sig_neg` must be an ordered list of the same length as the
signals, containing a True for each signal that needs to be negative.
Default is False.
cosp : bool
If True the comparative statistics are calculated only for the "communal sample
periods", i.e. periods and cross-sections that have values for all compared
signals. Default is False.
start : str
earliest date in ISO format. Default is None in which case the earliest date
available will be used.
end : str
latest date in ISO format. Default is None in which case the latest date in the
dataframe will be used.
blacklist : dict
cross-sections with date ranges that should be excluded from the data frame. If
one cross-section has several blacklist periods append numbers to the cross-section
code.
freqs : str, List[str]
letters denoting all frequencies at which the series may be sampled. This must
be a selection of 'D', 'W', 'M', 'Q', 'A'. Default is only 'M'. The return series
will always be summed over the sample period. The signal series will be aggregated
according to the values of `agg_sigs`.
agg_sigs : str, List[str]
aggregation method applied to the signal values in down-sampling. The default is
"last". Alternatives are "mean", "median" and "sum". If a single aggregation type is
chosen for multiple signal categories it is applied to all of them.
fwin : int
forward window of return category in base periods. Default is 1. This
conceptually corresponds to the holding period of a position in accordance with the
signal.
slip : int
Default is 0, implied slippage of feature availability for relationship with the
target category. See :func:`macrosynergy.management.df_utils.apply_slip` for more
information.
ms_panel_test : bool
if True the Macrosynergy Panel test is calculated. Please note that this is a
very time-consuming operation and should be used only if you require the result.
additional_metrics : List[Callable]
list of additional metrics to be calculated and added to the output table.
"""
def __init__(
self,
df: pd.DataFrame,
rets: Union[str, List[str]] = None,
sigs: Union[str, List[str]] = None,
cids: Union[str, List[str]] = None,
sig_neg: Union[bool, List[bool]] = None,
cosp: bool = False,
start: str = None,
end: str = None,
blacklist: dict = None,
freqs: Union[str, List[str]] = "M",
agg_sigs: Union[str, List[str]] = "last",
fwin: int = 1,
slip: int = 0,
ms_panel_test: bool = False,
additional_metrics: List[Callable] = None,
):
if rets is None:
raise ValueError("Target return must be defined.")
if sigs is None:
raise ValueError("Signal must be defined.")
if not isinstance(df, pd.DataFrame):
raise TypeError(f"DataFrame expected and not {type(df)}.")
if not isinstance(cids, str) and cids is not None:
if not isinstance(cids, list):
raise TypeError(f"List or string expected and not {type(cids)}.")
else:
if not all(isinstance(cid, str) for cid in cids):
raise TypeError(f"List of strings expected for cids.")
required_columns = ["cid", "xcat", "real_date", "value"]
if not all(col in df.columns for col in required_columns):
raise ValueError(
"Dataframe columns must be of value: 'cid', 'xcat','real_date' and \
'value'"
)
df["real_date"] = pd.to_datetime(df["real_date"], format="%Y-%m-%d")
df = QuantamentalDataFrame(df)
self.dic_freq = {
"D": "daily",
"W": "weekly",
"M": "monthly",
"Q": "quarterly",
"A": "annual",
}
freq_error = f"Frequency parameter must be one of {list(self.dic_freq.keys())}."
if isinstance(freqs, list):
seen = set()
self.freqs = []
for f in freqs:
if not f in self.dic_freq.keys():
raise ValueError(freq_error)
else:
if f not in seen:
seen.add(f)
self.freqs.append(f)
else:
warnings.warn(
f"Frequency {f} is repeated, dropping repeated frequency."
)
else:
if not freqs in self.dic_freq.keys():
raise ValueError(freq_error)
else:
self.freqs = [freqs]
if not isinstance(ms_panel_test, bool):
raise TypeError(
f"<bool> object expected for ms_panel_test and not {type(ms_panel_test)}."
)
self.ms_panel_test = ms_panel_test
self.metrics = [
"accuracy",
"bal_accuracy",
"pos_sigr",
"pos_retr",
"pos_prec",
"neg_prec",
"pearson",
"pearson_pval",
"kendall",
"kendall_pval",
"auc",
]
if self.ms_panel_test:
self.metrics.append("map_pval")
if additional_metrics:
self.metrics.extend(
metric.__name__
for metric in additional_metrics
if hasattr(metric, "__name__")
)
else:
additional_metrics = []
self.additional_metrics = additional_metrics
if not isinstance(cosp, bool):
raise TypeError(f"<bool> object expected and not {type(cosp)}.")
if isinstance(cids, str):
self.cids = [cids]
else:
self.cids = cids
self.rets = rets
self.slip = slip
self.agg_sigs = agg_sigs
self.xcats = list(df["xcat"].unique())
self.df = df
self.cosp = cosp
self.start = start
self.end = end
self.blacklist = blacklist
self.fwin = fwin
if not self.is_list_of_strings(rets):
self.rets = [rets]
if not self.is_list_of_strings(sigs):
self.sigs = [sigs]
else:
self.sigs = sigs.copy()
if not self.is_list_of_strings(agg_sigs):
self.agg_sigs = [agg_sigs]
if not self.is_list_of_strings(freqs):
self.freqs = [freqs]
for sig in self.sigs:
assert (
sig in self.xcats
), "Primary signal must be available in the DataFrame."
for ret in self.rets:
assert (
ret in self.xcats
), "Target return must be available in the DataFrame."
if sig_neg is None:
self.signs = [False for _ in self.sigs]
else:
self.signs = sig_neg if isinstance(sig_neg, list) else [sig_neg]
for sign in self.signs:
if not sign in [False, True]:
raise TypeError("Sign must be either False or True.")
if len(self.signs) != len(self.sigs):
raise ValueError("Signs must have a length equal to signals")
self.xcats = self.rets + self.sigs
self.df = reduce_df(
df,
xcats=self.xcats,
cids=self.cids,
start=self.start,
end=self.end,
blacklist=self.blacklist,
)
new_sigs = []
for i, sig in enumerate(self.sigs):
if self.signs[i]:
neg_sig = f"{sig}_NEG"
neg_df = self.df[self.df["xcat"] == sig].copy()
neg_df["value"] *= -1
neg_df["xcat"] = neg_sig
# Append the negated version to the main df
self.df = update_df(self.df, neg_df)
new_sigs.append(neg_sig)
else:
new_sigs.append(sig)
self.sigs = new_sigs
self.original_df = self.df.copy()
def __rival_sigs__(self, ret, sigs=None):
"""
Helper function used to produce the panel-level table for the additional signals.
"""
if sigs is None:
sigs = self.sigs
df_out = pd.DataFrame(index=sigs, columns=self.metrics)
df = self.df
for s in sigs:
# Entire panel will be passed in.
df_out = self.__table_stats__(
df_segment=df, df_out=df_out, segment=s, signal=s, ret=ret
)
return df_out
@staticmethod
def __yaxis_lim__(accuracy_df: pd.DataFrame):
"""
Helper function to determine the range the y-axis is defined over.
Parameters
----------
accuracy_df : ~pandas.DataFrame
two dimensional DataFrame with accuracy & balanced accuracy columns.
.. note::
The returned range will always be below 0.5.
"""
y_axis = lambda min_correl: min_correl > 0.45
min_value = accuracy_df.min().min()
# Ensures any accuracy statistics greater than 0.5 are more pronounced given the
# adjusted scale.
y_input = 0.45 if y_axis(min_value) else min_value
return y_input
[docs] def accuracy_bars(
self,
ret: str = None,
sigs: Union[str, List[str]] = None,
freq: str = None,
agg_sig: str = None,
view: str = "cross_section",
title: str = None,
title_fontsize: int = 16,
size: Tuple[float, float] = None,
legend_pos: str = "best",
x_labels: Dict = None,
x_labels_rotate: int = 0,
return_fig: bool = False,
**kwargs,
):
"""
Plot bar chart for the overall and balanced accuracy metrics. For types:
cross_section and years.
Parameters
----------
ret : str, optional
return category. Default is None, in which case the first return category will
be used.
sigs : str, or List[str], optional
signal category. Default is None, in which case all signals will be used.
freq : str, optional
frequency to be used in analysis. Default is None, in which case the first
frequency will be used.
agg_sig : str, optional
aggregation method to be used in analysis. Default is None, in which case the
first aggregation method will be used.
view : str, optional
type of segment over which bars are drawn. Either "cross_section" (default),
"years" or "signals".
title : str, optional
chart header - default will be applied if none is chosen.
title_fontsize : int
font size of chart header. Default is 16.
size : Tuple[float], optional
2-tuple of width and height of plot - default will be applied if none is
chosen.
legend_pos : str
position of legend box. Default is 'best'. See the documentation of
matplotlib.pyplot.legend.
x_labels : Dict[str]
dictionary of x-axis labels. Default is None.
x_labels_rotate : int
rotation of x-axis labels. Default is 0.
"""
if "type" in kwargs:
warnings.warn(
"`type` parameter is deprecated; use `view` instead.",
DeprecationWarning,
stacklevel=2,
)
view = kwargs["type"]
if view not in ["cross_section", "years", "signals"]:
raise ValueError(
"View parameter must be either 'cross_section', 'years' or 'signals'."
)
if sigs is None:
sigs = self.sigs
elif isinstance(sigs, str):
if sigs not in self.sigs and sigs + "_NEG" in self.sigs:
sigs = sigs + "_NEG"
if isinstance(sigs, list):
for sig in sigs:
if sig not in self.sigs and sig + "_NEG" in self.sigs:
sigs[sigs.index(sig)] = sig + "_NEG"
if isinstance(sigs, str):
sigs = [sigs]
for sig in sigs:
if sig not in self.sigs:
raise ValueError(
f"Signal {sig} is not defined in Signal Return Relations."
)
if freq is None:
freq = self.freqs[0]
if agg_sig is None:
agg_sig = self.agg_sigs[0]
if ret is None:
ret = self.rets[0]
self.df = self.original_df.copy()
self.manipulate_df(xcats=sigs + [ret], freq=freq, agg_sig=agg_sig)
for i in range(len(sigs)):
if not sigs[i] in self.sigs:
sigs[i] = sigs[i] + "_NEG"
if view == "cross_section":
df_xs = self.__output_table__(cs_type="cids", ret=ret, sig=sigs[0])
elif view == "years":
df_xs = self.__output_table__(cs_type="years", ret=ret, sig=sigs[0])
else:
df_xs = self.__rival_sigs__(ret, sigs)
dfx = df_xs[~df_xs.index.isin(["PosRatio"])]
if title is None:
refsig = "various signals" if view == "signals" else sigs[0]
title = (
f"Accuracy for sign prediction of {ret} based on {refsig} "
f"at {self.dic_freq[self.freqs[0]]} frequency."
)
if size is None:
size = (np.max([dfx.shape[0] / 2, 8]), 6)
sns.set_style("darkgrid")
fig, ax = plt.subplots(figsize=size)
x_indexes = np.arange(dfx.shape[0])
w = 0.4
ax.bar(
x_indexes - w / 2,
dfx["accuracy"],
label="Accuracy",
width=w,
color="lightblue",
)
ax.bar(
x_indexes + w / 2,
dfx["bal_accuracy"],
label="Balanced Accuracy",
width=w,
color="steelblue",
)
if x_labels:
validated_labels = {}
if view == "signals":
for key, value in x_labels.items():
if key in self.sigs:
validated_labels[key] = value
elif key + "_NEG" in self.sigs:
validated_labels[key + "_NEG"] = value
elif view == "cross_section":
for key, value in x_labels.items():
if key in self.cids:
validated_labels[key] = value
labels = [validated_labels.get(xcat, xcat) for xcat in dfx.index]
else:
labels = dfx.index
ax.set_xticks(x_indexes)
ax.set_xticklabels(labels, rotation=x_labels_rotate)
ax.axhline(y=0.5, color="black", linestyle="-", linewidth=0.5)
y_input = self.__yaxis_lim__(
accuracy_df=dfx.loc[:, ["accuracy", "bal_accuracy"]]
)
ax.set_ylim(round(y_input, 2))
ax.set_title(title, fontsize=title_fontsize)
ax.legend(loc=legend_pos)
if return_fig:
return fig
else:
plt.show()
[docs] def correlation_bars(
self,
ret: str = None,
sigs: Union[str, List[str]] = None,
freq: str = None,
type: str = "cross_section",
title: str = None,
title_fontsize: int = 16,
size: Tuple[float, float] = None,
legend_pos: str = "best",
x_labels: Dict = None,
x_labels_rotate: int = 0,
return_fig: bool = False,
):
"""
Plot correlation coefficients and significance. For types: cross_section and
years.
Parameters
----------
ret : str, optional
return category. Default is the first return category.
sig : str, List[str], optional
signal category. Default is the first signal category.
type : str, optional
type of segment over which bars are drawn. Either "cross_section" (default),
"years" or "signals".
title : str, optional
chart header. Default is None, in which case the default title will be applied.
title_fontsize : int
font size of chart header. Default is 16.
size : Tuple[float, float], optional
2-tuple of width and height of plot. If None, the default size will be applied.
legend_pos : str
position of legend box. Default is 'best'. See matplotlib.pyplot.legend.
x_labels : Dict[str]
dictionary of x-axis labels. Default is None.
x_labels_rotate : int
rotation of x-axis labels. Default is 0.
"""
assert type in ["cross_section", "years", "signals"]
if freq is None:
freq = self.freqs[0]
if ret is None and sigs is None:
ret = self.rets[0]
sigs = self.sigs
else:
if ret is None:
ret = self.rets[0]
if sigs is None:
sigs = self.sigs
elif isinstance(sigs, str):
if sigs not in self.sigs and sigs + "_NEG" in self.sigs:
sigs = sigs + "_NEG"
if isinstance(sigs, list):
for sig in sigs:
if sig not in self.sigs and sig + "_NEG" in self.sigs:
sigs[sigs.index(sig)] = sig + "_NEG"
self.df = self.original_df.copy()
if isinstance(sigs, str):
sigs = [sigs]
self.manipulate_df(
xcats=sigs + [ret],
freq=freq,
agg_sig=self.agg_sigs[0],
)
for i in range(len(sigs)):
if not sigs[i] in self.sigs:
sigs[i] = sigs[i] + "_NEG"
if type == "cross_section":
df_xs = self.__output_table__(cs_type="cids", ret=ret, sig=sigs[0])
elif type == "years":
df_xs = self.__output_table__(cs_type="years", ret=ret, sig=sigs[0])
else:
df_xs = self.__rival_sigs__(ret, sigs)
dfx = df_xs[~df_xs.index.isin(["PosRatio", "Mean"])]
pprobs = np.array(
[
(1 - pv) * (np.sign(cc) + 1) / 2
for pv, cc in zip(dfx["pearson_pval"], dfx["pearson"])
]
)
pprobs[pprobs == 0] = 0.01
kprobs = np.array(
[
(1 - pv) * (np.sign(cc) + 1) / 2
for pv, cc in zip(dfx["kendall_pval"], dfx["kendall"])
]
)
kprobs[kprobs == 0] = 0.01
if title is None:
refsig = "various signals" if type == "signals" else sigs[0]
title = (
f"Positive correlation probability of {ret} "
f"and lagged {refsig} at {self.dic_freq[freq]} frequency."
)
if size is None:
size = (np.max([dfx.shape[0] / 2, 8]), 6)
sns.set_style("darkgrid")
fig, ax = plt.subplots(figsize=size)
x_indexes = np.arange(len(dfx.index))
w = 0.4
ax.bar(x_indexes - w / 2, pprobs, label="Pearson", width=w, color="lightblue")
ax.bar(x_indexes + w / 2, kprobs, label="Kendall", width=w, color="steelblue")
if x_labels:
validated_labels = {}
for key, value in x_labels.items():
if key in self.sigs:
validated_labels[key] = value
elif key + "_NEG" in self.sigs:
validated_labels[key + "_NEG"] = value
labels = [validated_labels.get(xcat, xcat) for xcat in dfx.index]
else:
labels = dfx.index
ax.set_xticks(x_indexes)
ax.set_xticklabels(labels, rotation=x_labels_rotate)
ax.axhline(
y=0.95,
color="orange",
linestyle="--",
linewidth=0.5,
label="95% probability",
)
ax.axhline(
y=0.99, color="red", linestyle="--", linewidth=0.5, label="99% probability"
)
ax.set_title(title, fontsize=title_fontsize)
ax.legend(loc=legend_pos)
if return_fig:
return fig
else:
plt.show()
@staticmethod
def __slice_df__(df: pd.DataFrame, cs: str, cs_type: str):
"""
Slice DataFrame by year, cross-section, or use full panel.
Parameters
----------
df : ~pandas.DataFrame
standardised DataFrame.
cs : str
individual segment, cross-section or year.
cs_type : str
segmentation type.
"""
# Row names of cross-sections or years.
if cs != "Panel" and cs_type == "cids":
df_cs = df.loc[cs]
elif cs != "Panel":
df_cs = df[df["year"] == float(cs)]
else:
df_cs = df
return df_cs
[docs] @staticmethod
def apply_slip(
df: pd.DataFrame,
slip: int,
cids: List[str],
xcats: List[str],
metrics: List[str],
) -> pd.DataFrame:
"""
Function used to call the apply slip method that is defined in
`macrosynergy.management.df_utils`.
Parameters
----------
df : ~pandas.DataFrame
standardised DataFrame.
slip : int
slip value to apply to df.
cids : List[str]
list of cids in df to apply slip.
xcats : List[str]
list of xcats in df to apply slip.
metrics : List[str]
list of metrics in df to apply slip.
"""
return apply_slip_util(
df=df, slip=slip, cids=cids, xcats=xcats, metrics=metrics, raise_error=False
)
[docs] @staticmethod
def is_list_of_strings(variable: Any) -> bool:
"""
Function used to test whether a variable is a list of strings, to avoid the
compiler saying a string is a list of characters.
Parameters
----------
variable : Any
variable to be tested.
Returns
-------
bool
True if variable is a list of strings, False otherwise.
"""
return isinstance(variable, list) and all(
isinstance(item, str) for item in variable
)
[docs] def manipulate_df(self, xcats: List[str], freq: str, agg_sig: str):
"""
Used to manipulate the DataFrame to the desired format for the analysis. Firstly
reduces the dataframe to only include data outside of the blacklist and data
that is relevant to xcat and sig. Then applies the slip to the dataframe. It
then converts the dataframe to the desired format for the analysis and checks
whether any negative signs should be introduced.
Parameters
----------
xcats : List[str]
list of xcats in df to apply slip.
freq : str
frequency to be used in analysis.
agg_sig : str
aggregation method to be used in analysis.
"""
self.df = self.original_df.copy()
cids = None if self.cids is None else self.cids
dfd = reduce_df(
self.df,
xcats=xcats,
cids=cids,
start=self.start,
end=self.end,
blacklist=self.blacklist,
)
metric_cols: List[str] = list(
set(dfd.columns.tolist())
- set(["real_date", "xcat", "cid", "ticker", "last_updated"])
)
# here, the slip is applied to the the first xcat (explanatory variable)
dfd: pd.DataFrame = self.apply_slip(
df=dfd,
slip=self.slip,
cids=cids,
xcats=[xcats[0]],
metrics=metric_cols,
)
if self.cosp and len(self.sigs) > 1:
dfd = self.__communal_sample__(df=dfd, signal=xcats[:-1], ret=xcats[-1])
self.dfd = dfd
df = categories_df(
dfd,
xcats=xcats,
cids=cids,
val="value",
start=None,
end=None,
freq=freq,
blacklist=None,
lag=1,
fwin=self.fwin,
xcat_aggs=[agg_sig, "sum"],
)
self.df = df
self.cids = list(np.sort(self.df.index.get_level_values(0).unique()))
def __communal_sample__(self, df: pd.DataFrame, signal: str, ret: str):
"""
On a multi-index DataFrame, where the outer index are the cross-sections and the
inner index are the timestamps, exclude any row where all signals do not have a
realised value.
Parameters
----------
df : ~pandas.DataFrame
standardized DataFrame with the following necessary columns: 'cid', 'xcat',
'real_date' and 'value'.
signal : str
signal category.
ret : str
return category.
.. note::
Remove the return category from establishing the intersection to preserve the
maximum amount of signal data available (required because of the applied lag).
"""
df_w = df.pivot(index=("cid", "real_date"), columns="xcat", values="value")
storage = []
cid_name: str
cid_df: pd.DataFrame
for cid_name, cid_df in df_w.groupby(level=0, observed=True):
cid_df = cid_df[signal + [ret]]
final_df = pd.DataFrame(
data=np.empty(shape=cid_df.shape),
columns=cid_df.columns,
index=cid_df.index,
)
final_df.loc[:, :] = np.nan
# Return category is preserved.
final_df.loc[:, ret] = cid_df[ret]
intersection_df = cid_df.loc[:, signal].droplevel(level=0)
# Intersection exclusively across the signals.
intersection_df = intersection_df.dropna(how="any")
if not intersection_df.empty:
s_date = intersection_df.index[0]
e_date = intersection_df.index[-1]
final_df.loc[(cid_name, s_date):(cid_name, e_date), signal] = (
intersection_df.to_numpy()
)
storage.append(final_df)
else:
warnings.warn(
f"Cross-section {cid_name} has no common sample periods for the signals \
{signal} and return {ret}."
)
df = pd.concat(storage)
df = df.stack().reset_index().sort_values(["cid", "xcat", "real_date"])
df.columns = ["cid", "real_date", "xcat", "value"]
return df[["cid", "xcat", "real_date", "value"]]
def __table_stats__(
self,
df_segment: pd.DataFrame,
df_out: pd.DataFrame,
segment: str,
signal: str,
ret: str,
):
"""
Method used to compute the evaluation metrics across segments: cross-section,
yearly or category level.
Parameters
----------
df_segment : ~pandas.DataFrame
segmented DataFrame.
df_out : ~pandas.DataFrame
metric DataFrame where the index will be all segments for the respective
segmentation type.
segment : str
segment which could either be an individual cross-section, year or category.
Will form the index of the returned DataFrame.
signal : str
signal category.
ret : str
return category.
"""
# Account for NaN values between the single respective signal and return. Only
# applicable for rival signals panel level calculations.
df_segment = df_segment.loc[:, [ret, signal]].dropna(axis=0, how="any")
df_sgs = np.sign(df_segment.loc[:, [ret, signal]])
# Exact zeroes are disqualified for sign analysis only.
df_sgs = df_sgs[~((df_sgs.iloc[:, 0] == 0) | (df_sgs.iloc[:, 1] == 0))]
sig_sign = df_sgs[signal]
ret_sign = df_sgs[ret]
df_out.loc[segment, "accuracy"] = skm.accuracy_score(sig_sign, ret_sign)
df_out.loc[segment, "bal_accuracy"] = skm.balanced_accuracy_score(
sig_sign, ret_sign
)
df_out.loc[segment, "pos_sigr"] = np.mean(sig_sign == 1)
df_out.loc[segment, "pos_retr"] = np.mean(ret_sign == 1)
df_out.loc[segment, "pos_prec"] = skm.precision_score(
ret_sign, sig_sign, pos_label=1
)
df_out.loc[segment, "neg_prec"] = skm.precision_score(
ret_sign, sig_sign, pos_label=-1
)
ret_vals, sig_vals = df_segment[ret], df_segment[signal]
df_out.loc[segment, ["kendall", "kendall_pval"]] = stats.kendalltau(
ret_vals, sig_vals
)
if len(ret_sign) <= 1:
corr, corr_pval = np.nan, np.nan
else:
corr, corr_pval = stats.pearsonr(ret_vals, sig_vals)
df_out.loc[segment, ["pearson", "pearson_pval"]] = np.array([corr, corr_pval])
if (ret_sign == -1.0).all() or (ret_sign == 1.0).all():
df_out.loc[segment, "auc"] = np.nan
warnings.warn(
"AUC could not be calculated, since the return category has a lack of "
"class diversity."
)
else:
df_out.loc[segment, "auc"] = skm.roc_auc_score(ret_sign, sig_sign)
if self.ms_panel_test:
df_out.loc[segment, "map_pval"] = self.map_pval(ret_vals, sig_vals)
for metric in self.additional_metrics:
df_out.loc[segment, metric.__name__] = metric(ret_vals, sig_vals)
return df_out
[docs] def map_pval(self, ret_vals, sig_vals) -> float:
"""
Calculates the p-value using statsmodels MixedLM.
Parameters
----------
ret_vals : ~pandas.Series
return values.
sig_vals : ~pandas.Series
signal values.
Returns
-------
float
p-value of the MixedLM model.
"""
if (
not "cid" in ret_vals.index.names
or ret_vals.index.get_level_values("cid").nunique() <= 1
):
warnings.warn(
"P-value could not be calculated, since there wasn't enough datapoints."
)
return np.nan
X = sm.add_constant(ret_vals)
y = sig_vals.copy()
groups = ret_vals.index.get_level_values("real_date")
mlm = sm.MixedLM(y, X, groups=groups)
try:
re = mlm.fit(reml=False)
except np.linalg.LinAlgError:
warnings.warn(
"Singular matrix encountered, so p-value could not be calculated."
)
return np.nan
if re.summary().tables[1].iloc[1, 3] == "":
warnings.warn(
"P-value could not be calculated, since there wasn't enough datapoints."
)
return np.nan
pval_string = re.summary().tables[1].iloc[1, 3]
return float(pval_string)
def __output_table__(
self,
cs_type: str = "cids",
ret: str = None,
sig: str = None,
srt: bool = False,
):
"""
Creates a DataFrame with information on the signal-return relation across cross-
sections or years and, additionally, the panel.
Parameters
----------
cs_type : str
the segmentation type.
ret : str
return category. Default is the first return category.
sig : str
signal category. Default is the first signal category.
srt : bool
if True, the DataFrame will be sorted by the cross-sections. Default is False.
"""
if ret is None:
ret = self.rets if not isinstance(self.rets, list) else self.rets[0]
if sig is None:
sig = self.sigs if not isinstance(self.sigs, list) else self.sigs[0]
# Analysis completed exclusively on the primary signal.
r = [ret]
r.append(sig)
df = self.df[r]
# Will remove any timestamps where both the signal & return are not realised.
# Applicable even if communal sampling has been applied given the alignment
# excludes the return category.
df = df.dropna(how="any")
if cs_type == "cids":
css = set(self.cids)
unique_cids_df = set(df.index.get_level_values(0).unique())
if not css.issubset(unique_cids_df):
warnings.warn(
f"Cross-sections {css - unique_cids_df} have no corresponding xcats \
in the dataframe."
)
css = css.intersection(unique_cids_df)
css = sorted(list(css))
else:
df["year"] = np.array(df.reset_index(level=1)["real_date"].dt.year)
css = [str(y) for y in list(set(df["year"]))]
css = sorted(css)
statms = self.metrics
if srt:
css = []
index = ["Panel"]
else:
index = ["Panel", "Mean", "PosRatio"] + css
df_out = pd.DataFrame(index=index, columns=statms)
for cs in css + ["Panel"]:
df_cs = self.__slice_df__(df=df, cs=cs, cs_type=cs_type)
df_out = self.__table_stats__(
df_segment=df_cs, df_out=df_out, segment=cs, signal=sig, ret=ret
)
if not srt:
df_out.loc["Mean", :] = df_out.loc[css, :].mean()
above50s = statms[0:6] + [statms[statms.index("auc")]]
# Overview of the cross-sectional performance.
df_out.loc["PosRatio", above50s] = (df_out.loc[css, above50s] > 0.5).mean()
above0s = statms[6:9:2]
pos_corr_coefs = df_out.loc[css, above0s] > 0
df_out.loc["PosRatio", above0s] = pos_corr_coefs.mean()
below50s = statms[7:10:2]
pvals_bool = df_out.loc[css, below50s] < 0.5
pos_pvals = np.mean(np.array(pvals_bool) * np.array(pos_corr_coefs), axis=0)
# Positive correlation with error prob < 50%.
df_out.loc["PosRatio", below50s] = pos_pvals
if self.ms_panel_test:
map_pval_bool = df_out.loc[css, "map_pval"] < 0.5
pos_map_pval = np.mean(np.array(map_pval_bool) * np.nan)
df_out.loc["PosRatio", "map_pval"] = pos_map_pval
return df_out.astype("float")
[docs] def calculate_single_stat(
self, stat: str, ret: str = None, sig: str = None, type: str = None
) -> float:
"""
Calculates a single statistic for a given signal-return relation.
Parameters
----------
stat : str
statistic to be calculated.
ret : str
return category. Default is the first return category.
sig : str
signal category. Default is the first signal category.
type : str
type of segment over which bars are drawn. Either "panel" (default), "years"
or "signals".
Returns
-------
float
statistic value.
"""
r = [ret]
r.append(sig)
df = self.df[r]
df = df.dropna(how="any")
if type == "panel":
css = ["Panel"]
cs_type = "cids"
elif type == "mean_cids" or type == "pr_cids":
css = set(self.cids)
unique_cids_df = set(df.index.get_level_values(0).unique())
if not css.issubset(unique_cids_df):
warnings.warn(
f"Cross-sections {css - unique_cids_df} have no corresponding xcats \
in the dataframe."
)
css = css.intersection(unique_cids_df)
css = sorted(list(css))
cs_type = "cids"
elif type == "mean_years" or type == "pr_years":
df["year"] = np.array(df.reset_index(level=1)["real_date"].dt.year)
css = [str(y) for y in list(set(df["year"]))]
css = sorted(css)
cs_type = "years"
else:
raise ValueError("Invalid segmentation type.")
list_of_results = []
for cs in css:
df_segment = self.__slice_df__(df=df, cs=cs, cs_type=cs_type)
df_segment = df_segment.loc[:, [ret, sig]].dropna(axis=0, how="any")
df_sgs = np.sign(df_segment.loc[:, [ret, sig]])
# Exact zeroes are disqualified for sign analysis only.
df_sgs = df_sgs[~((df_sgs.iloc[:, 0] == 0) | (df_sgs.iloc[:, 1] == 0))]
sig_sign = df_sgs[sig]
ret_sign = df_sgs[ret]
ret_vals, sig_vals = df_segment[ret], df_segment[sig]
if stat == "accuracy":
list_of_results.append(skm.accuracy_score(sig_sign, ret_sign))
elif stat == "bal_accuracy":
list_of_results.append(skm.balanced_accuracy_score(sig_sign, ret_sign))
elif stat == "pos_sigr":
list_of_results.append(np.mean(sig_sign == 1))
elif stat == "pos_retr":
list_of_results.append(np.mean(ret_sign == 1))
elif stat == "pos_prec":
list_of_results.append(
skm.precision_score(ret_sign, sig_sign, pos_label=1)
)
elif stat == "neg_prec":
list_of_results.append(
skm.precision_score(ret_sign, sig_sign, pos_label=-1)
)
elif stat == "kendall":
list_of_results.append(stats.kendalltau(ret_vals, sig_vals)[0])
elif stat == "kendall_pval":
list_of_results.append(stats.kendalltau(ret_vals, sig_vals)[1])
elif stat == "pearson":
list_of_results.append(stats.pearsonr(ret_vals, sig_vals)[0])
elif stat == "pearson_pval":
list_of_results.append(stats.pearsonr(ret_vals, sig_vals)[1])
elif stat == "auc":
if (ret_sign == -1.0).all() or (ret_sign == 1.0).all():
list_of_results.append(np.nan)
warnings.warn(
"AUC could not be calculated, since the return category has a "
"lack of class diversity."
)
else:
list_of_results.append(skm.roc_auc_score(ret_sign, sig_sign))
elif stat == "map_pval" and self.ms_panel_test:
list_of_results.append(self.map_pval(ret_vals, sig_vals))
elif True in [
stat == metric.__name__ for metric in self.additional_metrics
]:
idx = [
stat == metric.__name__ for metric in self.additional_metrics
].index(True)
list_of_results.append(self.additional_metrics[idx](ret_vals, sig_vals))
else:
raise ValueError("Invalid statistic.")
if type == "panel":
return list_of_results[0]
elif type == "mean_years" or type == "mean_cids":
return np.mean(np.array(list_of_results))
elif type == "pr_years" or type == "pr_cids":
if stat in self.metrics[0:6] + ["auc"]:
return np.mean(np.array(list_of_results) > 0.5)
elif stat in self.metrics[6:9:2]:
return np.mean(np.array(list_of_results) > 0)
elif stat in self.metrics[7:10:2]:
return np.mean(np.array(list_of_results) < 0.5)
[docs] def summary_table(self, cross_section: bool = False, years: bool = False):
"""
Generates a summary table for the signal-return relations.
Parameters
----------
cross_section : bool
if True, the summary table will be generated for cross-sections.
years : bool
if True, the summary table will be generated for years. Must be False if
cross_section is True.
Returns
-------
~pandas.DataFrame
summary table.
"""
warnings.warn(
"summary_table() has been deprecated will be removed in a subsequent "
"version, please now use single_relation_table(table_type='summary').",
FutureWarning,
)
if cross_section and years:
raise ValueError("Both cross_section and years cannot be True")
if not (cross_section and years):
return self.single_relation_table(table_type="summary")
else:
return self.single_relation_table(
table_type="years" if years else "cross_section"
)
[docs] def signals_table(self, sigs: List[str] = None):
warnings.warn(
"signals_table() has been deprecated will be removed in a subsequent "
"version, please now use multiple_relations_table()",
FutureWarning,
)
if sigs is None:
sigs = self.sigs
return self.multiple_relations_table(
rets=self.rets[0],
xcats=sigs,
freqs=self.freqs[0],
agg_sigs=self.agg_sigs[0],
)
[docs] def cross_section_table(self):
"""
Deprecated method for cross-section table. Use `single_relation_table` instead.
Shows a table of category values across cross-sections for a given date.
"""
warnings.warn(
"cross_section_table() has been deprecated will be removed in a subsequent "
"version, please now use "
" single_relation_table(table_type='cross_section_table')",
FutureWarning,
)
return self.single_relation_table(table_type="cross_section")
[docs] def yearly_table(self):
"""
Deprecated method for yearly table. Use `single_relation_table` instead.
Displays annual average values of selected categories across cross-sections.
"""
warnings.warn(
"yearly_table() has been deprecated will be removed in a subsequent "
"version, please now use single_relation_table(table_type='years')",
FutureWarning,
)
return self.single_relation_table(table_type="years")
[docs] def single_relation_table(
self,
ret: str = None,
xcat: str = None,
freq: str = None,
agg_sigs: str = None,
table_type: str = None,
) -> pd.DataFrame:
"""
Computes all the statistics for one specific signal-return relation:
Parameters
----------
ret : str
single target return category. Default is first in target return list of the
class.
xcat : str
single signal category to be considered. Default is first in feature
category list of the class.
freq : str
letter denoting single frequency at which the series will be sampled. This
must be one of the frequencies selected for the class. If not specified uses the
freq stored in the class.
agg_sigs : str
aggregation method applied to the signal values in down-sampling.
table_type : str
type of table to be returned. Either "summary", "years", "cross_section".
Returns
-------
~pandas.DataFrame
table with the statistics for the single signal-return relation.
"""
self.df = self.original_df
if ret is None:
ret = self.rets if not isinstance(self.rets, list) else self.rets[0]
if freq is None:
freq = self.freqs if not isinstance(self.freqs, list) else self.freqs[0]
if agg_sigs is None:
agg_sigs = (
self.agg_sigs
if not isinstance(self.agg_sigs, list)
else self.agg_sigs[0]
)
if xcat is None:
sig = self.sigs if not isinstance(self.sigs, list) else self.sigs[0]
xcat = [sig, ret]
elif not isinstance(xcat, str):
raise TypeError("xcat must be a string")
else: # If xcat is a string
if xcat not in self.sigs and xcat + "_NEG" in self.sigs:
xcat = xcat + "_NEG"
sig = xcat
xcat = [sig, ret]
if not isinstance(ret, str):
raise TypeError("ret must be a string")
if not isinstance(freq, str):
raise TypeError("freq must be a string")
if not isinstance(agg_sigs, str):
raise TypeError("agg_sigs must be a string")
self.manipulate_df(xcats=xcat, freq=freq, agg_sig=agg_sigs)
if not sig in self.sigs:
sig = sig + "_NEG"
if table_type is not None:
if not table_type in ["summary", "years", "cross_section"]:
raise ValueError("Invalid table type")
if table_type == "years":
cs_type = "years"
else:
cs_type = "cids"
if table_type == "summary":
df_result = pd.concat(
[
self.__output_table__(
cs_type="years", ret=ret, sig=sig, srt=False
).iloc[:3],
self.__output_table__(
cs_type="cids", ret=ret, sig=sig, srt=False
).iloc[1:3],
],
axis=0,
)
df_result.index = [
df_result.index[0],
"Mean years",
"Positive ratio",
"Mean cids",
"Positive ratio",
]
else:
df_result = self.__output_table__(
cs_type=cs_type, ret=ret, sig=sig, srt=table_type is None
)
self.df = self.original_df
index = f"{freq}: {sig}/{agg_sigs} => {ret}"
df_result.rename(index={"Panel": index}, inplace=True)
return df_result.round(5)
[docs] def reindex_multindex_df(
self, df: pd.DataFrame, desired_order: List[str], var_type: str
):
df["Signal_Order"] = pd.Categorical(
df.index.get_level_values(var_type), categories=desired_order, ordered=True
)
df_sorted = df.sort_values("Signal_Order")
df_sorted.drop("Signal_Order", axis=1, inplace=True)
return df_sorted
[docs] def multiple_relations_table(
self,
rets: Union[str, List[str]] = None,
xcats: Union[str, List[str]] = None,
freqs: Union[str, List[str]] = None,
agg_sigs: Union[str, List[str]] = None,
signal_name_dict: Optional[Dict[str, str]] = None,
return_name_dict: Optional[Dict[str, str]] = None,
):
"""
Calculates all the statistics for each return and signal category specified with
each frequency and aggregation method, note that if none are defined it does
this for all categories, frequencies and aggregation methods that were stored in
the class.
Parameters
----------
rets : str, List[str]
target return category
xcats : str, List[str]
signal categories to be considered
freqs : str, List[str]
letters denoting frequency at which the series are to be sampled. This must
be one of 'D', 'W', 'M', 'Q', 'A'. If not specified uses the freq stored in the
class.
agg_sigs : str, List[str]
aggregation methods applied to the signal values in down-sampling.
"""
self.df = self.original_df
self.xcats = list(self.df["xcat"].unique())
if rets is None:
rets = self.rets
if freqs is None:
freqs = self.freqs
if agg_sigs is None:
agg_sigs = self.agg_sigs
if not isinstance(agg_sigs, list):
agg_sigs = [agg_sigs]
if xcats is None:
xcats = self.xcats
else:
if isinstance(xcats, str):
if xcats not in self.sigs and xcats + "_NEG" in self.sigs:
xcats = xcats + "_NEG"
if isinstance(xcats, list):
for xcat in xcats:
if xcat not in self.sigs and xcat + "_NEG" in self.sigs:
xcats[xcats.index(xcat)] = xcat + "_NEG"
if not isinstance(xcats, list):
xcats = [xcats]
if not isinstance(rets, list):
rets = [rets]
if not isinstance(freqs, list):
freqs = [freqs]
for rets_elem in rets:
if not rets_elem in self.xcats:
raise ValueError(f"{rets_elem} is not a valid return category")
for xcats_elem in xcats:
if not xcats_elem in self.xcats:
raise ValueError(f"{xcats_elem} is not a valid signal category")
for freqs_elem in freqs:
if not freqs_elem in self.freqs:
raise ValueError(f"{freqs_elem} is not a valid frequency")
for agg_sigs_elem in agg_sigs:
if not agg_sigs_elem in self.agg_sigs:
raise ValueError(f"{agg_sigs_elem} is not a valid aggregation method")
xcats = [x for x in xcats if x in self.sigs]
multiindex = pd.MultiIndex.from_tuples(
[
(ret, xcat, freq, agg_sig)
for freq in freqs
for agg_sig in agg_sigs
for ret in rets
for xcat in xcats
],
names=["Return", "Signal", "Frequency", "Aggregation"],
)
df_rows = []
for freq in freqs:
for agg_sig in agg_sigs:
for ret in rets:
self.manipulate_df(xcats=xcats + [ret], freq=freq, agg_sig=agg_sig)
for xcat in xcats:
df_rows.append(
self.__output_table__(
cs_type="cids", ret=ret, sig=xcat, srt=True
)
)
df_result = pd.concat(df_rows, axis=0)
df_result.index = multiindex
if signal_name_dict is not None:
df_result.rename(index=signal_name_dict, inplace=True)
df_result = self.reindex_multindex_df(
df_result, signal_name_dict.values(), "Signal"
)
if return_name_dict is not None:
df_result.rename(index=return_name_dict, inplace=True)
df_result = self.reindex_multindex_df(
df_result, return_name_dict.values(), "Return"
)
self.df = self.original_df
return df_result
[docs] def single_statistic_table(
self,
stat: str,
type: str = "panel",
rows: List[str] = ["xcat", "agg_sigs"],
columns: List[str] = ["ret", "freq"],
show_heatmap: bool = False,
title: Optional[str] = None,
title_fontsize: int = 16,
row_names: Optional[List[str]] = None,
column_names: Optional[List[str]] = None,
signal_name_dict: Optional[Dict[str, str]] = None,
return_name_dict: Optional[Dict[str, str]] = None,
xcat_labels: Optional[Dict[str, str]] = None,
freq_labels: Optional[Dict[str, str]] = None,
agg_sigs_labels: Optional[Dict[str, str]] = None,
min_color: Optional[float] = None,
max_color: Optional[float] = None,
figsize: Tuple[float, float] = (14, 8),
annotate: bool = True,
round: int = 3,
pval_stat: Optional[str] = None,
round_pval: int = 3,
significance_threshold: Optional[float] = 0.9,
xlabel: Optional[str] = None,
ylabel: Optional[str] = None,
collapse_constant_levels: bool = False,
axis_label_levels: Optional[List[str]] = None,
footnote: Optional[str] = None,
footnote_fontsize: int = 10,
):
"""
Creates a table which shows the specified statistic for each row and column
specified as arguments:
Parameters
----------
stat : str
type of statistic to be displayed (this can be any of the column names of
summary_table).
type : str
type of the statistic displayed. This can be based on the overall panel
("panel", default), an average of annual panels (mean_years), an average of
cross-sectional relations ("mean_cids"), the positive ratio across
years("pr_years"), positive ratio across sections ("pr_cids").
rows : List[str]
row indices, which can be return categories, feature categories, frequencies
and/or aggregations. The choice is made through a list of one or more of "xcat",
"ret", "freq" and "agg_sigs". The default is ["xcat", "agg_sigs"] resulting in
index strings (<agg_signs>) or if only one aggregation is available.
columns : List[str]
column indices, which can be return categories, feature categories,
frequencies and/or aggregations. The choice is made through a list of one or
more of "xcat", "ret", "freq" and "agg_sigs". The default is ["ret", "freq]
resulting in index strings () or if only one frequency is available.
show_heatmap : bool
if True, the table is visualized as a heatmap. Default is False.
title : str, optional
plot title. Default is None in which case the default title is used.
title_fontsize : int
font size of title. Default is 16.
row_names : List[str]
specifies the labels of rows in the heatmap. Default is None, the indices of
the generated DataFrame are used.
column_names : List[str]
specifies the labels of columns in the heatmap. Default is None, the columns
of the generated DataFrame are used.
signal_name_dict : dict, optional
dictionary mapping the signal names to the desired names in the heatmap.
Default is None, in which case the signal names are used. Renamed
values flow through to the auto axis label produced by the
constant-level collapse described under ``ylabel``.
return_name_dict : dict, optional
dictionary mapping the return names to the desired names in the heatmap.
Default is None, in which case the return names are used. Renamed
values flow through to the auto axis label produced by the
constant-level collapse described under ``xlabel``.
xcat_labels : dict, optional
Unified rename dictionary covering both signal and return
``xcat``\s. Internally split by membership in ``self.sigs`` /
``self.rets`` and routed through ``signal_name_dict`` /
``return_name_dict``; xcats not listed in the dict are kept
verbatim. Mutually exclusive with the two legacy kwargs — pass
either ``xcat_labels`` or ``signal_name_dict`` /
``return_name_dict``, not both. Default is None (no rename).
freq_labels : dict, optional
Mapping from frequency code (``"M"``, ``"Q"``, …) to the
display label used on the heatmap and in the auto axis label
produced by the constant-level collapse. Frequencies not
listed in the dict are kept verbatim. Default is None
(raw codes are shown).
agg_sigs_labels : dict, optional
Mapping from aggregation code (``"last"``, ``"mean"``, …) to
the display label used on the heatmap and in the auto axis
label produced by the constant-level collapse. Aggregations
not listed in the dict are kept verbatim. Default is None
(raw codes are shown).
min_color : float, optional
minimum value of the color scale. Default is None, in which case the minimum
value of the table is used.
max_color : float, optional
maximum value of the color scale. Default is None, in which case the maximum
value of the table is used.
figsize : Tuple[float, float]
Tuple (w, h) of width and height of graph. Default is (14, 8).
annotate : bool
Default is True, where the values shown in the heatmap are annotated.
round : int
number of decimals to round the primary statistic to in the heatmap
annotations. Default is 3.
pval_stat : str, optional
name of a p-value statistic — typically ``"kendall_pval"``,
``"pearson_pval"`` or ``"map_pval"`` (the Macrosynergy Panel
test). When set, each heatmap cell shows the **probability of
significance**, ``1 - pval_stat``, in brackets beneath the
primary statistic. Default is None. When ``pval_stat="map_pval"``
the SignalReturnRelations must have been constructed with
``ms_panel_test=True``.
round_pval : int
number of decimals to round the bracketed probability of
significance to in the heatmap annotations. Default is 3.
significance_threshold : float, optional
probability-of-significance cutoff above which a cell's
annotation is rendered in black and bold. Compared directly
against the bracketed value (``1 - pval_stat``), so 0.9
highlights cells whose probability of significance exceeds 0.9
(equivalently, raw p-value below 0.1). Only takes effect when
``pval_stat`` is set. Pass ``None`` to disable. Default is 0.9.
xlabel : str, optional
Label drawn beneath the heatmap columns, useful for naming
the target return (e.g. ``"Forward return (target)"``).
Default is None. When ``collapse_constant_levels=True`` and
the caller leaves this None, any column-index levels whose
values are constant across the table are auto-collapsed into
this label (joined by ``" · "``). See ``axis_label_levels``
to restrict which constant levels feed into the label.
ylabel : str, optional
Label drawn beside the heatmap rows, useful for naming the
feature (e.g. ``"Factor (feature)"``). Default is None. When
``collapse_constant_levels=True`` and the caller leaves this
None, any row-index levels whose values are constant across
the table are auto-collapsed into this label (joined by
``" · "``). For instance, a table whose rows iterate over
one signal, one aggregation, and several frequencies will
display only the frequencies as y-tick labels and place
``"<signal> · <aggregation>"`` on the y-axis label. See
``axis_label_levels`` to restrict which constant levels
feed into the label.
collapse_constant_levels : bool, optional
When True, row/column index levels whose values are constant
across the table are stripped from the tick labels and
promoted to the corresponding axis label (joined by
``" · "``) when the caller did not pass ``xlabel``/``ylabel``
(or ``row_names``/``column_names``) explicitly. The returned
DataFrame is unchanged in every case. Default is False (raw
MultiIndex tuples appear as tick labels, matching the
historical rendering). Required to be True before passing
``axis_label_levels``.
axis_label_levels : List[str], optional
Subset of ``["xcat", "ret", "freq", "agg_sigs"]`` naming the
level keys eligible for promotion into the auto x/y-axis
label. Constant levels not in this list still collapse from
the tick labels but do not appear in the axis label. Only
takes effect when ``collapse_constant_levels=True``; raises
``ValueError`` otherwise. Default is None, which promotes
every collapsed level into the label. Pass e.g.
``["xcat", "ret"]`` to keep the auto-label limited to the
signal/return identity and drop the aggregation/frequency
suffix.
footnote : str, optional
Free-text caption rendered below the heatmap. Useful for
recording the significance test, panel scope, or annotation
legend (e.g. ``"Significance computed with the Macrosynergy
panel test."``). Multi-line strings are supported. Default
is None (no footnote).
footnote_fontsize : int, optional
Font size for the footnote text. Default is 10.
Returns
-------
~pandas.DataFrame
DataFrame with the specified statistic for each row and column.
"""
self.df = self.original_df.copy()
if not stat in self.metrics:
raise ValueError(f"Stat must be one of {self.metrics}")
if pval_stat is not None:
if pval_stat == "map_pval" and not self.ms_panel_test:
raise ValueError(
"pval_stat='map_pval' requires SignalReturnRelations to "
"be constructed with ms_panel_test=True."
)
if pval_stat not in self.metrics:
raise ValueError(f"pval_stat must be one of {self.metrics}")
if not isinstance(rows, list):
raise TypeError("Rows must be a list")
if not isinstance(columns, list):
raise TypeError("Columns must be a list")
type_values = ["panel", "mean_years", "mean_cids", "pr_years", "pr_cids"]
rows_values = ["xcat", "ret", "freq", "agg_sigs"]
if not type in type_values:
raise ValueError(f"Type must be one of {type_values}")
if not all([x in rows_values for x in rows]):
raise ValueError(f"Rows must only contain {rows_values}")
if not all([x in rows_values for x in columns]):
raise ValueError(f"Columns must only contain {rows_values}")
if axis_label_levels is not None:
if not collapse_constant_levels:
raise ValueError(
"axis_label_levels requires collapse_constant_levels=True."
)
if not all(x in rows_values for x in axis_label_levels):
raise ValueError(
f"axis_label_levels must only contain {rows_values}"
)
if xcat_labels is not None:
if signal_name_dict is not None or return_name_dict is not None:
raise ValueError(
"Pass either xcat_labels or "
"signal_name_dict/return_name_dict, not both."
)
# Build identity-filled rename dicts so existing keys preserve
# their position and unrenamed xcats are not dropped by the
# downstream reorder.
signal_name_dict = {s: xcat_labels.get(s, s) for s in self.sigs}
return_name_dict = {r: xcat_labels.get(r, r) for r in self.rets}
rows_dict = {
"xcat": self.sigs,
"ret": self.rets,
"freq": self.freqs,
"agg_sigs": self.agg_sigs,
}
df_row_names, df_column_names = self.set_df_labels(rows_dict, rows, columns)
df_result = pd.DataFrame(
columns=df_column_names, index=df_row_names, dtype=np.float64
)
# sort index to prevent performance degradation: PerformanceWarning
df_result.sort_index(inplace=True)
df_pval: Optional[pd.DataFrame] = None
if pval_stat is not None:
df_pval = pd.DataFrame(
columns=df_column_names, index=df_row_names, dtype=np.float64
)
df_pval.sort_index(inplace=True)
loop_tuples: List[Tuple[str, str, str, str]] = [
(ret, sig, freq, agg_sig)
for ret in self.rets
for sig in self.sigs
for freq in self.freqs
for agg_sig in self.agg_sigs
]
# Reorder tuples
for ret, sig, freq, agg_sig in loop_tuples:
# Prepare xcat and manipulate DataFrame
xcat = [sig, ret]
self.manipulate_df(xcats=xcat, freq=freq, agg_sig=agg_sig)
hash = f"{ret}/{sig}/{freq}/{agg_sig}"
row = self.get_rowcol(hash, rows)
column = self.get_rowcol(hash, columns)
df_result.loc[row, column] = self.calculate_single_stat(
stat, ret, sig, type
)
if pval_stat is not None:
df_pval.loc[row, column] = self.calculate_single_stat(
pval_stat, ret, sig, type
)
# Reset self.df and sig to original values
self.df = self.original_df
if signal_name_dict is not None:
# Reorder the index according to the signal_name_dict
if "xcat" in rows:
df_result.rename(index=signal_name_dict, inplace=True)
df_result = self.reindex_multindex_df(
df_result, signal_name_dict.values(), "Signal"
)
if df_pval is not None:
df_pval.rename(index=signal_name_dict, inplace=True)
df_pval = self.reindex_multindex_df(
df_pval, signal_name_dict.values(), "Signal"
)
else:
df_result.rename(columns=signal_name_dict, inplace=True)
df_result = df_result[signal_name_dict.values()]
if df_pval is not None:
df_pval.rename(columns=signal_name_dict, inplace=True)
df_pval = df_pval[signal_name_dict.values()]
if return_name_dict is not None:
# Reorder the index according to the return_name_dict
if "ret" in rows:
df_result.rename(index=return_name_dict, inplace=True)
df_result = self.reindex_multindex_df(
df_result, return_name_dict.values(), "Return"
)
if df_pval is not None:
df_pval.rename(index=return_name_dict, inplace=True)
df_pval = self.reindex_multindex_df(
df_pval, return_name_dict.values(), "Return"
)
else:
df_result.rename(columns=return_name_dict, inplace=True)
df_result = df_result[return_name_dict.values()]
if df_pval is not None:
df_pval.rename(columns=return_name_dict, inplace=True)
df_pval = df_pval[return_name_dict.values()]
# Frequency / aggregation display renames. Identity-fill so that
# frequencies (or aggregations) not listed in the user dict keep
# their slot in the renamed axis instead of being dropped by the
# downstream reorder, mirroring the xcat_labels pattern above.
# The renamed values flow into both the heatmap tick labels and
# the auto axis label produced by ``collapse_constant_levels``.
if freq_labels is not None:
freq_labels_full = {f: freq_labels.get(f, f) for f in self.freqs}
if "freq" in rows:
df_result.rename(index=freq_labels_full, inplace=True)
df_result = self.reindex_multindex_df(
df_result, list(freq_labels_full.values()), "Frequency"
)
if df_pval is not None:
df_pval.rename(index=freq_labels_full, inplace=True)
df_pval = self.reindex_multindex_df(
df_pval, list(freq_labels_full.values()), "Frequency"
)
elif "freq" in columns:
df_result.rename(columns=freq_labels_full, inplace=True)
if df_pval is not None:
df_pval.rename(columns=freq_labels_full, inplace=True)
if agg_sigs_labels is not None:
agg_sigs_labels_full = {
a: agg_sigs_labels.get(a, a) for a in self.agg_sigs
}
if "agg_sigs" in rows:
df_result.rename(index=agg_sigs_labels_full, inplace=True)
df_result = self.reindex_multindex_df(
df_result, list(agg_sigs_labels_full.values()), "Aggregation"
)
if df_pval is not None:
df_pval.rename(index=agg_sigs_labels_full, inplace=True)
df_pval = self.reindex_multindex_df(
df_pval, list(agg_sigs_labels_full.values()), "Aggregation"
)
elif "agg_sigs" in columns:
df_result.rename(columns=agg_sigs_labels_full, inplace=True)
if df_pval is not None:
df_pval.rename(columns=agg_sigs_labels_full, inplace=True)
if show_heatmap:
if not title:
title = f"{stat}"
if min_color is None:
min_color = df_result.values.min()
if max_color is None:
max_color = df_result.values.max()
# Convert raw p-values to probability of significance (1 - pval)
# so the bracketed value and the highlight threshold share the
# same scale.
df_psig = 1.0 - df_pval if df_pval is not None else None
if annotate and df_psig is not None:
heatmap_annot = self._format_dual_annot(
df_result, df_psig, round, round_pval
)
heatmap_fmt = ""
else:
heatmap_annot = annotate
heatmap_fmt = f".{round}f"
highlight_mask = None
if df_psig is not None and significance_threshold is not None:
highlight_mask = df_psig > float(significance_threshold)
yticklabels_to_pass = row_names
xticklabels_to_pass = column_names
ylabel_to_pass = ylabel
xlabel_to_pass = xlabel
if collapse_constant_levels:
# Strip row/column index levels whose values are constant
# so they don't clutter the tick labels. The collapsed
# values are promoted to the corresponding axis label
# when the caller did not provide one. ``df_result``
# itself is left untouched.
display_yticks, constant_y = self._collapse_constant_levels(
df_result.index
)
display_xticks, constant_x = self._collapse_constant_levels(
df_result.columns
)
if yticklabels_to_pass is None:
yticklabels_to_pass = display_yticks
if xticklabels_to_pass is None:
xticklabels_to_pass = display_xticks
# Filter which collapsed levels feed into the auto axis
# label. ``axis_label_levels`` is expressed in the same
# vocabulary as ``rows`` / ``columns`` (``"xcat"``,
# ``"ret"``, ``"freq"``, ``"agg_sigs"``); translate to
# the display level names used in the MultiIndex.
label_dict = {
"xcat": "Signal",
"ret": "Return",
"freq": "Frequency",
"agg_sigs": "Aggregation",
}
if axis_label_levels is not None:
allowed = {label_dict[k] for k in axis_label_levels}
constant_y = [(n, v) for n, v in constant_y if n in allowed]
constant_x = [(n, v) for n, v in constant_x if n in allowed]
if ylabel_to_pass is None and constant_y:
ylabel_to_pass = " · ".join(v for _, v in constant_y)
if xlabel_to_pass is None and constant_x:
xlabel_to_pass = " · ".join(v for _, v in constant_x)
msv.view_table(
df_result,
title=title,
title_fontsize=title_fontsize,
min_color=min_color,
max_color=max_color,
figsize=figsize,
fmt=heatmap_fmt,
annot=heatmap_annot,
xlabel=xlabel_to_pass,
ylabel=ylabel_to_pass,
xticklabels=xticklabels_to_pass,
yticklabels=yticklabels_to_pass,
highlight_mask=highlight_mask,
footnote=footnote,
footnote_fontsize=footnote_fontsize,
)
return df_result
[docs] def show_single_statistic_table(self, *args, **kwargs) -> pd.DataFrame:
"""
Return the single statistic table without rendering a heatmap.
Thin wrapper around :meth:`single_statistic_table` that forces
``show_heatmap=False``.
Parameters
----------
stat : str
type of statistic to be displayed (this can be any of the column names of
summary_table).
type : str
type of the statistic displayed. This can be based on the overall panel
("panel", default), an average of annual panels (mean_years), an average of
cross-sectional relations ("mean_cids"), the positive ratio across
years("pr_years"), positive ratio across sections ("pr_cids").
rows : List[str]
row indices, which can be return categories, feature categories, frequencies
and/or aggregations. The choice is made through a list of one or more of "xcat",
"ret", "freq" and "agg_sigs". The default is ["xcat", "agg_sigs"] resulting in
index strings (<agg_signs>) or if only one aggregation is available.
columns : List[str]
column indices, which can be return categories, feature categories,
frequencies and/or aggregations. The choice is made through a list of one or
more of "xcat", "ret", "freq" and "agg_sigs". The default is ["ret", "freq]
resulting in index strings () or if only one frequency is available.
title : str, optional
plot title. Default is None in which case the default title is used.
title_fontsize : int
font size of title. Default is 16.
row_names : List[str]
specifies the labels of rows in the heatmap. Default is None, the indices of
the generated DataFrame are used.
column_names : List[str]
specifies the labels of columns in the heatmap. Default is None, the columns
of the generated DataFrame are used.
signal_name_dict : dict, optional
dictionary mapping the signal names to the desired names in the heatmap.
Default is None, in which case the signal names are used. Renamed
values flow through to the auto axis label produced by the
constant-level collapse described under ``ylabel``.
return_name_dict : dict, optional
dictionary mapping the return names to the desired names in the heatmap.
Default is None, in which case the return names are used. Renamed
values flow through to the auto axis label produced by the
constant-level collapse described under ``xlabel``.
xcat_labels : dict, optional
Unified rename dictionary covering both signal and return
``xcat``\s. Internally split by membership in ``self.sigs`` /
``self.rets`` and routed through ``signal_name_dict`` /
``return_name_dict``; xcats not listed in the dict are kept
verbatim. Mutually exclusive with the two legacy kwargs — pass
either ``xcat_labels`` or ``signal_name_dict`` /
``return_name_dict``, not both. Default is None (no rename).
freq_labels : dict, optional
Mapping from frequency code (``"M"``, ``"Q"``, …) to its
display label. Frequencies not listed in the dict are kept
verbatim. Default is None.
agg_sigs_labels : dict, optional
Mapping from aggregation code (``"last"``, ``"mean"``, …) to
its display label. Aggregations not listed in the dict are
kept verbatim. Default is None.
min_color : float, optional
minimum value of the color scale. Default is None, in which case the minimum
value of the table is used.
max_color : float, optional
maximum value of the color scale. Default is None, in which case the maximum
value of the table is used.
figsize : Tuple[float, float]
Tuple (w, h) of width and height of graph. Default is (14, 8).
annotate : bool
Default is True, where the values shown in the heatmap are annotated.
round : int
number of decimals to round the primary statistic to in the heatmap
annotations. Default is 3.
pval_stat : str, optional
name of a p-value statistic — typically ``"kendall_pval"``,
``"pearson_pval"`` or ``"map_pval"`` (the Macrosynergy Panel
test). When set, each heatmap cell shows the **probability of
significance**, ``1 - pval_stat``, in brackets beneath the
primary statistic. Default is None. When ``pval_stat="map_pval"``
the SignalReturnRelations must have been constructed with
``ms_panel_test=True``.
round_pval : int
number of decimals to round the bracketed probability of
significance to in the heatmap annotations. Default is 3.
significance_threshold : float, optional
probability-of-significance cutoff above which a cell's
annotation is rendered in black and bold. Compared directly
against the bracketed value (``1 - pval_stat``), so 0.9
highlights cells whose probability of significance exceeds 0.9
(equivalently, raw p-value below 0.1). Only takes effect when
``pval_stat`` is set. Pass ``None`` to disable. Default is 0.9.
xlabel, ylabel, footnote, footnote_fontsize
Forwarded to :meth:`single_statistic_table` and only affect
the heatmap; accepted here for API symmetry even though this
wrapper renders no heatmap.
Returns
-------
~pandas.DataFrame
DataFrame with the specified statistic for each row and column.
"""
kwargs["show_heatmap"] = False
return self.single_statistic_table(*args, **kwargs)
[docs] def plot_single_statistic_heatmap(self, *args, **kwargs) -> None:
"""
Render the heatmap of the single statistic table.
Thin wrapper around :meth:`single_statistic_table` that forces
``show_heatmap=True``. The computed table itself is not returned.
Parameters
----------
stat : str
type of statistic to be displayed (this can be any of the column names of
summary_table).
type : str
type of the statistic displayed. This can be based on the overall panel
("panel", default), an average of annual panels (mean_years), an average of
cross-sectional relations ("mean_cids"), the positive ratio across
years("pr_years"), positive ratio across sections ("pr_cids").
rows : List[str]
row indices, which can be return categories, feature categories, frequencies
and/or aggregations. The choice is made through a list of one or more of "xcat",
"ret", "freq" and "agg_sigs". The default is ["xcat", "agg_sigs"] resulting in
index strings (<agg_signs>) or if only one aggregation is available.
columns : List[str]
column indices, which can be return categories, feature categories,
frequencies and/or aggregations. The choice is made through a list of one or
more of "xcat", "ret", "freq" and "agg_sigs". The default is ["ret", "freq]
resulting in index strings () or if only one frequency is available.
show_heatmap : bool
not allowed; this wrapper always forces ``show_heatmap=True`` and
any value supplied by the caller is overridden.
title : str, optional
plot title. Default is None in which case the default title is used.
title_fontsize : int
font size of title. Default is 16.
row_names : List[str]
specifies the labels of rows in the heatmap. Default is None, the indices of
the generated DataFrame are used.
column_names : List[str]
specifies the labels of columns in the heatmap. Default is None, the columns
of the generated DataFrame are used.
signal_name_dict : dict, optional
dictionary mapping the signal names to the desired names in the heatmap.
Default is None, in which case the signal names are used. Renamed
values flow through to the auto axis label produced by the
constant-level collapse described under ``ylabel``.
return_name_dict : dict, optional
dictionary mapping the return names to the desired names in the heatmap.
Default is None, in which case the return names are used. Renamed
values flow through to the auto axis label produced by the
constant-level collapse described under ``xlabel``.
xcat_labels : dict, optional
Unified rename dictionary covering both signal and return
``xcat``\s. Internally split by membership in ``self.sigs`` /
``self.rets`` and routed through ``signal_name_dict`` /
``return_name_dict``; xcats not listed in the dict are kept
verbatim. Mutually exclusive with the two legacy kwargs — pass
either ``xcat_labels`` or ``signal_name_dict`` /
``return_name_dict``, not both. Default is None (no rename).
freq_labels : dict, optional
Mapping from frequency code (``"M"``, ``"Q"``, …) to its
display label. Frequencies not listed in the dict are kept
verbatim. Default is None.
agg_sigs_labels : dict, optional
Mapping from aggregation code (``"last"``, ``"mean"``, …) to
its display label. Aggregations not listed in the dict are
kept verbatim. Default is None.
min_color : float, optional
minimum value of the color scale. Default is None, in which case the minimum
value of the table is used.
max_color : float, optional
maximum value of the color scale. Default is None, in which case the maximum
value of the table is used.
figsize : Tuple[float, float]
Tuple (w, h) of width and height of graph. Default is (14, 8).
annotate : bool
Default is True, where the values shown in the heatmap are annotated.
round : int
number of decimals to round the primary statistic to in the heatmap
annotations. Default is 3.
pval_stat : str, optional
name of a p-value statistic — typically ``"kendall_pval"``,
``"pearson_pval"`` or ``"map_pval"`` (the Macrosynergy Panel
test). When set, each heatmap cell shows the **probability of
significance**, ``1 - pval_stat``, in brackets beneath the
primary statistic. Default is None. When ``pval_stat="map_pval"``
the SignalReturnRelations must have been constructed with
``ms_panel_test=True``.
round_pval : int
number of decimals to round the bracketed probability of
significance to in the heatmap annotations. Default is 3.
significance_threshold : float, optional
probability-of-significance cutoff above which a cell's
annotation is rendered in black and bold. Compared directly
against the bracketed value (``1 - pval_stat``), so 0.9
highlights cells whose probability of significance exceeds 0.9
(equivalently, raw p-value below 0.1). Only takes effect when
``pval_stat`` is set. Pass ``None`` to disable. Default is 0.9.
xlabel : str, optional
Label drawn beneath the heatmap columns. Default is None.
ylabel : str, optional
Label drawn beside the heatmap rows. Default is None.
footnote : str, optional
Free-text caption rendered below the heatmap. Useful for
recording the significance test, panel scope, or annotation
legend. Multi-line strings are supported. Default is None.
footnote_fontsize : int, optional
Font size for the footnote text. Default is 10.
"""
kwargs["show_heatmap"] = True
self.single_statistic_table(*args, **kwargs)
@staticmethod
def _format_dual_annot(
df_stat: pd.DataFrame,
df_pval: pd.DataFrame,
round_stat: int,
round_pval: int,
) -> pd.DataFrame:
"""
Build a string-typed DataFrame of cell annotations of the form
``"<stat>\\n(<pval>)"`` aligned with ``df_stat``. NaN values render
as empty strings.
"""
def _fmt(value: float, ndigits: int) -> str:
if value is None or (isinstance(value, float) and np.isnan(value)):
return ""
return f"{value:.{ndigits}f}"
annot = pd.DataFrame(index=df_stat.index, columns=df_stat.columns, dtype=object)
for row in df_stat.index:
for col in df_stat.columns:
stat_str = _fmt(df_stat.loc[row, col], round_stat)
pval_str = _fmt(df_pval.loc[row, col], round_pval)
if stat_str == "" and pval_str == "":
annot.loc[row, col] = ""
elif pval_str == "":
annot.loc[row, col] = stat_str
else:
annot.loc[row, col] = f"{stat_str}\n({pval_str})"
return annot
def _collapse_constant_levels(
self, idx: pd.Index
) -> Tuple[Optional[List[str]], List[Tuple[str, str]]]:
"""
Strip levels of a MultiIndex whose values are constant across the
index and surface those values for axis-label use.
Parameters
----------
idx : pd.Index
Row or column index of the assembled statistic table. May be a
plain :class:`~pandas.Index` or a :class:`~pandas.MultiIndex`.
Returns
-------
Tuple[Optional[List[str]], List[Tuple[str, str]]]
``(display_labels, constant_pairs)``.
``display_labels`` is a list of tick labels with constant levels
removed, joined by ``" · "`` when more than one level survives.
It is ``None`` when no collapse applies (plain ``Index``, single
level, no constant levels, or all levels constant — in which
case the existing tick labels are kept). ``constant_pairs`` is
an ordered list of ``(level_name, value)`` for each collapsed
level, suitable for filtering and joining into an auto axis
label.
"""
if not isinstance(idx, pd.MultiIndex) or idx.nlevels < 2:
return None, []
constant_level_nos: List[int] = []
constant_pairs: List[Tuple[str, str]] = []
for level_no in range(idx.nlevels):
uniq = idx.get_level_values(level_no).unique()
if len(uniq) == 1:
constant_level_nos.append(level_no)
constant_pairs.append(
(str(idx.names[level_no]), str(uniq[0]))
)
if not constant_level_nos:
return None, []
if len(constant_level_nos) == idx.nlevels:
# Every level is constant (single-row/column table): leave the
# tick labels alone but still expose the values for the axis.
return None, constant_pairs
remaining = idx.droplevel(constant_level_nos)
if isinstance(remaining, pd.MultiIndex):
display = [
" · ".join(str(part) for part in tup) for tup in remaining.tolist()
]
else:
display = [str(v) for v in remaining.tolist()]
return display, constant_pairs
[docs] def set_df_labels(self, rows_dict: Dict, rows: List[str], columns: List[str]):
"""
Creates two lists of strings that will be used as the row and column labels for
the resulting dataframe.
Parameters
----------
rows_dict : dict
dictionary containing the each value for each of the xcat, ret, freq and
agg_sigs categories.
rows : List[str]
list of strings specifying which of the categories are included in the rows
of the dataframe.
columns : List[str]
list of strings specifying which of the categories are included in the
columns of the dataframe.
"""
label_dict = {
"xcat": "Signal",
"ret": "Return",
"freq": "Frequency",
"agg_sigs": "Aggregation",
}
if len(rows) == 2:
rows_names = pd.MultiIndex.from_tuples(
[(a, b) for a in rows_dict[rows[0]] for b in rows_dict[rows[1]]],
names=[label_dict[rows[0]], label_dict[rows[1]]],
)
columns_names = pd.MultiIndex.from_tuples(
[(a, b) for a in rows_dict[columns[0]] for b in rows_dict[columns[1]]],
names=[label_dict[columns[0]], label_dict[columns[1]]],
)
elif len(rows) == 1:
rows_names = rows_dict[rows[0]]
columns_names = pd.MultiIndex.from_tuples(
[
(a, b, c)
for a in rows_dict[columns[0]]
for b in rows_dict[columns[1]]
for c in rows_dict[columns[2]]
],
names=[
label_dict[columns[0]],
label_dict[columns[1]],
label_dict[columns[2]],
],
)
elif len(columns) == 1:
rows_names = pd.MultiIndex.from_tuples(
[
(a, b, c)
for a in rows_dict[rows[0]]
for b in rows_dict[rows[1]]
for c in rows_dict[rows[2]]
],
names=[label_dict[rows[0]], label_dict[rows[1]], label_dict[rows[2]]],
)
columns_names = rows_dict[columns[0]]
return rows_names, columns_names
[docs] def get_rowcol(self, hash: str, rowcols: List[str]):
"""
Calculates which row/column the hash belongs to.
Parameters
----------
hash : str
hash of the statistic.
rowcols : List[str]
list of strings specifying which of the categories are in the rows/columns
of the dataframe.
"""
result = ""
idx: List[str] = ["ret", "xcat", "freq", "agg_sigs"]
assert all([x in idx for x in rowcols]), "rowcols must be a subset of idx"
if len(rowcols) == 1:
result = hash.split("/")[idx.index(rowcols[0])]
if len(rowcols) == 2:
result = (
hash.split("/")[idx.index(rowcols[0])],
hash.split("/")[idx.index(rowcols[1])],
)
if len(rowcols) == 3:
result = (
hash.split("/")[idx.index(rowcols[0])],
hash.split("/")[idx.index(rowcols[1])],
hash.split("/")[idx.index(rowcols[2])],
)
return result
if __name__ == "__main__":
cids = ["AUD", "CAD", "GBP", "NZD", "USD"]
xcats = ["XR", "XRH", "CRY", "GROWTH", "INFL"]
df_cids = pd.DataFrame(
index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
)
df_cids.loc["AUD"] = ["2000-01-01", "2020-12-31", 0.1, 1]
df_cids.loc["CAD"] = ["2001-01-01", "2020-11-30", 0, 1]
df_cids.loc["BRL"] = ["2001-01-01", "2020-11-30", -0.1, 2]
df_cids.loc["GBP"] = ["2002-01-01", "2020-11-30", 0, 2]
df_cids.loc["NZD"] = ["2002-01-01", "2020-09-30", -0.1, 2]
df_cids.loc["USD"] = ["2003-01-01", "2020-12-31", -0.1, 2]
cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]
df_xcats = pd.DataFrame(index=xcats, columns=cols)
df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
df_xcats.loc["XRH"] = ["2000-01-01", "2020-12-31", 0.2, 1, 0, 0.25]
df_xcats.loc["CRY"] = ["2000-01-01", "2020-10-30", 1, 2, 0.95, 1]
df_xcats.loc["GROWTH"] = ["2001-01-01", "2020-10-30", 1, 2, 0.9, 1]
df_xcats.loc["INFL"] = ["2001-01-01", "2020-10-30", 1, 2, 0.8, 0.5]
dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
dfd["grading"] = np.ones(dfd.shape[0])
black = {"AUD": ["2000-01-01", "2003-12-31"], "GBP": ["2018-01-01", "2100-01-01"]}
# All AUD GROWTH locations.
filt1 = (dfd["xcat"] == "GROWTH") & (dfd["cid"] == "AUD")
filt2 = (dfd["xcat"] == "INFL") & (dfd["cid"] == "NZD")
# Reduced DataFrame.
dfdx = dfd[~(filt1 | filt2)].copy()
dfdx["ERA"] = "before 2007"
dfdx.loc[dfdx["real_date"].dt.year > 2007, "ERA"] = "from 2010"
cidx = ["AUD", "CAD", "GBP", "USD"]
def spearman(x, y):
return stats.spearmanr(x, y)[0]
from statsmodels.tsa.stattools import grangercausalitytests
def granger(x, y):
return grangercausalitytests(
np.array([x, y]).T, maxlag=3, addconst=True, verbose=False
)[1][0]["ssr_ftest"][0]
def granger_pval(x, y):
return grangercausalitytests(
np.array([x, y]).T, maxlag=3, addconst=True, verbose=False
)[1][0]["ssr_ftest"][1]
sigs = ["CRY"]
# Additional signals.
srn = SignalReturnRelations(
dfd,
rets="XR",
sigs=sigs,
sig_neg=True,
cosp=True,
freqs="Q",
start="2002-01-01",
ms_panel_test=True,
additional_metrics=[spearman, granger, granger_pval],
)
print(sigs)
df_dep = srn.summary_table()
print(df_dep)
dfsum = srn.single_relation_table(table_type="summary")
print(dfsum)
srn = SignalReturnRelations(
dfd,
rets="XR",
sigs=["CRY", "CRY", "INFL", "GROWTH"],
sig_neg=[True, False, True, True],
cosp=True,
freqs="M",
start="2002-01-01",
additional_metrics=[spearman, granger, granger_pval],
)
df_sigs = srn.multiple_relations_table()
print(df_sigs)
dfsum = srn.single_relation_table(table_type="cross_section")
print(dfsum)
srn.accuracy_bars(
view="signals",
title="Accuracy",
x_labels={"CRY": "Cry", "INFL": "Inflation", "GROWTH": "Growth"},
x_labels_rotate=45,
)
sst = srn.single_statistic_table(stat="granger_pval")
print(sst)
sr = SignalReturnRelations(
dfd,
rets=["XR", "XRH"],
sigs=["CRY", "INFL", "GROWTH"],
freqs="M",
start="2002-01-01",
agg_sigs="last",
)
srt = sr.single_relation_table()
mrt = sr.multiple_relations_table()
sst = sr.single_statistic_table(
stat="accuracy",
type="mean_years",
rows=["ret", "agg_sigs"],
columns=["xcat", "freq"],
)
print(srt)
print(mrt)
print(sst)
# Basic Signal Returns showing for multiple input values
sr = SignalReturnRelations(
dfd,
rets=["XR", "XRH"],
sigs=["CRY", "INFL", "GROWTH"],
sig_neg=[True, True, False],
cosp=True,
freqs=["M", "Q"],
agg_sigs=["last", "mean"],
blacklist=black,
)
sr.accuracy_bars(sigs=["CRY", "INFL_NEG"], view="signals", title="Accuracy")
sr.correlation_bars(sigs=["CRY", "INFL_NEG"], type="signals", title="Correlation")
srt = sr.single_relation_table(
ret="XRH", xcat="INFL_NEG", freq="Q", agg_sigs="last"
)
mrt = sr.multiple_relations_table()
sst = sr.single_statistic_table(stat="pearson", show_heatmap=True)
print(srt)
print(mrt)
print(sst)
# Specifying specific arguments for each of the Signal Return Functions
srt = sr.single_relation_table(ret="XR", xcat="CRY_NEG", freq="Q", agg_sigs="last")
print(srt)
mrt = sr.multiple_relations_table(
rets=["XR", "GROWTH"], xcats="INFL", freqs=["M", "Q"], agg_sigs=["last", "mean"]
)
print(mrt)
sst = sr.single_statistic_table(
stat="auc",
rows=["ret", "xcat", "freq"],
columns=["agg_sigs"],
type="mean_cids",
)
print(sst)