Source code for macrosynergy.signal.signal_return_relations

"""
Module for analysing and visualizing signal and a return series.
"""

import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.colors import is_color_like
from scipy import stats
from sklearn import metrics as skm

import macrosynergy.visuals as msv
from macrosynergy.learning.random_effects import RandomEffects
from macrosynergy.management.simulate import make_qdf, make_test_df
from macrosynergy.management.types import QuantamentalDataFrame
from macrosynergy.management.utils import (
    apply_slip as apply_slip_util,
)
from macrosynergy.management.utils import (
    categories_df,
    get_cid,
    get_xcat,
    reduce_df,
    update_df,
)

# Ensure warnings are printed
warnings.simplefilter("always")


[docs]class SignalReturnRelations:
    """
    Class for analysing and visualizing signals and return series. The class is designed
    to provide a comprehensive analysis of the relationship between signals and returns
    across different frequencies and aggregation methods. The class can be used
    to calculate and visualize the following metrics:

    - Accuracy
    - Balanced accuracy
    - Positive signal ratio
    - Positive return ratio
    - Positive precision
    - Negative precision
    - Pearson correlation
    - Pearson correlation p-value
    - Kendall correlation
    - Kendall correlation p-value
    - AUC
    - Macrosynergy Panel test

    Parameters
    ----------
    df : ~pandas.DataFrame
        standardized DataFrame with the following necessary columns: 'cid', 'xcat',
        'real_date' and 'value.
    rets : str, List[str]
        one or several target return categories.
    sigs : str, List[str]
        list of signal categories to be considered for which detailed relational
        statistics can be calculated.
    sig_neg : bool, List[bool]
        if set to True puts the signal in negative terms for all analysis. If more than
        one signal is tested, `sig_neg` must be an ordered list of the same length as the
        signals, containing a True for each signal that needs to be negative.
        Default is False.
    cosp : bool
        If True the comparative statistics are calculated only for the "communal sample
        periods", i.e. periods and cross-sections that have values for all compared
        signals. Default is False.
    start : str
        earliest date in ISO format. Default is None in which case the earliest date
        available will be used.
    end : str
        latest date in ISO format. Default is None in which case the latest date in the
        dataframe will be used.
    blacklist : dict
        cross-sections with date ranges that should be excluded from the data frame. If
        one cross-section has several blacklist periods append numbers to the cross-section
        code.
    freqs : str, List[str]
        letters denoting all frequencies at which the series may be sampled. This must
        be a selection of 'D', 'W', 'M', 'Q', 'A'. Default is only 'M'. The return series
        will always be summed over the sample period. The signal series will be aggregated
        according to the values of `agg_sigs`.
    agg_sigs : str, List[str]
        aggregation method applied to the signal values in down-sampling. The default is
        "last". Alternatives are "mean", "median" and "sum". If a single aggregation type is
        chosen for multiple signal categories it is applied to all of them.
    fwin : int
        forward window of return category in base periods. Default is 1. This
        conceptually corresponds to the holding period of a position in accordance with the
        signal.
    slip : int
        Default is 0, implied slippage of feature availability for relationship with the
        target category. See :func:`macrosynergy.management.df_utils.apply_slip` for more
        information.
    ms_panel_test : bool
        if True the Macrosynergy Panel test is calculated. Please note that this is a
        very time-consuming operation and should be used only if you require the result.
    additional_metrics : List[Callable]
        list of additional metrics to be calculated and added to the output table.
    """

    def __init__(
        self,
        df: pd.DataFrame,
        rets: Union[str, List[str]] = None,
        sigs: Union[str, List[str]] = None,
        cids: Union[str, List[str]] = None,
        sig_neg: Union[bool, List[bool]] = None,
        cosp: bool = False,
        start: str = None,
        end: str = None,
        blacklist: dict = None,
        freqs: Union[str, List[str]] = "M",
        agg_sigs: Union[str, List[str]] = "last",
        fwin: int = 1,
        slip: int = 0,
        ms_panel_test: bool = False,
        additional_metrics: List[Callable] = None,
    ):
        if rets is None:
            raise ValueError("Target return must be defined.")
        if sigs is None:
            raise ValueError("Signal must be defined.")
        if not isinstance(df, pd.DataFrame):
            raise TypeError(f"DataFrame expected and not {type(df)}.")
        if not isinstance(cids, str) and cids is not None:
            if not isinstance(cids, list):
                raise TypeError(f"List or string expected and not {type(cids)}.")
            else:
                if not all(isinstance(cid, str) for cid in cids):
                    raise TypeError("List of strings expected for cids.")

        required_columns = ["cid", "xcat", "real_date", "value"]

        if not all(col in df.columns for col in required_columns):
            raise ValueError(
                "Dataframe columns must be of value: 'cid', 'xcat','real_date' and  \
                'value'"
            )
        df["real_date"] = pd.to_datetime(df["real_date"], format="%Y-%m-%d")
        df = QuantamentalDataFrame(df)

        self.dic_freq = {
            "D": "daily",
            "W": "weekly",
            "M": "monthly",
            "Q": "quarterly",
            "A": "annual",
        }

        freq_error = f"Frequency parameter must be one of {list(self.dic_freq.keys())}."
        if isinstance(freqs, list):
            seen = set()
            self.freqs = []
            for f in freqs:
                if f not in self.dic_freq.keys():
                    raise ValueError(freq_error)
                else:
                    if f not in seen:
                        seen.add(f)
                        self.freqs.append(f)
                    else:
                        warnings.warn(
                            f"Frequency {f} is repeated, dropping repeated frequency."
                        )
        else:
            if freqs not in self.dic_freq.keys():
                raise ValueError(freq_error)
            else:
                self.freqs = [freqs]

        if not isinstance(ms_panel_test, bool):
            raise TypeError(
                f"<bool> object expected for ms_panel_test and not {type(ms_panel_test)}."
            )

        self.ms_panel_test = ms_panel_test

        self.metrics = [
            "accuracy",
            "bal_accuracy",
            "pos_sigr",
            "pos_retr",
            "pos_prec",
            "neg_prec",
            "pearson",
            "pearson_pval",
            "kendall",
            "kendall_pval",
            "auc",
        ]

        if self.ms_panel_test:
            self.metrics.append("map_pval")

        if additional_metrics:
            self.metrics.extend(
                metric.__name__
                for metric in additional_metrics
                if hasattr(metric, "__name__")
            )
        else:
            additional_metrics = []

        self.additional_metrics = additional_metrics

        if not isinstance(cosp, bool):
            raise TypeError(f"<bool> object expected and not {type(cosp)}.")

        if isinstance(cids, str):
            cids = [cids]

        self.cids = cids
        self.rets = rets
        self.slip = slip
        self.agg_sigs = agg_sigs
        self.xcats = list(df["xcat"].unique())
        self.df = df
        self.cosp = cosp
        self.start = start
        self.end = end
        self.blacklist = blacklist
        self.fwin = fwin

        if not self.is_list_of_strings(rets):
            self.rets = [rets]

        if not self.is_list_of_strings(sigs):
            self.sigs = [sigs]
        else:
            self.sigs = sigs.copy()

        if not self.is_list_of_strings(agg_sigs):
            self.agg_sigs = [agg_sigs]

        if not self.is_list_of_strings(freqs):
            self.freqs = [freqs]

        for sig in self.sigs:
            assert (
                sig in self.xcats
            ), "Primary signal must be available in the DataFrame."

        for ret in self.rets:
            assert (
                ret in self.xcats
            ), "Target return must be available in the DataFrame."

        if sig_neg is None:
            self.signs = [False for _ in self.sigs]
        else:
            self.signs = sig_neg if isinstance(sig_neg, list) else [sig_neg]

        for sign in self.signs:
            if sign not in [False, True]:
                raise TypeError("Sign must be either False or True.")

        if len(self.signs) != len(self.sigs):
            raise ValueError("Signs must have a length equal to signals")

        self.xcats = self.rets + self.sigs
        self.df = reduce_df(
            df,
            xcats=self.xcats,
            cids=self.cids,
            start=self.start,
            end=self.end,
            blacklist=self.blacklist,
        )

        new_sigs = []

        for i, sig in enumerate(self.sigs):
            if self.signs[i]:
                neg_sig = f"{sig}_NEG"
                neg_df = self.df[self.df["xcat"] == sig].copy()
                neg_df["value"] *= -1
                neg_df["xcat"] = neg_sig

                # Append the negated version to the main df
                self.df = update_df(self.df, neg_df)
                new_sigs.append(neg_sig)
            else:
                new_sigs.append(sig)

        self.sigs = new_sigs

        self.df = QuantamentalDataFrame(self.df)

        all_found_tickers = self.df.list_tickers()

        sigs_found, rets_found = {}, {}
        for tk in all_found_tickers:
            cid, xcat = get_cid(tk), get_xcat(tk)
            if xcat in self.sigs:
                sigs_found[cid] = sigs_found.get(cid, []) + [tk]
            if xcat in self.rets:
                rets_found[cid] = rets_found.get(cid, []) + [tk]
        # keep only cids that have at least one sig AND one ret
        self.cids = sorted(set(sigs_found) & set(rets_found))

        self.df = self.df.reduce_df(cids=self.cids, blacklist=self.blacklist)
        self.original_df = QuantamentalDataFrame(self.df.copy())
        self.cids_used_in_last_calculation = None

    def __rival_sigs__(self, ret, sigs=None):
        """
        Helper function used to produce the panel-level table for the additional signals.
        """

        if sigs is None:
            sigs = self.sigs

        df_out = pd.DataFrame(index=sigs, columns=self.metrics)
        df = self.df

        for s in sigs:
            # Entire panel will be passed in.
            df_out = self.__table_stats__(
                df_segment=df, df_out=df_out, segment=s, signal=s, ret=ret
            )

        return df_out

    @staticmethod
    def __yaxis_lim__(accuracy_df: pd.DataFrame):
        """
        Helper function to determine the range the y-axis is defined over.

        Parameters
        ----------
        accuracy_df : ~pandas.DataFrame
            two dimensional DataFrame with accuracy & balanced accuracy columns.


        .. note::
            The returned range will always be below 0.5.
        """

        y_axis = lambda min_correl: min_correl > 0.45
        min_value = accuracy_df.min().min()
        # Ensures any accuracy statistics greater than 0.5 are more pronounced given the
        # adjusted scale.
        y_input = 0.45 if y_axis(min_value) else min_value

        return y_input

[docs]    def accuracy_bars(
        self,
        ret: str = None,
        sigs: Union[str, List[str]] = None,
        freq: str = None,
        agg_sig: str = None,
        view: str = "cross_section",
        title: str = None,
        title_fontsize: int = 16,
        size: Tuple[float, float] = None,
        legend_pos: str = "best",
        x_labels: Dict = None,
        x_labels_rotate: int = 0,
        return_fig: bool = False,
        **kwargs,
    ):
        """
        Plot bar chart for the overall and balanced accuracy metrics. For types:
        cross_section and years.

        Parameters
        ----------
        ret : str, optional
            return category. Default is None, in which case the first return category will
            be used.
        sigs : str, or List[str], optional
            signal category. Default is None, in which case all signals will be used.
        freq : str, optional
            frequency to be used in analysis. Default is None, in which case the first
            frequency will be used.
        agg_sig : str, optional
            aggregation method to be used in analysis. Default is None, in which case the
            first aggregation method will be used.
        view : str, optional
            type of segment over which bars are drawn. Either "cross_section" (default),
            "years" or "signals".
        title : str, optional
            chart header - default will be applied if none is chosen.
        title_fontsize : int
            font size of chart header. Default is 16.
        size : Tuple[float], optional
            2-tuple of width and height of plot - default will be applied if none is
            chosen.
        legend_pos : str
            position of legend box. Default is 'best'. See the documentation of
            matplotlib.pyplot.legend.
        x_labels : Dict[str]
            dictionary of x-axis labels. Default is None.
        x_labels_rotate : int
            rotation of x-axis labels. Default is 0.
        """
        if "type" in kwargs:
            warnings.warn(
                "`type` parameter is deprecated; use `view` instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            view = kwargs["type"]
        if view not in ["cross_section", "years", "signals"]:
            raise ValueError(
                "View parameter must be either 'cross_section', 'years' or 'signals'."
            )

        if sigs is None:
            sigs = self.sigs
        elif isinstance(sigs, str):
            if sigs not in self.sigs and sigs + "_NEG" in self.sigs:
                sigs = sigs + "_NEG"
        if isinstance(sigs, list):
            for sig in sigs:
                if sig not in self.sigs and sig + "_NEG" in self.sigs:
                    sigs[sigs.index(sig)] = sig + "_NEG"

        if isinstance(sigs, str):
            sigs = [sigs]

        for sig in sigs:
            if sig not in self.sigs:
                raise ValueError(
                    f"Signal {sig} is not defined in Signal Return Relations."
                )

        if freq is None:
            freq = self.freqs[0]

        if agg_sig is None:
            agg_sig = self.agg_sigs[0]

        if ret is None:
            ret = self.rets[0]

        self.df = self.original_df.copy()
        self.manipulate_df(xcats=sigs + [ret], freq=freq, agg_sig=agg_sig)

        for i in range(len(sigs)):
            if sigs[i] not in self.sigs:
                sigs[i] = sigs[i] + "_NEG"

        if view == "cross_section":
            df_xs = self.__output_table__(cs_type="cids", ret=ret, sig=sigs[0])
        elif view == "years":
            df_xs = self.__output_table__(cs_type="years", ret=ret, sig=sigs[0])
        else:
            df_xs = self.__rival_sigs__(ret, sigs)

        dfx = df_xs[~df_xs.index.isin(["PosRatio"])]

        if title is None:
            refsig = "various signals" if view == "signals" else sigs[0]
            title = (
                f"Accuracy for sign prediction of {ret} based on {refsig} "
                f"at {self.dic_freq[self.freqs[0]]} frequency."
            )
        if size is None:
            size = (np.max([dfx.shape[0] / 2, 8]), 6)

        sns.set_style("darkgrid")
        fig, ax = plt.subplots(figsize=size)
        x_indexes = np.arange(dfx.shape[0])

        w = 0.4
        ax.bar(
            x_indexes - w / 2,
            dfx["accuracy"],
            label="Accuracy",
            width=w,
            color="lightblue",
        )
        ax.bar(
            x_indexes + w / 2,
            dfx["bal_accuracy"],
            label="Balanced Accuracy",
            width=w,
            color="steelblue",
        )

        if x_labels:
            validated_labels = {}
            if view == "signals":
                for key, value in x_labels.items():
                    if key in self.sigs:
                        validated_labels[key] = value
                    elif key + "_NEG" in self.sigs:
                        validated_labels[key + "_NEG"] = value
            elif view == "cross_section":
                for key, value in x_labels.items():
                    if key in self.cids:
                        validated_labels[key] = value
            labels = [validated_labels.get(xcat, xcat) for xcat in dfx.index]
        else:
            labels = dfx.index

        ax.set_xticks(x_indexes)
        ax.set_xticklabels(labels, rotation=x_labels_rotate)
        ax.axhline(y=0.5, color="black", linestyle="-", linewidth=0.5)

        y_input = self.__yaxis_lim__(
            accuracy_df=dfx.loc[:, ["accuracy", "bal_accuracy"]]
        )
        ax.set_ylim(round(y_input, 2))

        ax.set_title(title, fontsize=title_fontsize)
        ax.legend(loc=legend_pos)

        if return_fig:
            return fig
        else:
            plt.show()

[docs]    def correlation_bars(
        self,
        ret: str = None,
        sigs: Union[str, List[str]] = None,
        freq: str = None,
        type: str = "cross_section",
        title: str = None,
        title_fontsize: int = 16,
        size: Tuple[float, float] = None,
        legend_pos: str = "best",
        x_labels: Dict = None,
        x_labels_rotate: int = 0,
        return_fig: bool = False,
    ):
        """
        Plot correlation coefficients and significance. For types: cross_section and
        years.

        Parameters
        ----------
        ret : str, optional
            return category. Default is the first return category.
        sig : str, List[str], optional
            signal category. Default is the first signal category.
        type : str, optional
            type of segment over which bars are drawn. Either "cross_section" (default),
            "years" or "signals".
        title : str, optional
            chart header. Default is None, in which case the default title will be applied.
        title_fontsize : int
            font size of chart header. Default is 16.
        size : Tuple[float, float], optional
            2-tuple of width and height of plot. If None, the default size will be applied.
        legend_pos : str
            position of legend box. Default is 'best'. See matplotlib.pyplot.legend.
        x_labels : Dict[str]
            dictionary of x-axis labels. Default is None.
        x_labels_rotate : int
            rotation of x-axis labels. Default is 0.
        """

        assert type in ["cross_section", "years", "signals"]

        if freq is None:
            freq = self.freqs[0]

        if ret is None and sigs is None:
            ret = self.rets[0]
            sigs = self.sigs
        else:
            if ret is None:
                ret = self.rets[0]
            if sigs is None:
                sigs = self.sigs
            elif isinstance(sigs, str):
                if sigs not in self.sigs and sigs + "_NEG" in self.sigs:
                    sigs = sigs + "_NEG"
            if isinstance(sigs, list):
                for sig in sigs:
                    if sig not in self.sigs and sig + "_NEG" in self.sigs:
                        sigs[sigs.index(sig)] = sig + "_NEG"
            self.df = self.original_df.copy()

        if isinstance(sigs, str):
            sigs = [sigs]

        self.manipulate_df(
            xcats=sigs + [ret],
            freq=freq,
            agg_sig=self.agg_sigs[0],
        )
        for i in range(len(sigs)):
            if sigs[i] not in self.sigs:
                sigs[i] = sigs[i] + "_NEG"
        if type == "cross_section":
            df_xs = self.__output_table__(cs_type="cids", ret=ret, sig=sigs[0])
        elif type == "years":
            df_xs = self.__output_table__(cs_type="years", ret=ret, sig=sigs[0])
        else:
            df_xs = self.__rival_sigs__(ret, sigs)

        dfx = df_xs[~df_xs.index.isin(["PosRatio", "Mean"])]

        pprobs = np.array(
            [
                (1 - pv) * (np.sign(cc) + 1) / 2
                for pv, cc in zip(dfx["pearson_pval"], dfx["pearson"])
            ]
        )
        pprobs[pprobs == 0] = 0.01
        kprobs = np.array(
            [
                (1 - pv) * (np.sign(cc) + 1) / 2
                for pv, cc in zip(dfx["kendall_pval"], dfx["kendall"])
            ]
        )
        kprobs[kprobs == 0] = 0.01

        if title is None:
            refsig = "various signals" if type == "signals" else sigs[0]
            title = (
                f"Positive correlation probability of {ret} "
                f"and lagged {refsig} at {self.dic_freq[freq]} frequency."
            )
        if size is None:
            size = (np.max([dfx.shape[0] / 2, 8]), 6)

        sns.set_style("darkgrid")
        fig, ax = plt.subplots(figsize=size)
        x_indexes = np.arange(len(dfx.index))
        w = 0.4
        ax.bar(x_indexes - w / 2, pprobs, label="Pearson", width=w, color="lightblue")
        ax.bar(x_indexes + w / 2, kprobs, label="Kendall", width=w, color="steelblue")

        if x_labels:
            validated_labels = {}
            for key, value in x_labels.items():
                if key in self.sigs:
                    validated_labels[key] = value
                elif key + "_NEG" in self.sigs:
                    validated_labels[key + "_NEG"] = value
            labels = [validated_labels.get(xcat, xcat) for xcat in dfx.index]
        else:
            labels = dfx.index

        ax.set_xticks(x_indexes)
        ax.set_xticklabels(labels, rotation=x_labels_rotate)

        ax.axhline(
            y=0.95,
            color="orange",
            linestyle="--",
            linewidth=0.5,
            label="95% probability",
        )
        ax.axhline(
            y=0.99, color="red", linestyle="--", linewidth=0.5, label="99% probability"
        )

        ax.set_title(title, fontsize=title_fontsize)
        ax.legend(loc=legend_pos)

        if return_fig:
            return fig
        else:
            plt.show()

    @staticmethod
    def __slice_df__(df: pd.DataFrame, cs: str, cs_type: str):
        """
        Slice DataFrame by year, cross-section, or use full panel.

        Parameters
        ----------
        df : ~pandas.DataFrame
            standardised DataFrame.
        cs : str
            individual segment, cross-section or year.
        cs_type : str
            segmentation type.
        """

        # Row names of cross-sections or years.
        if cs != "Panel" and cs_type == "cids":
            df_cs = df.loc[cs]
        elif cs != "Panel":
            df_cs = df[df["year"] == float(cs)]
        else:
            df_cs = df

        return df_cs

[docs]    @staticmethod
    def apply_slip(
        df: pd.DataFrame,
        slip: int,
        cids: List[str],
        xcats: List[str],
        metrics: List[str],
    ) -> pd.DataFrame:
        """
        Function used to call the apply slip method that is defined in
        `macrosynergy.management.df_utils`.

        Parameters
        ----------
        df : ~pandas.DataFrame
            standardised DataFrame.
        slip : int
            slip value to apply to df.
        cids : List[str]
            list of cids in df to apply slip.
        xcats : List[str]
            list of xcats in df to apply slip.
        metrics : List[str]
            list of metrics in df to apply slip.
        """

        return apply_slip_util(
            df=df, slip=slip, cids=cids, xcats=xcats, metrics=metrics, raise_error=False
        )

[docs]    @staticmethod
    def is_list_of_strings(variable: Any) -> bool:
        """
        Function used to test whether a variable is a list of strings, to avoid the
        compiler saying a string is a list of characters.

        Parameters
        ----------
        variable : Any
            variable to be tested.

        Returns
        -------
        bool
            True if variable is a list of strings, False otherwise.
        """

        return isinstance(variable, list) and all(
            isinstance(item, str) for item in variable
        )

[docs]    def manipulate_df(self, xcats: List[str], freq: str, agg_sig: str):
        """
        Used to manipulate the DataFrame to the desired format for the analysis. Firstly
        reduces the dataframe to only include data outside of the blacklist and data
        that is relevant to xcat and sig. Then applies the slip to the dataframe. It
        then converts the dataframe to the desired format for the analysis and checks
        whether any negative signs should be introduced.

        Parameters
        ----------
        xcats : List[str]
            list of xcats in df to apply slip.
        freq : str
            frequency to be used in analysis.
        agg_sig : str
            aggregation method to be used in analysis.
        """

        self.df = self.original_df.copy()

        cids = None if self.cids is None else self.cids
        dfd = reduce_df(
            self.df,
            xcats=xcats,
            cids=cids,
            start=self.start,
            end=self.end,
            blacklist=self.blacklist,
        )
        metric_cols: List[str] = list(
            set(dfd.columns.tolist())
            - set(["real_date", "xcat", "cid", "ticker", "last_updated"])
        )
        # here, the slip is applied to the the first xcat (explanatory variable)
        dfd: pd.DataFrame = self.apply_slip(
            df=dfd,
            slip=self.slip,
            cids=cids,
            xcats=[xcats[0]],
            metrics=metric_cols,
        )

        if self.cosp and len(self.sigs) > 1:
            dfd = self.__communal_sample__(df=dfd, signal=xcats[:-1], ret=xcats[-1])

        self.dfd = dfd

        df = categories_df(
            dfd,
            xcats=xcats,
            cids=cids,
            val="value",
            start=None,
            end=None,
            freq=freq,
            blacklist=None,
            lag=1,
            fwin=self.fwin,
            xcat_aggs=[agg_sig, "sum"],
        )
        self.df = df
        self.cids_used_in_last_calculation = list(
            np.sort(self.df.index.get_level_values(0).unique())
        )

    def __communal_sample__(self, df: pd.DataFrame, signal: str, ret: str):
        """
        On a multi-index DataFrame, where the outer index are the cross-sections and the
        inner index are the timestamps, exclude any row where all signals do not have a
        realised value.

        Parameters
        ----------
        df : ~pandas.DataFrame
            standardized DataFrame with the following necessary columns: 'cid', 'xcat',
            'real_date' and 'value'.
        signal : str
            signal category.
        ret : str
            return category.

        .. note::
            Remove the return category from establishing the intersection to preserve the
            maximum amount of signal data available (required because of the applied lag).
        """

        df_w = df.pivot(index=("cid", "real_date"), columns="xcat", values="value")

        storage = []
        cid_name: str
        cid_df: pd.DataFrame
        for cid_name, cid_df in df_w.groupby(level=0, observed=True):
            cid_df = cid_df[signal + [ret]]

            final_df = pd.DataFrame(
                data=np.empty(shape=cid_df.shape),
                columns=cid_df.columns,
                index=cid_df.index,
            )
            final_df.loc[:, :] = np.nan

            # Return category is preserved.
            final_df.loc[:, ret] = cid_df[ret]

            intersection_df = cid_df.loc[:, signal].droplevel(level=0)
            # Intersection exclusively across the signals.
            intersection_df = intersection_df.dropna(how="any")
            if not intersection_df.empty:
                s_date = intersection_df.index[0]
                e_date = intersection_df.index[-1]

                final_df.loc[(cid_name, s_date) : (cid_name, e_date), signal] = (
                    intersection_df.to_numpy()
                )
                storage.append(final_df)
            else:
                warnings.warn(
                    f"Cross-section {cid_name} has no common sample periods for the signals \
                    {signal} and return {ret}."
                )

        df = pd.concat(storage)
        df = df.stack().reset_index().sort_values(["cid", "xcat", "real_date"])
        df.columns = ["cid", "real_date", "xcat", "value"]

        return df[["cid", "xcat", "real_date", "value"]]

    def __table_stats__(
        self,
        df_segment: pd.DataFrame,
        df_out: pd.DataFrame,
        segment: str,
        signal: str,
        ret: str,
    ):
        """
        Method used to compute the evaluation metrics across segments: cross-section,
        yearly or category level.

        Parameters
        ----------
        df_segment : ~pandas.DataFrame
            segmented DataFrame.
        df_out : ~pandas.DataFrame
            metric DataFrame where the index will be all segments for the respective
            segmentation type.
        segment : str
            segment which could either be an individual cross-section, year or category.
            Will form the index of the returned DataFrame.
        signal : str
            signal category.
        ret : str
            return category.
        """

        # Account for NaN values between the single respective signal and return. Only
        # applicable for rival signals panel level calculations.

        df_segment = df_segment.loc[:, [ret, signal]].dropna(axis=0, how="any")

        df_sgs = np.sign(df_segment.loc[:, [ret, signal]])
        # Exact zeroes are disqualified for sign analysis only.
        df_sgs = df_sgs[~((df_sgs.iloc[:, 0] == 0) | (df_sgs.iloc[:, 1] == 0))]

        sig_sign = df_sgs[signal]
        ret_sign = df_sgs[ret]

        df_out.loc[segment, "accuracy"] = skm.accuracy_score(sig_sign, ret_sign)
        df_out.loc[segment, "bal_accuracy"] = skm.balanced_accuracy_score(
            sig_sign, ret_sign
        )

        df_out.loc[segment, "pos_sigr"] = np.mean(sig_sign == 1)
        df_out.loc[segment, "pos_retr"] = np.mean(ret_sign == 1)
        df_out.loc[segment, "pos_prec"] = skm.precision_score(
            ret_sign, sig_sign, pos_label=1
        )
        df_out.loc[segment, "neg_prec"] = skm.precision_score(
            ret_sign, sig_sign, pos_label=-1
        )

        ret_vals, sig_vals = df_segment[ret], df_segment[signal]
        df_out.loc[segment, ["kendall", "kendall_pval"]] = stats.kendalltau(
            ret_vals, sig_vals
        )
        if len(ret_sign) <= 1:
            corr, corr_pval = np.nan, np.nan
        else:
            corr, corr_pval = stats.pearsonr(ret_vals, sig_vals)
        df_out.loc[segment, ["pearson", "pearson_pval"]] = np.array([corr, corr_pval])

        if (ret_sign == -1.0).all() or (ret_sign == 1.0).all():
            df_out.loc[segment, "auc"] = np.nan
            warnings.warn(
                "AUC could not be calculated, since the return category has a lack of "
                "class diversity."
            )
        else:
            df_out.loc[segment, "auc"] = skm.roc_auc_score(ret_sign, sig_sign)

        if self.ms_panel_test:
            df_out.loc[segment, "map_pval"] = self.map_pval(ret_vals, sig_vals)

        for metric in self.additional_metrics:
            df_out.loc[segment, metric.__name__] = metric(ret_vals, sig_vals)

        return df_out

[docs]    def map_pval(self, ret_vals, sig_vals) -> float:
        """
        Calculates the p-value of the Macrosynergy Panel (MAP) significance test for
        the signal-return relationship.

        The test fits a period random-effects panel model ``signal ~ 1 + return`` with
        a fixed intercept and random effects grouped by ``real_date``, reusing the
        package's Swamy-Arora feasible-GLS estimator
        :class:`macrosynergy.learning.random_effects.RandomEffects`. The returned
        statistic is the two-sided p-value of the fixed-effect slope on the return,
        rounded to three decimal places.

        Parameters
        ----------
        ret_vals : ~pandas.Series
            return values.
        sig_vals : ~pandas.Series
            signal values.

        Returns
        -------
        float
            p-value of the return slope in the MAP random-effects model, or ``np.nan``
            if it could not be calculated.
        """

        if (
            "cid" not in ret_vals.index.names
            or ret_vals.index.get_level_values("cid").nunique() <= 1
        ):
            warnings.warn(
                "P-value could not be calculated, since there wasn't enough datapoints."
            )
            return np.nan

        # Degeneracy guard: a signal or return column with no genuine variation gives a
        # rank-deficient design. RandomEffects does NOT raise on this -- an
        # (near-)constant column drives the fitted slope and its standard error to
        # machine epsilon, so re.pvals comes out spuriously finite (~0), i.e. a FALSE
        # significant result. Reject up front on a relative-std test so both the
        # exactly-constant and near-constant (constant + tiny noise) cases return nan.
        # Threshold 1e-9 sits far below any legitimate panel's relative variation.
        for _col in (ret_vals, sig_vals):
            _v = np.asarray(_col, dtype=np.float64)
            _std = np.nanstd(_v)
            _scale = np.nanmean(np.abs(_v)) + 1.0
            if not np.isfinite(_std) or _std <= 1e-9 * _scale:
                warnings.warn(
                    "Singular matrix encountered, so p-value could not be calculated."
                )
                return np.nan

        try:
            # divide="raise"/invalid="raise" turns RandomEffects' internal
            # divide-by-zero on a degenerate design (only a RuntimeWarning by default)
            # into a FloatingPointError we can catch rather than returning a bogus 0.
            with np.errstate(divide="raise", invalid="raise"):
                re = RandomEffects(group_col="real_date", fit_intercept=True).fit(
                    ret_vals, sig_vals
                )
                # Features are ['const', <return>]; the return slope is the last entry.
                pval = float(re.pvals.iloc[-1])
        except (KeyError, ValueError, FloatingPointError, np.linalg.LinAlgError):
            # Degenerate/singular design (e.g. a constant/collinear column collapses the
            # intercept, so the 'const' coefficient is absent) -> no slope p-value.
            warnings.warn(
                "Singular matrix encountered, so p-value could not be calculated."
            )
            return np.nan
        if np.isnan(pval):
            warnings.warn(
                "P-value could not be calculated, since there wasn't enough datapoints."
            )
            return np.nan
        return round(pval, 3)

    def __output_table__(
        self,
        cs_type: str = "cids",
        ret: str = None,
        sig: str = None,
        srt: bool = False,
    ):
        """
        Creates a DataFrame with information on the signal-return relation across cross-
        sections or years and, additionally, the panel.

        Parameters
        ----------
        cs_type : str
            the segmentation type.
        ret : str
            return category. Default is the first return category.
        sig : str
            signal category. Default is the first signal category.
        srt : bool
            if True, the DataFrame will be sorted by the cross-sections. Default is False.
        """

        if ret is None:
            ret = self.rets if not isinstance(self.rets, list) else self.rets[0]
        if sig is None:
            sig = self.sigs if not isinstance(self.sigs, list) else self.sigs[0]

        # Analysis completed exclusively on the primary signal.
        r = [ret]
        r.append(sig)
        df = self.df[r]

        # Will remove any timestamps where both the signal & return are not realised.
        # Applicable even if communal sampling has been applied given the alignment
        # excludes the return category.
        df = df.dropna(how="any")

        if cs_type == "cids":
            css = set(self.cids)
            unique_cids_df = set(df.index.get_level_values(0).unique())

            if not css.issubset(unique_cids_df):
                warnings.warn(
                    f"Cross-sections {css - unique_cids_df} have no corresponding xcats \
                        in the dataframe."
                )
                css = css.intersection(unique_cids_df)

            css = sorted(list(css))
        else:
            df["year"] = np.array(df.reset_index(level=1)["real_date"].dt.year)
            css = [str(y) for y in list(set(df["year"]))]
            css = sorted(css)

        statms = self.metrics
        if srt:
            css = []
            index = ["Panel"]
        else:
            index = ["Panel", "Mean", "PosRatio"] + css

        df_out = pd.DataFrame(index=index, columns=statms)

        for cs in css + ["Panel"]:
            df_cs = self.__slice_df__(df=df, cs=cs, cs_type=cs_type)
            df_out = self.__table_stats__(
                df_segment=df_cs, df_out=df_out, segment=cs, signal=sig, ret=ret
            )
        if not srt:
            df_out.loc["Mean", :] = df_out.loc[css, :].mean()

            above50s = statms[0:6] + [statms[statms.index("auc")]]
            # Overview of the cross-sectional performance.
            df_out.loc["PosRatio", above50s] = (df_out.loc[css, above50s] > 0.5).mean()

            above0s = statms[6:9:2]
            pos_corr_coefs = df_out.loc[css, above0s] > 0
            df_out.loc["PosRatio", above0s] = pos_corr_coefs.mean()

            below50s = statms[7:10:2]
            pvals_bool = df_out.loc[css, below50s] < 0.5
            pos_pvals = np.mean(np.array(pvals_bool) * np.array(pos_corr_coefs), axis=0)
            # Positive correlation with error prob < 50%.
            df_out.loc["PosRatio", below50s] = pos_pvals
            if self.ms_panel_test:
                map_pval_bool = df_out.loc[css, "map_pval"] < 0.5
                pos_map_pval = np.mean(np.array(map_pval_bool) * np.nan)
                df_out.loc["PosRatio", "map_pval"] = pos_map_pval

        return df_out.astype("float")

[docs]    def calculate_single_stat(
        self, stat: str, ret: str = None, sig: str = None, type: str = None
    ) -> float:
        """
        Calculates a single statistic for a given signal-return relation.

        Parameters
        ----------
        stat : str
            statistic to be calculated.
        ret : str
            return category. Default is the first return category.
        sig : str
            signal category. Default is the first signal category.
        type : str
            type of segment over which bars are drawn. Either "panel" (default), "years"
            or "signals".

        Returns
        -------
        float
            statistic value.
        """

        r = [ret]
        r.append(sig)
        df = self.df[r]

        df = df.dropna(how="any")

        if type == "panel":
            css = ["Panel"]
            cs_type = "cids"
        elif type == "mean_cids" or type == "pr_cids":
            css = set(self.cids)
            unique_cids_df = set(df.index.get_level_values(0).unique())
            if not css.issubset(unique_cids_df):
                warnings.warn(
                    f"Cross-sections {css - unique_cids_df} have no corresponding xcats \
                        in the dataframe."
                )
                css = css.intersection(unique_cids_df)
            css = sorted(list(css))
            cs_type = "cids"
        elif type == "mean_years" or type == "pr_years":
            df["year"] = np.array(df.reset_index(level=1)["real_date"].dt.year)
            css = [str(y) for y in list(set(df["year"]))]
            css = sorted(css)
            cs_type = "years"
        else:
            raise ValueError("Invalid segmentation type.")

        list_of_results = []
        for cs in css:
            df_segment = self.__slice_df__(df=df, cs=cs, cs_type=cs_type)
            df_segment = df_segment.loc[:, [ret, sig]].dropna(axis=0, how="any")

            df_sgs = np.sign(df_segment.loc[:, [ret, sig]])
            # Exact zeroes are disqualified for sign analysis only.
            df_sgs = df_sgs[~((df_sgs.iloc[:, 0] == 0) | (df_sgs.iloc[:, 1] == 0))]

            sig_sign = df_sgs[sig]
            ret_sign = df_sgs[ret]
            ret_vals, sig_vals = df_segment[ret], df_segment[sig]
            if stat == "accuracy":
                list_of_results.append(skm.accuracy_score(sig_sign, ret_sign))
            elif stat == "bal_accuracy":
                list_of_results.append(skm.balanced_accuracy_score(sig_sign, ret_sign))
            elif stat == "pos_sigr":
                list_of_results.append(np.mean(sig_sign == 1))
            elif stat == "pos_retr":
                list_of_results.append(np.mean(ret_sign == 1))
            elif stat == "pos_prec":
                list_of_results.append(
                    skm.precision_score(ret_sign, sig_sign, pos_label=1)
                )
            elif stat == "neg_prec":
                list_of_results.append(
                    skm.precision_score(ret_sign, sig_sign, pos_label=-1)
                )
            elif stat == "kendall":
                list_of_results.append(stats.kendalltau(ret_vals, sig_vals)[0])
            elif stat == "kendall_pval":
                list_of_results.append(stats.kendalltau(ret_vals, sig_vals)[1])
            elif stat == "pearson":
                list_of_results.append(stats.pearsonr(ret_vals, sig_vals)[0])
            elif stat == "pearson_pval":
                list_of_results.append(stats.pearsonr(ret_vals, sig_vals)[1])
            elif stat == "auc":
                if (ret_sign == -1.0).all() or (ret_sign == 1.0).all():
                    list_of_results.append(np.nan)
                    warnings.warn(
                        "AUC could not be calculated, since the return category has a "
                        "lack of class diversity."
                    )
                else:
                    list_of_results.append(skm.roc_auc_score(ret_sign, sig_sign))
            elif stat == "map_pval" and self.ms_panel_test:
                list_of_results.append(self.map_pval(ret_vals, sig_vals))
            elif True in [
                stat == metric.__name__ for metric in self.additional_metrics
            ]:
                idx = [
                    stat == metric.__name__ for metric in self.additional_metrics
                ].index(True)
                list_of_results.append(self.additional_metrics[idx](ret_vals, sig_vals))
            else:
                raise ValueError("Invalid statistic.")

        if type == "panel":
            return list_of_results[0]
        elif type == "mean_years" or type == "mean_cids":
            return np.mean(np.array(list_of_results))
        elif type == "pr_years" or type == "pr_cids":
            if stat in self.metrics[0:6] + ["auc"]:
                return np.mean(np.array(list_of_results) > 0.5)
            elif stat in self.metrics[6:9:2]:
                return np.mean(np.array(list_of_results) > 0)
            elif stat in self.metrics[7:10:2]:
                return np.mean(np.array(list_of_results) < 0.5)

[docs]    def summary_table(self, cross_section: bool = False, years: bool = False):
        """
        Generates a summary table for the signal-return relations.

        Parameters
        ----------
        cross_section : bool
            if True, the summary table will be generated for cross-sections.
        years : bool
            if True, the summary table will be generated for years. Must be False if
            cross_section is True.

        Returns
        -------
        ~pandas.DataFrame
            summary table.
        """

        warnings.warn(
            "summary_table() has been deprecated will be removed in a subsequent "
            "version, please now use single_relation_table(table_type='summary').",
            FutureWarning,
        )
        if cross_section and years:
            raise ValueError("Both cross_section and years cannot be True")
        if not (cross_section and years):
            return self.single_relation_table(table_type="summary")
        else:
            return self.single_relation_table(
                table_type="years" if years else "cross_section"
            )

[docs]    def signals_table(self, sigs: List[str] = None):
        warnings.warn(
            "signals_table() has been deprecated will be removed in a subsequent "
            "version, please now use multiple_relations_table()",
            FutureWarning,
        )
        if sigs is None:
            sigs = self.sigs
        return self.multiple_relations_table(
            rets=self.rets[0],
            xcats=sigs,
            freqs=self.freqs[0],
            agg_sigs=self.agg_sigs[0],
        )

[docs]    def cross_section_table(self):
        """
        Deprecated method for cross-section table. Use `single_relation_table` instead.
        Shows a table of category values across cross-sections for a given date.
        """
        warnings.warn(
            "cross_section_table() has been deprecated will be removed in a subsequent "
            "version, please now use "
            " single_relation_table(table_type='cross_section_table')",
            FutureWarning,
        )
        return self.single_relation_table(table_type="cross_section")

[docs]    def yearly_table(self):
        """
        Deprecated method for yearly table. Use `single_relation_table` instead.
        Displays annual average values of selected categories across cross-sections.
        """
        warnings.warn(
            "yearly_table() has been deprecated will be removed in a subsequent "
            "version, please now use single_relation_table(table_type='years')",
            FutureWarning,
        )
        return self.single_relation_table(table_type="years")

[docs]    def single_relation_table(
        self,
        ret: str = None,
        xcat: str = None,
        freq: str = None,
        agg_sigs: str = None,
        table_type: str = None,
    ) -> pd.DataFrame:
        """
        Computes all the statistics for one specific signal-return relation:

        Parameters
        ----------
        ret : str
            single target return category. Default is first in target return list of the
            class.
        xcat : str
            single signal category to be considered. Default is first in feature
            category list of the class.
        freq : str
            letter denoting single frequency at which the series will be sampled. This
            must be one of the frequencies selected for the class. If not specified uses the
            freq stored in the class.
        agg_sigs : str
            aggregation method applied to the signal values in down-sampling.
        table_type : str
            type of table to be returned. Either "summary", "years", "cross_section".

        Returns
        -------
        ~pandas.DataFrame
            table with the statistics for the single signal-return relation.
        """

        self.df = self.original_df
        if ret is None:
            ret = self.rets if not isinstance(self.rets, list) else self.rets[0]
        if freq is None:
            freq = self.freqs if not isinstance(self.freqs, list) else self.freqs[0]
        if agg_sigs is None:
            agg_sigs = (
                self.agg_sigs
                if not isinstance(self.agg_sigs, list)
                else self.agg_sigs[0]
            )
        if xcat is None:
            sig = self.sigs if not isinstance(self.sigs, list) else self.sigs[0]
            xcat = [sig, ret]
        elif not isinstance(xcat, str):
            raise TypeError("xcat must be a string")
        else:  # If xcat is a string
            if xcat not in self.sigs and xcat + "_NEG" in self.sigs:
                xcat = xcat + "_NEG"
            sig = xcat
            xcat = [sig, ret]

        if not isinstance(ret, str):
            raise TypeError("ret must be a string")
        if not isinstance(freq, str):
            raise TypeError("freq must be a string")
        if not isinstance(agg_sigs, str):
            raise TypeError("agg_sigs must be a string")

        self.manipulate_df(xcats=xcat, freq=freq, agg_sig=agg_sigs)

        if sig not in self.sigs:
            sig = sig + "_NEG"

        if table_type is not None:
            if table_type not in ["summary", "years", "cross_section"]:
                raise ValueError("Invalid table type")

        if table_type == "years":
            cs_type = "years"
        else:
            cs_type = "cids"

        if table_type == "summary":
            df_result = pd.concat(
                [
                    self.__output_table__(
                        cs_type="years", ret=ret, sig=sig, srt=False
                    ).iloc[:3],
                    self.__output_table__(
                        cs_type="cids", ret=ret, sig=sig, srt=False
                    ).iloc[1:3],
                ],
                axis=0,
            )
            df_result.index = [
                df_result.index[0],
                "Mean years",
                "Positive ratio",
                "Mean cids",
                "Positive ratio",
            ]
        else:
            df_result = self.__output_table__(
                cs_type=cs_type, ret=ret, sig=sig, srt=table_type is None
            )

        self.df = self.original_df
        index = f"{freq}: {sig}/{agg_sigs} => {ret}"

        df_result.rename(index={"Panel": index}, inplace=True)

        return df_result.round(5)

[docs]    def reindex_multindex_df(
        self, df: pd.DataFrame, desired_order: List[str], var_type: str
    ):
        df["Signal_Order"] = pd.Categorical(
            df.index.get_level_values(var_type), categories=desired_order, ordered=True
        )
        df_sorted = df.sort_values("Signal_Order")
        df_sorted.drop("Signal_Order", axis=1, inplace=True)
        return df_sorted

[docs]    def multiple_relations_table(
        self,
        rets: Union[str, List[str]] = None,
        xcats: Union[str, List[str]] = None,
        freqs: Union[str, List[str]] = None,
        agg_sigs: Union[str, List[str]] = None,
        signal_name_dict: Optional[Dict[str, str]] = None,
        return_name_dict: Optional[Dict[str, str]] = None,
    ):
        """
        Calculates all the statistics for each return and signal category specified with
        each frequency and aggregation method, note that if none are defined it does
        this for all categories, frequencies and aggregation methods that were stored in
        the class.

        Parameters
        ----------
        rets : str, List[str]
            target return category
        xcats : str, List[str]
            signal categories to be considered
        freqs : str, List[str]
            letters denoting frequency at which the series are to be sampled. This must
            be one of 'D', 'W', 'M', 'Q', 'A'. If not specified uses the freq stored in the
            class.
        agg_sigs : str, List[str]
            aggregation methods applied to the signal values in down-sampling.
        """

        self.df = self.original_df
        self.xcats = list(self.df["xcat"].unique())
        if rets is None:
            rets = self.rets
        if freqs is None:
            freqs = self.freqs
        if agg_sigs is None:
            agg_sigs = self.agg_sigs
        if not isinstance(agg_sigs, list):
            agg_sigs = [agg_sigs]
        if xcats is None:
            xcats = self.xcats
        else:
            if isinstance(xcats, str):
                if xcats not in self.sigs and xcats + "_NEG" in self.sigs:
                    xcats = xcats + "_NEG"
            if isinstance(xcats, list):
                for xcat in xcats:
                    if xcat not in self.sigs and xcat + "_NEG" in self.sigs:
                        xcats[xcats.index(xcat)] = xcat + "_NEG"
        if not isinstance(xcats, list):
            xcats = [xcats]
        if not isinstance(rets, list):
            rets = [rets]
        if not isinstance(freqs, list):
            freqs = [freqs]

        for rets_elem in rets:
            if rets_elem not in self.xcats:
                raise ValueError(f"{rets_elem} is not a valid return category")

        for xcats_elem in xcats:
            if xcats_elem not in self.xcats:
                raise ValueError(f"{xcats_elem} is not a valid signal category")

        for freqs_elem in freqs:
            if freqs_elem not in self.freqs:
                raise ValueError(f"{freqs_elem} is not a valid frequency")

        for agg_sigs_elem in agg_sigs:
            if agg_sigs_elem not in self.agg_sigs:
                raise ValueError(f"{agg_sigs_elem} is not a valid aggregation method")

        xcats = [x for x in xcats if x in self.sigs]

        multiindex = pd.MultiIndex.from_tuples(
            [
                (ret, xcat, freq, agg_sig)
                for freq in freqs
                for agg_sig in agg_sigs
                for ret in rets
                for xcat in xcats
            ],
            names=["Return", "Signal", "Frequency", "Aggregation"],
        )

        df_rows = []
        for freq in freqs:
            for agg_sig in agg_sigs:
                for ret in rets:
                    self.manipulate_df(xcats=xcats + [ret], freq=freq, agg_sig=agg_sig)
                    for xcat in xcats:
                        df_rows.append(
                            self.__output_table__(
                                cs_type="cids", ret=ret, sig=xcat, srt=True
                            )
                        )

        df_result = pd.concat(df_rows, axis=0)

        df_result.index = multiindex

        if signal_name_dict is not None:
            df_result.rename(index=signal_name_dict, inplace=True)
            df_result = self.reindex_multindex_df(
                df_result, signal_name_dict.values(), "Signal"
            )

        if return_name_dict is not None:
            df_result.rename(index=return_name_dict, inplace=True)
            df_result = self.reindex_multindex_df(
                df_result, return_name_dict.values(), "Return"
            )

        self.df = self.original_df

        return df_result

[docs]    def single_statistic_table(
        self,
        stat: str,
        type: str = "panel",
        rows: List[str] = ["xcat", "agg_sigs"],
        columns: List[str] = ["ret", "freq"],
        show_heatmap: bool = False,
        title: Optional[str] = None,
        title_fontsize: int = 16,
        row_names: Optional[List[str]] = None,
        column_names: Optional[List[str]] = None,
        signal_name_dict: Optional[Dict[str, str]] = None,
        return_name_dict: Optional[Dict[str, str]] = None,
        xcat_labels: Optional[Dict[str, str]] = None,
        freq_labels: Optional[Dict[str, str]] = None,
        agg_sigs_labels: Optional[Dict[str, str]] = None,
        emphasize_rows: Optional[Union[List[str], Dict[str, str]]] = None,
        xcat_row_order: Optional[List[str]] = None,
        min_color: Optional[float] = None,
        max_color: Optional[float] = None,
        figsize: Tuple[float, float] = (14, 8),
        annotate: bool = True,
        round: int = 3,
        pval_stat: Optional[str] = None,
        round_pval: int = 3,
        significance_threshold: Optional[float] = 0.9,
        xlabel: Optional[str] = None,
        ylabel: Optional[str] = None,
        collapse_constant_levels: bool = False,
        axis_label_levels: Optional[List[str]] = None,
        footnote: Optional[str] = None,
        footnote_fontsize: int = 10,
    ):
        """
        Creates a table which shows the specified statistic for each row and column
        specified as arguments:

        Parameters
        ----------
        stat : str
            type of statistic to be displayed (this can be any of the column names of
            summary_table).
        type : str
            type of the statistic displayed. This can be based on the overall panel
            ("panel", default), an average of annual panels (mean_years), an average of
            cross-sectional relations ("mean_cids"), the positive ratio across
            years("pr_years"), positive ratio across sections ("pr_cids").
        rows : List[str]
            row indices, which can be return categories, feature categories, frequencies
            and/or aggregations. The choice is made through a list of one or more of "xcat",
            "ret", "freq" and "agg_sigs". The default is ["xcat", "agg_sigs"] resulting in
            index strings (<agg_signs>) or if only one aggregation is available.
        columns : List[str]
            column indices, which can be return categories, feature categories,
            frequencies and/or aggregations. The choice is made through a list of one or
            more of "xcat", "ret", "freq" and "agg_sigs". The default is ["ret", "freq]
            resulting in index strings () or if only one frequency is available.
        show_heatmap : bool
            if True, the table is visualized as a heatmap. Default is False.
        title : str, optional
            plot title. Default is None in which case the default title is used.
        title_fontsize : int
            font size of title. Default is 16.
        row_names : List[str]
            specifies the labels of rows in the heatmap. Default is None, the indices of
            the generated DataFrame are used.
        column_names : List[str]
            specifies the labels of columns in the heatmap. Default is None, the columns
            of the generated DataFrame are used.
        signal_name_dict : dict, optional
            dictionary mapping the signal names to the desired names in the heatmap.
            Default is None, in which case the signal names are used. Renamed
            values flow through to the auto axis label produced by the
            constant-level collapse described under ``ylabel``.
        return_name_dict : dict, optional
            dictionary mapping the return names to the desired names in the heatmap.
            Default is None, in which case the return names are used. Renamed
            values flow through to the auto axis label produced by the
            constant-level collapse described under ``xlabel``.
        xcat_labels : dict, optional
            Unified rename dictionary covering both signal and return
            ``xcats``. Internally split by membership in ``self.sigs`` /
            ``self.rets`` and routed through ``signal_name_dict`` /
            ``return_name_dict``; xcats not listed in the dict are kept
            verbatim. Mutually exclusive with the two legacy kwargs — pass
            either ``xcat_labels`` or ``signal_name_dict`` /
            ``return_name_dict``, not both. Default is None (no rename).
        freq_labels : dict, optional
            Mapping from frequency code (``"M"``, ``"Q"``, …) to the
            display label used on the heatmap and in the auto axis label
            produced by the constant-level collapse. Frequencies not
            listed in the dict are kept verbatim. Default is None
            (raw codes are shown).
        agg_sigs_labels : dict, optional
            Mapping from aggregation code (``"last"``, ``"mean"``, …) to
            the display label used on the heatmap and in the auto axis
            label produced by the constant-level collapse. Aggregations
            not listed in the dict are kept verbatim. Default is None
            (raw codes are shown).
        emphasize_rows : dict or List[str], optional
            Signal xcats (as passed to ``sigs``) whose rows are outlined
            with a box in the heatmap, scorecard-style. Pass a mapping
            ``{xcat: color}`` to set each box colour (any valid matplotlib
            colour); a plain list is treated as ``{xcat: "black"}``.
            Requires ``"xcat"`` in ``rows``; raises ``ValueError`` if set
            without it, if a name is not among ``sigs``, or if a colour is
            invalid. Boxing does not reorder the table — use
            ``xcat_row_order`` for that. Default is None.
        xcat_row_order : List[str], optional
            Signal xcats (as passed to ``sigs``) giving the desired
            top-to-bottom row order. Listed signals are placed in this
            order; any signals not listed keep their original order above
            them. Requires ``"xcat"`` in ``rows``; raises ``ValueError``
            if set without it or if a name is not among ``sigs``. The
            returned DataFrame reflects the new row order; the statistics
            are unchanged. Default is None.
        min_color : float, optional
            minimum value of the color scale. Default is None, in which case the minimum
            value of the table is used.
        max_color : float, optional
            maximum value of the color scale. Default is None, in which case the maximum
            value of the table is used.
        figsize : Tuple[float, float]
            Tuple (w, h) of width and height of graph. Default is (14, 8).
        annotate : bool
            Default is True, where the values shown in the heatmap are annotated.
        round : int
            number of decimals to round the primary statistic to in the heatmap
            annotations. Default is 3.
        pval_stat : str, optional
            name of a p-value statistic — typically ``"kendall_pval"``,
            ``"pearson_pval"`` or ``"map_pval"`` (the Macrosynergy Panel
            test). When set, each heatmap cell shows the **probability of
            significance**, ``1 - pval_stat``, in brackets beneath the
            primary statistic. Default is None. When ``pval_stat="map_pval"``
            the SignalReturnRelations must have been constructed with
            ``ms_panel_test=True``.
        round_pval : int
            number of decimals to round the bracketed probability of
            significance to in the heatmap annotations. Default is 3.
        significance_threshold : float, optional
            probability-of-significance cutoff above which a cell's
            annotation is rendered in black and bold. Compared directly
            against the bracketed value (``1 - pval_stat``), so 0.9
            highlights cells whose probability of significance exceeds 0.9
            (equivalently, raw p-value below 0.1). Only takes effect when
            ``pval_stat`` is set. Pass ``None`` to disable. Default is 0.9.
        xlabel : str, optional
            Label drawn beneath the heatmap columns, useful for naming
            the target return (e.g. ``"Forward return (target)"``).
            Default is None. When ``collapse_constant_levels=True`` and
            the caller leaves this None, any column-index levels whose
            values are constant across the table are auto-collapsed into
            this label (joined by ``" · "``). See ``axis_label_levels``
            to restrict which constant levels feed into the label.
        ylabel : str, optional
            Label drawn beside the heatmap rows, useful for naming the
            feature (e.g. ``"Factor (feature)"``). Default is None. When
            ``collapse_constant_levels=True`` and the caller leaves this
            None, any row-index levels whose values are constant across
            the table are auto-collapsed into this label (joined by
            ``" · "``). For instance, a table whose rows iterate over
            one signal, one aggregation, and several frequencies will
            display only the frequencies as y-tick labels and place
            ``"<signal> · <aggregation>"`` on the y-axis label. See
            ``axis_label_levels`` to restrict which constant levels
            feed into the label.
        collapse_constant_levels : bool, optional
            When True, row/column index levels whose values are constant
            across the table are stripped from the tick labels and
            promoted to the corresponding axis label (joined by
            ``" · "``) when the caller did not pass ``xlabel``/``ylabel``
            (or ``row_names``/``column_names``) explicitly. The returned
            DataFrame is unchanged in every case. Default is False (raw
            MultiIndex tuples appear as tick labels, matching the
            historical rendering). Required to be True before passing
            ``axis_label_levels``.
        axis_label_levels : List[str], optional
            Subset of ``["xcat", "ret", "freq", "agg_sigs"]`` naming the
            level keys eligible for promotion into the auto x/y-axis
            label. Constant levels not in this list still collapse from
            the tick labels but do not appear in the axis label. Only
            takes effect when ``collapse_constant_levels=True``; raises
            ``ValueError`` otherwise. Default is None, which promotes
            every collapsed level into the label. Pass e.g.
            ``["xcat", "ret"]`` to keep the auto-label limited to the
            signal/return identity and drop the aggregation/frequency
            suffix.
        footnote : str, optional
            Free-text caption rendered below the heatmap. Useful for
            recording the significance test, panel scope, or annotation
            legend (e.g. ``"Significance computed with the Macrosynergy
            panel test."``). Multi-line strings are supported. Default
            is None (no footnote).
        footnote_fontsize : int, optional
            Font size for the footnote text. Default is 10.

        Returns
        -------
        ~pandas.DataFrame
            DataFrame with the specified statistic for each row and column.
        """

        self.df = self.original_df.copy()

        if stat not in self.metrics:
            raise ValueError(f"Stat must be one of {self.metrics}")

        if pval_stat is not None:
            if pval_stat == "map_pval" and not self.ms_panel_test:
                raise ValueError(
                    "pval_stat='map_pval' requires SignalReturnRelations to "
                    "be constructed with ms_panel_test=True."
                )
            if pval_stat not in self.metrics:
                raise ValueError(f"pval_stat must be one of {self.metrics}")

        if not isinstance(rows, list):
            raise TypeError("Rows must be a list")
        if not isinstance(columns, list):
            raise TypeError("Columns must be a list")

        type_values = ["panel", "mean_years", "mean_cids", "pr_years", "pr_cids"]
        rows_values = ["xcat", "ret", "freq", "agg_sigs"]

        if type not in type_values:
            raise ValueError(f"Type must be one of {type_values}")

        if not all([x in rows_values for x in rows]):
            raise ValueError(f"Rows must only contain {rows_values}")

        if not all([x in rows_values for x in columns]):
            raise ValueError(f"Columns must only contain {rows_values}")

        if axis_label_levels is not None:
            if not collapse_constant_levels:
                raise ValueError(
                    "axis_label_levels requires collapse_constant_levels=True."
                )
            if not all(x in rows_values for x in axis_label_levels):
                raise ValueError(f"axis_label_levels must only contain {rows_values}")

        if xcat_labels is not None:
            if signal_name_dict is not None or return_name_dict is not None:
                raise ValueError(
                    "Pass either xcat_labels or "
                    "signal_name_dict/return_name_dict, not both."
                )
            # Build identity-filled rename dicts so existing keys preserve
            # their position and unrenamed xcats are not dropped by the
            # downstream reorder.
            signal_name_dict = {s: xcat_labels.get(s, s) for s in self.sigs}
            return_name_dict = {r: xcat_labels.get(r, r) for r in self.rets}

        def _resolve_sig(name: str, label: str) -> str:
            # Resolve the sig_neg suffix the same way as elsewhere so the
            # original xcat still matches self.sigs.
            if name not in self.sigs and f"{name}_NEG" in self.sigs:
                name = f"{name}_NEG"
            if name not in self.sigs:
                raise ValueError(
                    f"{label} entries must be among sigs {self.sigs}; "
                    f"got unknown {name!r}."
                )
            return name

        if (emphasize_rows is not None or xcat_row_order is not None) and (
            "xcat" not in rows
        ):
            raise ValueError("emphasize_rows / xcat_row_order require 'xcat' in rows.")

        # Map matched signal -> box colour (list form defaults to black).
        emphasize_colors: Dict[str, str] = {}
        if emphasize_rows is not None:
            items = (
                emphasize_rows.items()
                if isinstance(emphasize_rows, dict)
                else [(e, "black") for e in emphasize_rows]
            )
            for e, color in items:
                if not is_color_like(color):
                    raise ValueError(
                        f"emphasize_rows colour {color!r} for {e!r} is not a "
                        "valid matplotlib colour."
                    )
                emphasize_colors[_resolve_sig(e, "emphasize_rows")] = color

        row_order_sigs: List[str] = (
            [_resolve_sig(x, "xcat_row_order") for x in xcat_row_order]
            if xcat_row_order is not None
            else []
        )

        rows_dict = {
            "xcat": self.sigs,
            "ret": self.rets,
            "freq": self.freqs,
            "agg_sigs": self.agg_sigs,
        }

        df_row_names, df_column_names = self.set_df_labels(rows_dict, rows, columns)

        df_result = pd.DataFrame(
            columns=df_column_names, index=df_row_names, dtype=np.float64
        )
        # sort index to prevent performance degradation: PerformanceWarning
        df_result.sort_index(inplace=True)

        df_pval: Optional[pd.DataFrame] = None
        if pval_stat is not None:
            df_pval = pd.DataFrame(
                columns=df_column_names, index=df_row_names, dtype=np.float64
            )
            df_pval.sort_index(inplace=True)

        loop_tuples: List[Tuple[str, str, str, str]] = [
            (ret, sig, freq, agg_sig)
            for ret in self.rets
            for sig in self.sigs
            for freq in self.freqs
            for agg_sig in self.agg_sigs
        ]

        # Reorder tuples

        for ret, sig, freq, agg_sig in loop_tuples:
            # Prepare xcat and manipulate DataFrame
            xcat = [sig, ret]
            self.manipulate_df(xcats=xcat, freq=freq, agg_sig=agg_sig)
            hash = f"{ret}/{sig}/{freq}/{agg_sig}"

            row = self.get_rowcol(hash, rows)
            column = self.get_rowcol(hash, columns)
            df_result.loc[row, column] = self.calculate_single_stat(
                stat, ret, sig, type
            )
            if pval_stat is not None:
                df_pval.loc[row, column] = self.calculate_single_stat(
                    pval_stat, ret, sig, type
                )

            # Reset self.df and sig to original values
            self.df = self.original_df

        if signal_name_dict is not None:
            # Reorder the index according to the signal_name_dict
            if "xcat" in rows:
                df_result.rename(index=signal_name_dict, inplace=True)
                df_result = self.reindex_multindex_df(
                    df_result, signal_name_dict.values(), "Signal"
                )
                if df_pval is not None:
                    df_pval.rename(index=signal_name_dict, inplace=True)
                    df_pval = self.reindex_multindex_df(
                        df_pval, signal_name_dict.values(), "Signal"
                    )
            else:
                df_result.rename(columns=signal_name_dict, inplace=True)
                df_result = df_result[signal_name_dict.values()]
                if df_pval is not None:
                    df_pval.rename(columns=signal_name_dict, inplace=True)
                    df_pval = df_pval[signal_name_dict.values()]

        if return_name_dict is not None:
            # Reorder the index according to the return_name_dict
            if "ret" in rows:
                df_result.rename(index=return_name_dict, inplace=True)
                df_result = self.reindex_multindex_df(
                    df_result, return_name_dict.values(), "Return"
                )
                if df_pval is not None:
                    df_pval.rename(index=return_name_dict, inplace=True)
                    df_pval = self.reindex_multindex_df(
                        df_pval, return_name_dict.values(), "Return"
                    )
            else:
                df_result.rename(columns=return_name_dict, inplace=True)
                df_result = df_result[return_name_dict.values()]
                if df_pval is not None:
                    df_pval.rename(columns=return_name_dict, inplace=True)
                    df_pval = df_pval[return_name_dict.values()]

        # Frequency / aggregation display renames. Identity-fill so that
        # frequencies (or aggregations) not listed in the user dict keep
        # their slot in the renamed axis instead of being dropped by the
        # downstream reorder, mirroring the xcat_labels pattern above.
        # The renamed values flow into both the heatmap tick labels and
        # the auto axis label produced by ``collapse_constant_levels``.
        if freq_labels is not None:
            freq_labels_full = {f: freq_labels.get(f, f) for f in self.freqs}
            if "freq" in rows:
                df_result.rename(index=freq_labels_full, inplace=True)
                df_result = self.reindex_multindex_df(
                    df_result, list(freq_labels_full.values()), "Frequency"
                )
                if df_pval is not None:
                    df_pval.rename(index=freq_labels_full, inplace=True)
                    df_pval = self.reindex_multindex_df(
                        df_pval, list(freq_labels_full.values()), "Frequency"
                    )
            elif "freq" in columns:
                df_result.rename(columns=freq_labels_full, inplace=True)
                if df_pval is not None:
                    df_pval.rename(columns=freq_labels_full, inplace=True)

        if agg_sigs_labels is not None:
            agg_sigs_labels_full = {a: agg_sigs_labels.get(a, a) for a in self.agg_sigs}
            if "agg_sigs" in rows:
                df_result.rename(index=agg_sigs_labels_full, inplace=True)
                df_result = self.reindex_multindex_df(
                    df_result, list(agg_sigs_labels_full.values()), "Aggregation"
                )
                if df_pval is not None:
                    df_pval.rename(index=agg_sigs_labels_full, inplace=True)
                    df_pval = self.reindex_multindex_df(
                        df_pval, list(agg_sigs_labels_full.values()), "Aggregation"
                    )
            elif "agg_sigs" in columns:
                df_result.rename(columns=agg_sigs_labels_full, inplace=True)
                if df_pval is not None:
                    df_pval.rename(columns=agg_sigs_labels_full, inplace=True)

        box_rows: Optional[Dict[int, str]] = None
        if row_order_sigs or emphasize_colors:
            # Translate resolved signals through the effective rename so they
            # still match after signal_name_dict / xcat_labels renamed the index.
            def _translate(name: str) -> str:
                return signal_name_dict.get(name, name) if signal_name_dict else name

            signal_level = (
                df_result.index.get_level_values("Signal")
                if isinstance(df_result.index, pd.MultiIndex)
                else df_result.index
            )

            if row_order_sigs:
                # Stable-sort listed signals to the given order; unlisted rank
                # -1 and keep their original order above them.
                rank_map = {_translate(s): i for i, s in enumerate(row_order_sigs)}
                rank = [rank_map.get(s, -1) for s in signal_level]
                order = np.argsort(rank, kind="stable")
                df_result = df_result.iloc[order]
                if df_pval is not None:
                    df_pval = df_pval.iloc[order]
                signal_level = signal_level[order]

            if emphasize_colors:
                color_map = {_translate(s): c for s, c in emphasize_colors.items()}
                box_rows = {
                    i: color_map[s]
                    for i, s in enumerate(signal_level)
                    if s in color_map
                }

        if show_heatmap:
            if not title:
                title = f"{stat}"

            if min_color is None:
                min_color = df_result.values.min()
            if max_color is None:
                max_color = df_result.values.max()

            # Convert raw p-values to probability of significance (1 - pval)
            # so the bracketed value and the highlight threshold share the
            # same scale.
            df_psig = 1.0 - df_pval if df_pval is not None else None

            if annotate and df_psig is not None:
                heatmap_annot = self._format_dual_annot(
                    df_result, df_psig, round, round_pval
                )
                heatmap_fmt = ""
            else:
                heatmap_annot = annotate
                heatmap_fmt = f".{round}f"

            highlight_mask = None
            if df_psig is not None and significance_threshold is not None:
                highlight_mask = df_psig > float(significance_threshold)

            yticklabels_to_pass = row_names
            xticklabels_to_pass = column_names
            ylabel_to_pass = ylabel
            xlabel_to_pass = xlabel

            if collapse_constant_levels:
                # Strip row/column index levels whose values are constant
                # so they don't clutter the tick labels. The collapsed
                # values are promoted to the corresponding axis label
                # when the caller did not provide one. ``df_result``
                # itself is left untouched.
                display_yticks, constant_y = self._collapse_constant_levels(
                    df_result.index
                )
                display_xticks, constant_x = self._collapse_constant_levels(
                    df_result.columns
                )

                if yticklabels_to_pass is None:
                    yticklabels_to_pass = display_yticks
                if xticklabels_to_pass is None:
                    xticklabels_to_pass = display_xticks

                # Filter which collapsed levels feed into the auto axis
                # label. ``axis_label_levels`` is expressed in the same
                # vocabulary as ``rows`` / ``columns`` (``"xcat"``,
                # ``"ret"``, ``"freq"``, ``"agg_sigs"``); translate to
                # the display level names used in the MultiIndex.
                label_dict = {
                    "xcat": "Signal",
                    "ret": "Return",
                    "freq": "Frequency",
                    "agg_sigs": "Aggregation",
                }
                if axis_label_levels is not None:
                    allowed = {label_dict[k] for k in axis_label_levels}
                    constant_y = [(n, v) for n, v in constant_y if n in allowed]
                    constant_x = [(n, v) for n, v in constant_x if n in allowed]

                if ylabel_to_pass is None and constant_y:
                    ylabel_to_pass = " · ".join(v for _, v in constant_y)
                if xlabel_to_pass is None and constant_x:
                    xlabel_to_pass = " · ".join(v for _, v in constant_x)

            msv.view_table(
                df_result,
                title=title,
                title_fontsize=title_fontsize,
                min_color=min_color,
                max_color=max_color,
                figsize=figsize,
                fmt=heatmap_fmt,
                annot=heatmap_annot,
                xlabel=xlabel_to_pass,
                ylabel=ylabel_to_pass,
                xticklabels=xticklabels_to_pass,
                yticklabels=yticklabels_to_pass,
                highlight_mask=highlight_mask,
                box_rows=box_rows,
                footnote=footnote,
                footnote_fontsize=footnote_fontsize,
            )

        return df_result

[docs]    def show_single_statistic_table(self, *args, **kwargs) -> pd.DataFrame:
        """
        Return the single statistic table without rendering a heatmap.

        Thin wrapper around :meth:`single_statistic_table` that forces
        ``show_heatmap=False``.

        Parameters
        ----------
        stat : str
            type of statistic to be displayed (this can be any of the column names of
            summary_table).
        type : str
            type of the statistic displayed. This can be based on the overall panel
            ("panel", default), an average of annual panels (mean_years), an average of
            cross-sectional relations ("mean_cids"), the positive ratio across
            years("pr_years"), positive ratio across sections ("pr_cids").
        rows : List[str]
            row indices, which can be return categories, feature categories, frequencies
            and/or aggregations. The choice is made through a list of one or more of "xcat",
            "ret", "freq" and "agg_sigs". The default is ["xcat", "agg_sigs"] resulting in
            index strings (<agg_signs>) or if only one aggregation is available.
        columns : List[str]
            column indices, which can be return categories, feature categories,
            frequencies and/or aggregations. The choice is made through a list of one or
            more of "xcat", "ret", "freq" and "agg_sigs". The default is ["ret", "freq]
            resulting in index strings () or if only one frequency is available.
        title : str, optional
            plot title. Default is None in which case the default title is used.
        title_fontsize : int
            font size of title. Default is 16.
        row_names : List[str]
            specifies the labels of rows in the heatmap. Default is None, the indices of
            the generated DataFrame are used.
        column_names : List[str]
            specifies the labels of columns in the heatmap. Default is None, the columns
            of the generated DataFrame are used.
        signal_name_dict : dict, optional
            dictionary mapping the signal names to the desired names in the heatmap.
            Default is None, in which case the signal names are used. Renamed
            values flow through to the auto axis label produced by the
            constant-level collapse described under ``ylabel``.
        return_name_dict : dict, optional
            dictionary mapping the return names to the desired names in the heatmap.
            Default is None, in which case the return names are used. Renamed
            values flow through to the auto axis label produced by the
            constant-level collapse described under ``xlabel``.
        xcat_labels : dict, optional
            Unified rename dictionary covering both signal and return
            ``xcats``. Internally split by membership in ``self.sigs`` /
            ``self.rets`` and routed through ``signal_name_dict`` /
            ``return_name_dict``; xcats not listed in the dict are kept
            verbatim. Mutually exclusive with the two legacy kwargs — pass
            either ``xcat_labels`` or ``signal_name_dict`` /
            ``return_name_dict``, not both. Default is None (no rename).
        freq_labels : dict, optional
            Mapping from frequency code (``"M"``, ``"Q"``, …) to its
            display label. Frequencies not listed in the dict are kept
            verbatim. Default is None.
        agg_sigs_labels : dict, optional
            Mapping from aggregation code (``"last"``, ``"mean"``, …) to
            its display label. Aggregations not listed in the dict are
            kept verbatim. Default is None.
        min_color : float, optional
            minimum value of the color scale. Default is None, in which case the minimum
            value of the table is used.
        max_color : float, optional
            maximum value of the color scale. Default is None, in which case the maximum
            value of the table is used.
        figsize : Tuple[float, float]
            Tuple (w, h) of width and height of graph. Default is (14, 8).
        annotate : bool
            Default is True, where the values shown in the heatmap are annotated.
        round : int
            number of decimals to round the primary statistic to in the heatmap
            annotations. Default is 3.
        pval_stat : str, optional
            name of a p-value statistic — typically ``"kendall_pval"``,
            ``"pearson_pval"`` or ``"map_pval"`` (the Macrosynergy Panel
            test). When set, each heatmap cell shows the **probability of
            significance**, ``1 - pval_stat``, in brackets beneath the
            primary statistic. Default is None. When ``pval_stat="map_pval"``
            the SignalReturnRelations must have been constructed with
            ``ms_panel_test=True``.
        round_pval : int
            number of decimals to round the bracketed probability of
            significance to in the heatmap annotations. Default is 3.
        significance_threshold : float, optional
            probability-of-significance cutoff above which a cell's
            annotation is rendered in black and bold. Compared directly
            against the bracketed value (``1 - pval_stat``), so 0.9
            highlights cells whose probability of significance exceeds 0.9
            (equivalently, raw p-value below 0.1). Only takes effect when
            ``pval_stat`` is set. Pass ``None`` to disable. Default is 0.9.
        xlabel, ylabel, footnote, footnote_fontsize
            Forwarded to :meth:`single_statistic_table` and only affect
            the heatmap; accepted here for API symmetry even though this
            wrapper renders no heatmap.

        Returns
        -------
        ~pandas.DataFrame
            DataFrame with the specified statistic for each row and column.
        """
        kwargs["show_heatmap"] = False
        return self.single_statistic_table(*args, **kwargs)

[docs]    def plot_single_statistic_heatmap(self, *args, **kwargs) -> None:
        """
        Render the heatmap of the single statistic table.

        Thin wrapper around :meth:`single_statistic_table` that forces
        ``show_heatmap=True``. The computed table itself is not returned.

        Parameters
        ----------
        stat : str
            type of statistic to be displayed (this can be any of the column names of
            summary_table).
        type : str
            type of the statistic displayed. This can be based on the overall panel
            ("panel", default), an average of annual panels (mean_years), an average of
            cross-sectional relations ("mean_cids"), the positive ratio across
            years("pr_years"), positive ratio across sections ("pr_cids").
        rows : List[str]
            row indices, which can be return categories, feature categories, frequencies
            and/or aggregations. The choice is made through a list of one or more of "xcat",
            "ret", "freq" and "agg_sigs". The default is ["xcat", "agg_sigs"] resulting in
            index strings (<agg_signs>) or if only one aggregation is available.
        columns : List[str]
            column indices, which can be return categories, feature categories,
            frequencies and/or aggregations. The choice is made through a list of one or
            more of "xcat", "ret", "freq" and "agg_sigs". The default is ["ret", "freq]
            resulting in index strings () or if only one frequency is available.
        show_heatmap : bool
            not allowed; this wrapper always forces ``show_heatmap=True`` and
            any value supplied by the caller is overridden.
        title : str, optional
            plot title. Default is None in which case the default title is used.
        title_fontsize : int
            font size of title. Default is 16.
        row_names : List[str]
            specifies the labels of rows in the heatmap. Default is None, the indices of
            the generated DataFrame are used.
        column_names : List[str]
            specifies the labels of columns in the heatmap. Default is None, the columns
            of the generated DataFrame are used.
        signal_name_dict : dict, optional
            dictionary mapping the signal names to the desired names in the heatmap.
            Default is None, in which case the signal names are used. Renamed
            values flow through to the auto axis label produced by the
            constant-level collapse described under ``ylabel``.
        return_name_dict : dict, optional
            dictionary mapping the return names to the desired names in the heatmap.
            Default is None, in which case the return names are used. Renamed
            values flow through to the auto axis label produced by the
            constant-level collapse described under ``xlabel``.
        xcat_labels : dict, optional
            Unified rename dictionary covering both signal and return
            ``xcats``. Internally split by membership in ``self.sigs`` /
            ``self.rets`` and routed through ``signal_name_dict`` /
            ``return_name_dict``; xcats not listed in the dict are kept
            verbatim. Mutually exclusive with the two legacy kwargs — pass
            either ``xcat_labels`` or ``signal_name_dict`` /
            ``return_name_dict``, not both. Default is None (no rename).
        freq_labels : dict, optional
            Mapping from frequency code (``"M"``, ``"Q"``, …) to its
            display label. Frequencies not listed in the dict are kept
            verbatim. Default is None.
        agg_sigs_labels : dict, optional
            Mapping from aggregation code (``"last"``, ``"mean"``, …) to
            its display label. Aggregations not listed in the dict are
            kept verbatim. Default is None.
        min_color : float, optional
            minimum value of the color scale. Default is None, in which case the minimum
            value of the table is used.
        max_color : float, optional
            maximum value of the color scale. Default is None, in which case the maximum
            value of the table is used.
        figsize : Tuple[float, float]
            Tuple (w, h) of width and height of graph. Default is (14, 8).
        annotate : bool
            Default is True, where the values shown in the heatmap are annotated.
        round : int
            number of decimals to round the primary statistic to in the heatmap
            annotations. Default is 3.
        pval_stat : str, optional
            name of a p-value statistic — typically ``"kendall_pval"``,
            ``"pearson_pval"`` or ``"map_pval"`` (the Macrosynergy Panel
            test). When set, each heatmap cell shows the **probability of
            significance**, ``1 - pval_stat``, in brackets beneath the
            primary statistic. Default is None. When ``pval_stat="map_pval"``
            the SignalReturnRelations must have been constructed with
            ``ms_panel_test=True``.
        round_pval : int
            number of decimals to round the bracketed probability of
            significance to in the heatmap annotations. Default is 3.
        significance_threshold : float, optional
            probability-of-significance cutoff above which a cell's
            annotation is rendered in black and bold. Compared directly
            against the bracketed value (``1 - pval_stat``), so 0.9
            highlights cells whose probability of significance exceeds 0.9
            (equivalently, raw p-value below 0.1). Only takes effect when
            ``pval_stat`` is set. Pass ``None`` to disable. Default is 0.9.
        xlabel : str, optional
            Label drawn beneath the heatmap columns. Default is None.
        ylabel : str, optional
            Label drawn beside the heatmap rows. Default is None.
        footnote : str, optional
            Free-text caption rendered below the heatmap. Useful for
            recording the significance test, panel scope, or annotation
            legend. Multi-line strings are supported. Default is None.
        footnote_fontsize : int, optional
            Font size for the footnote text. Default is 10.
        """
        kwargs["show_heatmap"] = True
        self.single_statistic_table(*args, **kwargs)

    @staticmethod
    def _format_dual_annot(
        df_stat: pd.DataFrame,
        df_pval: pd.DataFrame,
        round_stat: int,
        round_pval: int,
    ) -> pd.DataFrame:
        """
        Build a string-typed DataFrame of cell annotations of the form
        ``"<stat>\\n(<pval>)"`` aligned with ``df_stat``. NaN values render
        as empty strings.
        """

        def _fmt(value: float, ndigits: int) -> str:
            if value is None or (isinstance(value, float) and np.isnan(value)):
                return ""
            return f"{value:.{ndigits}f}"

        annot = pd.DataFrame(index=df_stat.index, columns=df_stat.columns, dtype=object)
        for row in df_stat.index:
            for col in df_stat.columns:
                stat_str = _fmt(df_stat.loc[row, col], round_stat)
                pval_str = _fmt(df_pval.loc[row, col], round_pval)
                if stat_str == "" and pval_str == "":
                    annot.loc[row, col] = ""
                elif pval_str == "":
                    annot.loc[row, col] = stat_str
                else:
                    annot.loc[row, col] = f"{stat_str}\n({pval_str})"
        return annot

    def _collapse_constant_levels(
        self, idx: pd.Index
    ) -> Tuple[Optional[List[str]], List[Tuple[str, str]]]:
        """
        Strip levels of a MultiIndex whose values are constant across the
        index and surface those values for axis-label use.

        Parameters
        ----------
        idx : pd.Index
            Row or column index of the assembled statistic table. May be a
            plain :class:`~pandas.Index` or a :class:`~pandas.MultiIndex`.

        Returns
        -------
        Tuple[Optional[List[str]], List[Tuple[str, str]]]
            ``(display_labels, constant_pairs)``.
            ``display_labels`` is a list of tick labels with constant levels
            removed, joined by ``" · "`` when more than one level survives.
            It is ``None`` when no collapse applies (plain ``Index``, single
            level, no constant levels, or all levels constant — in which
            case the existing tick labels are kept). ``constant_pairs`` is
            an ordered list of ``(level_name, value)`` for each collapsed
            level, suitable for filtering and joining into an auto axis
            label.
        """
        if not isinstance(idx, pd.MultiIndex) or idx.nlevels < 2:
            return None, []

        constant_level_nos: List[int] = []
        constant_pairs: List[Tuple[str, str]] = []
        for level_no in range(idx.nlevels):
            uniq = idx.get_level_values(level_no).unique()
            if len(uniq) == 1:
                constant_level_nos.append(level_no)
                constant_pairs.append((str(idx.names[level_no]), str(uniq[0])))

        if not constant_level_nos:
            return None, []
        if len(constant_level_nos) == idx.nlevels:
            # Every level is constant (single-row/column table): leave the
            # tick labels alone but still expose the values for the axis.
            return None, constant_pairs

        remaining = idx.droplevel(constant_level_nos)
        if isinstance(remaining, pd.MultiIndex):
            display = [
                " · ".join(str(part) for part in tup) for tup in remaining.tolist()
            ]
        else:
            display = [str(v) for v in remaining.tolist()]
        return display, constant_pairs

[docs]    def set_df_labels(self, rows_dict: Dict, rows: List[str], columns: List[str]):
        """
        Creates two lists of strings that will be used as the row and column labels for
        the resulting dataframe.

        Parameters
        ----------
        rows_dict : dict
            dictionary containing the each value for each of the xcat, ret, freq and
            agg_sigs categories.
        rows : List[str]
            list of strings specifying which of the categories are included in the rows
            of the dataframe.
        columns : List[str]
            list of strings specifying which of the categories are included in the
            columns of the dataframe.
        """

        label_dict = {
            "xcat": "Signal",
            "ret": "Return",
            "freq": "Frequency",
            "agg_sigs": "Aggregation",
        }
        if len(rows) == 2:
            rows_names = pd.MultiIndex.from_tuples(
                [(a, b) for a in rows_dict[rows[0]] for b in rows_dict[rows[1]]],
                names=[label_dict[rows[0]], label_dict[rows[1]]],
            )
            columns_names = pd.MultiIndex.from_tuples(
                [(a, b) for a in rows_dict[columns[0]] for b in rows_dict[columns[1]]],
                names=[label_dict[columns[0]], label_dict[columns[1]]],
            )
        elif len(rows) == 1:
            rows_names = rows_dict[rows[0]]
            columns_names = pd.MultiIndex.from_tuples(
                [
                    (a, b, c)
                    for a in rows_dict[columns[0]]
                    for b in rows_dict[columns[1]]
                    for c in rows_dict[columns[2]]
                ],
                names=[
                    label_dict[columns[0]],
                    label_dict[columns[1]],
                    label_dict[columns[2]],
                ],
            )
        elif len(columns) == 1:
            rows_names = pd.MultiIndex.from_tuples(
                [
                    (a, b, c)
                    for a in rows_dict[rows[0]]
                    for b in rows_dict[rows[1]]
                    for c in rows_dict[rows[2]]
                ],
                names=[label_dict[rows[0]], label_dict[rows[1]], label_dict[rows[2]]],
            )
            columns_names = rows_dict[columns[0]]

        return rows_names, columns_names

[docs]    def get_rowcol(self, hash: str, rowcols: List[str]):
        """
        Calculates which row/column the hash belongs to.

        Parameters
        ----------
        hash : str
            hash of the statistic.
        rowcols : List[str]
            list of strings specifying which of the categories are in the rows/columns
            of the dataframe.
        """

        result = ""
        idx: List[str] = ["ret", "xcat", "freq", "agg_sigs"]
        assert all([x in idx for x in rowcols]), "rowcols must be a subset of idx"

        if len(rowcols) == 1:
            result = hash.split("/")[idx.index(rowcols[0])]
        if len(rowcols) == 2:
            result = (
                hash.split("/")[idx.index(rowcols[0])],
                hash.split("/")[idx.index(rowcols[1])],
            )
        if len(rowcols) == 3:
            result = (
                hash.split("/")[idx.index(rowcols[0])],
                hash.split("/")[idx.index(rowcols[1])],
                hash.split("/")[idx.index(rowcols[2])],
            )

        return result


if __name__ == "__main__":
    cids = ["AUD", "CAD", "GBP", "NZD", "USD"]
    xcats = ["XR", "XRH", "CRY", "GROWTH", "INFL"]
    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )
    df_cids.loc["AUD"] = ["2000-01-01", "2020-12-31", 0.1, 1]
    df_cids.loc["CAD"] = ["2001-01-01", "2020-11-30", 0, 1]
    df_cids.loc["BRL"] = ["2001-01-01", "2020-11-30", -0.1, 2]
    df_cids.loc["GBP"] = ["2002-01-01", "2020-11-30", 0, 2]
    df_cids.loc["NZD"] = ["2002-01-01", "2020-09-30", -0.1, 2]
    df_cids.loc["USD"] = ["2003-01-01", "2020-12-31", -0.1, 2]

    cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]
    df_xcats = pd.DataFrame(index=xcats, columns=cols)
    df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
    df_xcats.loc["XRH"] = ["2000-01-01", "2020-12-31", 0.2, 1, 0, 0.25]
    df_xcats.loc["CRY"] = ["2000-01-01", "2020-10-30", 1, 2, 0.95, 1]
    df_xcats.loc["GROWTH"] = ["2001-01-01", "2020-10-30", 1, 2, 0.9, 1]
    df_xcats.loc["INFL"] = ["2001-01-01", "2020-10-30", 1, 2, 0.8, 0.5]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    dfd["grading"] = np.ones(dfd.shape[0])
    black = {"AUD": ["2000-01-01", "2003-12-31"], "GBP": ["2018-01-01", "2100-01-01"]}

    # All AUD GROWTH locations.
    filt1 = (dfd["xcat"] == "GROWTH") & (dfd["cid"] == "AUD")
    filt2 = (dfd["xcat"] == "INFL") & (dfd["cid"] == "NZD")

    # Reduced DataFrame.
    dfdx = dfd[~(filt1 | filt2)].copy()
    dfdx["ERA"] = "before 2007"
    dfdx.loc[dfdx["real_date"].dt.year > 2007, "ERA"] = "from 2010"

    cidx = ["AUD", "CAD", "GBP", "USD"]

    def spearman(x, y):
        return stats.spearmanr(x, y)[0]

    from statsmodels.tsa.stattools import grangercausalitytests

    def granger(x, y):
        return grangercausalitytests(
            np.array([x, y]).T, maxlag=3, addconst=True, verbose=False
        )[1][0]["ssr_ftest"][0]

    def granger_pval(x, y):
        return grangercausalitytests(
            np.array([x, y]).T, maxlag=3, addconst=True, verbose=False
        )[1][0]["ssr_ftest"][1]

    sigs = ["CRY"]
    # Additional signals.
    srn = SignalReturnRelations(
        dfd,
        rets="XR",
        sigs=sigs,
        sig_neg=True,
        cosp=True,
        freqs="Q",
        start="2002-01-01",
        ms_panel_test=True,
        additional_metrics=[spearman, granger, granger_pval],
    )

    print(sigs)

    df_dep = srn.summary_table()
    print(df_dep)

    dfsum = srn.single_relation_table(table_type="summary")
    print(dfsum)

    srn = SignalReturnRelations(
        dfd,
        rets="XR",
        sigs=["CRY", "CRY", "INFL", "GROWTH"],
        sig_neg=[True, False, True, True],
        cosp=True,
        freqs="M",
        start="2002-01-01",
        additional_metrics=[spearman, granger, granger_pval],
    )

    df_sigs = srn.multiple_relations_table()
    print(df_sigs)

    dfsum = srn.single_relation_table(table_type="cross_section")
    print(dfsum)

    srn.accuracy_bars(
        view="signals",
        title="Accuracy",
        x_labels={"CRY": "Cry", "INFL": "Inflation", "GROWTH": "Growth"},
        x_labels_rotate=45,
    )

    sst = srn.single_statistic_table(stat="granger_pval")

    print(sst)

    sr = SignalReturnRelations(
        dfd,
        rets=["XR", "XRH"],
        sigs=["CRY", "INFL", "GROWTH"],
        freqs="M",
        start="2002-01-01",
        agg_sigs="last",
    )

    srt = sr.single_relation_table()
    mrt = sr.multiple_relations_table()
    sst = sr.single_statistic_table(
        stat="accuracy",
        type="mean_years",
        rows=["ret", "agg_sigs"],
        columns=["xcat", "freq"],
    )

    print(srt)
    print(mrt)
    print(sst)

    # Basic Signal Returns showing for multiple input values

    sr = SignalReturnRelations(
        dfd,
        rets=["XR", "XRH"],
        sigs=["CRY", "INFL", "GROWTH"],
        sig_neg=[True, True, False],
        cosp=True,
        freqs=["M", "Q"],
        agg_sigs=["last", "mean"],
        blacklist=black,
    )

    sr.accuracy_bars(sigs=["CRY", "INFL_NEG"], view="signals", title="Accuracy")
    sr.correlation_bars(sigs=["CRY", "INFL_NEG"], type="signals", title="Correlation")

    srt = sr.single_relation_table(
        ret="XRH", xcat="INFL_NEG", freq="Q", agg_sigs="last"
    )
    mrt = sr.multiple_relations_table()
    # xcat_row_order sets the top-to-bottom row order; emphasize_rows draws a
    # coloured box around each named signal (grey for the second-last row,
    # black for the last), scorecard-style.
    sst = sr.single_statistic_table(
        stat="pearson",
        show_heatmap=True,
        xcat_row_order=["CRY", "INFL", "GROWTH"],
        emphasize_rows={"INFL": "red", "GROWTH": "black"},
        title="Pearson (INFL boxed grey, GROWTH boxed black)",
    )

    print(srt)
    print(mrt)
    print(sst)

    # Specifying specific arguments for each of the Signal Return Functions

    srt = sr.single_relation_table(ret="XR", xcat="CRY_NEG", freq="Q", agg_sigs="last")
    print(srt)

    mrt = sr.multiple_relations_table(
        rets=["XR", "GROWTH"], xcats="INFL", freqs=["M", "Q"], agg_sigs=["last", "mean"]
    )
    print(mrt)

    sst = sr.single_statistic_table(
        stat="auc",
        rows=["ret", "xcat", "freq"],
        columns=["agg_sigs"],
        type="mean_cids",
    )
    print(sst)