Source code for macrosynergy.visuals.multiple_reg_scatter

from typing import List, Tuple
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from macrosynergy.management.simulate.simulate_quantamental_data import make_qdf
from macrosynergy.panel.category_relations import CategoryRelations
import textwrap
import seaborn as sns


[docs]def multiple_reg_scatter(
    cat_rels: List[CategoryRelations],
    ncol: int = 0,
    nrow: int = 0,
    figsize: Tuple[int, int] = (20, 15),
    title: str = "",
    title_xadj: float = 0.5,
    title_yadj: float = 0.99,
    title_fontsize: int = 20,
    xlab: str = "",
    ylab: str = "",
    label_fontsize: int = 12,
    fit_reg: bool = True,
    reg_ci: int = 95,
    reg_order: int = 1,
    reg_robust: bool = False,
    coef_box: str = None,
    coef_box_size: Tuple[float] = (0.4, 2.5),
    coef_box_font_size: int = 12,
    prob_est: str = "pool",
    separator: int = None,
    single_chart: bool = False,
    subplot_titles: bool = None,
    subplot_title_fontsize: int = 14,
    color_cids: bool = False,
    remove_zero_predictor: bool = False,
    share_axes: bool = True,
    return_fig: bool = False,
):
    """
    Displays multiple regression scatter plots across categories. The categories are 
    passed as a list of CategoryRelations objects, where the regression calculations 
    take place.

    Parameters
    ----------
    cat_rels : List[CategoryRelations]
        list of CategoryRelations objects.
    ncol : int
        number of columns in the grid. Default is 0, which will be set to the length of
        cat_rels.
    nrow : int
        number of rows in the grid. Default is 0, which will be set to 1.
    figsize : Tuple[float]
        size of the figure. Default is (20, 15).
    title : str
        title of the figure. Default is an empty string.
    xlab : str
        label of the x-axis. Default is an empty string.
    ylab : str
        label of the y-axis. Default is an empty string.
    fit_reg : bool
        if True (default) a linear regression line is fitted to the data.
    reg_ci : int
        confidence interval for the regression line. Default is 95.
    reg_order : int
        order of the regression line. Default is 1.
    reg_robust : bool
        if True (default is False) robust standard errors are used.
    coef_box : str
        if not None, a box with the coefficients of the regression is displayed. Default
        is None.
    coef_box_font_size : int
        font size of the coefficients box. Default is 12. If set to 0 it automatically
        sets the fontsize according to matplotlib.
    prob_est : str
        method to estimate the probability. Default is 'pool'.
    separator : int
        allows categorizing the scatter analysis by integer. This is done by setting it
        to a year [2010, for instance] which will subsequently split the time-period into
        the sample before (not including) that year and from (including) that year.
    single_chart : bool
        if True (default is False) all the data is plotted in a single chart. If False,
        a grid of charts is created.
    subplot_titles : List[str]
        list of titles for each subplot. Default is None.
    color_cids : bool
        if True (default is False) each cross section is given a distinct color in the 
        scatter plot.
    remove_zero_predictor : bool, default=False
            Remove zeros from the input series.
    share_axes : bool
        if True (default is True) the axes are shared across subplots.
    """

    sns.set_theme(style="whitegrid")
    if ncol == 0:
        ncol = len(cat_rels)
    if nrow == 0:
        nrow = 1
    if subplot_titles is not None:
        if len(subplot_titles) != len(cat_rels):
            raise ValueError(
                "The length of subplot_titles must be equal to the length of cat_rels."
            )

    if separator is not None:
        if separator == "cids":
            raise ValueError(
                "Separator 'cids' is not permitted in multiple_reg_scatter. "
                "To get a plot across multiple cids, please specify separator as cids "
                "inside reg_scatter."
            )
    single_scatter = color_cids
    separator = "cids" if color_cids else separator

    fig, axes = plt.subplots(
        nrows=nrow, ncols=ncol, figsize=figsize, sharex=share_axes, sharey=share_axes
    )
    fig.suptitle(title, x=title_xadj, y=title_yadj, fontsize=title_fontsize)
    fig.supxlabel(xlab, fontsize=label_fontsize)
    fig.supylabel(ylab, fontsize=label_fontsize)

    for i, cat_rel in enumerate(cat_rels):
        row = i // ncol
        col = i % ncol
        if not isinstance(axes, np.ndarray):
            ax = axes
            ax.set_facecolor("white")
        else:
            ax = axes[i] if (ncol == 1 or nrow == 1) else axes[row, col]
            ax.set_facecolor("white")

        if subplot_titles is not None:
            subplot_title = subplot_titles[i]
        else:
            if cat_rel.years is None:
                dates = (
                    cat_rel.df.index.get_level_values("real_date")
                    .to_series()
                    .dt.strftime("%Y-%m-%d")
                )
                subplot_title = (
                    f"{cat_rel.xcats[0]} and {cat_rel.xcats[1]} "
                    f"from {dates.min()} to {dates.max()}"
                )
            else:
                subplot_title = f"{cat_rel.xcats[0]} and {cat_rel.xcats[1]}"

        width = (figsize[0] // ncol) * 6
        wrapped_title = "\n".join(textwrap.wrap(subplot_title, width=width))

        cat_rel.reg_scatter(
            title=wrapped_title,
            labels=False,
            xlab="",
            ylab="",
            fit_reg=fit_reg,
            reg_ci=reg_ci,
            reg_order=reg_order,
            reg_robust=reg_robust,
            coef_box=coef_box,
            coef_box_size=coef_box_size,
            coef_box_font_size=coef_box_font_size,
            prob_est=prob_est,
            single_chart=single_chart,
            separator=separator,
            ax=ax,
            single_scatter=single_scatter,
            title_fontsize=subplot_title_fontsize,
            remove_zero_predictor=remove_zero_predictor,
        )

    plt.subplots_adjust(top=title_yadj - 0.01)
    plt.tight_layout()

    if return_fig:
        return fig
    else:
        plt.show()


if __name__ == "__main__":
    cids = ["AUD", "CAD", "GBP", "NZD", "USD"]
    xcats = ["XR", "CRY", "GROWTH", "INFL"]
    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )
    df_cids.loc["AUD"] = ["2000-01-01", "2020-12-31", 0.1, 1]
    df_cids.loc["CAD"] = ["2001-01-01", "2020-11-30", 0, 1]
    df_cids.loc["BRL"] = ["2001-01-01", "2020-11-30", -0.1, 2]
    df_cids.loc["GBP"] = ["2002-01-01", "2020-11-30", 0, 2]
    df_cids.loc["NZD"] = ["2002-01-01", "2020-09-30", -0.1, 2]
    df_cids.loc["USD"] = ["2003-01-01", "2020-12-31", -0.1, 2]

    cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]
    df_xcats = pd.DataFrame(index=xcats, columns=cols)
    df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0.1, 1, 0, 0.3]
    df_xcats.loc["CRY"] = [
        "2000-01-01",
        "2020-10-30",
        1,
        2,
        0.95,
        1,
    ]
    df_xcats.loc["GROWTH"] = ["2001-01-01", "2020-10-30", 1, 2, 0.9, 1]
    df_xcats.loc["INFL"] = ["2001-01-01", "2020-10-30", 1, 2, 0.8, 0.5]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    dfd["grading"] = np.ones(dfd.shape[0])
    black = {"AUD": ["2000-01-01", "2003-12-31"], "GBP": ["2018-01-01", "2100-01-01"]}

    # All AUD GROWTH locations.
    filt1 = (dfd["xcat"] == "GROWTH") & (dfd["cid"] == "AUD")
    filt2 = (dfd["xcat"] == "INFL") & (dfd["cid"] == "NZD")

    # Reduced DataFrame.
    dfdx = dfd[~(filt1 | filt2)].copy()
    dfdx["ERA"]: str = "before 2007"
    dfdx.loc[dfdx["real_date"].dt.year > 2007, "ERA"] = "from 2010"

    cidx = ["AUD", "CAD", "GBP", "USD"]

    cr1 = CategoryRelations(
        dfdx,
        xcats=["CRY", "XR"],
        freq="M",
        lag=1,
        cids=cidx,
        xcat_aggs=["mean", "sum"],
        start="2001-01-01",
        blacklist=black,
        years=None,
    )

    # cr1.reg_scatter(
    #     labels=False,
    #     single_scatter=True,
    #     title="Carry and Return",
    #     xlab="Carry",
    #     ylab="Return",
    #     prob_est="map",
    #     separator="cids"
    # )

    cr2 = CategoryRelations(
        dfdx,
        xcats=["CRY", "GROWTH"],
        # xcat1_chg="diff",
        freq="M",
        lag=1,
        cids=cidx,
        xcat_aggs=["mean", "sum"],
        start="2001-01-01",
        blacklist=black,
        years=None,
    )

    cr3 = CategoryRelations(
        dfdx,
        xcats=["CRY", "INFL"],
        # xcat1_chg="diff",
        freq="M",
        lag=1,
        cids=cidx,
        xcat_aggs=["mean", "sum"],
        start="2001-01-01",
        blacklist=black,
        years=None,
    )

    cr4 = CategoryRelations(
        dfdx,
        xcats=["CRY", "INFL"],
        # xcat1_chg="diff",
        freq="Q",
        lag=1,
        cids=cidx,
        xcat_aggs=["mean", "sum"],
        start="2001-01-01",
        blacklist=black,
        years=None,
    )

    cr5 = CategoryRelations(
        dfdx,
        xcats=["CRY", "INFL"],
        # xcat1_chg="diff",
        freq="Q",
        lag=1,
        cids=cidx,
        xcat_aggs=["mean", "sum"],
        start="2001-01-01",
        blacklist=black,
        years=None,
    )

    cr6 = CategoryRelations(
        dfdx,
        xcats=["CRY", "INFL"],
        # xcat1_chg="diff",
        freq="Q",
        lag=1,
        cids=cidx,
        xcat_aggs=["mean", "sum"],
        start="2001-01-01",
        blacklist=black,
        years=None,
    )

    multiple_reg_scatter(
        [cr1, cr2, cr3, cr4, cr5, cr6],
        title="Growth trend and subsequent sectoral equity returns.",
        xlab="Real technical growth trend",
        ylab="Excess Return",
        ncol=3,
        nrow=2,
        coef_box="upper right",
        color_cids=True,
        single_chart=True,
        share_axes=False,
    )

    multiple_reg_scatter(
        [cr1, cr2, cr3, cr4, cr5, cr6],
        title="Growth trend and subsequent sectoral equity returns.",
        xlab="Real technical growth trend",
        ylab="Excess Return",
        ncol=6,
        nrow=2,
    )