Source code for macrosynergy.panel.return_beta

"""
Functions for calculating the hedge ratios of a panel of returns with respect to a
single return.
"""

import warnings
import numpy as np
import pandas as pd
from typing import List, Tuple

from macrosynergy.management.types import QuantamentalDataFrame
from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.utils import (
    reduce_df,
    _map_to_business_day_frequency,
    standardise_dataframe,
)
import matplotlib.pyplot as plt


[docs]def date_alignment(
    unhedged_return: pd.Series, benchmark_return: pd.Series
) -> Tuple[pd.Timestamp, pd.Timestamp]:
    """
    Method used to align the two Series over the same timestamps: the sample data for
    the endogenous & exogenous variables must match throughout the re-estimation
    calculation.

    Parameters
    ----------
    unhedged_return : ~pandas.DataFrame
        the return series of the asset that is being hedged.
    benchmark_return : ~pandas.Series
        the return series of the asset being used to hedge against the main asset.

    Returns
    -------
    ~pandas.Timestamp, ~pandas.Timestamp
        the shared start and end date across the two series.
    """

    ma_dates = unhedged_return.index
    ha_dates = benchmark_return.index

    if ma_dates[0] > ha_dates[0]:
        start_date = ma_dates[0]
    else:
        start_date = ha_dates[0]

    if ma_dates[-1] > ha_dates[-1]:
        end_date = ha_dates[-1]
    else:
        end_date = ma_dates[-1]

    return start_date, end_date


[docs]def hedge_calculator(
    unhedged_return: pd.Series,
    benchmark_return: pd.Series,
    rdates: List[pd.Timestamp],
    meth: str = "ols",
    half_life: int = 11,
    min_obs: int = 24,
    max_obs: int = 1000,
) -> pd.DataFrame:
    """
    Calculate the hedge ratios for each cross-section in the panel being hedged. It is
    worth noting that the sample of data used for calculating the hedge ratio will
    increase according to the dates parameter: each date represents an additional number
    of timestamps where the numeracy of dates added to the sample is instructed by the
    "refreq" parameter.

    Parameters
    ----------
    unhedged_return : ~pandas.Series
        the return series of the asset that is being hedged.
    benchmark_return : ~pandas.Series
        the return series of the asset being used to hedge against the main asset.
    rdates : List[~pandas.Timestamp]
        the dates controlling the frequency of re-estimation.
    meth : str
        method to estimate the hedge ratio. Valid options are ``'ols'`` for ordinary
        least squares and ``'twls'`` for time-weighted least squares.
    half_life: int
        half-life of the exponential decay function used to calculate the time weights
        when ``meth='twls'``.
    min_obs : int
        a hedge ratio will only be computed if the number of days has surpassed the
        integer held by the parameter.
    max_obs : int
        the maximum number of latest observations allowed in order to estimate a hedge
        ratio. The default value is 1000.

    Returns
    -------
    ~pandas.DataFrame
        returns a dataframe of the hedge ratios for the respective cross-section.
    """
    # Input validation
    if meth not in ("ols", "twls"):
        raise ValueError(f"meth must be 'ols' or 'twls', got {meth!r}")
    if min_obs < 1:
        raise ValueError(f"min_obs must be >= 1, got {min_obs}")
    if max_obs < min_obs:
        raise ValueError(f"max_obs ({max_obs}) must be >= min_obs ({min_obs})")

    # Align returns
    benchmark_return = benchmark_return.dropna()
    unhedged_return = unhedged_return.dropna()

    common_idx_sorted = (
        benchmark_return.index
        .intersection(unhedged_return.index)
        .sort_values()
    )

    benchmark_return = benchmark_return[common_idx_sorted]
    unhedged_return = unhedged_return[common_idx_sorted]

    if len(common_idx_sorted) < min_obs:
        raise ValueError(
            f"Only {len(common_idx_sorted)} overlapping observations between the "
            f"unhedged and benchmark returns; at least min_obs = "
            f"{min_obs} are required."
        )

    # First date where we have min_obs observations
    min_obs_date = common_idx_sorted[min_obs - 1]

    # Check all re-estimation dates are valid
    rdates = np.array(rdates)
    invalid_rdate = rdates < min_obs_date
    if invalid_rdate.any():
        raise ValueError(
            f"Re-estimation dates {rdates[invalid_rdate]} are invalid as there is not "
            f"at least {min_obs} observations of data."
        )

    hedge_ratios = {}
    for rdate in rdates:
        # Inclusive of the re-estimation date.
        yvar = unhedged_return.loc[:rdate].values[-max_obs:]
        xvar = benchmark_return.loc[:rdate].values[-max_obs:].reshape(-1, 1)

        if meth == "ols":
            weights = np.ones_like(yvar)
        elif meth == "twls":
            weights = np.power(2, -np.arange(yvar.shape[0]) / half_life)[::-1]

        betas = weighted_least_squares(
            X=np.column_stack((np.ones(xvar.shape[0]), xvar)),
            y=yvar,
            weights=weights,
        )

        hedge_ratios[rdate] = betas[1]

    # Create hedge ratios dataframe
    df_hrat = pd.DataFrame.from_dict(hedge_ratios, orient="index", columns=["value"])
    df_hrat.index.name = "real_date"
    df_hrat = df_hrat.reset_index(level=0)

    # Create unhedged return df
    df_ur = unhedged_return.to_frame(name="returns").reset_index()

    # Merge to convert to the re-estimation frequency. The intermediary dates, daily
    # business days between re-estimation dates, will be populated with np.nan values.
    df_hr = df_ur.merge(df_hrat, on="real_date", how="left")

    df_hr = df_hr.drop("returns", axis=1)
    df_hr = df_hr.ffill()
    # Accounts for the application of the minimum number of observations required and
    # merging the two DataFrames. Drop the np.nan values prior to the application of the
    # shift (able to validate the logic).
    df_hr = df_hr.dropna(axis=0, how="any")

    df_hr = df_hr.set_index("real_date", drop=True).shift(1).reset_index(level=0)

    return df_hr


[docs]def adjusted_returns(
    benchmark_return: pd.Series, df_hedge: pd.DataFrame, dfw: pd.DataFrame
) -> pd.DataFrame:
    """
    Method used to compute the hedge ratio returns on the hedging asset which will
    subsequently be subtracted from the returns of the position contracts to calculate
    the adjusted returns (adjusted for the hedged position) across all cross-sections in
    the panel. For instance, if using US Equity to hedge Australia FX: AUD_FXXR_NSA_H =
    AUD_FXXR_NSA - HR_AUD * USD_EQXR_NSA.

    Parameters
    ----------
    benchmark_return : ~pandas.Series
        the return series of the asset being used to hedge against the main asset.
    df_hedge : ~pandas.DataFrame
        standardised dataframe with the hedge ratios.
    dfw : ~pandas.DataFrame
        pivoted dataframe of the relevant returns.

    Returns
    -------
    ~pandas.DataFrame
        standardised dataframe of adjusted returns.
    """

    hedge_pivot = df_hedge.pivot(index="real_date", columns="cid", values="value")
    index = benchmark_return.index

    # Matching the dimensions to the number of assets being hedged.
    benchmark_return = np.tile(
        benchmark_return.to_numpy(), (len(hedge_pivot.columns), 1)
    ).T
    br_df = pd.DataFrame(
        data=benchmark_return, columns=hedge_pivot.columns, index=index
    )

    hedged_returns = hedge_pivot.multiply(br_df)
    adj_rets = dfw - hedged_returns
    if isinstance(df_hedge["cid"].dtype, pd.CategoricalDtype):
        adj_rets.columns = pd.Categorical(adj_rets.columns)
    df_stack = adj_rets.stack().reset_index(name="value")
    df_stack.columns = ["real_date", "cid", "value"]

    return df_stack


[docs]def return_beta(
    df: QuantamentalDataFrame,
    xcat: str = None,
    cids: List[str] = None,
    benchmark_return: str = None,
    start: str = None,
    end: str = None,
    blacklist: dict = None,
    meth: str = "ols",
    oos: bool = True,
    refreq: str = "M",
    min_obs: int = 24,
    max_obs: int = 1000,
    hedged_returns: bool = False,
    ratio_name: str = "_HR",
    hr_name: str = "H",
    half_life: int = 11,
) -> QuantamentalDataFrame:
    """
    Estimate sensitivities (betas) of return category with respect to single return.

    Parameters
    ----------
    df : QuantamentalDataFrame
        standardized DataFrame with the necessary columns: 'cid', 'xcat', 'real_date'
        and 'value.
    xcat : str
        return category based on the type of positions that are to be hedged.
    cids : List[str]
        cross-sections of the returns for which hedge ratios are to be calculated.
        Default is all that are available in the dataframe.
    benchmark_return : str
        ticker of return of the hedge asset or basket. This is a single series, e.g.
        U.S. equity index returns ("USD_EQXR_NSA").
    start : str
        earliest date in ISO format. Default is None: earliest date in df is used.
    end : str
        latest date in ISO format. Default is None: latest date in df is used.
    blacklist : dict
        cross-sections with date ranges that should be excluded from the sample of data
        used for estimating hedge ratios. The estimated ratios during blacklist periods will
        be set equal to the last valid estimate.
    oos : bool
        if True (default) hedge ratios are calculated out-of-sample, i.e. for the period
        following the estimation period at the given re-estimation frequency.
        Currently not implemented. So will always be out of sample.
    refreq : str
        re-estimation frequency. This is period after which hedge ratios are re-
        estimated. The re-estimation is conducted at the end of the period and used as hedge
        ratio for all days of the following period. Re-estimation can have weekly, monthly,
        and quarterly frequency with the notations 'W', 'M', and 'Q' respectively. The
        default frequency is monthly.
    min_obs : int
        the minimum number of observations required in order to estimate a hedge ratio.
        The default value is 24 days. The permissible minimum is 10.
    max_obs : int
        the maximum number of latest observations allowed in order to estimate a hedge
        ratio. The default value is 1000.
    meth : str
        method to estimate the hedge ratio. Valid options are ``'ols'`` for ordinary
        least squares and ``'twls'`` for time-weighted least squares.
    hedged_returns : bool
        If True the function appends the hedged returns to the dataframe of hedge
        ratios. Default is False.
    ratio_name : str
        hedge ratio label that will be appended to the category name. The default is
        "_HR". For instance, 'xcat' + "_HR".
    hr_name : str
        label used to distinguish the hedged returns in the DataFrame. The label is
        appended to the category being hedged. The default is "H".
    half_life: int
        half-life of the exponential decay function used to calculate the time weights
        when ``meth='twls'``.

    Returns
    -------
    QuantamentalDataFrame
        DataFrame with hedge ratio estimates that update at the chosen re-estimation
        frequency. Additionally, the dataframe can include the hedged returns if the
        parameter `benchmark_return` has been set to True.


    .. note::
        Each cross-section of this category uses the same hedge asset/basket.

    .. note::
        A return beta is the estimated sensitivity of the main return with respect to the
        asset used for hedging. The ratio is recorded for the period after the estimation
        sample up until the next re-estimation date.
    """

    # Value checks
    df: QuantamentalDataFrame = QuantamentalDataFrame(df)

    all_tix = df.list_tickers()
    bm_error = f"Benchmark return ticker {benchmark_return} is not in the DataFrame."
    if not benchmark_return in all_tix:
        raise ValueError(bm_error)

    error_xcat = (
        f"The field, xcat, must be a string but received <{type(xcat)}>. Only"
        f" a single category is used to hedge against the main asset."
    )
    if not isinstance(xcat, str):
        raise ValueError(error_xcat)

    available_categories = df["xcat"].unique()
    error_hedging = (
        f"The return category used to be hedged, {xcat}, is "
        f"not defined in the dataframe."
    )
    if not xcat in list(available_categories):
        raise ValueError(error_hedging)

    min_obs_error = (
        "The number of minimum observations required to compute a hedge "
        "ratio is 10 business days, or two weeks. Please provide an integer "
        "value greater than 10."
    )
    if not isinstance(min_obs, int) or min_obs < 10:
        raise ValueError(min_obs_error)

    if not isinstance(max_obs, int) or max_obs < min_obs:
        raise ValueError(f"`max_obs` must be an integer ≫ `min_obs`.")

    # Information on hedge return and potential panel adjustment.

    post_fix = benchmark_return.split("_")
    xcat_hedge = "_".join(post_fix[1:])
    cid_hedge = post_fix[0]
    if xcat_hedge == xcat:
        if cid_hedge in cids:
            cids = [cid for cid in cids if cid != cid_hedge]
        warnings.warn(
            f"Return to be hedged for cross section {cid_hedge} is the hedge "
            f"return and has been removed from the panel."
        )

    # Wide time series DataFrame of unhedged and benchmark returns.

    # --- Time series DataFrame of unhedged returns.

    dfp = reduce_df(
        df, xcats=[xcat], cids=cids, start=start, end=end, blacklist=blacklist
    )
    dfp_w = dfp.pivot(index="real_date", columns="cid", values="value")
    dfp_w = dfp_w.dropna(axis=0, how="all")

    # --- Time series DataFrame of benchmark return for relevant dates.

    # The asset being used as the hedge could only be defined over a shorter time-period.
    dfh = reduce_df(
        df,
        xcats=[xcat_hedge],
        cids=cid_hedge,
        start=dfp_w.index[0],
        end=dfp_w.index[-1],
    )
    dfh_w = dfh.pivot(index="real_date", columns="cid", values="value")
    dfh_w.columns = ["hedge"]

    # --- Merge time series and calculate re-balancing dates.
    if isinstance(dfp_w.columns.dtype, pd.CategoricalDtype):
        dfp_w.columns = dfp_w.columns.astype("object")

    dfw = pd.merge(dfp_w, dfh_w, how="inner", on="real_date")
    br = dfw["hedge"]

    rf = _map_to_business_day_frequency(freq=refreq, valid_freqs=["W", "M", "Q"])
    dates_re = dfw.asfreq(rf).index.values

    # Find first re-balancing date with at least min_obs in the cid and hedge.
    # Satisfaction is judged at the re-estimation dates, since hedge ratios are only
    # ever estimated there - a cross-section that reaches min_obs only after the final
    # re-estimation date cannot be estimated and must be skipped, not crash the run.
    hedge_valid = dfw["hedge"].notna()
    min_obs_satisfied = dfw[cids].notna().mul(hedge_valid, axis=0).cumsum().ge(min_obs)
    satisfied_at_rdate = min_obs_satisfied.asfreq(rf)

    satisfied_cids = satisfied_at_rdate.columns[satisfied_at_rdate.any(axis=0)].tolist()
    unsatisfied_cids = [c for c in cids if c not in satisfied_cids]

    if len(unsatisfied_cids) == len(cids):
        raise RuntimeError(f"None of {cids} satisfy min_obs {min_obs}")
    if unsatisfied_cids:
        warnings.warn(
            f"Cannot calculate beta for the following cross sections: "
            f"{unsatisfied_cids} as there are no re-estimation dates where "
            f"these cross sections satisfy min_obs = {min_obs}."
        )

    first_rdate_map = satisfied_at_rdate[satisfied_cids].idxmax().to_dict()

    # Cross-section-wise hedge ratio estimation.

    aggregate = []
    for c in satisfied_cids:
        df_hr = hedge_calculator(
            unhedged_return=dfw[c],
            benchmark_return=br,
            rdates=dates_re[dates_re >= first_rdate_map[c]],
            meth=meth,
            half_life=half_life,
            min_obs=min_obs,
            max_obs=max_obs,
        )
        df_hr = QuantamentalDataFrame.from_long_df(df_hr, cid=c, xcat=xcat + ratio_name)
        aggregate.append(df_hr)

    df_hedge = QuantamentalDataFrame.from_qdf_list(aggregate)

    if hedged_returns:
        df_hreturn = adjusted_returns(df_hedge=df_hedge, dfw=dfw, benchmark_return=br)
        df_hreturn = QuantamentalDataFrame.from_long_df(
            df=df_hreturn, xcat=xcat + "_" + hr_name
        )
        df_hedge = df_hedge.update_df(df_hreturn)

    return standardise_dataframe(df_hedge)


[docs]def beta_display(df_hedge: pd.DataFrame, subplots: bool = False, hr_name: str = "H"):
    """
    Method used to visualise the hedging ratios across the panel: assumes a single
    category is used to hedge the primary asset.

    Parameters
    ----------
    df_hedge : ~pandas.DataFrame
        DataFrame with hedge ratios.
    subplots : bool
        matplotlib parameter to determine if each hedging series is displayed on
        separate subplots.
    hr_name : str
        label used to distinguish the hedged returns in the DataFrame. Comparable to
        return_beta() method, the default is "H".
    """

    condition = lambda c: c.split("_")[-1] != hr_name

    apply = list(map(condition, df_hedge["xcat"]))
    df_hedge = df_hedge[apply]

    dfw_ratios = df_hedge.pivot(index="real_date", columns="cid", values="value")

    dfw_ratios.plot(subplots=subplots, title="Hedging Ratios.", legend=True)
    plt.xlabel("real_date, years")
    plt.show()


[docs]def weighted_least_squares(X, y, weights):
    """
    Find the coefficient vector ``beta`` that minimizes the weighted
    sum of squared residuals:

    Parameters
    ----------
    X : ndarray of shape (n, p)
        Design matrix, where ``n`` is the number of observations and
        ``p`` is the number of predictors (including any intercept column).
    y : ndarray of shape (n,)
        Response vector.
    weights : ndarray of shape (n,)
        Non-negative weight for each observation. A higher weight gives
        that observation more influence on the fitted coefficients.

    Returns
    -------
    beta : ndarray of shape (p,)
        Estimated coefficient vector.
    """
    W = np.sqrt(weights)          # square-root weights
    X_w = X * W[:, np.newaxis]    # scale each row of X
    y_w = y * W                   # scale y the same way

    beta, _, _, _ = np.linalg.lstsq(X_w, y_w, rcond=None)
    return beta


if __name__ == "__main__":
    # Emerging Market Asian countries.
    cids = ["IDR", "INR", "KRW", "MYR", "PHP"]
    # Add the US - used as the hedging asset.
    cids += ["USD"]
    xcats = ["FXXR_NSA", "GROWTHXR_NSA", "INFLXR_NSA", "EQXR_NSA"]

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )

    df_cids.loc["IDR"] = ["2010-01-01", "2020-12-31", 0.5, 2]
    df_cids.loc["INR"] = ["2011-01-01", "2020-11-30", 0, 1]
    df_cids.loc["KRW"] = ["2012-01-01", "2020-11-30", -0.2, 0.5]
    df_cids.loc["MYR"] = ["2013-01-01", "2020-09-30", -0.2, 0.5]
    df_cids.loc["PHP"] = ["2002-01-01", "2020-09-30", -0.1, 2]
    df_cids.loc["USD"] = ["2000-01-01", "2022-03-14", 0, 1.25]

    df_xcats = pd.DataFrame(
        index=xcats,
        columns=["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"],
    )

    df_xcats.loc["FXXR_NSA"] = ["2010-01-01", "2020-10-30", 1, 2, 0.9, 1]
    df_xcats.loc["GROWTHXR_NSA"] = ["2012-01-01", "2020-10-30", 1, 2, 0.9, 1]
    df_xcats.loc["INFLXR_NSA"] = ["2013-01-01", "2020-10-30", 1, 2, 0.8, 0.5]
    df_xcats.loc["EQXR_NSA"] = ["2010-01-01", "2022-03-14", 0.5, 2, 0, 0.2]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    black = {"IDR": ["2010-01-01", "2014-01-04"], "INR": ["2010-01-01", "2013-12-31"]}

    xcat_hedge = "EQXR_NSA"
    # S&P500.
    benchmark_return = "USD_EQXR_NSA"
    df_hedge = return_beta(
        df=dfd,
        xcat=xcat_hedge,
        cids=["IDR", "INR", "KRW", "MYR", "PHP"],
        benchmark_return=benchmark_return,
        start="2010-01-01",
        end="2020-10-30",
        blacklist=black,
        meth="ols",
        oos=True,
        refreq="w",
        min_obs=24,
        hedged_returns=True,
    )

    print(df_hedge)
    beta_display(df_hedge=df_hedge, subplots=False)

    # Long position in S&P500 or the Nasdaq, and subsequently using US FX to hedge the
    # long position.
    xcats = "FXXR_NSA"
    cids = ["USD"]
    benchmark_return = "USD_EQXR_NSA"
    xcat_hedge_two = return_beta(
        df=dfd,
        xcat=xcats,
        cids=cids,
        benchmark_return=benchmark_return,
        start="2010-01-01",
        end="2020-10-30",
        blacklist=black,
        meth="ols",
        oos=True,
        refreq="m",
        min_obs=24,
    )
    print(xcat_hedge_two)