Source code for macrosynergy.panel.historic_vol

"""
Function for calculating historic volatility of quantamental data.
"""

import numpy as np
import pandas as pd
from typing import List, Optional, Dict, Any
import warnings
from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.utils import reduce_df, standardise_dataframe, get_eops
from macrosynergy.management.types import QuantamentalDataFrame


[docs]def historic_vol(
    df: pd.DataFrame,
    xcat: str = None,
    cids: List[str] = None,
    lback_periods: int = 21,
    lback_meth: str = "ma",
    half_life=11,
    start: str = None,
    end: str = None,
    est_freq: str = "D",
    blacklist: dict = None,
    remove_zeros: bool = True,
    postfix="ASD",
    nan_tolerance: float = 0.25,
):
    """
    Estimate historic annualized standard deviations of asset returns. The function can
    calculate the volatility using either a moving average or an exponential moving
    average method.

    Parameters
    ----------
    df : ~pandas.DataFrame
        standardized DataFrame with the following necessary columns: 'cid', 'xcat',
        'real_date' and 'value'. Will contain all of the data across all macroeconomic
        fields.
    xcat : str
        extended category denoting the return series for which volatility should be
        calculated. Note: in JPMaQS returns are represented in %, i.e. 5 means 5%.
    cids : List[str]
        cross sections for which volatility is calculated; default is all available for
        the category.
    lback_periods : int
        Number of lookback periods over which volatility is calculated. Default is 21.
    lback_meth : str
        Lookback method to calculate the volatility. Options are 'ma' for moving
        average, 'xma' for exponential moving average, and 'sq' for exponentially weighted
        std. Default is 'ma'.
    half_life : int
        Refers to the half-time for "xma". Default is 11.
    start : str
        earliest date in ISO format. Default is None and earliest date in df is used.
    end : str
        latest date in ISO format. Default is None and latest date in df is used.
    est_freq : str
        Frequency of (re-)estimation of volatility. Options are 'D' for end of each day
        (default), 'W' for end of each work week, 'M' for end of each month, and 'Q' for end
        of each week.
    blacklist : dict
        cross sections with date ranges that should be excluded from the data frame. If
        one cross section has several blacklist periods append numbers to the cross section
        code.
    half_life : int
        Refers to the half-time for "xma" and full lookback period for "ma".
    remove_zeros : bool
        if True (default) any returns that are exact zeros will not be included in the
        lookback window and prior non-zero values are added to the window instead.
    postfix : str
        string appended to category name for output; default is "ASD".
    nan_tolerance : float
        maximum ratio of NaNs to non-NaNs in a lookback window, if exceeded the
        resulting volatility is set to NaN. Default is 0.25.

    Returns
    -------
    ~pandas.DataFrame
        standardized DataFrame with the estimated annualized standard deviations of the
        chosen category. If the input 'value' is in % (as is the standard in
        JPMaQS) then the output will also be in %. 'cid', 'xcat', 'real_date' and 'value'.
    """

    df: QuantamentalDataFrame = QuantamentalDataFrame(df)
    est_freq = est_freq.lower()
    lback_meth = lback_meth.lower()
    assert lback_meth in ["xma", "ma", "sq"], (
        "Lookback method must be either 'xma' "
        "(exponential moving average), 'sq' (exponentially weighted std), or 'ma' (moving average)."
    )
    if lback_meth in ["xma", "sq"]:
        assert (
            lback_periods > half_life
        ), "Half life must be shorter than lookback period."
        assert half_life > 0, "Half life must be greater than 0."
    assert est_freq in [
        "d",
        "w",
        "m",
        "q",
    ], "Estimation frequency must be one of 'D', 'W', 'M', or 'Q'."

    # assert nan tolerance is an int or float. must be >0. if >1 must be int
    assert isinstance(
        nan_tolerance, (int, float)
    ), "nan_tolerance must be an int or float."
    assert (
        0 <= nan_tolerance <= 1
    ), "nan_tolerance must be between 0.0 and 1.0 inclusive."

    df = reduce_df(
        df, xcats=[xcat], cids=cids, start=start, end=end, blacklist=blacklist
    )

    dfw = df.pivot(index="real_date", columns="cid", values="value")

    trigger_indices = get_eops(
        dates=pd.DataFrame(dfw.index),
        freq=est_freq,
    )

    def single_calc(
        row,
        dfw: pd.DataFrame,
        lback_periods: int,
        nan_tolerance: float,
        roll_func: callable,
        remove_zeros: bool,
        weights: Optional[np.ndarray] = None,
    ):
        """
        Helper function to calculate the historic volatility for a single row in the
        DataFrame.
        """
        target_df: pd.DataFrame = dfw.loc[: row["real_date"]].tail(lback_periods)

        if weights is None:
            out = np.sqrt(252) * target_df.agg(roll_func, remove_zeros=remove_zeros)
        else:
            if len(weights) == len(target_df):
                out = np.sqrt(252) * target_df.agg(
                    roll_func, w=weights, remove_zeros=remove_zeros
                )
            else:
                return pd.Series(np.nan, index=target_df.columns)

        mask = (
            (
                target_df.isna().sum(axis=0)
                + (target_df == 0).sum(axis=0)
                + (lback_periods - len(target_df))
            )
            / lback_periods
        ) <= nan_tolerance
        # NOTE: dates with NaNs, dates with missing entries, and dates with 0s
        # are all treated as missing data and trigger a NaN in the output
        out[~mask] = np.nan

        return out

    expo_weights_arr: Optional[np.ndarray] = None
    if lback_meth in ["xma", "sq"]:
        expo_weights_arr = expo_weights(lback_periods, half_life)

    lback_meth_funcs = {
        "xma": expo_std,
        "sq": sq_std,
        "ma": flat_std,
    }
    _args = dict(remove_zeros=remove_zeros)
    if est_freq == "d":
        _args: Dict[str, Any] = dict(remove_zeros=remove_zeros)
        if lback_meth in ["xma", "sq"]:
            _args["w"] = expo_weights_arr
        _args["func"] = lback_meth_funcs[lback_meth]

        dfwa = np.sqrt(252) * dfw.rolling(window=lback_periods).agg(**_args)
    else:
        dfwa = pd.DataFrame(index=dfw.index, columns=dfw.columns)
        _args: Dict[str, Any] = dict(
            lback_periods=lback_periods,
            nan_tolerance=nan_tolerance,
            remove_zeros=remove_zeros,
        )

        if lback_meth in ["xma", "sq"]:
            _args["weights"] = expo_weights_arr
        _args["roll_func"] = lback_meth_funcs[lback_meth]

        dfwa.loc[trigger_indices, :] = (
            dfwa.loc[trigger_indices, :]
            .reset_index(False)
            .apply(
                lambda row: single_calc(
                    row=row,
                    dfw=dfw,
                    **_args,
                ),
                axis=1,
            )
            .set_index(trigger_indices)
        )

        fills = {"d": 1, "w": 5, "m": 24, "q": 64}
        dfwa = dfwa.astype(float).reindex(dfw.index).ffill(limit=fills[est_freq])

    df_out = dfwa.unstack().reset_index().rename({0: "value"}, axis=1)

    # Create an initial mask for all rows to keep
    keep_mask = pd.Series(False, index=df_out.index)

    # Iterate over each cid and mark valid rows
    for cid in cids:
        # Get the date range for the current 'cid' in the original df
        loc_bools = df["cid"] == cid
        if df[loc_bools].empty:
            warnings.warn(f"No data for {cid}_{xcat}. Skipping.")
            continue
        min_date = df.loc[loc_bools, "real_date"].min()
        max_date = df.loc[loc_bools, "real_date"].max()

        # Generate valid date range for the current 'cid'
        valid_dates = pd.bdate_range(start=min_date, end=max_date)

        # Update the keep_mask for rows corresponding to current 'cid' with valid dates
        sel_bools = df_out["cid"] == cid
        sel_dts = df_out["real_date"].isin(valid_dates)

        keep_mask |= sel_bools & sel_dts

    # Apply the mask to df_out
    df_out = df_out[keep_mask].reset_index(drop=True)

    df_out = QuantamentalDataFrame.from_long_df(
        df=df_out,
        xcat=xcat + postfix,
        categorical=df.InitializedAsCategorical,
    )
    return standardise_dataframe(df_out)


[docs]def expo_weights(lback_periods: int = 21, half_life: int = 11):
    """
    Calculates exponential series weights for finite horizon, normalized to 1.

    Parameters
    ----------
    lback_periods : int
        Number of lookback periods over which volatility is calculated. Default is 21.
    half_life : int
        Refers to the half-time for "xma" and full lookback period for "ma". Default is
        11.

    Returns
    -------
    ~numpy.ndarray
        An Array of weights determined by the length of the lookback period.

    Notes
    -----
    50% of the weight allocation will be applied to the number of days delimited by the
    half_life.
    """

    decf = 2 ** (-1 / half_life)
    weights = (1 - decf) * np.array(
        [decf ** (lback_periods - ii - 1) for ii in range(lback_periods)]
    )
    weights = weights / sum(weights)

    return weights


[docs]def expo_std(x: np.ndarray, w: np.ndarray, remove_zeros: bool = True):
    """
    Estimate volatility via the exponentially weighted mean absolute return.
    Uses weighted absolute deviations from zero as a proxy for standard deviation.

    Parameters
    ----------
    x : ~numpy.ndarray
        array of returns
    w : ~numpy.ndarray
        array of exponential weights (same length as x); will be normalized to 1.
    remove_zeros : bool
        removes zeroes as invalid entries and shortens the effective window.

    Returns
    -------
    float
        exponentially weighted mean absolute value (as proxy of return standard
        deviation).
    """

    assert len(x) == len(w), "weights and window must have same length"
    if remove_zeros:
        x = x[x != 0]
        w = w[0 : len(x)] / sum(w[0 : len(x)])
    w = w / sum(w)  # weights are normalized
    mabs = np.sum(np.multiply(w, np.abs(x)))
    return mabs


[docs]def sq_std(x: np.ndarray, w: np.ndarray, remove_zeros: bool = True):
    """
    Estimate volatility via the exponentially weighted root mean squared.
    Uses weighted squared deviations from the weighted mean (true std definition).

    Parameters
    ----------
    x : numpy.ndarray
        Array of returns.
    w : numpy.ndarray
        Array of exponential weights (must be the same length as `x`).
        The weights are normalized internally to sum to 1.
    remove_zeros : bool, default=True
        If True, zero returns are excluded from the calculation, and the
        corresponding portion of the weight vector is adjusted accordingly.

    Returns
    -------
    float
        Exponentially weighted standard deviation of returns.
    """

    assert len(x) == len(w), "weights and window must have same length"
    if remove_zeros:
        x = x[x != 0]
        w = w[0 : len(x)] / sum(w[0 : len(x)])
    w = w / sum(w)  # weights are normalized
    sqstd = np.sqrt(np.sum(w * (x - np.sum(w * x)) ** 2))
    return sqstd


[docs]def flat_std(x: np.ndarray, remove_zeros: bool = True):
    """
    Estimate standard deviation of returns based on exponentially weighted absolute
    values.

    Parameters
    ----------
    x : ~numpy.ndarray
        array of returns
    remove_zeros : bool
        removes zeroes as invalid entries and shortens the effective window.

    Returns
    -------
    float
        flat weighted mean absolute value (as proxy of return standard deviation).
    """

    if remove_zeros:
        x = x[x != 0]
    mabs = np.mean(np.abs(x))
    return mabs


if __name__ == "__main__":
    cids = ["AUD", "CAD", "GBP", "USD"]
    xcats = ["XR", "CRY", "GROWTH", "INFL"]

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )

    df_cids.loc["AUD"] = ["2010-01-01", "2020-12-31", 0.5, 2]
    df_cids.loc["CAD"] = ["2011-01-01", "2020-11-30", 0, 1]
    df_cids.loc["GBP"] = ["2012-01-01", "2020-10-30", -0.2, 0.5]
    df_cids.loc["USD"] = ["2013-01-01", "2020-09-30", -0.2, 0.5]

    df_xcats = pd.DataFrame(
        index=xcats,
        columns=["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"],
    )
    df_xcats.loc["XR"] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3]
    df_xcats.loc["CRY"] = ["2011-01-01", "2020-10-30", 1, 2, 0.9, 0.5]
    df_xcats.loc["GROWTH"] = ["2012-01-01", "2020-10-30", 1, 2, 0.9, 1]
    df_xcats.loc["INFL"] = ["2013-01-01", "2020-10-30", 1, 2, 0.8, 0.5]
    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    dfd["grading"] = np.ones(dfd.shape[0])

    print("Calculating historic volatility with the moving average method")
    df = historic_vol(
        dfd,
        cids=cids,
        xcat="XR",
        lback_periods=7,
        lback_meth="ma",
        est_freq="w",
        half_life=3,
        remove_zeros=True,
    )

    print(df.head(10))

    print("Calculating historic volatility with the exponential moving average method")
    df = historic_vol(
        dfd,
        cids=cids,
        xcat="XR",
        lback_periods=7,
        lback_meth="xma",
        est_freq="w",
        half_life=3,
        remove_zeros=True,
    )

    print(df.head(10))