Source code for macrosynergy.panel.historic_vol

"""
Function for calculating historic volatility of quantamental data.
"""

import numpy as np
import pandas as pd
from typing import List, Optional, Dict, Any
import warnings
from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.utils import reduce_df, standardise_dataframe, get_eops
from macrosynergy.management.types import QuantamentalDataFrame


[docs]def historic_vol( df: pd.DataFrame, xcat: str = None, cids: List[str] = None, lback_periods: int = 21, lback_meth: str = "ma", half_life=11, start: str = None, end: str = None, est_freq: str = "D", blacklist: dict = None, remove_zeros: bool = True, postfix="ASD", nan_tolerance: float = 0.25, ): """ Estimate historic annualized standard deviations of asset returns. The function can calculate the volatility using either a moving average or an exponential moving average method. Parameters ---------- df : ~pandas.DataFrame standardized DataFrame with the following necessary columns: 'cid', 'xcat', 'real_date' and 'value'. Will contain all of the data across all macroeconomic fields. xcat : str extended category denoting the return series for which volatility should be calculated. Note: in JPMaQS returns are represented in %, i.e. 5 means 5%. cids : List[str] cross sections for which volatility is calculated; default is all available for the category. lback_periods : int Number of lookback periods over which volatility is calculated. Default is 21. lback_meth : str Lookback method to calculate the volatility. Options are 'ma' for moving average, 'xma' for exponential moving average, and 'sq' for exponentially weighted std. Default is 'ma'. half_life : int Refers to the half-time for "xma". Default is 11. start : str earliest date in ISO format. Default is None and earliest date in df is used. end : str latest date in ISO format. Default is None and latest date in df is used. est_freq : str Frequency of (re-)estimation of volatility. Options are 'D' for end of each day (default), 'W' for end of each work week, 'M' for end of each month, and 'Q' for end of each week. blacklist : dict cross sections with date ranges that should be excluded from the data frame. If one cross section has several blacklist periods append numbers to the cross section code. half_life : int Refers to the half-time for "xma" and full lookback period for "ma". remove_zeros : bool if True (default) any returns that are exact zeros will not be included in the lookback window and prior non-zero values are added to the window instead. postfix : str string appended to category name for output; default is "ASD". nan_tolerance : float maximum ratio of NaNs to non-NaNs in a lookback window, if exceeded the resulting volatility is set to NaN. Default is 0.25. Returns ------- ~pandas.DataFrame standardized DataFrame with the estimated annualized standard deviations of the chosen category. If the input 'value' is in % (as is the standard in JPMaQS) then the output will also be in %. 'cid', 'xcat', 'real_date' and 'value'. """ df: QuantamentalDataFrame = QuantamentalDataFrame(df) est_freq = est_freq.lower() lback_meth = lback_meth.lower() assert lback_meth in ["xma", "ma", "sq"], ( "Lookback method must be either 'xma' " "(exponential moving average), 'sq' (exponentially weighted std), or 'ma' (moving average)." ) if lback_meth in ["xma", "sq"]: assert ( lback_periods > half_life ), "Half life must be shorter than lookback period." assert half_life > 0, "Half life must be greater than 0." assert est_freq in [ "d", "w", "m", "q", ], "Estimation frequency must be one of 'D', 'W', 'M', or 'Q'." # assert nan tolerance is an int or float. must be >0. if >1 must be int assert isinstance( nan_tolerance, (int, float) ), "nan_tolerance must be an int or float." assert ( 0 <= nan_tolerance <= 1 ), "nan_tolerance must be between 0.0 and 1.0 inclusive." df = reduce_df( df, xcats=[xcat], cids=cids, start=start, end=end, blacklist=blacklist ) dfw = df.pivot(index="real_date", columns="cid", values="value") trigger_indices = get_eops( dates=pd.DataFrame(dfw.index), freq=est_freq, ) def single_calc( row, dfw: pd.DataFrame, lback_periods: int, nan_tolerance: float, roll_func: callable, remove_zeros: bool, weights: Optional[np.ndarray] = None, ): """ Helper function to calculate the historic volatility for a single row in the DataFrame. """ target_df: pd.DataFrame = dfw.loc[: row["real_date"]].tail(lback_periods) if weights is None: out = np.sqrt(252) * target_df.agg(roll_func, remove_zeros=remove_zeros) else: if len(weights) == len(target_df): out = np.sqrt(252) * target_df.agg( roll_func, w=weights, remove_zeros=remove_zeros ) else: return pd.Series(np.nan, index=target_df.columns) mask = ( ( target_df.isna().sum(axis=0) + (target_df == 0).sum(axis=0) + (lback_periods - len(target_df)) ) / lback_periods ) <= nan_tolerance # NOTE: dates with NaNs, dates with missing entries, and dates with 0s # are all treated as missing data and trigger a NaN in the output out[~mask] = np.nan return out expo_weights_arr: Optional[np.ndarray] = None if lback_meth in ["xma", "sq"]: expo_weights_arr = expo_weights(lback_periods, half_life) lback_meth_funcs = { "xma": expo_std, "sq": sq_std, "ma": flat_std, } _args = dict(remove_zeros=remove_zeros) if est_freq == "d": _args: Dict[str, Any] = dict(remove_zeros=remove_zeros) if lback_meth in ["xma", "sq"]: _args["w"] = expo_weights_arr _args["func"] = lback_meth_funcs[lback_meth] dfwa = np.sqrt(252) * dfw.rolling(window=lback_periods).agg(**_args) else: dfwa = pd.DataFrame(index=dfw.index, columns=dfw.columns) _args: Dict[str, Any] = dict( lback_periods=lback_periods, nan_tolerance=nan_tolerance, remove_zeros=remove_zeros, ) if lback_meth in ["xma", "sq"]: _args["weights"] = expo_weights_arr _args["roll_func"] = lback_meth_funcs[lback_meth] dfwa.loc[trigger_indices, :] = ( dfwa.loc[trigger_indices, :] .reset_index(False) .apply( lambda row: single_calc( row=row, dfw=dfw, **_args, ), axis=1, ) .set_index(trigger_indices) ) fills = {"d": 1, "w": 5, "m": 24, "q": 64} dfwa = dfwa.astype(float).reindex(dfw.index).ffill(limit=fills[est_freq]) df_out = dfwa.unstack().reset_index().rename({0: "value"}, axis=1) # Create an initial mask for all rows to keep keep_mask = pd.Series(False, index=df_out.index) # Iterate over each cid and mark valid rows for cid in cids: # Get the date range for the current 'cid' in the original df loc_bools = df["cid"] == cid if df[loc_bools].empty: warnings.warn(f"No data for {cid}_{xcat}. Skipping.") continue min_date = df.loc[loc_bools, "real_date"].min() max_date = df.loc[loc_bools, "real_date"].max() # Generate valid date range for the current 'cid' valid_dates = pd.bdate_range(start=min_date, end=max_date) # Update the keep_mask for rows corresponding to current 'cid' with valid dates sel_bools = df_out["cid"] == cid sel_dts = df_out["real_date"].isin(valid_dates) keep_mask |= sel_bools & sel_dts # Apply the mask to df_out df_out = df_out[keep_mask].reset_index(drop=True) df_out = QuantamentalDataFrame.from_long_df( df=df_out, xcat=xcat + postfix, categorical=df.InitializedAsCategorical, ) return standardise_dataframe(df_out)
[docs]def expo_weights(lback_periods: int = 21, half_life: int = 11): """ Calculates exponential series weights for finite horizon, normalized to 1. Parameters ---------- lback_periods : int Number of lookback periods over which volatility is calculated. Default is 21. half_life : int Refers to the half-time for "xma" and full lookback period for "ma". Default is 11. Returns ------- ~numpy.ndarray An Array of weights determined by the length of the lookback period. Notes ----- 50% of the weight allocation will be applied to the number of days delimited by the half_life. """ decf = 2 ** (-1 / half_life) weights = (1 - decf) * np.array( [decf ** (lback_periods - ii - 1) for ii in range(lback_periods)] ) weights = weights / sum(weights) return weights
[docs]def expo_std(x: np.ndarray, w: np.ndarray, remove_zeros: bool = True): """ Estimate volatility via the exponentially weighted mean absolute return. Uses weighted absolute deviations from zero as a proxy for standard deviation. Parameters ---------- x : ~numpy.ndarray array of returns w : ~numpy.ndarray array of exponential weights (same length as x); will be normalized to 1. remove_zeros : bool removes zeroes as invalid entries and shortens the effective window. Returns ------- float exponentially weighted mean absolute value (as proxy of return standard deviation). """ assert len(x) == len(w), "weights and window must have same length" if remove_zeros: x = x[x != 0] w = w[0 : len(x)] / sum(w[0 : len(x)]) w = w / sum(w) # weights are normalized mabs = np.sum(np.multiply(w, np.abs(x))) return mabs
[docs]def sq_std(x: np.ndarray, w: np.ndarray, remove_zeros: bool = True): """ Estimate volatility via the exponentially weighted root mean squared. Uses weighted squared deviations from the weighted mean (true std definition). Parameters ---------- x : numpy.ndarray Array of returns. w : numpy.ndarray Array of exponential weights (must be the same length as `x`). The weights are normalized internally to sum to 1. remove_zeros : bool, default=True If True, zero returns are excluded from the calculation, and the corresponding portion of the weight vector is adjusted accordingly. Returns ------- float Exponentially weighted standard deviation of returns. """ assert len(x) == len(w), "weights and window must have same length" if remove_zeros: x = x[x != 0] w = w[0 : len(x)] / sum(w[0 : len(x)]) w = w / sum(w) # weights are normalized sqstd = np.sqrt(np.sum(w * (x - np.sum(w * x)) ** 2)) return sqstd
[docs]def flat_std(x: np.ndarray, remove_zeros: bool = True): """ Estimate standard deviation of returns based on exponentially weighted absolute values. Parameters ---------- x : ~numpy.ndarray array of returns remove_zeros : bool removes zeroes as invalid entries and shortens the effective window. Returns ------- float flat weighted mean absolute value (as proxy of return standard deviation). """ if remove_zeros: x = x[x != 0] mabs = np.mean(np.abs(x)) return mabs
if __name__ == "__main__": cids = ["AUD", "CAD", "GBP", "USD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids.loc["AUD"] = ["2010-01-01", "2020-12-31", 0.5, 2] df_cids.loc["CAD"] = ["2011-01-01", "2020-11-30", 0, 1] df_cids.loc["GBP"] = ["2012-01-01", "2020-10-30", -0.2, 0.5] df_cids.loc["USD"] = ["2013-01-01", "2020-09-30", -0.2, 0.5] df_xcats = pd.DataFrame( index=xcats, columns=["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"], ) df_xcats.loc["XR"] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3] df_xcats.loc["CRY"] = ["2011-01-01", "2020-10-30", 1, 2, 0.9, 0.5] df_xcats.loc["GROWTH"] = ["2012-01-01", "2020-10-30", 1, 2, 0.9, 1] df_xcats.loc["INFL"] = ["2013-01-01", "2020-10-30", 1, 2, 0.8, 0.5] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfd["grading"] = np.ones(dfd.shape[0]) print("Calculating historic volatility with the moving average method") df = historic_vol( dfd, cids=cids, xcat="XR", lback_periods=7, lback_meth="ma", est_freq="w", half_life=3, remove_zeros=True, ) print(df.head(10)) print("Calculating historic volatility with the exponential moving average method") df = historic_vol( dfd, cids=cids, xcat="XR", lback_periods=7, lback_meth="xma", est_freq="w", half_life=3, remove_zeros=True, ) print(df.head(10))