Source code for macrosynergy.management.utils.math

"""
Contains mathematical utility functions used across the package.
"""

import itertools
from numbers import Number
from typing import List

import numpy as np
import pandas as pd

from macrosynergy.management.simulate import make_qdf


[docs]def expanding_mean_with_nan( dfw: pd.DataFrame, absolute: bool = False ) -> List[np.float64]: """ Calculate the expanding mean of a DataFrame's values across rows, handling NaN values. This function computes the expanding (cumulative) mean of all elements in the DataFrame `dfw`, row-by-row. NaN values are ignored in the summation, ensuring they do not affect the calculation. If `absolute` is set to True, it uses the absolute values of elements for the expanding mean calculation. The function returns a list of expanding mean values, with each element corresponding to the expanding mean up to that row. Parameters ---------- dfw : pd.DataFrame A DataFrame with a datetime index (or convertible to datetime) and numeric data across its columns. The index is expected to represent timestamps. absolute : bool, optional If True, computes the expanding mean using the absolute values of the DataFrame's elements, by default False. Returns ------- List[np.float64] A list containing the expanding mean for each row of the DataFrame. Raises ------ TypeError If `dfw` is not a DataFrame, if its index cannot be converted to timestamps, or if `absolute` is not a boolean. """ if not isinstance(dfw, pd.DataFrame): raise TypeError("Method expects to receive a pd.DataFrame.") # cast the index to pd.Timestamp, if error raise TypeError try: dfw.index = pd.to_datetime(dfw.index) except: raise TypeError("The index of the DataFrame must be of type `<pd.Timestamp>`.") if not isinstance(absolute, bool): raise TypeError("The parameter `absolute` must be of type `<bool>`.") data: np.ndarray = dfw.to_numpy() no_rows: int = dfw.shape[0] no_columns: int = dfw.shape[1] no_elements: int = no_rows * no_columns one_dimension_arr: np.ndarray = data.reshape(no_elements) if absolute: one_dimension_arr = np.absolute(one_dimension_arr) rolling_summation: List[float] = [ np.nansum(one_dimension_arr[0 : (no_columns * i)]) for i in range(1, no_rows + 1) ] # Determine the number of active cross-sections per timestamp. Required for computing # the rolling mean. data_arr = data.astype(dtype=np.float32) # Sum across the rows. active_cross = np.sum(~np.isnan(data_arr), axis=1) rolling_active_cross = list(itertools.accumulate(active_cross)) mean_calc = lambda m, d: m / d ret = list(map(mean_calc, rolling_summation, rolling_active_cross)) return np.array(ret)
[docs]def ewm_sum(df: pd.DataFrame, halflife: Number): """ Compute the exponentially weighted moving sum of a DataFrame. Parameters ---------- df : pd.DataFrame DataFrame in the wide format for which to calculate weights. halflife : Number The halflife of the exponential decay. """ if not isinstance(df, pd.DataFrame): raise TypeError("Method expects to receive a pd.DataFrame.") if not isinstance(halflife, Number): raise TypeError("The parameter `halflife` must be of type `<Number>`.") weights = calculate_cumulative_weights(df, halflife) return df.ewm(halflife=halflife).mean().mul(weights, axis=0)
[docs]def calculate_cumulative_weights(df: pd.DataFrame, halflife: Number): """ Calculate the cumulative moving exponential weights for a DataFrame. Parameters ---------- df : pd.DataFrame DataFrame in the wide format for which to calculate weights. halflife : Number The halflife of the exponential decay. """ n = len(df) raw_weights = [(1 / 2) ** (i / halflife) for i in range(n)] cumulative_weights = np.cumsum(raw_weights) return pd.Series(cumulative_weights, index=df.index)
if __name__ == "__main__": cids = ["AUD", "CAD", "GBP", "USD", "NZD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) # Define the cross-sections over different timestamps such that the pivoted DataFrame # includes NaN values: more realistic testcase. df_cids.loc["AUD"] = ["2022-01-01", "2022-02-01", 0.5, 2] df_cids.loc["CAD"] = ["2022-01-10", "2022-02-01", 0.5, 2] df_cids.loc["GBP"] = ["2022-01-20", "2022-02-01", -0.2, 0.5] df_cids.loc["USD"] = ["2022-01-01", "2022-02-01", -0.2, 0.5] df_cids.loc["NZD"] = ["2022-01-05", "2022-02-01", -0.1, 2] df_xcats = pd.DataFrame( index=xcats, columns=["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"], ) df_xcats.loc["XR"] = ["2010-01-01", "2022-02-01", 0, 1, 0, 0.3] df_xcats.loc["CRY"] = ["2011-01-01", "2022-02-01", 1, 2, 0.9, 0.5] df_xcats.loc["GROWTH"] = ["2012-01-01", "2022-02-01", 1, 2, 0.9, 1] df_xcats.loc["INFL"] = ["2013-01-01", "2022-02-01", 1, 2, 0.8, 0.5] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfd_xr = dfd[dfd["xcat"] == "XR"] dfw = dfd_xr.pivot(index="real_date", columns="cid", values="value") no_rows = dfw.shape[0] ret_mean = expanding_mean_with_nan(dfw)