Source code for macrosynergy.management.utils.math

"""
Contains mathematical utility functions used across the package.
"""

import itertools
from numbers import Number
from typing import List

import numpy as np
import pandas as pd

from macrosynergy.management.simulate import make_qdf


[docs]def expanding_mean_with_nan(
    dfw: pd.DataFrame, absolute: bool = False
) -> List[np.float64]:
    """
    Calculate the expanding mean of a DataFrame's values across rows, handling NaN values.

    This function computes the expanding (cumulative) mean of all elements in the 
    DataFrame `dfw`, row-by-row. NaN values are ignored in the summation, ensuring they 
    do not affect the calculation. If `absolute` is set to True, it uses the absolute 
    values of elements for the expanding mean calculation. The function returns a list 
    of expanding mean values, with each element corresponding to the expanding mean up to 
    that row.

    Parameters
    ----------
    dfw : pd.DataFrame
        A DataFrame with a datetime index (or convertible to datetime) and numeric data 
        across its columns. The index is expected to represent timestamps.
    absolute : bool, optional
        If True, computes the expanding mean using the absolute values of the DataFrame's 
        elements, by default False.

    Returns
    -------
    List[np.float64]
        A list containing the expanding mean for each row of the DataFrame.

    Raises
    ------
    TypeError
        If `dfw` is not a DataFrame, if its index cannot be converted to timestamps, or if 
        `absolute` is not a boolean.
    """

    if not isinstance(dfw, pd.DataFrame):
        raise TypeError("Method expects to receive a pd.DataFrame.")

    # cast the index to pd.Timestamp, if error raise TypeError
    try:
        dfw.index = pd.to_datetime(dfw.index)
    except:
        raise TypeError("The index of the DataFrame must be of type `<pd.Timestamp>`.")

    if not isinstance(absolute, bool):
        raise TypeError("The parameter `absolute` must be of type `<bool>`.")

    data: np.ndarray = dfw.to_numpy()

    no_rows: int = dfw.shape[0]
    no_columns: int = dfw.shape[1]
    no_elements: int = no_rows * no_columns

    one_dimension_arr: np.ndarray = data.reshape(no_elements)
    if absolute:
        one_dimension_arr = np.absolute(one_dimension_arr)

    rolling_summation: List[float] = [
        np.nansum(one_dimension_arr[0 : (no_columns * i)])
        for i in range(1, no_rows + 1)
    ]

    # Determine the number of active cross-sections per timestamp. Required for computing
    # the rolling mean.
    data_arr = data.astype(dtype=np.float32)
    # Sum across the rows.
    active_cross = np.sum(~np.isnan(data_arr), axis=1)
    rolling_active_cross = list(itertools.accumulate(active_cross))

    mean_calc = lambda m, d: m / d
    ret = list(map(mean_calc, rolling_summation, rolling_active_cross))

    return np.array(ret)


[docs]def ewm_sum(df: pd.DataFrame, halflife: Number):
    """
    Compute the exponentially weighted moving sum of a DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame in the wide format for which to calculate weights.
    halflife : Number
        The halflife of the exponential decay.
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("Method expects to receive a pd.DataFrame.")
    if not isinstance(halflife, Number):
        raise TypeError("The parameter `halflife` must be of type `<Number>`.")

    weights = calculate_cumulative_weights(df, halflife)
    return df.ewm(halflife=halflife).mean().mul(weights, axis=0)


[docs]def calculate_cumulative_weights(df: pd.DataFrame, halflife: Number):
    """
    Calculate the cumulative moving exponential weights for a DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame in the wide format for which to calculate weights.
    halflife : Number
        The halflife of the exponential decay.
    """

    n = len(df)
    raw_weights = [(1 / 2) ** (i / halflife) for i in range(n)]

    cumulative_weights = np.cumsum(raw_weights)

    return pd.Series(cumulative_weights, index=df.index)


if __name__ == "__main__":
    cids = ["AUD", "CAD", "GBP", "USD", "NZD"]
    xcats = ["XR", "CRY", "GROWTH", "INFL"]

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )

    # Define the cross-sections over different timestamps such that the pivoted DataFrame
    # includes NaN values: more realistic testcase.
    df_cids.loc["AUD"] = ["2022-01-01", "2022-02-01", 0.5, 2]
    df_cids.loc["CAD"] = ["2022-01-10", "2022-02-01", 0.5, 2]
    df_cids.loc["GBP"] = ["2022-01-20", "2022-02-01", -0.2, 0.5]
    df_cids.loc["USD"] = ["2022-01-01", "2022-02-01", -0.2, 0.5]
    df_cids.loc["NZD"] = ["2022-01-05", "2022-02-01", -0.1, 2]

    df_xcats = pd.DataFrame(
        index=xcats,
        columns=["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"],
    )
    df_xcats.loc["XR"] = ["2010-01-01", "2022-02-01", 0, 1, 0, 0.3]
    df_xcats.loc["CRY"] = ["2011-01-01", "2022-02-01", 1, 2, 0.9, 0.5]
    df_xcats.loc["GROWTH"] = ["2012-01-01", "2022-02-01", 1, 2, 0.9, 1]
    df_xcats.loc["INFL"] = ["2013-01-01", "2022-02-01", 1, 2, 0.8, 0.5]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
    dfd_xr = dfd[dfd["xcat"] == "XR"]

    dfw = dfd_xr.pivot(index="real_date", columns="cid", values="value")
    no_rows = dfw.shape[0]

    ret_mean = expanding_mean_with_nan(dfw)