Source code for macrosynergy.panel.imputers

import warnings
import numpy as np
import pandas as pd

from macrosynergy.management.types import QuantamentalDataFrame
from macrosynergy.management.simulate import make_qdf


[docs]def impute_panel( df: pd.DataFrame, cids: list, xcats: list, threshold: float = 0.5, start_date: str = None, impute_empty_tickers: bool = False, ) -> pd.DataFrame: """ Imputes missing values for each category in a long-format panel dataset by a cross- sectional mean, conditional on the number of available cross-sections at each concerned date exceeding a fraction `threshold` of the total number of cross- sections. Parameters ---------- df : ~pandas.DataFrame the long-format panel dataset cids : list the list of cross sections to be considered in the imputation xcats : list the list of categories to be imputed threshold : float the fraction of available cross-sections at each date start_date : str the starting date for the imputation impute_empty_tickers : bool boolean flag for whether to impute missing values for empty tickers Returns ------- ~pandas.DataFrame the imputed long-format panel data with columns .. note:: This class is still **experimental**: the predictions and the API might change without any deprecation cycle. """ warnings.warn( "This function is deprecated and will be removed very soon, please use the" "panel_imputer class instead.", FutureWarning ) # Checks if not isinstance(df, pd.DataFrame): raise TypeError("The input `df` must be a pandas DataFrame.") if not isinstance(xcats, list): raise TypeError("The input `xcats` must be a list.") for xcat in xcats: if not isinstance(xcat, str): raise TypeError("The elements of `xcats` must be strings.") if not isinstance(threshold, float): raise TypeError("The input `threshold` must be a float.") if not 0 <= threshold <= 1: raise ValueError("The input `threshold` must be between 0 and 1.") if not isinstance(start_date, str) and start_date is not None: raise TypeError("The input `start_date` must be a string.") if not isinstance(impute_empty_tickers, bool): raise TypeError("The input `impute_empty_tickers` must be a boolean.") if start_date is not None: df = df[df["real_date"] >= start_date] complete_df = QuantamentalDataFrame(df) _as_categorical = complete_df.InitializedAsCategorical complete_df = complete_df.set_index(["cid", "real_date", "xcat"]) complete_df = complete_df if impute_empty_tickers: cids_series = pd.Series(cids, name="cid") else: cids_series = complete_df.index.levels[0] full_idx = pd.MultiIndex.from_frame( pd.concat( [ pd.MultiIndex.from_product( [ cids_series, complete_df.index.levels[1], pd.Series([xcat], name="xcat"), ] ).to_frame(index=False) for xcat in complete_df.index.levels[2].unique() ], axis=0, ignore_index=True, ), ) # reindexing to align all the CIDs on the same timeseries of dates complete_df = complete_df.reindex(full_idx).reset_index(drop=False) # subsetting to keep only the relevant XCATs across CIDs and dates incomplete_mask = (complete_df["xcat"].isin(xcats)) & ( complete_df["cid"].isin(cids) ) incomplete_df = complete_df.loc[incomplete_mask, :] # computing the data availability stats incomplete_df["mean_val"] = incomplete_df.groupby(["xcat", "real_date"])[ "value" ].transform("mean") incomplete_df["tot"] = incomplete_df.groupby(["xcat", "real_date"])[ "value" ].transform("size") incomplete_df["avail"] = incomplete_df.groupby(["xcat", "real_date"])[ "value" ].transform("count") # Filling CID-specific values only for the appropriate cids and conditional on representative sample mask = (incomplete_df["avail"].div(incomplete_df["tot"]) > threshold) & ( incomplete_df["value"].isna() ) incomplete_df.loc[mask, "value"] = incomplete_df.loc[mask, "value"].fillna( incomplete_df.loc[mask, "mean_val"] ) return QuantamentalDataFrame( incomplete_df[complete_df.columns].reset_index(drop=True), categorical=_as_categorical, )
if __name__ == "__main__": cids = ["AUD", "CAD", "GBP", "NZD", "USD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids.loc["AUD"] = ["2000-01-01", "2020-12-31", 0.1, 1] df_cids.loc["CAD"] = ["2001-01-01", "2020-11-30", 0, 1] # df_cids.loc["BRL"] = ["2001-01-01", "2020-11-30", -0.1, 2] df_cids.loc["GBP"] = ["2002-01-01", "2024-12-30", 0, 2] df_cids.loc["NZD"] = ["2002-01-01", "2024-12-30", -0.1, 2] df_cids.loc["USD"] = ["2003-01-01", "2024-12-31", -0.1, 2] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] df_xcats = pd.DataFrame(index=xcats, columns=cols) df_xcats.loc["XR"] = ["2000-01-01", "2024-12-31", 0.1, 1, 0, 0.3] df_xcats.loc["CRY"] = ["2000-01-01", "2020-10-30", 1, 2, 0.95, 1] df_xcats.loc["GROWTH"] = ["2001-01-01", "2020-10-30", 1, 2, 0.9, 1] df_xcats.loc["INFL"] = ["2001-01-01", "2020-10-30", 1, 2, 0.8, 0.5] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfx = impute_panel(dfd, cids + ["BRL"], ["XR"], threshold=0.0, start_date="2000-01-01", impute_empty_tickers=True) print(dfx)