import warnings
import numpy as np
import pandas as pd
from macrosynergy.management.types import QuantamentalDataFrame
from macrosynergy.management.simulate import make_qdf
[docs]def impute_panel(
df: pd.DataFrame,
cids: list,
xcats: list,
threshold: float = 0.5,
start_date: str = None,
impute_empty_tickers: bool = False,
) -> pd.DataFrame:
"""
Imputes missing values for each category in a long-format panel dataset by a cross-
sectional mean, conditional on the number of available cross-sections at each
concerned date exceeding a fraction `threshold` of the total number of cross-
sections.
Parameters
----------
df : ~pandas.DataFrame
the long-format panel dataset
cids : list
the list of cross sections to be considered in the imputation
xcats : list
the list of categories to be imputed
threshold : float
the fraction of available cross-sections at each date
start_date : str
the starting date for the imputation
impute_empty_tickers : bool
boolean flag for whether to impute missing values for empty tickers
Returns
-------
~pandas.DataFrame
the imputed long-format panel data with columns
.. note::
This class is still **experimental**: the predictions and the API might change
without any deprecation cycle.
"""
warnings.warn(
"This function is deprecated and will be removed very soon, please use the"
"panel_imputer class instead.",
FutureWarning
)
# Checks
if not isinstance(df, pd.DataFrame):
raise TypeError("The input `df` must be a pandas DataFrame.")
if not isinstance(xcats, list):
raise TypeError("The input `xcats` must be a list.")
for xcat in xcats:
if not isinstance(xcat, str):
raise TypeError("The elements of `xcats` must be strings.")
if not isinstance(threshold, float):
raise TypeError("The input `threshold` must be a float.")
if not 0 <= threshold <= 1:
raise ValueError("The input `threshold` must be between 0 and 1.")
if not isinstance(start_date, str) and start_date is not None:
raise TypeError("The input `start_date` must be a string.")
if not isinstance(impute_empty_tickers, bool):
raise TypeError("The input `impute_empty_tickers` must be a boolean.")
if start_date is not None:
df = df[df["real_date"] >= start_date]
complete_df = QuantamentalDataFrame(df)
_as_categorical = complete_df.InitializedAsCategorical
complete_df = complete_df.set_index(["cid", "real_date", "xcat"])
complete_df = complete_df
if impute_empty_tickers:
cids_series = pd.Series(cids, name="cid")
else:
cids_series = complete_df.index.levels[0]
full_idx = pd.MultiIndex.from_frame(
pd.concat(
[
pd.MultiIndex.from_product(
[
cids_series,
complete_df.index.levels[1],
pd.Series([xcat], name="xcat"),
]
).to_frame(index=False)
for xcat in complete_df.index.levels[2].unique()
],
axis=0,
ignore_index=True,
),
)
# reindexing to align all the CIDs on the same timeseries of dates
complete_df = complete_df.reindex(full_idx).reset_index(drop=False)
# subsetting to keep only the relevant XCATs across CIDs and dates
incomplete_mask = (complete_df["xcat"].isin(xcats)) & (
complete_df["cid"].isin(cids)
)
incomplete_df = complete_df.loc[incomplete_mask, :]
# computing the data availability stats
incomplete_df["mean_val"] = incomplete_df.groupby(["xcat", "real_date"])[
"value"
].transform("mean")
incomplete_df["tot"] = incomplete_df.groupby(["xcat", "real_date"])[
"value"
].transform("size")
incomplete_df["avail"] = incomplete_df.groupby(["xcat", "real_date"])[
"value"
].transform("count")
# Filling CID-specific values only for the appropriate cids and conditional on representative sample
mask = (incomplete_df["avail"].div(incomplete_df["tot"]) > threshold) & (
incomplete_df["value"].isna()
)
incomplete_df.loc[mask, "value"] = incomplete_df.loc[mask, "value"].fillna(
incomplete_df.loc[mask, "mean_val"]
)
return QuantamentalDataFrame(
incomplete_df[complete_df.columns].reset_index(drop=True),
categorical=_as_categorical,
)
if __name__ == "__main__":
cids = ["AUD", "CAD", "GBP", "NZD", "USD"]
xcats = ["XR", "CRY", "GROWTH", "INFL"]
df_cids = pd.DataFrame(
index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
)
df_cids.loc["AUD"] = ["2000-01-01", "2020-12-31", 0.1, 1]
df_cids.loc["CAD"] = ["2001-01-01", "2020-11-30", 0, 1]
# df_cids.loc["BRL"] = ["2001-01-01", "2020-11-30", -0.1, 2]
df_cids.loc["GBP"] = ["2002-01-01", "2024-12-30", 0, 2]
df_cids.loc["NZD"] = ["2002-01-01", "2024-12-30", -0.1, 2]
df_cids.loc["USD"] = ["2003-01-01", "2024-12-31", -0.1, 2]
cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]
df_xcats = pd.DataFrame(index=xcats, columns=cols)
df_xcats.loc["XR"] = ["2000-01-01", "2024-12-31", 0.1, 1, 0, 0.3]
df_xcats.loc["CRY"] = ["2000-01-01", "2020-10-30", 1, 2, 0.95, 1]
df_xcats.loc["GROWTH"] = ["2001-01-01", "2020-10-30", 1, 2, 0.9, 1]
df_xcats.loc["INFL"] = ["2001-01-01", "2020-10-30", 1, 2, 0.8, 0.5]
dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
dfx = impute_panel(dfd, cids + ["BRL"], ["XR"], threshold=0.0, start_date="2000-01-01", impute_empty_tickers=True)
print(dfx)