import warnings
from typing import List, Optional
import numpy as np
import pandas as pd
from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.types import QuantamentalDataFrame
from macrosynergy.management import reduce_df
[docs]def extend_history(
df: pd.DataFrame,
new_xcat: str,
cids: Optional[List[str]] = None,
hierarchy: List[str] = [],
backfill: bool = False,
start: str = None,
):
"""
Extends the history of a dataframe by creating a new xcat by combining hierarchical categories.
The method prioritizes superior categories for the new xcat and supplements with inferior ones
where superior category data is unavailable.
.. deprecated::
`extend_history` is deprecated and will be removed in a future release. Use
`merge_categories` instead — it provides per-date hierarchy fill plus the same
`backfill`/`start` options.
Parameters
----------
df : pd.DataFrame
The dataframe containing categories that are to be extended.
new_xcat : str
The name of the new xcat.
cids : List[str], optional
The cross sections to extend. If None, all cids available for any category in 'hierarchy' are extended.
hierarchy : List[str]
list of categories from best to worst for representation of the concept.
Inferior categories are only used to extend the history of the superior ones.
The new category consists of the best representation category values
and inferior category values that are available prior to any superior.
backfill : bool, optional
If True, the new xcat is backfilled to the start date specified by the 'start' parameter.
start : str, optional
The start date of the new xcat. If backfill is True, this values will be backfilled up to this date.
Returns
-------
~pandas.DataFrame
standardized DataFrame for the new xcat with extended history, with the columns:
'cid', 'xcat', 'real_date' and 'value'.
"""
warnings.warn(
"`extend_history` is deprecated and will be removed in a future release. "
"Use `merge_categories(df, hierarchy=..., new_xcat=..., backfill=..., "
"start=...)` instead.",
DeprecationWarning,
stacklevel=2,
)
df = QuantamentalDataFrame(df)
result_as_categorical = df.InitializedAsCategorical
_extend_history_checks(
df=df,
new_xcat=new_xcat,
cids=cids,
hierarchy=hierarchy,
backfill=backfill,
start=start,
)
start = pd.to_datetime(start)
extended_results = []
df, _, cids_in_df = reduce_df(df=df, xcats=hierarchy, cids=cids, out_all=True)
if df.empty:
raise ValueError("No data available for the specified cids and categories.")
if cids is None:
cids = cids_in_df
else:
missing_cids = list(set(cids) - set(cids_in_df))
cids = cids_in_df
if len(missing_cids) > 0:
warnings.warn(
f"Warning: cids {missing_cids} do not exist for any category in hierarchy. They will be ignored."
)
for cid in cids:
cid_df = df[df["cid"] == cid]
extended_series = pd.DataFrame()
for category in hierarchy:
cat_df = cid_df[cid_df["xcat"] == category].sort_values("real_date")
if extended_series.empty:
extended_series = cat_df.copy()
else:
min_real_date = extended_series["real_date"].min()
inferior_values = cat_df[cat_df["real_date"] < min_real_date]
extended_series = pd.concat([extended_series, inferior_values])
extended_series = extended_series.sort_values("real_date")
extended_series["xcat"] = new_xcat
extended_series["cid"] = cid
if backfill:
valid = extended_series.dropna(subset=["value"])
if not valid.empty:
first_valid_date = valid["real_date"].min()
first_valid_value = valid.loc[
valid["real_date"] == first_valid_date, "value"
].iloc[0]
if first_valid_date > start:
backfilled_data = pd.DataFrame(
{
"real_date": pd.bdate_range(
start=start,
end=first_valid_date - pd.Timedelta(days=1),
),
"value": first_valid_value,
"cid": cid,
"xcat": new_xcat,
}
)
extended_series = extended_series[
extended_series["real_date"] >= first_valid_date
]
extended_series = pd.concat([backfilled_data, extended_series])
elif start is not None:
extended_series = extended_series[extended_series["real_date"] >= start]
# Add new_xcat and cid
extended_series["xcat"] = new_xcat
extended_series["cid"] = cid
extended_results.append(extended_series)
extended_df = pd.concat(extended_results, ignore_index=True)
extended_df = extended_df.sort_values(["cid", "real_date"])
return QuantamentalDataFrame(extended_df, categorical=result_as_categorical)
def _extend_history_checks(
df: pd.DataFrame,
new_xcat: str,
cids: Optional[List[str]] = None,
hierarchy: List[str] = [],
backfill: bool = False,
start: str = None,
):
"""
Checks for inputs to `extend_history`.
Parameters
----------
df : pd.DataFrame
The dataframe containing categories that are to be extended.
new_xcat : str
The name of the new xcat.
cids : List[str], optional
The cross sections to extend. If None, all cids available for any category in 'hierarchy' are extended.
hierarchy : List[str]
list of categories from best to worst for representation of the concept.
Inferior categories are only used to extend the history of the superior ones.
The new category consists of the best representation category values
and inferior category values that are available prior to any superior.
backfill : bool, optional
If True, the new xcat is backfilled to the start date specified by the 'start' parameter.
start : str, optional
The start date of the new xcat. If backfill is True, this values will be backfilled up to this date.
"""
if not isinstance(df, pd.DataFrame):
raise TypeError("df must be a pandas DataFrame")
if not isinstance(new_xcat, str):
raise TypeError("new_xcat must be a string")
if cids is not None:
if not isinstance(cids, list):
raise TypeError("cids must be a list")
if not all(isinstance(cid, str) for cid in cids):
raise TypeError("cids must be a list of strings")
if not isinstance(hierarchy, list):
raise TypeError("hierarchy must be a list")
if not isinstance(backfill, bool):
raise TypeError("backfill must be a boolean")
if start is not None and not isinstance(start, str):
raise TypeError("start must be a string")
if not all(isinstance(cat, str) for cat in hierarchy):
raise TypeError("hierarchy must be a list of strings")
if backfill and start is None:
raise ValueError("start must be provided if backfill is True")
if __name__ == "__main__":
cids = ["AUD", "CAD", "GBP", "USD", "NZD"]
xcats = ["INFL", "INFL0"]
df_cids = pd.DataFrame(
index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
)
df_cids.loc["AUD"] = ["2010-01-01", "2020-12-31", 0.5, 2]
df_cids.loc["CAD"] = ["2011-01-01", "2020-11-30", 0, 1]
df_cids.loc["GBP"] = ["2012-01-01", "2020-11-30", -0.2, 0.5]
df_cids.loc["USD"] = ["2010-01-01", "2020-12-30", -0.2, 0.5]
df_cids.loc["NZD"] = ["2002-01-01", "2020-09-30", -0.1, 2]
df_cids.loc["EUR"] = ["2002-01-01", "2020-09-30", -0.2, 2]
df_xcats = pd.DataFrame(
index=xcats,
columns=["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"],
)
df_xcats.loc["INFL"] = ["2010-01-01", "2020-09-30", 1, 2, 0.8, 0.5]
df_xcats.loc["INFL0"] = ["2000-01-01", "2020-09-30", 1, 3, 0.5, 0.2]
np.random.seed(0)
dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
df = extend_history(
dfd, "INFL1", cids, ["INFL", "INFL0"], backfill=False, start=None
)
pass