Source code for macrosynergy.panel.extend_history

import warnings
from typing import List, Optional

import numpy as np
import pandas as pd

from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.types import QuantamentalDataFrame
from macrosynergy.management import reduce_df


[docs]def extend_history(
    df: pd.DataFrame,
    new_xcat: str,
    cids: Optional[List[str]] = None,
    hierarchy: List[str] = [],
    backfill: bool = False,
    start: str = None,
):
    """
    Extends the history of a dataframe by creating a new xcat by combining hierarchical categories.
    The method prioritizes superior categories for the new xcat and supplements with inferior ones
    where superior category data is unavailable.

    .. deprecated::
        `extend_history` is deprecated and will be removed in a future release. Use
        `merge_categories` instead — it provides per-date hierarchy fill plus the same
        `backfill`/`start` options.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing categories that are to be extended.
    new_xcat : str
        The name of the new xcat.
    cids : List[str], optional
        The cross sections to extend. If None, all cids available for any category in 'hierarchy' are extended.
    hierarchy : List[str]
         list of categories from best to worst for representation of the concept.
         Inferior categories are only used to extend the history of the superior ones.
         The new category consists of the best representation category values
         and inferior category values that are available prior to any superior.
    backfill : bool, optional
        If True, the new xcat is backfilled to the start date specified by the 'start' parameter.
    start : str, optional
        The start date of the new xcat. If backfill is True, this values will be backfilled up to this date.

    Returns
    -------
    ~pandas.DataFrame
        standardized DataFrame for the new xcat with extended history, with the columns:
        'cid', 'xcat', 'real_date' and 'value'.
    """

    warnings.warn(
        "`extend_history` is deprecated and will be removed in a future release. "
        "Use `merge_categories(df, hierarchy=..., new_xcat=..., backfill=..., "
        "start=...)` instead.",
        DeprecationWarning,
        stacklevel=2,
    )

    df = QuantamentalDataFrame(df)
    result_as_categorical = df.InitializedAsCategorical
    _extend_history_checks(
        df=df,
        new_xcat=new_xcat,
        cids=cids,
        hierarchy=hierarchy,
        backfill=backfill,
        start=start,
    )
    start = pd.to_datetime(start)

    extended_results = []

    df, _, cids_in_df = reduce_df(df=df, xcats=hierarchy, cids=cids, out_all=True)

    if df.empty:
        raise ValueError("No data available for the specified cids and categories.")

    if cids is None:
        cids = cids_in_df
    else:
        missing_cids = list(set(cids) - set(cids_in_df))
        cids = cids_in_df
        if len(missing_cids) > 0:
            warnings.warn(
                f"Warning: cids {missing_cids} do not exist for any category in hierarchy. They will be ignored."
            )

    for cid in cids:

        cid_df = df[df["cid"] == cid]

        extended_series = pd.DataFrame()

        for category in hierarchy:

            cat_df = cid_df[cid_df["xcat"] == category].sort_values("real_date")

            if extended_series.empty:
                extended_series = cat_df.copy()
            else:
                min_real_date = extended_series["real_date"].min()
                inferior_values = cat_df[cat_df["real_date"] < min_real_date]
                extended_series = pd.concat([extended_series, inferior_values])

        extended_series = extended_series.sort_values("real_date")

        extended_series["xcat"] = new_xcat
        extended_series["cid"] = cid

        if backfill:
            valid = extended_series.dropna(subset=["value"])
            if not valid.empty:
                first_valid_date = valid["real_date"].min()
                first_valid_value = valid.loc[
                    valid["real_date"] == first_valid_date, "value"
                ].iloc[0]
                if first_valid_date > start:
                    backfilled_data = pd.DataFrame(
                        {
                            "real_date": pd.bdate_range(
                                start=start,
                                end=first_valid_date - pd.Timedelta(days=1),
                            ),
                            "value": first_valid_value,
                            "cid": cid,
                            "xcat": new_xcat,
                        }
                    )
                    extended_series = extended_series[
                        extended_series["real_date"] >= first_valid_date
                    ]
                    extended_series = pd.concat([backfilled_data, extended_series])
        elif start is not None:
            extended_series = extended_series[extended_series["real_date"] >= start]

        # Add new_xcat and cid
        extended_series["xcat"] = new_xcat
        extended_series["cid"] = cid

        extended_results.append(extended_series)

    extended_df = pd.concat(extended_results, ignore_index=True)
    extended_df = extended_df.sort_values(["cid", "real_date"])

    return QuantamentalDataFrame(extended_df, categorical=result_as_categorical)


def _extend_history_checks(
    df: pd.DataFrame,
    new_xcat: str,
    cids: Optional[List[str]] = None,
    hierarchy: List[str] = [],
    backfill: bool = False,
    start: str = None,
):
    """
    Checks for inputs to `extend_history`.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing categories that are to be extended.
    new_xcat : str
        The name of the new xcat.
    cids : List[str], optional
        The cross sections to extend. If None, all cids available for any category in 'hierarchy' are extended.
    hierarchy : List[str]
         list of categories from best to worst for representation of the concept.
         Inferior categories are only used to extend the history of the superior ones.
         The new category consists of the best representation category values
         and inferior category values that are available prior to any superior.
    backfill : bool, optional
        If True, the new xcat is backfilled to the start date specified by the 'start' parameter.
    start : str, optional
        The start date of the new xcat. If backfill is True, this values will be backfilled up to this date.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a pandas DataFrame")
    if not isinstance(new_xcat, str):
        raise TypeError("new_xcat must be a string")
    if cids is not None:
        if not isinstance(cids, list):
            raise TypeError("cids must be a list")
        if not all(isinstance(cid, str) for cid in cids):
            raise TypeError("cids must be a list of strings")
    if not isinstance(hierarchy, list):
        raise TypeError("hierarchy must be a list")
    if not isinstance(backfill, bool):
        raise TypeError("backfill must be a boolean")
    if start is not None and not isinstance(start, str):
        raise TypeError("start must be a string")
    if not all(isinstance(cat, str) for cat in hierarchy):
        raise TypeError("hierarchy must be a list of strings")

    if backfill and start is None:
        raise ValueError("start must be provided if backfill is True")


if __name__ == "__main__":
    cids = ["AUD", "CAD", "GBP", "USD", "NZD"]
    xcats = ["INFL", "INFL0"]

    df_cids = pd.DataFrame(
        index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"]
    )

    df_cids.loc["AUD"] = ["2010-01-01", "2020-12-31", 0.5, 2]
    df_cids.loc["CAD"] = ["2011-01-01", "2020-11-30", 0, 1]
    df_cids.loc["GBP"] = ["2012-01-01", "2020-11-30", -0.2, 0.5]
    df_cids.loc["USD"] = ["2010-01-01", "2020-12-30", -0.2, 0.5]
    df_cids.loc["NZD"] = ["2002-01-01", "2020-09-30", -0.1, 2]
    df_cids.loc["EUR"] = ["2002-01-01", "2020-09-30", -0.2, 2]

    df_xcats = pd.DataFrame(
        index=xcats,
        columns=["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"],
    )
    df_xcats.loc["INFL"] = ["2010-01-01", "2020-09-30", 1, 2, 0.8, 0.5]
    df_xcats.loc["INFL0"] = ["2000-01-01", "2020-09-30", 1, 3, 0.5, 0.2]

    np.random.seed(0)
    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)

    df = extend_history(
        dfd, "INFL1", cids, ["INFL", "INFL0"], backfill=False, start=None
    )

    pass