Source code for macrosynergy.management.utils.check_availability

"""
Module for checking the availability of data availabity from a Quantamental DataFrame.
Includes functions for checking start years and end dates of a DataFrame, as well as
visualizing the results.
"""

import numpy as np
import pandas as pd
from typing import List, Tuple

from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.utils import reduce_df
from macrosynergy.management.types import QuantamentalDataFrame
import macrosynergy.visuals as msv


[docs]def check_availability(
    df: pd.DataFrame,
    xcats: List[str] = None,
    cids: List[str] = None,
    start: str = None,
    start_size: Tuple[float] = None,
    end_size: Tuple[float] = None,
    start_years: bool = True,
    missing_recent: bool = True,
    use_last_businessday: bool = True,
    title: str = None,
    title_fontsize: int = None,
    xcat_labels: dict = None,
    sort_labels: bool = False,
):
    """
    Wrapper for visualizing start and end dates of a filtered DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        standardized DataFrame with the following necessary columns: 'cid', 'xcat',
        'real_date'.
    xcats : List[str]
        extended categories to be checked on. Default is all in the DataFrame.
    cids : List[str]
        cross sections to be checked on. Default is all in the DataFrame.
    start : str
        string representing earliest considered date. Default is None, which reverts to
        earliest date in the dataframe.
    start_size : Tuple[float]
        tuple of floats with width / length of the start years heatmap. Default is None
        (format adjusted to data).
    end_size : Tuple[float]
        tuple of floats with width/length of the end dates heatmap. Default is None
        (format adjusted to data).
    start_years : bool
        boolean indicating whether or not to display a chart of starting years for each
        cross-section and indicator. Default is True (display start years).
    missing_recent : bool
        boolean indicating whether or not to display a chart of missing date numbers for
        each cross-section and indicator. Default is True (display missing days).
    use_last_businessday : bool
        boolean indicating whether or not to use the last business day before today as
        the end date. Default is True.
    title : str
        A string to be used as the title of the heatmap. If None, a default header will be
        used.
    title_fontsize : int
        Font size for the title of the heatmap. Default is None (automatic sizing).
    xcat_labels : dict
        dictionary with xcat labels. Default is None (no labels).
    sort_labels : bool
        boolean indicating whether to sort the `xcats` in the heatmap alphabetically.
        The sorting is done based on the `xcats` list, with the labels from `xcat_labels`
        simply used for display (not regarded for sorting at all). Default is False (no
        sorting, ordered as provided in `xcats`).
    """

    for bvar, varname in zip(
        [start_years, missing_recent, sort_labels],
        ["start_years", "missing_recent", "sort_labels"],
    ):
        if not isinstance(bvar, bool):
            raise TypeError(f"`{varname}` must be a `bool` and not {type(bvar)}.")

    df = QuantamentalDataFrame(df)

    dfx = reduce_df(df, xcats=xcats, cids=cids, start=start)

    if xcats is None:
        xcats = sorted(dfx["xcat"].unique())

    if sort_labels:
        xcats = sorted(xcats)

    if xcat_labels is not None:
        dfx = dfx.rename_xcats(xcat_labels)

    if dfx.empty:
        raise ValueError(
            "No data available for the selected cross-sections and categories."
        )
    if start_years:
        dfs = check_startyears(dfx)
        row_order = get_heatmap_row_order(xcats=xcats, xcat_labels=xcat_labels)
        visual_paneldates(
            dfs,
            size=start_size,
            use_last_businessday=use_last_businessday,
            title=title,
            title_fontsize=title_fontsize,
            row_order=row_order,
        )
    if missing_recent:
        dfe = check_enddates(dfx)
        row_order = get_heatmap_row_order(xcats=xcats, xcat_labels=xcat_labels)
        visual_paneldates(
            dfe,
            size=end_size,
            use_last_businessday=use_last_businessday,
            title=title,
            title_fontsize=title_fontsize,
            row_order=row_order,
        )


[docs]def missing_in_df(
    df: QuantamentalDataFrame,
    xcats: List[str] = None,
    cids: List[str] = None,
):
    """
    Print missing cross-sections and categories

    Parameters
    ----------
    df : QuantamentalDataFrame
        standardized DataFrame with the following necessary columns: 'cid', 'xcat',
        'real_date'.
    xcats : List[str]
        extended categories to be checked on. Default is all in the DataFrame.
    cids : List[str]
        cross sections to be checked on. Default is all in the DataFrame.
    """

    if not isinstance(df, QuantamentalDataFrame):
        raise TypeError("`df` must be a QuantamentalDataFrame/pd.DataFrame")

    if df.empty:
        raise ValueError("`df` is empty.")

    for lst, name in zip([xcats, cids], ["xcats", "cids"]):
        if (lst is not None) and not (
            isinstance(lst, list) and all(isinstance(x, str) for x in lst)
        ):
            raise TypeError(f"`{name}` should be a `List[str]` and not {type(lst)}.")

    missing_across_df = list(set(xcats) - set(df["xcat"]))
    if len(missing_across_df) > 0:
        print("Missing XCATs across DataFrame: ", missing_across_df)
    else:
        print("No missing XCATs across DataFrame.")

    cids = df["cid"].unique() if cids is None else cids
    xcats_used = sorted(list(set(xcats).intersection(set(df["xcat"]))))
    if len(xcats_used) == 0:
        print("No XCATs found in the DataFrame.")
        return

    max_xcat_len = max(map(len, xcats_used))
    for xcat in xcats_used:
        cids_xcat = df.loc[df["xcat"] == xcat, "cid"].unique()
        missing_cids = sorted(set(cids) - set(cids_xcat))
        msg = f"Missing cids for {xcat}: " + " " * (max_xcat_len - len(xcat))
        print(msg, missing_cids)


[docs]def check_startyears(df: pd.DataFrame) -> pd.DataFrame:
    """
    DataFrame with starting years across all extended categories and cross-sections

    Parameters
    ----------
    df : pd.DataFrame
        standardized DataFrame with the following necessary columns: 'cid', 'xcat',
        'real_date'.

    Returns
    -------
    pd.DataFrame
        DataFrame consisting of starting years for all series.
    """

    df: pd.DataFrame = df.copy()
    df = df.dropna(how="any")
    df_starts = (
        df[["cid", "xcat", "real_date"]].groupby(["cid", "xcat"], observed=True).min()
    )
    df_starts["real_date"] = pd.DatetimeIndex(df_starts.loc[:, "real_date"]).year

    return df_starts.unstack().loc[:, "real_date"].astype(int, errors="ignore")


[docs]def check_enddates(df: pd.DataFrame) -> pd.DataFrame:
    """
    DataFrame with end dates across all extended categories and cross sections.

    Parameters
    ----------
    df : pd.DataFrame
        standardized DataFrame with the following necessary columns: 'cid', 'xcat',
        'real_date'.

    Returns
    -------
    pd.DataFrame
        DataFrame consisting of end dates for all series.
    """

    df: pd.DataFrame = df.copy()
    df = df.dropna(how="any")
    df_ends = (
        df[["cid", "xcat", "real_date"]].groupby(["cid", "xcat"], observed=True).max()
    )
    df_ends["real_date"] = df_ends["real_date"].dt.strftime("%Y-%m-%d")

    return df_ends.unstack().loc[:, "real_date"]


[docs]def business_day_dif(df: pd.DataFrame, maxdate: pd.Timestamp) -> pd.DataFrame:
    """
    Number of business days between two respective business dates.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame cross-sections rows and category columns. Each cell in the DataFrame
        will correspond to the start date of the respective series.
    maxdate : pd.Timestamp
        maximum release date found in the received DataFrame. In principle, all series
        should have values up until the respective business date. The difference will
        represent possible missing values.

    Returns
    -------
    pd.DataFrame
        DataFrame consisting of business day differences for all series.
    """

    year_df = (maxdate.year - df.apply(lambda x: x.dt.isocalendar().year)) * 52
    week_df = maxdate.week - df.apply(lambda x: x.dt.isocalendar().week)
    # Account for difference over a year.
    week_df += year_df
    # Account for weekends.
    week_df *= 2
    df = (maxdate - df).apply(lambda x: x.dt.days)
    # set to zero if the difference is negative.
    df = df - week_df
    return df.where(df >= 0, 0)


[docs]def get_heatmap_row_order(xcats: List[str], xcat_labels: dict = None) -> List[str]:
    if not xcat_labels:
        return xcats
    missing = set(xcats) - set(xcat_labels.keys())
    if missing:
        raise ValueError(
            f"Missing labels for xcats: {sorted(missing)}. "
            "Ensure all specified `xcats` are present in the `xcat_labels` dictionary."
        )

    return [xcat_labels[xcat] for xcat in xcats] if xcat_labels else xcats


[docs]def visual_paneldates(
    df: pd.DataFrame,
    size: Tuple[float] = None,
    use_last_businessday: bool = True,
    title: str = None,
    row_order: List[str] = None,
    title_fontsize: int = None,
):
    """
    Visualize panel dates with color codes.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame cross sections rows and category columns.
    size : Tuple[float]
        tuple of floats with width/length of displayed heatmap.
    use_last_businessday : bool
        boolean indicating whether or not to use the last business day before today as
        the end date. Default is True.
    title : str
        A string to be used as the title of the heatmap. If None, a default header will be
        used.
    title_fontsize : int
        Font size for the title of the heatmap. Default is None (automatic sizing).
    row_order : List[str]
        A list of strings specifying the order of rows in the heatmap. These rows
        correspond to the columns of the input DataFrame. If None, the default order
        used by Seaborn will be applied.
    """

    msv.view_panel_dates(
        df=df,
        size=size,
        use_last_businessday=use_last_businessday,
        header=title,
        row_order=row_order,
        title_fontsize=title_fontsize,
    )


if __name__ == "__main__":
    cids = ["AUD", "CAD", "GBP"]
    xcats = ["XR", "CRY"]

    cols_1 = ["earliest", "latest", "mean_add", "sd_mult"]
    df_cids = pd.DataFrame(index=cids, columns=cols_1)
    df_cids.loc["AUD",] = ["2010-01-01", "2020-12-31", 0.5, 2]
    df_cids.loc["CAD",] = ["2010-01-01", "2020-11-30", 0, 1]
    df_cids.loc["GBP",] = ["2012-01-01", "2020-11-30", -0.2, 0.5]

    cols_2 = cols_1 + ["ar_coef", "back_coef"]
    df_xcats = pd.DataFrame(index=xcats, columns=cols_2)
    df_xcats.loc["XR",] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3]
    df_xcats.loc["CRY",] = ["2011-01-01", "2020-10-30", 1, 2, 0.9, 0.5]

    dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)

    filt_na = (dfd["cid"] == "CAD") & (dfd["real_date"] < "2011-01-01")
    dfd.loc[filt_na, "value"] = np.nan

    xxcats = xcats + ["TREND"]
    xxcids = cids + ["USD"]

    check_availability(
        df=dfd,
        xcats=xcats,
        cids=cids,
        start_size=(10, 5),
        end_size=(10, 8),
        xcat_labels={"XR": "Exchange Rate", "CRY": "Commodity"},
    )