Source code for macrosynergy.management.utils.check_availability

"""
Module for checking the availability of data availabity from a Quantamental DataFrame.
Includes functions for checking start years and end dates of a DataFrame, as well as
visualizing the results.
"""

import numpy as np
import pandas as pd
from typing import List, Tuple

from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.utils import reduce_df
from macrosynergy.management.types import QuantamentalDataFrame
import macrosynergy.visuals as msv


[docs]def check_availability( df: pd.DataFrame, xcats: List[str] = None, cids: List[str] = None, start: str = None, start_size: Tuple[float] = None, end_size: Tuple[float] = None, start_years: bool = True, missing_recent: bool = True, use_last_businessday: bool = True, title: str = None, title_fontsize: int = None, xcat_labels: dict = None, sort_labels: bool = False, ): """ Wrapper for visualizing start and end dates of a filtered DataFrame. Parameters ---------- df : pd.DataFrame standardized DataFrame with the following necessary columns: 'cid', 'xcat', 'real_date'. xcats : List[str] extended categories to be checked on. Default is all in the DataFrame. cids : List[str] cross sections to be checked on. Default is all in the DataFrame. start : str string representing earliest considered date. Default is None, which reverts to earliest date in the dataframe. start_size : Tuple[float] tuple of floats with width / length of the start years heatmap. Default is None (format adjusted to data). end_size : Tuple[float] tuple of floats with width/length of the end dates heatmap. Default is None (format adjusted to data). start_years : bool boolean indicating whether or not to display a chart of starting years for each cross-section and indicator. Default is True (display start years). missing_recent : bool boolean indicating whether or not to display a chart of missing date numbers for each cross-section and indicator. Default is True (display missing days). use_last_businessday : bool boolean indicating whether or not to use the last business day before today as the end date. Default is True. title : str A string to be used as the title of the heatmap. If None, a default header will be used. title_fontsize : int Font size for the title of the heatmap. Default is None (automatic sizing). xcat_labels : dict dictionary with xcat labels. Default is None (no labels). sort_labels : bool boolean indicating whether to sort the `xcats` in the heatmap alphabetically. The sorting is done based on the `xcats` list, with the labels from `xcat_labels` simply used for display (not regarded for sorting at all). Default is False (no sorting, ordered as provided in `xcats`). """ for bvar, varname in zip( [start_years, missing_recent, sort_labels], ["start_years", "missing_recent", "sort_labels"], ): if not isinstance(bvar, bool): raise TypeError(f"`{varname}` must be a `bool` and not {type(bvar)}.") df = QuantamentalDataFrame(df) dfx = reduce_df(df, xcats=xcats, cids=cids, start=start) if xcats is None: xcats = sorted(dfx["xcat"].unique()) if sort_labels: xcats = sorted(xcats) if xcat_labels is not None: dfx = dfx.rename_xcats(xcat_labels) if dfx.empty: raise ValueError( "No data available for the selected cross-sections and categories." ) if start_years: dfs = check_startyears(dfx) row_order = get_heatmap_row_order(xcats=xcats, xcat_labels=xcat_labels) visual_paneldates( dfs, size=start_size, use_last_businessday=use_last_businessday, title=title, title_fontsize=title_fontsize, row_order=row_order, ) if missing_recent: dfe = check_enddates(dfx) row_order = get_heatmap_row_order(xcats=xcats, xcat_labels=xcat_labels) visual_paneldates( dfe, size=end_size, use_last_businessday=use_last_businessday, title=title, title_fontsize=title_fontsize, row_order=row_order, )
[docs]def missing_in_df( df: QuantamentalDataFrame, xcats: List[str] = None, cids: List[str] = None, ): """ Print missing cross-sections and categories Parameters ---------- df : QuantamentalDataFrame standardized DataFrame with the following necessary columns: 'cid', 'xcat', 'real_date'. xcats : List[str] extended categories to be checked on. Default is all in the DataFrame. cids : List[str] cross sections to be checked on. Default is all in the DataFrame. """ if not isinstance(df, QuantamentalDataFrame): raise TypeError("`df` must be a QuantamentalDataFrame/pd.DataFrame") if df.empty: raise ValueError("`df` is empty.") for lst, name in zip([xcats, cids], ["xcats", "cids"]): if (lst is not None) and not ( isinstance(lst, list) and all(isinstance(x, str) for x in lst) ): raise TypeError(f"`{name}` should be a `List[str]` and not {type(lst)}.") missing_across_df = list(set(xcats) - set(df["xcat"])) if len(missing_across_df) > 0: print("Missing XCATs across DataFrame: ", missing_across_df) else: print("No missing XCATs across DataFrame.") cids = df["cid"].unique() if cids is None else cids xcats_used = sorted(list(set(xcats).intersection(set(df["xcat"])))) if len(xcats_used) == 0: print("No XCATs found in the DataFrame.") return max_xcat_len = max(map(len, xcats_used)) for xcat in xcats_used: cids_xcat = df.loc[df["xcat"] == xcat, "cid"].unique() missing_cids = sorted(set(cids) - set(cids_xcat)) msg = f"Missing cids for {xcat}: " + " " * (max_xcat_len - len(xcat)) print(msg, missing_cids)
[docs]def check_startyears(df: pd.DataFrame) -> pd.DataFrame: """ DataFrame with starting years across all extended categories and cross-sections Parameters ---------- df : pd.DataFrame standardized DataFrame with the following necessary columns: 'cid', 'xcat', 'real_date'. Returns ------- pd.DataFrame DataFrame consisting of starting years for all series. """ df: pd.DataFrame = df.copy() df = df.dropna(how="any") df_starts = ( df[["cid", "xcat", "real_date"]].groupby(["cid", "xcat"], observed=True).min() ) df_starts["real_date"] = pd.DatetimeIndex(df_starts.loc[:, "real_date"]).year return df_starts.unstack().loc[:, "real_date"].astype(int, errors="ignore")
[docs]def check_enddates(df: pd.DataFrame) -> pd.DataFrame: """ DataFrame with end dates across all extended categories and cross sections. Parameters ---------- df : pd.DataFrame standardized DataFrame with the following necessary columns: 'cid', 'xcat', 'real_date'. Returns ------- pd.DataFrame DataFrame consisting of end dates for all series. """ df: pd.DataFrame = df.copy() df = df.dropna(how="any") df_ends = ( df[["cid", "xcat", "real_date"]].groupby(["cid", "xcat"], observed=True).max() ) df_ends["real_date"] = df_ends["real_date"].dt.strftime("%Y-%m-%d") return df_ends.unstack().loc[:, "real_date"]
[docs]def business_day_dif(df: pd.DataFrame, maxdate: pd.Timestamp) -> pd.DataFrame: """ Number of business days between two respective business dates. Parameters ---------- df : pd.DataFrame DataFrame cross-sections rows and category columns. Each cell in the DataFrame will correspond to the start date of the respective series. maxdate : pd.Timestamp maximum release date found in the received DataFrame. In principle, all series should have values up until the respective business date. The difference will represent possible missing values. Returns ------- pd.DataFrame DataFrame consisting of business day differences for all series. """ year_df = (maxdate.year - df.apply(lambda x: x.dt.isocalendar().year)) * 52 week_df = maxdate.week - df.apply(lambda x: x.dt.isocalendar().week) # Account for difference over a year. week_df += year_df # Account for weekends. week_df *= 2 df = (maxdate - df).apply(lambda x: x.dt.days) # set to zero if the difference is negative. df = df - week_df return df.where(df >= 0, 0)
[docs]def get_heatmap_row_order(xcats: List[str], xcat_labels: dict = None) -> List[str]: if not xcat_labels: return xcats missing = set(xcats) - set(xcat_labels.keys()) if missing: raise ValueError( f"Missing labels for xcats: {sorted(missing)}. " "Ensure all specified `xcats` are present in the `xcat_labels` dictionary." ) return [xcat_labels[xcat] for xcat in xcats] if xcat_labels else xcats
[docs]def visual_paneldates( df: pd.DataFrame, size: Tuple[float] = None, use_last_businessday: bool = True, title: str = None, row_order: List[str] = None, title_fontsize: int = None, ): """ Visualize panel dates with color codes. Parameters ---------- df : pd.DataFrame DataFrame cross sections rows and category columns. size : Tuple[float] tuple of floats with width/length of displayed heatmap. use_last_businessday : bool boolean indicating whether or not to use the last business day before today as the end date. Default is True. title : str A string to be used as the title of the heatmap. If None, a default header will be used. title_fontsize : int Font size for the title of the heatmap. Default is None (automatic sizing). row_order : List[str] A list of strings specifying the order of rows in the heatmap. These rows correspond to the columns of the input DataFrame. If None, the default order used by Seaborn will be applied. """ msv.view_panel_dates( df=df, size=size, use_last_businessday=use_last_businessday, header=title, row_order=row_order, title_fontsize=title_fontsize, )
if __name__ == "__main__": cids = ["AUD", "CAD", "GBP"] xcats = ["XR", "CRY"] cols_1 = ["earliest", "latest", "mean_add", "sd_mult"] df_cids = pd.DataFrame(index=cids, columns=cols_1) df_cids.loc["AUD",] = ["2010-01-01", "2020-12-31", 0.5, 2] df_cids.loc["CAD",] = ["2010-01-01", "2020-11-30", 0, 1] df_cids.loc["GBP",] = ["2012-01-01", "2020-11-30", -0.2, 0.5] cols_2 = cols_1 + ["ar_coef", "back_coef"] df_xcats = pd.DataFrame(index=xcats, columns=cols_2) df_xcats.loc["XR",] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3] df_xcats.loc["CRY",] = ["2011-01-01", "2020-10-30", 1, 2, 0.9, 0.5] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) filt_na = (dfd["cid"] == "CAD") & (dfd["real_date"] < "2011-01-01") dfd.loc[filt_na, "value"] = np.nan xxcats = xcats + ["TREND"] xxcids = cids + ["USD"] check_availability( df=dfd, xcats=xcats, cids=cids, start_size=(10, 5), end_size=(10, 8), xcat_labels={"XR": "Exchange Rate", "CRY": "Commodity"}, )