"""
Module for checking the availability of data availabity from a Quantamental DataFrame.
Includes functions for checking start years and end dates of a DataFrame, as well as
visualizing the results.
"""
import numpy as np
import pandas as pd
from typing import List, Tuple
from macrosynergy.management.simulate import make_qdf
from macrosynergy.management.utils import reduce_df
from macrosynergy.management.types import QuantamentalDataFrame
import macrosynergy.visuals as msv
[docs]def check_availability(
df: pd.DataFrame,
xcats: List[str] = None,
cids: List[str] = None,
start: str = None,
start_size: Tuple[float] = None,
end_size: Tuple[float] = None,
start_years: bool = True,
missing_recent: bool = True,
use_last_businessday: bool = True,
title: str = None,
title_fontsize: int = None,
xcat_labels: dict = None,
sort_labels: bool = False,
):
"""
Wrapper for visualizing start and end dates of a filtered DataFrame.
Parameters
----------
df : pd.DataFrame
standardized DataFrame with the following necessary columns: 'cid', 'xcat',
'real_date'.
xcats : List[str]
extended categories to be checked on. Default is all in the DataFrame.
cids : List[str]
cross sections to be checked on. Default is all in the DataFrame.
start : str
string representing earliest considered date. Default is None, which reverts to
earliest date in the dataframe.
start_size : Tuple[float]
tuple of floats with width / length of the start years heatmap. Default is None
(format adjusted to data).
end_size : Tuple[float]
tuple of floats with width/length of the end dates heatmap. Default is None
(format adjusted to data).
start_years : bool
boolean indicating whether or not to display a chart of starting years for each
cross-section and indicator. Default is True (display start years).
missing_recent : bool
boolean indicating whether or not to display a chart of missing date numbers for
each cross-section and indicator. Default is True (display missing days).
use_last_businessday : bool
boolean indicating whether or not to use the last business day before today as
the end date. Default is True.
title : str
A string to be used as the title of the heatmap. If None, a default header will be
used.
title_fontsize : int
Font size for the title of the heatmap. Default is None (automatic sizing).
xcat_labels : dict
dictionary with xcat labels. Default is None (no labels).
sort_labels : bool
boolean indicating whether to sort the `xcats` in the heatmap alphabetically.
The sorting is done based on the `xcats` list, with the labels from `xcat_labels`
simply used for display (not regarded for sorting at all). Default is False (no
sorting, ordered as provided in `xcats`).
"""
for bvar, varname in zip(
[start_years, missing_recent, sort_labels],
["start_years", "missing_recent", "sort_labels"],
):
if not isinstance(bvar, bool):
raise TypeError(f"`{varname}` must be a `bool` and not {type(bvar)}.")
df = QuantamentalDataFrame(df)
dfx = reduce_df(df, xcats=xcats, cids=cids, start=start)
if xcats is None:
xcats = sorted(dfx["xcat"].unique())
if sort_labels:
xcats = sorted(xcats)
if xcat_labels is not None:
dfx = dfx.rename_xcats(xcat_labels)
if dfx.empty:
raise ValueError(
"No data available for the selected cross-sections and categories."
)
if start_years:
dfs = check_startyears(dfx)
row_order = get_heatmap_row_order(xcats=xcats, xcat_labels=xcat_labels)
visual_paneldates(
dfs,
size=start_size,
use_last_businessday=use_last_businessday,
title=title,
title_fontsize=title_fontsize,
row_order=row_order,
)
if missing_recent:
dfe = check_enddates(dfx)
row_order = get_heatmap_row_order(xcats=xcats, xcat_labels=xcat_labels)
visual_paneldates(
dfe,
size=end_size,
use_last_businessday=use_last_businessday,
title=title,
title_fontsize=title_fontsize,
row_order=row_order,
)
[docs]def missing_in_df(
df: QuantamentalDataFrame,
xcats: List[str] = None,
cids: List[str] = None,
):
"""
Print missing cross-sections and categories
Parameters
----------
df : QuantamentalDataFrame
standardized DataFrame with the following necessary columns: 'cid', 'xcat',
'real_date'.
xcats : List[str]
extended categories to be checked on. Default is all in the DataFrame.
cids : List[str]
cross sections to be checked on. Default is all in the DataFrame.
"""
if not isinstance(df, QuantamentalDataFrame):
raise TypeError("`df` must be a QuantamentalDataFrame/pd.DataFrame")
if df.empty:
raise ValueError("`df` is empty.")
for lst, name in zip([xcats, cids], ["xcats", "cids"]):
if (lst is not None) and not (
isinstance(lst, list) and all(isinstance(x, str) for x in lst)
):
raise TypeError(f"`{name}` should be a `List[str]` and not {type(lst)}.")
missing_across_df = list(set(xcats) - set(df["xcat"]))
if len(missing_across_df) > 0:
print("Missing XCATs across DataFrame: ", missing_across_df)
else:
print("No missing XCATs across DataFrame.")
cids = df["cid"].unique() if cids is None else cids
xcats_used = sorted(list(set(xcats).intersection(set(df["xcat"]))))
if len(xcats_used) == 0:
print("No XCATs found in the DataFrame.")
return
max_xcat_len = max(map(len, xcats_used))
for xcat in xcats_used:
cids_xcat = df.loc[df["xcat"] == xcat, "cid"].unique()
missing_cids = sorted(set(cids) - set(cids_xcat))
msg = f"Missing cids for {xcat}: " + " " * (max_xcat_len - len(xcat))
print(msg, missing_cids)
[docs]def check_startyears(df: pd.DataFrame) -> pd.DataFrame:
"""
DataFrame with starting years across all extended categories and cross-sections
Parameters
----------
df : pd.DataFrame
standardized DataFrame with the following necessary columns: 'cid', 'xcat',
'real_date'.
Returns
-------
pd.DataFrame
DataFrame consisting of starting years for all series.
"""
df: pd.DataFrame = df.copy()
df = df.dropna(how="any")
df_starts = (
df[["cid", "xcat", "real_date"]].groupby(["cid", "xcat"], observed=True).min()
)
df_starts["real_date"] = pd.DatetimeIndex(df_starts.loc[:, "real_date"]).year
return df_starts.unstack().loc[:, "real_date"].astype(int, errors="ignore")
[docs]def check_enddates(df: pd.DataFrame) -> pd.DataFrame:
"""
DataFrame with end dates across all extended categories and cross sections.
Parameters
----------
df : pd.DataFrame
standardized DataFrame with the following necessary columns: 'cid', 'xcat',
'real_date'.
Returns
-------
pd.DataFrame
DataFrame consisting of end dates for all series.
"""
df: pd.DataFrame = df.copy()
df = df.dropna(how="any")
df_ends = (
df[["cid", "xcat", "real_date"]].groupby(["cid", "xcat"], observed=True).max()
)
df_ends["real_date"] = df_ends["real_date"].dt.strftime("%Y-%m-%d")
return df_ends.unstack().loc[:, "real_date"]
[docs]def business_day_dif(df: pd.DataFrame, maxdate: pd.Timestamp) -> pd.DataFrame:
"""
Number of business days between two respective business dates.
Parameters
----------
df : pd.DataFrame
DataFrame cross-sections rows and category columns. Each cell in the DataFrame
will correspond to the start date of the respective series.
maxdate : pd.Timestamp
maximum release date found in the received DataFrame. In principle, all series
should have values up until the respective business date. The difference will
represent possible missing values.
Returns
-------
pd.DataFrame
DataFrame consisting of business day differences for all series.
"""
year_df = (maxdate.year - df.apply(lambda x: x.dt.isocalendar().year)) * 52
week_df = maxdate.week - df.apply(lambda x: x.dt.isocalendar().week)
# Account for difference over a year.
week_df += year_df
# Account for weekends.
week_df *= 2
df = (maxdate - df).apply(lambda x: x.dt.days)
# set to zero if the difference is negative.
df = df - week_df
return df.where(df >= 0, 0)
[docs]def get_heatmap_row_order(xcats: List[str], xcat_labels: dict = None) -> List[str]:
if not xcat_labels:
return xcats
missing = set(xcats) - set(xcat_labels.keys())
if missing:
raise ValueError(
f"Missing labels for xcats: {sorted(missing)}. "
"Ensure all specified `xcats` are present in the `xcat_labels` dictionary."
)
return [xcat_labels[xcat] for xcat in xcats] if xcat_labels else xcats
[docs]def visual_paneldates(
df: pd.DataFrame,
size: Tuple[float] = None,
use_last_businessday: bool = True,
title: str = None,
row_order: List[str] = None,
title_fontsize: int = None,
):
"""
Visualize panel dates with color codes.
Parameters
----------
df : pd.DataFrame
DataFrame cross sections rows and category columns.
size : Tuple[float]
tuple of floats with width/length of displayed heatmap.
use_last_businessday : bool
boolean indicating whether or not to use the last business day before today as
the end date. Default is True.
title : str
A string to be used as the title of the heatmap. If None, a default header will be
used.
title_fontsize : int
Font size for the title of the heatmap. Default is None (automatic sizing).
row_order : List[str]
A list of strings specifying the order of rows in the heatmap. These rows
correspond to the columns of the input DataFrame. If None, the default order
used by Seaborn will be applied.
"""
msv.view_panel_dates(
df=df,
size=size,
use_last_businessday=use_last_businessday,
header=title,
row_order=row_order,
title_fontsize=title_fontsize,
)
if __name__ == "__main__":
cids = ["AUD", "CAD", "GBP"]
xcats = ["XR", "CRY"]
cols_1 = ["earliest", "latest", "mean_add", "sd_mult"]
df_cids = pd.DataFrame(index=cids, columns=cols_1)
df_cids.loc["AUD",] = ["2010-01-01", "2020-12-31", 0.5, 2]
df_cids.loc["CAD",] = ["2010-01-01", "2020-11-30", 0, 1]
df_cids.loc["GBP",] = ["2012-01-01", "2020-11-30", -0.2, 0.5]
cols_2 = cols_1 + ["ar_coef", "back_coef"]
df_xcats = pd.DataFrame(index=xcats, columns=cols_2)
df_xcats.loc["XR",] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3]
df_xcats.loc["CRY",] = ["2011-01-01", "2020-10-30", 1, 2, 0.9, 0.5]
dfd = make_qdf(df_cids, df_xcats, back_ar=0.75)
filt_na = (dfd["cid"] == "CAD") & (dfd["real_date"] < "2011-01-01")
dfd.loc[filt_na, "value"] = np.nan
xxcats = xcats + ["TREND"]
xxcids = cids + ["USD"]
check_availability(
df=dfd,
xcats=xcats,
cids=cids,
start_size=(10, 5),
end_size=(10, 8),
xcat_labels={"XR": "Exchange Rate", "CRY": "Commodity"},
)