Source code for macrosynergy.panel.make_blacklist

"""
Module with functions for processing "blacklist" data for cross-sections in a
quantamental DataFrame.
"""

import numpy as np
import pandas as pd
from typing import List
from itertools import groupby
from macrosynergy.management.utils import reduce_df
from macrosynergy.management.simulate import make_qdf_black, make_qdf
from macrosynergy.management.types import QuantamentalDataFrame


[docs]def make_blacklist( df: QuantamentalDataFrame, xcat: str, cids: List[str] = None, start: str = None, end: str = None, nan_black: bool = False, ): """ Converts binary category of standardized dataframe into a standardized dictionary that can serve as a blacklist for cross-sections in further analyses Parameters ---------- df : QuantamentalDataFrame standardized DataFrame with following columns: 'cid', 'xcat', 'real_date' and 'value'. xcat : str category with binary values, where 1 means blacklisting and 0 means not blacklisting. cids : str list of cross-sections that are considered in the formation of the blacklist. Per default, all available cross sections are considered. start : str earliest date in ISO format. Default is None and earliest date for which the respective category is available is used. end : str latest date in ISO format. Default is None and latest date for which the respective category is available is used. nan_black : bool if True NaNs are blacklisted (coverted to ones). Defaults is False, i.e. NaNs are converted to zeroes. Returns ------- dict standardized dictionary with cross-sections as keys and tuples of start and end dates of the blacklist periods in ISO formats as values. If one cross section has multiple blacklist periods, numbers are added to the keys (i.e. TRY_1, TRY_2, etc.) """ if not isinstance(df, QuantamentalDataFrame): raise TypeError("df must be a standardized quantamental dataframe") df = QuantamentalDataFrame(df) dfd = reduce_df(df=df, xcats=[xcat], cids=cids, start=start, end=end) if "value" not in dfd.columns: raise ValueError("`value` column not found in df") if not all(np.isin(dfd["value"].dropna().unique(), [0, 1])): raise ValueError("blacklist values must all be 0/1") df_pivot = dfd.pivot(index="real_date", columns="cid", values="value") dates = df_pivot.index cids_df = list(df_pivot.columns) # replace NaNs df_pivot[df_pivot.isna()] = int(nan_black) # 1 if nan_black else 0 dates_dict = {} for cid in cids_df: count = 0 column = df_pivot[cid].to_numpy() si = 0 for k, v in groupby(column): # iterator of consecutive keys and values v = list(v) # instantiate the iterable in memory. length = len(v) if v[0] == 1: # if blacklist period if count == 0: dates_dict[cid] = startend(dates, si, length) elif count == 1: val = dates_dict.pop(cid) dates_dict[cid + "_1"] = val # change key if more than 1 per cid count += 1 dates_dict[cid + "_" + str(count)] = startend(dates, si, length) else: dates_dict[cid + "_" + str(count)] = startend(dates, si, length) count += 1 si += length return dates_dict
[docs]def startend(dti, start, length): """ Return start and end dates of a sequence as tuple Parameters ---------- dti : DateTimeIndex datetime series of working days start : int index of start length : int number of sequential days Returns ------- Tuple[pd.Timestamp, pd.Timestamp] tuple of start and end date """ tup = (dti[start], dti[start + (length - 1)]) return tup
if __name__ == "__main__": cids = ["AUD", "GBP", "CAD", "USD"] cols = ["earliest", "latest", "mean_add", "sd_mult"] df_cid1 = pd.DataFrame(index=cids, columns=cols) df_cid1.loc["AUD"] = ["2010-01-01", "2020-12-31", 0, 1] df_cid1.loc["GBP"] = ["2011-01-01", "2020-11-30", 0, 1] df_cid1.loc["CAD"] = ["2011-01-01", "2021-11-30", 0, 1] df_cid1.loc["USD"] = ["2011-01-01", "2020-12-30", 0, 1] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] df_xcat1 = pd.DataFrame(index=["FXXR_NSA", "FXCRY_NSA"], columns=cols) df_xcat1.loc["FXXR_NSA"] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3] df_xcat1.loc["FXCRY_NSA"] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3] df1 = make_qdf(df_cid1, df_xcat1, back_ar=0.05) df_xcat2 = pd.DataFrame(index=["FXNONTRADE_NSA"], columns=["earliest", "latest"]) df_xcat2.loc["FXNONTRADE_NSA"] = ["2010-01-01", "2021-11-30"] black = { "AUD": ("2010-01-12", "2010-06-14"), "USD": ("2011-08-17", "2011-09-20"), "CAD_1": ("2011-01-04", "2011-01-23"), "CAD_2": ("2013-01-09", "2013-04-10"), "CAD_3": ("2015-01-12", "2015-03-12"), "CAD_4": ("2021-11-01", "2021-11-20"), } print(black) df2 = make_qdf_black(df_cid1, df_xcat2, blackout=black) df = pd.concat([df1, df2]).reset_index() dates_dict = make_blacklist( df, xcat="FXNONTRADE_NSA", cids=None, start=None, end=None ) # If the output, from the below printed dictionary, differs from the above defined # dictionary, it should be by a date or two, as the construction of the dataframe, # using make_qdf_black(), will account for the dates received, in the dictionary, # being weekends. Therefore, if any of the dates, for the start or end of the # blackout period are Saturday or Sunday, the date for will be shifted to the # following Monday. Hence, a break in alignment from "blackout" to "dates_dict". print(dates_dict)