"""
Module with functions for processing "blacklist" data for cross-sections in a
quantamental DataFrame.
"""
import numpy as np
import pandas as pd
from typing import List
from itertools import groupby
from macrosynergy.management.utils import reduce_df
from macrosynergy.management.simulate import make_qdf_black, make_qdf
from macrosynergy.management.types import QuantamentalDataFrame
[docs]def make_blacklist(
df: QuantamentalDataFrame,
xcat: str,
cids: List[str] = None,
start: str = None,
end: str = None,
nan_black: bool = False,
):
"""
Converts binary category of standardized dataframe into a standardized dictionary
that can serve as a blacklist for cross-sections in further analyses
Parameters
----------
df : QuantamentalDataFrame
standardized DataFrame with following columns: 'cid', 'xcat', 'real_date' and
'value'.
xcat : str
category with binary values, where 1 means blacklisting and 0 means not
blacklisting.
cids : str
list of cross-sections that are considered in the formation of the blacklist.
Per default, all available cross sections are considered.
start : str
earliest date in ISO format. Default is None and earliest date for which the
respective category is available is used.
end : str
latest date in ISO format. Default is None and latest date for which the
respective category is available is used.
nan_black : bool
if True NaNs are blacklisted (coverted to ones). Defaults is False, i.e. NaNs
are converted to zeroes.
Returns
-------
dict
standardized dictionary with cross-sections as keys and tuples of start and end
dates of the blacklist periods in ISO formats as values. If one cross section has
multiple blacklist periods, numbers are added to the keys (i.e. TRY_1, TRY_2, etc.)
"""
if not isinstance(df, QuantamentalDataFrame):
raise TypeError("df must be a standardized quantamental dataframe")
df = QuantamentalDataFrame(df)
dfd = reduce_df(df=df, xcats=[xcat], cids=cids, start=start, end=end)
if "value" not in dfd.columns:
raise ValueError("`value` column not found in df")
if not all(np.isin(dfd["value"].dropna().unique(), [0, 1])):
raise ValueError("blacklist values must all be 0/1")
df_pivot = dfd.pivot(index="real_date", columns="cid", values="value")
dates = df_pivot.index
cids_df = list(df_pivot.columns)
# replace NaNs
df_pivot[df_pivot.isna()] = int(nan_black) # 1 if nan_black else 0
dates_dict = {}
for cid in cids_df:
count = 0
column = df_pivot[cid].to_numpy()
si = 0
for k, v in groupby(column): # iterator of consecutive keys and values
v = list(v) # instantiate the iterable in memory.
length = len(v)
if v[0] == 1: # if blacklist period
if count == 0:
dates_dict[cid] = startend(dates, si, length)
elif count == 1:
val = dates_dict.pop(cid)
dates_dict[cid + "_1"] = val # change key if more than 1 per cid
count += 1
dates_dict[cid + "_" + str(count)] = startend(dates, si, length)
else:
dates_dict[cid + "_" + str(count)] = startend(dates, si, length)
count += 1
si += length
return dates_dict
[docs]def startend(dti, start, length):
"""
Return start and end dates of a sequence as tuple
Parameters
----------
dti : DateTimeIndex
datetime series of working days
start : int
index of start
length : int
number of sequential days
Returns
-------
Tuple[pd.Timestamp, pd.Timestamp]
tuple of start and end date
"""
tup = (dti[start], dti[start + (length - 1)])
return tup
if __name__ == "__main__":
cids = ["AUD", "GBP", "CAD", "USD"]
cols = ["earliest", "latest", "mean_add", "sd_mult"]
df_cid1 = pd.DataFrame(index=cids, columns=cols)
df_cid1.loc["AUD"] = ["2010-01-01", "2020-12-31", 0, 1]
df_cid1.loc["GBP"] = ["2011-01-01", "2020-11-30", 0, 1]
df_cid1.loc["CAD"] = ["2011-01-01", "2021-11-30", 0, 1]
df_cid1.loc["USD"] = ["2011-01-01", "2020-12-30", 0, 1]
cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"]
df_xcat1 = pd.DataFrame(index=["FXXR_NSA", "FXCRY_NSA"], columns=cols)
df_xcat1.loc["FXXR_NSA"] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3]
df_xcat1.loc["FXCRY_NSA"] = ["2010-01-01", "2020-12-31", 0, 1, 0, 0.3]
df1 = make_qdf(df_cid1, df_xcat1, back_ar=0.05)
df_xcat2 = pd.DataFrame(index=["FXNONTRADE_NSA"], columns=["earliest", "latest"])
df_xcat2.loc["FXNONTRADE_NSA"] = ["2010-01-01", "2021-11-30"]
black = {
"AUD": ("2010-01-12", "2010-06-14"),
"USD": ("2011-08-17", "2011-09-20"),
"CAD_1": ("2011-01-04", "2011-01-23"),
"CAD_2": ("2013-01-09", "2013-04-10"),
"CAD_3": ("2015-01-12", "2015-03-12"),
"CAD_4": ("2021-11-01", "2021-11-20"),
}
print(black)
df2 = make_qdf_black(df_cid1, df_xcat2, blackout=black)
df = pd.concat([df1, df2]).reset_index()
dates_dict = make_blacklist(
df, xcat="FXNONTRADE_NSA", cids=None, start=None, end=None
)
# If the output, from the below printed dictionary, differs from the above defined
# dictionary, it should be by a date or two, as the construction of the dataframe,
# using make_qdf_black(), will account for the dates received, in the dictionary,
# being weekends. Therefore, if any of the dates, for the start or end of the
# blackout period are Saturday or Sunday, the date for will be shifted to the
# following Monday. Hence, a break in alignment from "blackout" to "dates_dict".
print(dates_dict)