import warnings
import datetime
from typing import (
Dict,
List,
NamedTuple,
Optional,
Union,
)
import pandas as pd
from macrosynergy.management.utils import reduce_df, is_valid_iso_date
from macrosynergy.management.utils.df_utils import standardise_dataframe
[docs]class QDFArgs(NamedTuple):
"""
Contains the QuantamentalDataFrame and associated arguments.
"""
df: pd.DataFrame
cids: List[str]
xcats: List[str]
metrics: List[str]
start: str
end: str
[docs]def validate_and_reduce_qdf(
df: pd.DataFrame,
cids: Optional[List[str]] = None,
xcats: Optional[List[str]] = None,
metrics: Optional[List[str]] = None,
intersect: Optional[bool] = False,
tickers: Optional[List[str]] = None,
blacklist: Optional[Dict[str, List[str]]] = None,
start: Optional[str] = None,
end: Optional[str] = None,
):
"""
Validates the inputs to a function that takes a DataFrame as its first argument. The
DataFrame is then reduced according to the inputs.
Parameters
----------
df : pd.DataFrame
A DataFrame with the following columns: 'cid', 'xcat', 'real_date', and at least
one metric from - 'value', 'grading', 'eop_lag', or 'mop_lag'.
cids : List[str]
A list of cids to select from the DataFrame. If None, all cids are selected.
xcats : List[str]
A list of xcats to select from the DataFrame. If None, all xcats are selected.
metrics : List[str]
A list of metrics to select from the DataFrame. If None, all metrics are
selected.
intersect : bool
if True only retains cids that are available for all xcats. Default is False.
tickers : List[str]
A list of tickers that will be selected from the DataFrame if they exist,
regardless of start, end, blacklist, and intersect arguments.
blacklist : dict
cross-sections with date ranges that should be excluded from the data frame. If
one cross-section has several blacklist periods append numbers to the cross-section
code.
start : str
ISO-8601 formatted date string. Select data from this date onwards. If None, all
dates are selected.
end : str
ISO-8601 formatted date string. Select data up to and including this date. If
None, all dates are selected.
Returns
-------
QDFArgs
A NamedTuple that contains the validated arguments.
"""
df: pd.DataFrame = df.copy()
df = standardise_dataframe(df=df)
metrics = _validate_metrics(df=df, metrics=metrics)
cids_provided: bool = cids is not None
xcats_provided: bool = xcats is not None
missing_cids: List[str]
missing_xcats: List[str]
cids, missing_cids = _set_or_find_missing_in_df(
df=df, col_name="cid", values=cids, param_name="cids"
)
xcats, missing_xcats = _set_or_find_missing_in_df(
df=df, col_name="xcat", values=xcats, param_name="xcats"
)
start, end = _validate_start_and_end_dates(df, start, end)
ticker_df: pd.DataFrame = pd.DataFrame()
if tickers is not None:
ticker_df = _get_ticker_df(df=df, tickers=tickers, metrics=metrics)
df: pd.DataFrame
r_xcats: List[str]
r_cids: List[str]
df, r_xcats, r_cids = reduce_df(
df=df,
cids=cids if isinstance(cids, list) else [cids],
xcats=xcats if isinstance(xcats, list) else [xcats],
intersect=intersect,
start=start,
end=end,
blacklist=blacklist,
out_all=True,
)
df: pd.DataFrame = pd.concat([df, ticker_df], axis=0)
df = df.drop_duplicates()
if (
((len(r_xcats) != len(xcats) - len(missing_xcats)) and xcats_provided)
or ((len(r_cids) != len(cids) - len(missing_cids)) and cids_provided)
) and not intersect:
m_cids: List[str] = list(set(cids).difference(set(r_cids), set(missing_cids)))
m_xcats: List[str] = list(
set(xcats).difference(set(r_xcats), set(missing_xcats))
)
warnings.warn(
"The provided arguments resulted in a DataFrame that does not "
"contain all the requested cids and xcats. "
+ (f"Missing cids: {m_cids}. " if m_cids else "")
+ (f"Missing xcats: {m_xcats}. " if m_xcats else "")
)
for m_cid in m_cids:
cids.remove(m_cid)
for m_xcat in m_xcats:
xcats.remove(m_xcat)
elif intersect:
if len(r_cids) == 0:
raise ValueError(
"The arguments provided resulted in an empty DataFrame when "
"filtered. There are no intersecting cids."
)
if len(r_xcats) == 0:
raise ValueError(
"The arguments provided resulted in an empty DataFrame when "
"filtered. There are no intersecting xcats."
)
cids = r_cids
xcats = r_xcats
if df.empty:
raise ValueError(
"The arguments provided resulted in an "
"empty DataFrame when filtered (see `reduce_df`)."
)
return QDFArgs(df, cids, xcats, metrics, start, end)
def _get_ticker_df(df: pd.DataFrame, tickers: List[str], metrics: Optional[List[str]]):
"""
Filters a QuantamentalDataFrame by tickers.
Parameters
----------
df : pd.DataFrame
a Pandas DataFrame.
metrics : List[str]
a list of metrics.
tickers : List[str]
a list of tickers.
Returns
-------
pd.DataFrame
the filtered DataFrame.
"""
df_tickers: List[pd.DataFrame] = [pd.DataFrame()]
for ticker in tickers:
_cid, _xcat = ticker.split("_", 1)
df_tickers.append(
df.loc[
(df["cid"] == _cid) & (df["xcat"] == _xcat),
["real_date", "cid", "xcat"] + metrics,
]
)
ticker_df: pd.DataFrame = pd.concat(df_tickers, axis=0)
return ticker_df
def _validate_start_and_end_dates(df: pd.DataFrame, start: str, end: str):
"""
Determines start and end dates for a DataFrame.
Parameters
----------
df : pd.DataFrame
DataFrame to be filtered.
start : str
ISO-8601 formatted date string. If None, the earliest date in the DataFrame is
used.
end : str
ISO-8601 formatted date string. If None, the latest date in the DataFrame is
used.
Returns
-------
Tuple[str]
Tuple of start and end dates.
"""
if start is None:
start: str = pd.Timestamp(df["real_date"].min()).strftime("%Y-%m-%d")
if end is None:
end: str = pd.Timestamp(df["real_date"].max()).strftime("%Y-%m-%d")
for var, name in [(start, "start"), (end, "end")]:
if not is_valid_iso_date(var):
raise ValueError(f"`{name}` must be a valid ISO date string")
return start, end
def _validate_metrics(df: pd.DataFrame, metrics: List[str]):
"""
Validates the metrics passed to a function.
Parameters
----------
df : pd.DataFrame
a Pandas DataFrame.
metrics : List[str]
a list of metrics to be checked.
Returns
-------
List[str]
a list of metrics to be used.
"""
required_columns: List[str] = ["real_date", "cid", "xcat"]
if metrics is None:
metrics: List[str] = list(set(df.columns) - set(required_columns))
required_columns += metrics
if not set(required_columns).issubset(set(df.columns)):
raise ValueError(
f"DataFrame must contain the following columns: {required_columns}"
)
return metrics
def _set_or_find_missing_in_df(
df: pd.DataFrame, col_name: str, values: Optional[List], param_name: str
):
"""
Returns the values passed to a function and a list of values that are not found in a
specific column of a DataFrame. If values is None, all unique values in the
DataFrame are returned.
Parameters
----------
df : pd.DataFrame
a Pandas DataFrame.
col_name : str
name of column in the DataFrame.
values : List
list of values to be checked.
param_name : str
name of parameter passed to function.
Returns
-------
List
list of values that are not in the DataFrame.
"""
missing_values: List = []
if values is None:
values = df[col_name].unique().tolist()
else:
missing_values = _find_missing_in_df(
df=df, col_name=col_name, values=values, param_name=param_name
)
return values, missing_values
def _find_missing_in_df(df: pd.DataFrame, col_name: str, values: List, param_name: str):
"""
Finds values in a list that are not in a specific column of a DataFrame.
Parameters
----------
df : pd.DataFrame
a Pandas DataFrame.
col_name : str
name of column in the DataFrame.
values : List
list of values to be checked.
param_name : str
name of parameter passed to function.
Returns
-------
List
list of values that are not in the DataFrame.
"""
missing: List = []
if not set(values).issubset(set(df[col_name].unique())):
# warn
warnings.warn(
f"The following {col_name}(s), passed in `{param_name}`,"
" are not in the DataFrame `df`: "
f"{list(set(values) - set(df[col_name].unique()))}."
)
missing = list(set(values) - set(df[col_name].unique()))
return missing