"""
Implementation of linear_composite() function as a module.
"""
import numpy as np
import pandas as pd
from typing import List, Dict, Union, Optional, Tuple, Type, Set
import warnings
from packaging import version
from macrosynergy.management.utils import reduce_df, is_valid_iso_date
from macrosynergy.management.simulate import make_test_df
from macrosynergy.management.utils.core import _map_to_business_day_frequency
from macrosynergy.management.utils.df_utils import get_sops
from macrosynergy.management.types import QuantamentalDataFrame
listtypes: Tuple[Type, ...] = (list, np.ndarray, pd.Series, tuple)
PD_FUTURE_STACK = (
dict(future_stack=True)
if version.parse(pd.__version__) > version.parse("2.1.0")
else dict(dropna=False)
)
[docs]def linear_composite(
df: pd.DataFrame,
xcats: Union[str, List[str]],
cids: Optional[List[str]] = None,
weights: Optional[Union[List[float], str]] = None,
normalize_weights: bool = True,
signs: Optional[List[float]] = None,
start: Optional[str] = None,
end: Optional[str] = None,
blacklist: Dict[str, List[str]] = None,
complete_xcats: bool = False,
complete_cids: bool = False,
new_xcat="NEW",
new_cid="GLB",
weight_lag: int = 0,
rebal_freq: str = "D",
thresh: Optional[float] = None,
):
"""
Weighted linear combinations of cross sections or categories
Parameters
----------
df : ~pandas.DataFrame
standardized JPMaQS DataFrame with the necessary columns: 'cid', 'xcat',
'real_date' and 'value'.
xcats : Union[str, List[str]
One or more categories to be combined. If a single category is given the linear
combination is calculated across cross-sections. This results in a single series
to which a new cross-sectional identifier is assigned. If more than one category
string is given the output will be a new category, i.e. a panel that is a linear
combination of the categories specified.
cids : List[str]
cross-sections for which the linear combinations are calculated. Default is all
cross-section available.
weights : Union[List[float], str]
This specifies how categories or cross sections are combined. There are three
principal options. The first (default) is None, in which case equal weights are
given to all categories or cross sections that are available. The second case is a
set of fixed coefficients, in which case these very coefficients are applied to all
available categories of cross sections. Per default the coefficients are normalized
so that they add up to one for each period. This can be changed with the argument
`normalize_weights`. The third case is the assignment of a weighting category. This
only applies to combinations of cross sections. In this case the weighting category
is multiplied for each period with the corresponding value of main category of the
same cross section. Per default the weight category values are normalized so that
they add up to one for each period. This can be changed with the argument
`normalize_weights`.
normalize_weights : bool
If True (default) the weights are normalized to sum to 1. If False the weights
are used as specified.
signs : List[float]
An array of consisting of +1s or -1s, of the same length as the number of
categories in `xcats` to indicate whether the respective category should be added or
subtracted from the linear combination. Not relevant when aggregating over cross-
sections, i.e. when a single category is given in `xcats`. Default is None and all
signs are set to +1.
start : str
earliest date in ISO format. Default is None and earliest date for which the
respective category is available is used.
end : str
latest date in ISO format. Default is None and latest date for which the
respective category is available is used.
complete_xcats : bool
If True (default) combinations are only calculated for observation dates on
which all categories are available. If False a combination of the available
categories is used. Not relevant when aggregating over cross-sections, i.e.
when a single category is given in `xcats`.
complete_cids : bool
If True (default) combinations are only calculated for observation dates on
which all cross-sections are available. If False a combination of the available
cross-sections is used. Not relevant when aggregating over categories, i.e. when
multiple categories are given in `xcats`.
new_xcat : str
Name of new composite category when aggregating over categories for a given
cross-section. Default is "NEW".
new_cid : str
Name of new composite cross-section when aggregating over cross-sections for a
given category. Default is "GLB".
weight_lag : int
Number of business days to lag the weight series. Only applicable when
`weights` is a category string. Default is 0 (no lag).
A lag of N means that weights from N business days ago are used for each
date. Not applicable to fixed weights (list).
rebal_freq : str
Rebalancing frequency for weights. Must be one of "D" (daily), "W" (weekly),
"M" (monthly), or "Q" (quarterly). Default is "D" (daily rebalancing).
When set to a coarser frequency, weights are only updated at the start of
each period and held constant between rebalancing dates.
thresh : float, optional
Winsorization threshold for contributing series. If set, caps positive values
at `thresh` and floors negative values at `-thresh` for all input series before
calculating the composite. This reduces the impact of outliers. Default is None
(no winsorization).
Returns
-------
~pandas.DataFrame
standardized DataFrame with the composite values, with the columns:
'cid', 'xcat', 'real_date' and 'value'.
"""
(
df,
xcats,
cids,
weights,
normalize_weights,
signs,
start,
end,
blacklist,
complete_xcats,
complete_cids,
new_xcat,
new_cid,
weight_lag,
rebal_freq,
thresh,
_xcat_agg,
mode,
) = _check_args(
df=df,
xcats=xcats,
cids=cids,
weights=weights,
normalize_weights=normalize_weights,
signs=signs,
start=start,
end=end,
blacklist=blacklist,
complete_xcats=complete_xcats,
complete_cids=complete_cids,
new_xcat=new_xcat,
new_cid=new_cid,
weight_lag=weight_lag,
rebal_freq=rebal_freq,
thresh=thresh,
)
# update local variables
_xcats: List[str] = xcats + ([weights] if isinstance(weights, str) else [])
remaining_xcats: List[str]
remaining_cids: List[str]
# NOTE: the "remaining_*" variables will not be in the same order as the input
# cids/xcats.
# Do not used these for index based lookups/operations.
df, remaining_xcats, remaining_cids = reduce_df(
df=df,
xcats=_xcats,
cids=cids,
start=start,
end=end,
blacklist=blacklist,
intersect=False,
out_all=True,
)
df = QuantamentalDataFrame(df)
result_as_categorical = df.InitializedAsCategorical
if (
len(remaining_cids) < len(cids)
and not _xcat_agg
and complete_cids
or len(remaining_cids) == 0
):
missing_cids_xcats_str = _missing_cids_xcats_str(df=df, cids=cids, xcats=xcats)
raise ValueError(
"Not all `cids` have complete `xcat` data required for the calculation.\n"
f"{missing_cids_xcats_str}"
)
if _xcat_agg:
df = _populate_missing_xcat_series(df)
result_df: QuantamentalDataFrame = linear_composite_xcat_agg(
df=df,
xcats=xcats,
weights=weights,
signs=signs,
normalize_weights=normalize_weights,
complete_xcats=complete_xcats,
new_xcat=new_xcat,
thresh=thresh,
)
else: # mode == "cid_agg" -- single xcat
df, cids, _xcat, weights, signs = _check_df_for_missing_cid_data(
df=df, cids=cids, weights=weights, signs=signs
)
result_df: QuantamentalDataFrame = linear_composite_cid_agg(
df=df,
xcat=_xcat,
cids=cids,
weights=weights,
signs=signs,
normalize_weights=normalize_weights,
complete_cids=complete_cids,
new_cid=new_cid,
weight_lag=weight_lag,
rebal_freq=rebal_freq,
thresh=thresh,
)
return QuantamentalDataFrame(result_df, categorical=result_as_categorical)
def _missing_cids_xcats_str(
df: QuantamentalDataFrame,
cids: List[str],
xcats: List[str],
) -> str:
output_strs: List[str] = []
found_cids = df["cid"].unique().tolist()
found_xcats = df["xcat"].unique().tolist()
if set(cids) != set(found_cids):
missing_cids = list(set(cids) - set(found_cids))
else:
missing_cids = []
if set(xcats) != set(found_xcats):
missing_xcats = list(set(xcats) - set(found_xcats))
else:
missing_xcats = []
xcat_dict: Dict[str, str] = {}
for xc in sorted(xcats):
miss_cids = list(
set(cids) - set(df.loc[df["xcat"] == xc, "cid"].unique().tolist())
)
if miss_cids:
xcat_dict[xc] = miss_cids
if missing_cids:
output_strs.append(f"Missing cids: {missing_cids}")
if missing_xcats:
output_strs.append(f"Missing xcats: {missing_xcats}")
if xcat_dict:
output_strs.append(
"The following `cids` are missing for the respective `xcats`:"
)
longest_xc = max([len(xc) for xc in xcat_dict.keys()])
for _xc, _cids in xcat_dict.items():
msg = f"{_xc}: " + " " * (longest_xc - len(_xc)) + " " + str(sorted(_cids))
output_strs.append(msg)
return "\n".join(output_strs)
def _linear_composite_basic(
data_df: pd.DataFrame,
weights_df: pd.DataFrame,
normalize_weights: bool = True,
complete: bool = False,
mode: str = "xcat_agg",
):
"""Main calculation function for linear_composite()"""
# Create a boolean mask to help us work out the calcs
nan_mask: pd.DataFrame = data_df.isna() | weights_df.isna()
# Normalize weights (if requested)
if normalize_weights:
adj_weights_wide = weights_df[~nan_mask].div(
weights_df[~nan_mask].abs().sum(axis=1), axis=0
)
adj_weights_wide[nan_mask] = np.nan
assert np.allclose(
adj_weights_wide[~adj_weights_wide.isna().all(axis=1)].abs().sum(axis=1), 1
), "Weights do not sum to 1. Normalization failed."
weights_df = adj_weights_wide.copy()
# Multiply the weights by the target data
out_df = data_df * weights_df
# Sum across the columns
out_df = out_df.sum(axis="columns")
# NOTE: Using `axis` with strings, to make it more readable
# Remove periods with missing data (if requested) (rows with any NaNs)
if complete:
out_df[nan_mask.any(axis="columns")] = np.nan
# put NaNs back in, as sum() removes them
out_df[nan_mask.all(axis="columns")] = np.nan
# Reset index, rename columns and return
out_df = out_df.reset_index().rename(columns={0: "value"})
# TODO: out_df from cid_agg and xcat_agg are not in the same format...
return out_df
def _apply_weight_lag(
weights_df: pd.DataFrame,
weight_lag: int,
) -> pd.DataFrame:
"""
Apply weight lag by shifting values forward.
Parameters
----------
weights_df : pd.DataFrame
DataFrame with weights indexed by date
weight_lag : int
Number of periods to lag the weights
Returns
-------
pd.DataFrame
DataFrame with lagged weights
"""
if weight_lag == 0:
return weights_df
# Shift values forward (older weights apply to later dates)
return weights_df.shift(weight_lag)
def _apply_rebal_freq(
weights_df: pd.DataFrame,
rebal_freq: str,
) -> pd.DataFrame:
"""
Limit weight updates to rebalancing dates and forward-fill between periods.
Parameters
----------
weights_df : pd.DataFrame
DataFrame with weights indexed by date
rebal_freq : str
Rebalancing frequency: "D", "W", "M", or "Q"
Returns
-------
pd.DataFrame
DataFrame with weights only updated at rebalancing dates
"""
if rebal_freq.upper() == "D":
return weights_df # Daily rebalancing - no change
# Get start-of-period dates for rebalancing
all_dates = weights_df.index
rebal_dates_series = get_sops(dates=all_dates, freq=rebal_freq)
rebal_dates = pd.DatetimeIndex(rebal_dates_series)
# Find rebalancing dates that exist in weights
rebal_dates_in_weights = weights_df.index.intersection(rebal_dates)
if len(rebal_dates_in_weights) == 0:
raise ValueError(
f"No rebalancing dates found for frequency '{rebal_freq}'. "
"Consider using a coarser frequency or check date range."
)
# Keep only rebalancing dates, then reindex and forward-fill
weights_rebal = weights_df.loc[rebal_dates_in_weights].copy()
weights_rebal = weights_rebal.reindex(all_dates)
weights_rebal = weights_rebal.ffill()
return weights_rebal
[docs]def linear_composite_cid_agg(
df: QuantamentalDataFrame,
xcat: str,
cids: List[str],
weights: Union[str, List[float]],
signs: List[float],
normalize_weights: bool = True,
complete_cids: bool = True,
new_cid="GLB",
weight_lag: int = 0,
rebal_freq: str = "D",
thresh: Optional[float] = None,
):
"""Linear composite of various cids for a given category across all periods."""
if isinstance(weights, str):
weights_df: pd.DataFrame = df[(df["xcat"] == weights)]
weights_df = weights_df.set_index(["real_date", "cid"])["value"].unstack(
level=1
)
weights_df = weights_df[cids].mul(signs, axis=1)
# Apply weight lag for dynamic weights
if weight_lag > 0:
weights_df = _apply_weight_lag(weights_df, weight_lag)
else:
weights_series: pd.Series = pd.Series(
np.array(weights) * np.array(signs),
index=cids,
)
weights_df = pd.DataFrame(
data=[weights_series.sort_index()],
index=pd.to_datetime(df["real_date"].unique().tolist()),
columns=df["cid"].unique(),
)
weights_df.index.names = ["real_date"]
weights_df.columns.names = ["cid"]
# Apply rebalancing frequency (applies to both fixed and dynamic)
if rebal_freq.upper() != "D":
weights_df = _apply_rebal_freq(weights_df, rebal_freq)
# create the data_df
data_df: pd.DataFrame = (
df[(df["xcat"] == xcat)]
.set_index(["real_date", "cid"])["value"]
.unstack(level=1)
)
# Apply winsorization if thresh is specified
if thresh is not None:
data_df = data_df.clip(lower=-thresh, upper=thresh)
# aligning the index of weights_df to the data one
# so that we have the same set of dates and same set of CIDs -- thank you
# @mikiinterfiore
weights_df = (
weights_df.stack(**PD_FUTURE_STACK)
.reindex(data_df.stack(**PD_FUTURE_STACK).index)
.unstack(1)
)
# assert that data_df and weights_df have the same shape, index and columns
assert (
(data_df.shape == weights_df.shape)
and (data_df.index.equals(weights_df.index))
and (data_df.columns.equals(weights_df.columns))
), (
"Unexpected shape of `data_df` and `weights_df`. "
"Unable to shape data for calculation."
)
# Calculate the linear combination
out_df: pd.DataFrame = _linear_composite_basic(
data_df=data_df,
weights_df=weights_df,
normalize_weights=normalize_weights,
complete=complete_cids,
mode="cid_agg",
)
if df.is_categorical():
out_df = QuantamentalDataFrame.from_timeseries(
out_df.set_index("real_date")["value"], ticker=f"{new_cid}_{xcat}"
)
else:
out_df["cid"] = new_cid
out_df["xcat"] = xcat
return out_df
[docs]def linear_composite_xcat_agg(
df: QuantamentalDataFrame,
xcats: List[str],
weights: List[float],
signs: List[float],
normalize_weights: bool = True,
complete_xcats: bool = True,
new_xcat="NEW",
thresh: Optional[float] = None,
):
"""Linear composite of various xcats across all cids and periods"""
# Create a weights series with the xcats as index
weights_series: pd.Series = pd.Series(
np.array(weights) * np.array(signs), index=xcats
)
# Create wide dataframes for the data and weights
data_df = df.set_index(["cid", "real_date", "xcat"])["value"].unstack(level=2)
# Apply winsorization if thresh is specified
if thresh is not None:
data_df = data_df.clip(lower=-thresh, upper=thresh)
weights_df = pd.DataFrame(
data=[weights_series.sort_index()],
index=data_df.index,
columns=data_df.columns,
)
# Calculate the linear combination
out_df: pd.DataFrame = _linear_composite_basic(
data_df=data_df,
weights_df=weights_df,
normalize_weights=normalize_weights,
complete=complete_xcats,
mode="xcat_agg",
)
if df.is_categorical():
# add a new column called xcat with the new_xcat value
out_df["xcat"] = pd.Categorical.from_codes(
codes=[0] * len(out_df), categories=[new_xcat]
)
out_df = QuantamentalDataFrame(out_df)
else:
out_df["xcat"] = new_xcat
return out_df
def _populate_missing_xcat_series(
df: QuantamentalDataFrame,
) -> QuantamentalDataFrame:
"""
Populate missing xcat series with NaNs
"""
found_cids: List[str] = df["cid"].unique().tolist()
found_xcats: List[str] = df["xcat"].unique().tolist()
found_xcats_set: Set[str] = set(found_xcats)
dt_range: pd.DatetimeIndex = pd.to_datetime(df["real_date"].unique())
wrn_msg: str = (
"{cidx} does not have complete xcat data for {missing_xcats}."
" These will be filled with NaNs for the calculation."
)
for cidx in found_cids:
missing_xcats = list(
found_xcats_set - set(df.loc[df["cid"] == cidx, "xcat"].unique())
)
if missing_xcats:
warnings.warn(wrn_msg.format(cidx=cidx, missing_xcats=missing_xcats))
for xc in missing_xcats:
if df.is_categorical():
df.add_nan_series(
ticker=f"{cidx}_{xc}",
start=dt_range.min(),
end=dt_range.max(),
)
else:
dct = {
"cid": cidx,
"xcat": xc,
"real_date": dt_range,
"value": np.nan,
}
df = pd.concat([df, pd.DataFrame(data=dct)])
return df
def _check_df_for_missing_cid_data(
df: QuantamentalDataFrame,
cids: List[str],
weights: Union[str, List[float]],
signs: List[float],
) -> Tuple[
QuantamentalDataFrame, List[str], str, Union[str, List[float], None], List[float]
]:
"""
Check the DataFrame for missing `cid` data and drop them if necessary and return the
DataFrame with the missing `cid` data dropped.
"""
found_cids: List[str] = df["cid"].unique().tolist()
found_cids = [cid for cid in cids if cid in found_cids]
found_xcats: List[str] = df["xcat"].unique().tolist()
found_xcats_set: Set[str] = set(found_xcats)
wrn_msg: str = (
"`cid` {cidx} does not have complete `xcat` data for {missing_xcats}."
" These will be dropped from the calculation."
)
if isinstance(weights, str):
if weights not in found_xcats:
raise ValueError(
f"Weight category {weights} not found in `df`. "
f"Available categories are {found_xcats}."
)
if len(found_xcats_set - {weights}) == 0:
raise ValueError(
"None of the `xcats` are present in `df` other than the `weights`. "
f"Available categories are {found_xcats}."
)
if set(cids) - set(found_cids) != set():
for cid in set(cids) - set(found_cids):
# Cids has already been removed since it uses
warnings.warn(f"cid {cid} not found in `df`. It will be ignored.")
signs.pop(cids.index(cid))
if isinstance(weights, list):
weights.pop(cids.index(cid))
ctr = 0
for cidx in found_cids.copy(): # copy to allow modification of `cids`
missing_xcats = list(
found_xcats_set - set(df.loc[df["cid"] == cidx, "xcat"].unique())
)
if missing_xcats:
found_cids.pop(ctr)
signs.pop(ctr)
if isinstance(weights, list):
weights.pop(ctr)
# drop from df
df = df.loc[df["cid"] != cidx, :]
warnings.warn(wrn_msg.format(cidx=cidx, missing_xcats=missing_xcats))
else:
ctr += 1
if len(found_cids) == 0:
raise ValueError(
"No `cids` have complete `xcat` data required for the calculation."
)
_xcat: str = list(set(found_xcats) - {weights if isinstance(weights, str) else ""})[
0
]
rcids = [c for c in cids if c in found_cids] # to preserve order
return QuantamentalDataFrame(df), rcids, _xcat, weights, signs
def _check_args(
df: QuantamentalDataFrame,
xcats: Union[str, List[str]],
cids: Optional[List[str]] = None,
weights: Optional[Union[List[float], str]] = None,
normalize_weights: bool = True,
signs: Optional[List[float]] = None,
start: Optional[str] = None,
end: Optional[str] = None,
blacklist: Dict[str, List[str]] = None,
complete_xcats: bool = False,
complete_cids: bool = False,
new_xcat="NEW",
new_cid="GLB",
weight_lag: int = 0,
rebal_freq: str = "D",
thresh: Optional[float] = None,
):
"""
Check the arguments of linear_composite()
"""
# df check
if (
(not isinstance(df, QuantamentalDataFrame))
or ("value" not in df.columns)
or (df["value"].isna().all())
):
raise TypeError("`df` must be a standardized Quantamental DataFrame.")
if start is None:
start: str = df["real_date"].min().strftime("%Y-%m-%d")
if end is None:
end: str = df["real_date"].max().strftime("%Y-%m-%d")
# dates check
for varx, namex in zip([start, end], ["start", "end"]):
if varx is not None:
if not (isinstance(varx, str) and is_valid_iso_date(varx)):
raise ValueError(f"`{namex}` must be a valid ISO date string.")
# if type(df) is QuantamentalDataFrame and df.is_categorical():
# xcats_in_df = set(df["xcat"].cat.categories)
# cids_in_df = set(df["cid"].cat.categories)
# else:
xcats_in_df = set(df["xcat"].values)
cids_in_df = set(df["cid"].values)
# check xcats
if xcats is None:
xcats: List[str] = list(xcats_in_df)
elif isinstance(xcats, str):
xcats: List[str] = [xcats]
elif isinstance(xcats, listtypes):
xcats: List[str] = list(xcats)
else:
raise TypeError("`xcats` must be a string or list of strings.")
if not all(x in xcats_in_df for x in xcats):
if complete_xcats:
raise ValueError("Not all `xcats` are available in `df`.")
else:
missing_xcats = list(set(xcats) - xcats_in_df)
warnings.warn(
f"Not all `xcats` are available in `df`: {missing_xcats} "
"The calculation will be performed with the available xcats."
)
if signs is not None:
signs = [signs[i] for i, xc in enumerate(xcats) if xc not in missing_xcats]
if isinstance(weights, list):
weights = [weights[i] for i, xc in enumerate(xcats) if xc not in missing_xcats]
xcats = [xc for xc in xcats if xc not in missing_xcats]
# check cids
if cids is None:
cids: List[str] = list(cids_in_df)
elif isinstance(cids, str):
cids: List[str] = [cids]
elif isinstance(cids, listtypes):
cids: List[str] = list(cids)
else:
raise TypeError("`cids` must be a string or list of strings.")
# check cids in df
if not all(c in cids_in_df for c in cids):
if complete_cids:
raise ValueError("Not all `cids` are available in `df`.")
else:
missing_cids = list(set(cids) - cids_in_df)
warnings.warn(
f"Not all `cids` are available in `df`: {missing_cids} "
"The calculation will be performed with the available cids."
)
if signs is not None:
signs = [signs[i] for i, cid in enumerate(cids) if cid not in missing_cids]
if isinstance(weights, list):
weights = [weights[i] for i, cid in enumerate(cids) if cid not in missing_cids]
cids = [cid for cid in cids if cid not in missing_cids]
_xcat_agg: bool = len(xcats) > 1 or new_xcat != "NEW"
mode: str = "xcat_agg" if _xcat_agg else "cid_agg"
if _xcat_agg and isinstance(weights, str):
raise ValueError(
"When aggregating over xcats, `weights` "
"must be a list of floats or integers."
)
# check weights
expc_weights_len: int = len(xcats) if _xcat_agg else len(cids)
if weights is None:
weights: List[float] = list(np.ones(expc_weights_len) / expc_weights_len)
elif isinstance(weights, listtypes):
weights: List[float] = list(weights)
if not all([isinstance(x, (float, int)) for x in weights]):
raise TypeError("`weights` must be a list of floats or integers.")
if len(weights) != expc_weights_len:
raise ValueError(
"`weights` must be a list of floats of the same length as `xcats`."
)
if any([x == 0.0 for x in weights]):
raise ValueError("`weights` must not contain any 0s.")
elif isinstance(weights, str):
if weights not in xcats_in_df:
raise ValueError(
"When using a category-string as `weights`"
" it must be present in `df`."
)
else:
raise TypeError("`weights` must be a list of floats, a string or None.")
# check signs
if signs is None:
signs: List[float] = [1.0] * (len(xcats) if _xcat_agg else len(cids))
elif isinstance(signs, listtypes):
signs: List[float] = list(signs)
if len(signs) != expc_weights_len:
raise ValueError(
"`signs` must be a list of floats of the same length as `xcats`."
)
if not all([x in [-1.0, 1.0] for x in signs]):
if any([x == 0.0 for x in signs]):
raise ValueError("`signs` must not contain any 0s.")
warnings.warn(
"`signs` must be a list of +1s or -1s. "
"`signs` will be coerced to +1s/-1s. "
"(i.e. signs = abs(signs) / signs)"
)
signs: List[float] = [abs(x) / x for x in signs]
else:
raise TypeError("`signs` must be a list of floats/ints or None.")
if not isinstance(normalize_weights, bool):
raise TypeError("`normalize_weights` must be a boolean.")
if not isinstance(complete_xcats, bool):
raise TypeError("`complete_xcats` must be a boolean.")
if not isinstance(complete_cids, bool):
raise TypeError("`complete_cids` must be a boolean.")
if not isinstance(new_xcat, str):
raise TypeError("`new_xcat` must be a string.")
if not isinstance(new_cid, str):
raise TypeError("`new_cid` must be a string.")
if blacklist is not None:
if not isinstance(blacklist, dict):
raise TypeError("`blacklist` must be a dictionary.")
# Validate weight_lag
if not isinstance(weight_lag, int):
raise TypeError("`weight_lag` must be an integer.")
if weight_lag < 0:
raise ValueError("`weight_lag` must be non-negative.")
if weight_lag > 0 and not isinstance(weights, str):
raise ValueError(
"`weight_lag` can only be applied when `weights` is a category string."
)
# Validate rebal_freq
if not isinstance(rebal_freq, str):
raise TypeError("`rebal_freq` must be a string.")
_map_to_business_day_frequency(rebal_freq, valid_freqs=["D", "W", "M", "Q"])
# Validate thresh
if thresh is not None:
if not isinstance(thresh, (int, float)):
raise TypeError("`thresh` must be a numeric value (int or float).")
if thresh <= 0:
raise ValueError("`thresh` must be positive.")
return (
df,
xcats,
cids,
weights,
normalize_weights,
signs,
start,
end,
blacklist,
complete_xcats,
complete_cids,
new_xcat,
new_cid,
weight_lag,
rebal_freq,
thresh,
_xcat_agg,
mode,
)
if __name__ == "__main__":
cids = ["AUD", "CAD", "GBP"]
xcats = ["XR", "CRY", "INFL"]
df: pd.DataFrame = pd.concat(
[
make_test_df(
cids=cids,
xcats=xcats[:-1],
start="2000-01-01",
end="2000-02-01",
style="linear",
),
make_test_df(
cids=cids,
xcats=["INFL"],
start="2000-01-01",
end="2000-02-01",
style="decreasing-linear",
),
]
)
# all infls are now decreasing-linear, while everything else is increasing-linear
df.loc[
(df["cid"] == "GBP")
& (df["xcat"] == "INFL")
& (df["real_date"] == "2000-01-17"),
"value",
] = np.nan
df.loc[
(df["cid"] == "AUD")
& (df["xcat"] == "CRY")
& (df["real_date"] == "2000-01-17"),
"value",
] = np.nan
# there are now missing values for AUD-CRY and GBP-INFL on 2000-01-17
lc_cid = linear_composite(
df=df, xcats="XR", weights="INFL", normalize_weights=False
)
df = QuantamentalDataFrame(df)
lc_xcat = linear_composite(
df=df,
cids=["GBP", "AUD", "CAD"],
xcats=["XR"],
weights=[1, 2, 1],
signs=[1, -1, 1],
complete_xcats=True,
)