Source code for macrosynergy.panel.linear_composite

"""
Implementation of linear_composite() function as a module.
"""

import numpy as np
import pandas as pd
from typing import List, Dict, Union, Optional, Tuple, Type, Set
import warnings
from packaging import version
from macrosynergy.management.utils import reduce_df, is_valid_iso_date
from macrosynergy.management.simulate import make_test_df
from macrosynergy.management.utils.core import _map_to_business_day_frequency
from macrosynergy.management.utils.df_utils import get_sops
from macrosynergy.management.types import QuantamentalDataFrame

listtypes: Tuple[Type, ...] = (list, np.ndarray, pd.Series, tuple)

PD_FUTURE_STACK = (
    dict(future_stack=True)
    if version.parse(pd.__version__) > version.parse("2.1.0")
    else dict(dropna=False)
)


[docs]def linear_composite(
    df: pd.DataFrame,
    xcats: Union[str, List[str]],
    cids: Optional[List[str]] = None,
    weights: Optional[Union[List[float], str]] = None,
    normalize_weights: bool = True,
    signs: Optional[List[float]] = None,
    start: Optional[str] = None,
    end: Optional[str] = None,
    blacklist: Dict[str, List[str]] = None,
    complete_xcats: bool = False,
    complete_cids: bool = False,
    new_xcat="NEW",
    new_cid="GLB",
    weight_lag: int = 0,
    rebal_freq: str = "D",
    thresh: Optional[float] = None,
):
    """
    Weighted linear combinations of cross sections or categories

    Parameters
    ----------
    df : ~pandas.DataFrame
        standardized JPMaQS DataFrame with the necessary columns: 'cid', 'xcat',
        'real_date' and 'value'.
    xcats : Union[str, List[str]
        One or more categories to be combined. If a single category is given the linear
        combination is calculated across cross-sections. This results in a single series
        to which a new cross-sectional identifier is assigned. If more than one category
        string is given the output will be a new category, i.e. a panel that is a linear
        combination of the categories specified.
    cids : List[str]
        cross-sections for which the linear combinations are calculated. Default is all
        cross-section available.
    weights : Union[List[float], str]
        This specifies how categories or cross sections are combined. There are three
        principal options. The first (default) is None, in which case equal weights are
        given to all categories or cross sections that are available. The second case is a
        set of fixed coefficients, in which case these very coefficients are applied to all
        available categories of cross sections. Per default the coefficients are normalized
        so that they add up to one for each period. This can be changed with the argument
        `normalize_weights`. The third case is the assignment of a weighting category. This
        only applies to combinations of cross sections. In this case the weighting category
        is multiplied for each period with the corresponding value of main category of the
        same cross section. Per default the weight category values are normalized so that
        they add up to one for each period. This can be changed with the argument
        `normalize_weights`.
    normalize_weights : bool
        If True (default) the weights are normalized to sum to 1. If False the weights
        are used as specified.
    signs : List[float]
        An array of consisting of +1s or -1s, of the same length as the number of
        categories in `xcats` to indicate whether the respective category should be added or
        subtracted from the linear combination. Not relevant when aggregating over cross-
        sections, i.e. when a single category is given in `xcats`. Default is None and all
        signs are set to +1.
    start : str
        earliest date in ISO format. Default is None and earliest date for which the
        respective category is available is used.
    end : str
        latest date in ISO format. Default is None and latest date for which the
        respective category is available is used.
    complete_xcats : bool
        If True (default) combinations are only calculated for observation dates on
        which all categories are available. If False a combination of the available
        categories is used. Not relevant when aggregating over cross-sections, i.e.
        when a single category is given in `xcats`.
    complete_cids : bool
        If True (default) combinations are only calculated for observation dates on
        which all cross-sections are available. If False a combination of the available
        cross-sections is used. Not relevant when aggregating over categories, i.e. when
        multiple categories are given in `xcats`.
    new_xcat : str
        Name of new composite category when aggregating over categories for a given
        cross-section. Default is "NEW".
    new_cid : str
        Name of new composite cross-section when aggregating over cross-sections for a
        given category. Default is "GLB".
    weight_lag : int
        Number of business days to lag the weight series. Only applicable when
        `weights` is a category string. Default is 0 (no lag).
        A lag of N means that weights from N business days ago are used for each
        date. Not applicable to fixed weights (list).
    rebal_freq : str
        Rebalancing frequency for weights. Must be one of "D" (daily), "W" (weekly),
        "M" (monthly), or "Q" (quarterly). Default is "D" (daily rebalancing).
        When set to a coarser frequency, weights are only updated at the start of
        each period and held constant between rebalancing dates.
    thresh : float, optional
        Winsorization threshold for contributing series. If set, caps positive values
        at `thresh` and floors negative values at `-thresh` for all input series before
        calculating the composite. This reduces the impact of outliers. Default is None
        (no winsorization).

    Returns
    -------
    ~pandas.DataFrame
        standardized DataFrame with the composite values, with the columns:
        'cid', 'xcat', 'real_date' and 'value'.
    """

    (
        df,
        xcats,
        cids,
        weights,
        normalize_weights,
        signs,
        start,
        end,
        blacklist,
        complete_xcats,
        complete_cids,
        new_xcat,
        new_cid,
        weight_lag,
        rebal_freq,
        thresh,
        _xcat_agg,
        mode,
    ) = _check_args(
        df=df,
        xcats=xcats,
        cids=cids,
        weights=weights,
        normalize_weights=normalize_weights,
        signs=signs,
        start=start,
        end=end,
        blacklist=blacklist,
        complete_xcats=complete_xcats,
        complete_cids=complete_cids,
        new_xcat=new_xcat,
        new_cid=new_cid,
        weight_lag=weight_lag,
        rebal_freq=rebal_freq,
        thresh=thresh,
    )

    # update local variables

    _xcats: List[str] = xcats + ([weights] if isinstance(weights, str) else [])

    remaining_xcats: List[str]
    remaining_cids: List[str]
    # NOTE: the "remaining_*" variables will not be in the same order as the input
    # cids/xcats.
    # Do not used these for index based lookups/operations.
    df, remaining_xcats, remaining_cids = reduce_df(
        df=df,
        xcats=_xcats,
        cids=cids,
        start=start,
        end=end,
        blacklist=blacklist,
        intersect=False,
        out_all=True,
    )

    df = QuantamentalDataFrame(df)
    result_as_categorical = df.InitializedAsCategorical

    if (
        len(remaining_cids) < len(cids)
        and not _xcat_agg
        and complete_cids
        or len(remaining_cids) == 0
    ):
        missing_cids_xcats_str = _missing_cids_xcats_str(df=df, cids=cids, xcats=xcats)
        raise ValueError(
            "Not all `cids` have complete `xcat` data required for the calculation.\n"
            f"{missing_cids_xcats_str}"
        )

    if _xcat_agg:
        df = _populate_missing_xcat_series(df)

        result_df: QuantamentalDataFrame = linear_composite_xcat_agg(
            df=df,
            xcats=xcats,
            weights=weights,
            signs=signs,
            normalize_weights=normalize_weights,
            complete_xcats=complete_xcats,
            new_xcat=new_xcat,
            thresh=thresh,
        )

    else:  # mode == "cid_agg" -- single xcat
        df, cids, _xcat, weights, signs = _check_df_for_missing_cid_data(
            df=df, cids=cids, weights=weights, signs=signs
        )

        result_df: QuantamentalDataFrame = linear_composite_cid_agg(
            df=df,
            xcat=_xcat,
            cids=cids,
            weights=weights,
            signs=signs,
            normalize_weights=normalize_weights,
            complete_cids=complete_cids,
            new_cid=new_cid,
            weight_lag=weight_lag,
            rebal_freq=rebal_freq,
            thresh=thresh,
        )
    
    return QuantamentalDataFrame(result_df, categorical=result_as_categorical)

def _missing_cids_xcats_str(
    df: QuantamentalDataFrame,
    cids: List[str],
    xcats: List[str],
) -> str:
    output_strs: List[str] = []

    found_cids = df["cid"].unique().tolist()
    found_xcats = df["xcat"].unique().tolist()

    if set(cids) != set(found_cids):
        missing_cids = list(set(cids) - set(found_cids))
    else:
        missing_cids = []

    if set(xcats) != set(found_xcats):
        missing_xcats = list(set(xcats) - set(found_xcats))
    else:
        missing_xcats = []

    xcat_dict: Dict[str, str] = {}
    for xc in sorted(xcats):
        miss_cids = list(
            set(cids) - set(df.loc[df["xcat"] == xc, "cid"].unique().tolist())
        )
        if miss_cids:
            xcat_dict[xc] = miss_cids

    if missing_cids:
        output_strs.append(f"Missing cids: {missing_cids}")
    if missing_xcats:
        output_strs.append(f"Missing xcats: {missing_xcats}")

    if xcat_dict:
        output_strs.append(
            "The following `cids` are missing for the respective `xcats`:"
        )
        longest_xc = max([len(xc) for xc in xcat_dict.keys()])
        for _xc, _cids in xcat_dict.items():
            msg = f"{_xc}: " + " " * (longest_xc - len(_xc)) + " " + str(sorted(_cids))
            output_strs.append(msg)

    return "\n".join(output_strs)


def _linear_composite_basic(
    data_df: pd.DataFrame,
    weights_df: pd.DataFrame,
    normalize_weights: bool = True,
    complete: bool = False,
    mode: str = "xcat_agg",
):
    """Main calculation function for linear_composite()"""

    # Create a boolean mask to help us work out the calcs
    nan_mask: pd.DataFrame = data_df.isna() | weights_df.isna()

    # Normalize weights (if requested)
    if normalize_weights:
        adj_weights_wide = weights_df[~nan_mask].div(
            weights_df[~nan_mask].abs().sum(axis=1), axis=0
        )
        adj_weights_wide[nan_mask] = np.nan

        assert np.allclose(
            adj_weights_wide[~adj_weights_wide.isna().all(axis=1)].abs().sum(axis=1), 1
        ), "Weights do not sum to 1. Normalization failed."

        weights_df = adj_weights_wide.copy()

    # Multiply the weights by the target data
    out_df = data_df * weights_df

    # Sum across the columns
    out_df = out_df.sum(axis="columns")

    # NOTE: Using `axis` with strings, to make it more readable
    # Remove periods with missing data (if requested) (rows with any NaNs)
    if complete:
        out_df[nan_mask.any(axis="columns")] = np.nan

    # put NaNs back in, as sum() removes them
    out_df[nan_mask.all(axis="columns")] = np.nan

    # Reset index, rename columns and return
    out_df = out_df.reset_index().rename(columns={0: "value"})

    # TODO: out_df from cid_agg and xcat_agg are not in the same format...

    return out_df


def _apply_weight_lag(
    weights_df: pd.DataFrame,
    weight_lag: int,
) -> pd.DataFrame:
    """
    Apply weight lag by shifting values forward.

    Parameters
    ----------
    weights_df : pd.DataFrame
        DataFrame with weights indexed by date
    weight_lag : int
        Number of periods to lag the weights

    Returns
    -------
    pd.DataFrame
        DataFrame with lagged weights
    """
    if weight_lag == 0:
        return weights_df

    # Shift values forward (older weights apply to later dates)
    return weights_df.shift(weight_lag)


def _apply_rebal_freq(
    weights_df: pd.DataFrame,
    rebal_freq: str,
) -> pd.DataFrame:
    """
    Limit weight updates to rebalancing dates and forward-fill between periods.

    Parameters
    ----------
    weights_df : pd.DataFrame
        DataFrame with weights indexed by date
    rebal_freq : str
        Rebalancing frequency: "D", "W", "M", or "Q"

    Returns
    -------
    pd.DataFrame
        DataFrame with weights only updated at rebalancing dates
    """
    if rebal_freq.upper() == "D":
        return weights_df  # Daily rebalancing - no change

    # Get start-of-period dates for rebalancing
    all_dates = weights_df.index
    rebal_dates_series = get_sops(dates=all_dates, freq=rebal_freq)
    rebal_dates = pd.DatetimeIndex(rebal_dates_series)

    # Find rebalancing dates that exist in weights
    rebal_dates_in_weights = weights_df.index.intersection(rebal_dates)

    if len(rebal_dates_in_weights) == 0:
        raise ValueError(
            f"No rebalancing dates found for frequency '{rebal_freq}'. "
            "Consider using a coarser frequency or check date range."
        )

    # Keep only rebalancing dates, then reindex and forward-fill
    weights_rebal = weights_df.loc[rebal_dates_in_weights].copy()
    weights_rebal = weights_rebal.reindex(all_dates)
    weights_rebal = weights_rebal.ffill()

    return weights_rebal


[docs]def linear_composite_cid_agg(
    df: QuantamentalDataFrame,
    xcat: str,
    cids: List[str],
    weights: Union[str, List[float]],
    signs: List[float],
    normalize_weights: bool = True,
    complete_cids: bool = True,
    new_cid="GLB",
    weight_lag: int = 0,
    rebal_freq: str = "D",
    thresh: Optional[float] = None,
):
    """Linear composite of various cids for a given category across all periods."""
    if isinstance(weights, str):
        weights_df: pd.DataFrame = df[(df["xcat"] == weights)]
        weights_df = weights_df.set_index(["real_date", "cid"])["value"].unstack(
            level=1
        )
        weights_df = weights_df[cids].mul(signs, axis=1)

        # Apply weight lag for dynamic weights
        if weight_lag > 0:
            weights_df = _apply_weight_lag(weights_df, weight_lag)

    else:
        weights_series: pd.Series = pd.Series(
            np.array(weights) * np.array(signs),
            index=cids,
        )
        weights_df = pd.DataFrame(
            data=[weights_series.sort_index()],
            index=pd.to_datetime(df["real_date"].unique().tolist()),
            columns=df["cid"].unique(),
        )

        weights_df.index.names = ["real_date"]
        weights_df.columns.names = ["cid"]

    # Apply rebalancing frequency (applies to both fixed and dynamic)
    if rebal_freq.upper() != "D":
        weights_df = _apply_rebal_freq(weights_df, rebal_freq)

    # create the data_df
    data_df: pd.DataFrame = (
        df[(df["xcat"] == xcat)]
        .set_index(["real_date", "cid"])["value"]
        .unstack(level=1)
    )

    # Apply winsorization if thresh is specified
    if thresh is not None:
        data_df = data_df.clip(lower=-thresh, upper=thresh)

    # aligning the index of weights_df to the data one
    # so that we have the same set of dates and same set of CIDs -- thank you
    # @mikiinterfiore
    weights_df = (
        weights_df.stack(**PD_FUTURE_STACK)
        .reindex(data_df.stack(**PD_FUTURE_STACK).index)
        .unstack(1)
    )

    # assert that data_df and weights_df have the same shape, index and columns
    assert (
        (data_df.shape == weights_df.shape)
        and (data_df.index.equals(weights_df.index))
        and (data_df.columns.equals(weights_df.columns))
    ), (
        "Unexpected shape of `data_df` and `weights_df`. "
        "Unable to shape data for calculation."
    )

    # Calculate the linear combination
    out_df: pd.DataFrame = _linear_composite_basic(
        data_df=data_df,
        weights_df=weights_df,
        normalize_weights=normalize_weights,
        complete=complete_cids,
        mode="cid_agg",
    )

    if df.is_categorical():
        out_df = QuantamentalDataFrame.from_timeseries(
            out_df.set_index("real_date")["value"], ticker=f"{new_cid}_{xcat}"
        )
    else:
        out_df["cid"] = new_cid
        out_df["xcat"] = xcat

    return out_df


[docs]def linear_composite_xcat_agg(
    df: QuantamentalDataFrame,
    xcats: List[str],
    weights: List[float],
    signs: List[float],
    normalize_weights: bool = True,
    complete_xcats: bool = True,
    new_xcat="NEW",
    thresh: Optional[float] = None,
):
    """Linear composite of various xcats across all cids and periods"""

    # Create a weights series with the xcats as index
    weights_series: pd.Series = pd.Series(
        np.array(weights) * np.array(signs), index=xcats
    )

    # Create wide dataframes for the data and weights
    data_df = df.set_index(["cid", "real_date", "xcat"])["value"].unstack(level=2)

    # Apply winsorization if thresh is specified
    if thresh is not None:
        data_df = data_df.clip(lower=-thresh, upper=thresh)

    weights_df = pd.DataFrame(
        data=[weights_series.sort_index()],
        index=data_df.index,
        columns=data_df.columns,
    )

    # Calculate the linear combination
    out_df: pd.DataFrame = _linear_composite_basic(
        data_df=data_df,
        weights_df=weights_df,
        normalize_weights=normalize_weights,
        complete=complete_xcats,
        mode="xcat_agg",
    )
    if df.is_categorical():
        # add a new column called xcat with the new_xcat value
        out_df["xcat"] = pd.Categorical.from_codes(
            codes=[0] * len(out_df), categories=[new_xcat]
        )
        out_df = QuantamentalDataFrame(out_df)
    else:
        out_df["xcat"] = new_xcat

    return out_df


def _populate_missing_xcat_series(
    df: QuantamentalDataFrame,
) -> QuantamentalDataFrame:
    """
    Populate missing xcat series with NaNs
    """
    found_cids: List[str] = df["cid"].unique().tolist()
    found_xcats: List[str] = df["xcat"].unique().tolist()
    found_xcats_set: Set[str] = set(found_xcats)
    dt_range: pd.DatetimeIndex = pd.to_datetime(df["real_date"].unique())
    wrn_msg: str = (
        "{cidx} does not have complete xcat data for {missing_xcats}."
        " These will be filled with NaNs for the calculation."
    )

    for cidx in found_cids:
        missing_xcats = list(
            found_xcats_set - set(df.loc[df["cid"] == cidx, "xcat"].unique())
        )
        if missing_xcats:
            warnings.warn(wrn_msg.format(cidx=cidx, missing_xcats=missing_xcats))
            for xc in missing_xcats:
                if df.is_categorical():
                    df.add_nan_series(
                        ticker=f"{cidx}_{xc}",
                        start=dt_range.min(),
                        end=dt_range.max(),
                    )
                else:
                    dct = {
                        "cid": cidx,
                        "xcat": xc,
                        "real_date": dt_range,
                        "value": np.nan,
                    }
                    df = pd.concat([df, pd.DataFrame(data=dct)])

    return df


def _check_df_for_missing_cid_data(
    df: QuantamentalDataFrame,
    cids: List[str],
    weights: Union[str, List[float]],
    signs: List[float],
) -> Tuple[
    QuantamentalDataFrame, List[str], str, Union[str, List[float], None], List[float]
]:
    """
    Check the DataFrame for missing `cid` data and drop them if necessary and return the
    DataFrame with the missing `cid` data dropped.
    """

    found_cids: List[str] = df["cid"].unique().tolist()
    found_cids = [cid for cid in cids if cid in found_cids]
    found_xcats: List[str] = df["xcat"].unique().tolist()
    found_xcats_set: Set[str] = set(found_xcats)
    wrn_msg: str = (
        "`cid` {cidx} does not have complete `xcat` data for {missing_xcats}."
        " These will be dropped from the calculation."
    )
    if isinstance(weights, str):
        if weights not in found_xcats:
            raise ValueError(
                f"Weight category {weights} not found in `df`. "
                f"Available categories are {found_xcats}."
            )

        if len(found_xcats_set - {weights}) == 0:
            raise ValueError(
                "None of the `xcats` are present in `df` other than the `weights`. "
                f"Available categories are {found_xcats}."
            )

    if set(cids) - set(found_cids) != set():
        for cid in set(cids) - set(found_cids):
            # Cids has already been removed since it uses
            warnings.warn(f"cid {cid} not found in `df`. It will be ignored.")
            signs.pop(cids.index(cid))
            if isinstance(weights, list):
                weights.pop(cids.index(cid))

    ctr = 0
    for cidx in found_cids.copy():  # copy to allow modification of `cids`
        missing_xcats = list(
            found_xcats_set - set(df.loc[df["cid"] == cidx, "xcat"].unique())
        )
        if missing_xcats:
            found_cids.pop(ctr)
            signs.pop(ctr)
            if isinstance(weights, list):
                weights.pop(ctr)
            # drop from df
            df = df.loc[df["cid"] != cidx, :]
            warnings.warn(wrn_msg.format(cidx=cidx, missing_xcats=missing_xcats))
        else:
            ctr += 1

    if len(found_cids) == 0:
        raise ValueError(
            "No `cids` have complete `xcat` data required for the calculation."
        )

    _xcat: str = list(set(found_xcats) - {weights if isinstance(weights, str) else ""})[
        0
    ]

    rcids = [c for c in cids if c in found_cids]  # to preserve order
    return QuantamentalDataFrame(df), rcids, _xcat, weights, signs

def _check_args(
    df: QuantamentalDataFrame,
    xcats: Union[str, List[str]],
    cids: Optional[List[str]] = None,
    weights: Optional[Union[List[float], str]] = None,
    normalize_weights: bool = True,
    signs: Optional[List[float]] = None,
    start: Optional[str] = None,
    end: Optional[str] = None,
    blacklist: Dict[str, List[str]] = None,
    complete_xcats: bool = False,
    complete_cids: bool = False,
    new_xcat="NEW",
    new_cid="GLB",
    weight_lag: int = 0,
    rebal_freq: str = "D",
    thresh: Optional[float] = None,
):
    """
    Check the arguments of linear_composite()
    """

    # df check
    if (
        (not isinstance(df, QuantamentalDataFrame))
        or ("value" not in df.columns)
        or (df["value"].isna().all())
    ):
        raise TypeError("`df` must be a standardized Quantamental DataFrame.")

    if start is None:
        start: str = df["real_date"].min().strftime("%Y-%m-%d")
    if end is None:
        end: str = df["real_date"].max().strftime("%Y-%m-%d")

    # dates check
    for varx, namex in zip([start, end], ["start", "end"]):
        if varx is not None:
            if not (isinstance(varx, str) and is_valid_iso_date(varx)):
                raise ValueError(f"`{namex}` must be a valid ISO date string.")

    # if type(df) is QuantamentalDataFrame and df.is_categorical():
    #     xcats_in_df = set(df["xcat"].cat.categories)
    #     cids_in_df = set(df["cid"].cat.categories)
    # else:
    xcats_in_df = set(df["xcat"].values)
    cids_in_df = set(df["cid"].values)

    # check xcats
    if xcats is None:
        xcats: List[str] = list(xcats_in_df)
    elif isinstance(xcats, str):
        xcats: List[str] = [xcats]
    elif isinstance(xcats, listtypes):
        xcats: List[str] = list(xcats)
    else:
        raise TypeError("`xcats` must be a string or list of strings.")

    if not all(x in xcats_in_df for x in xcats):
        if complete_xcats:
            raise ValueError("Not all `xcats` are available in `df`.")
        else:
            missing_xcats = list(set(xcats) - xcats_in_df)
            warnings.warn(
                f"Not all `xcats` are available in `df`: {missing_xcats} "
                "The calculation will be performed with the available xcats."
            )
            if signs is not None:
                signs = [signs[i] for i, xc in enumerate(xcats) if xc not in missing_xcats]
            if isinstance(weights, list):
                weights = [weights[i] for i, xc in enumerate(xcats) if xc not in missing_xcats]
            xcats = [xc for xc in xcats if xc not in missing_xcats]

    # check cids
    
    if cids is None:
        cids: List[str] = list(cids_in_df)
    elif isinstance(cids, str):
        cids: List[str] = [cids]
    elif isinstance(cids, listtypes):
        cids: List[str] = list(cids)
    else:
        raise TypeError("`cids` must be a string or list of strings.")

    # check cids in df
    if not all(c in cids_in_df for c in cids):
        if complete_cids:
            raise ValueError("Not all `cids` are available in `df`.")
        else:
            missing_cids = list(set(cids) - cids_in_df)
            warnings.warn(
                f"Not all `cids` are available in `df`: {missing_cids} "
                "The calculation will be performed with the available cids."
            )
            if signs is not None:
                signs = [signs[i] for i, cid in enumerate(cids) if cid not in missing_cids]
            if isinstance(weights, list):
                weights = [weights[i] for i, cid in enumerate(cids) if cid not in missing_cids]
            cids = [cid for cid in cids if cid not in missing_cids]

    _xcat_agg: bool = len(xcats) > 1 or new_xcat != "NEW"
    mode: str = "xcat_agg" if _xcat_agg else "cid_agg"

    if _xcat_agg and isinstance(weights, str):
        raise ValueError(
            "When aggregating over xcats, `weights` "
            "must be a list of floats or integers."
        )

    # check weights
    expc_weights_len: int = len(xcats) if _xcat_agg else len(cids)

    if weights is None:
        weights: List[float] = list(np.ones(expc_weights_len) / expc_weights_len)
    elif isinstance(weights, listtypes):
        weights: List[float] = list(weights)
        if not all([isinstance(x, (float, int)) for x in weights]):
            raise TypeError("`weights` must be a list of floats or integers.")
        if len(weights) != expc_weights_len:
            raise ValueError(
                "`weights` must be a list of floats of the same length as `xcats`."
            )
        if any([x == 0.0 for x in weights]):
            raise ValueError("`weights` must not contain any 0s.")

    elif isinstance(weights, str):
        if weights not in xcats_in_df:
            raise ValueError(
                "When using a category-string as `weights`"
                " it must be present in `df`."
            )
    else:
        raise TypeError("`weights` must be a list of floats, a string or None.")

    # check signs
    if signs is None:
        signs: List[float] = [1.0] * (len(xcats) if _xcat_agg else len(cids))
    elif isinstance(signs, listtypes):
        signs: List[float] = list(signs)
        if len(signs) != expc_weights_len:
            raise ValueError(
                "`signs` must be a list of floats of the same length as `xcats`."
            )
        if not all([x in [-1.0, 1.0] for x in signs]):
            if any([x == 0.0 for x in signs]):
                raise ValueError("`signs` must not contain any 0s.")
            warnings.warn(
                "`signs` must be a list of +1s or -1s. "
                "`signs` will be coerced to +1s/-1s. "
                "(i.e. signs = abs(signs) / signs)"
            )

            signs: List[float] = [abs(x) / x for x in signs]

    else:
        raise TypeError("`signs` must be a list of floats/ints or None.")

    if not isinstance(normalize_weights, bool):
        raise TypeError("`normalize_weights` must be a boolean.")

    if not isinstance(complete_xcats, bool):
        raise TypeError("`complete_xcats` must be a boolean.")

    if not isinstance(complete_cids, bool):
        raise TypeError("`complete_cids` must be a boolean.")

    if not isinstance(new_xcat, str):
        raise TypeError("`new_xcat` must be a string.")

    if not isinstance(new_cid, str):
        raise TypeError("`new_cid` must be a string.")

    if blacklist is not None:
        if not isinstance(blacklist, dict):
            raise TypeError("`blacklist` must be a dictionary.")

    # Validate weight_lag
    if not isinstance(weight_lag, int):
        raise TypeError("`weight_lag` must be an integer.")
    if weight_lag < 0:
        raise ValueError("`weight_lag` must be non-negative.")
    if weight_lag > 0 and not isinstance(weights, str):
        raise ValueError(
            "`weight_lag` can only be applied when `weights` is a category string."
        )

    # Validate rebal_freq
    if not isinstance(rebal_freq, str):
        raise TypeError("`rebal_freq` must be a string.")

    _map_to_business_day_frequency(rebal_freq, valid_freqs=["D", "W", "M", "Q"])

    # Validate thresh
    if thresh is not None:
        if not isinstance(thresh, (int, float)):
            raise TypeError("`thresh` must be a numeric value (int or float).")
        if thresh <= 0:
            raise ValueError("`thresh` must be positive.")

    return (
        df,
        xcats,
        cids,
        weights,
        normalize_weights,
        signs,
        start,
        end,
        blacklist,
        complete_xcats,
        complete_cids,
        new_xcat,
        new_cid,
        weight_lag,
        rebal_freq,
        thresh,
        _xcat_agg,
        mode,
    )


if __name__ == "__main__":
    cids = ["AUD", "CAD", "GBP"]
    xcats = ["XR", "CRY", "INFL"]

    df: pd.DataFrame = pd.concat(
        [
            make_test_df(
                cids=cids,
                xcats=xcats[:-1],
                start="2000-01-01",
                end="2000-02-01",
                style="linear",
            ),
            make_test_df(
                cids=cids,
                xcats=["INFL"],
                start="2000-01-01",
                end="2000-02-01",
                style="decreasing-linear",
            ),
        ]
    )

    # all infls are now decreasing-linear, while everything else is increasing-linear

    df.loc[
        (df["cid"] == "GBP")
        & (df["xcat"] == "INFL")
        & (df["real_date"] == "2000-01-17"),
        "value",
    ] = np.nan

    df.loc[
        (df["cid"] == "AUD")
        & (df["xcat"] == "CRY")
        & (df["real_date"] == "2000-01-17"),
        "value",
    ] = np.nan

    # there are now missing values for AUD-CRY and GBP-INFL on 2000-01-17

    lc_cid = linear_composite(
        df=df, xcats="XR", weights="INFL", normalize_weights=False
    )
    df = QuantamentalDataFrame(df)
    lc_xcat = linear_composite(
        df=df,
        cids=["GBP", "AUD", "CAD"],
        xcats=["XR"],
        weights=[1, 2, 1],
        signs=[1, -1, 1],
        complete_xcats=True,
    )