Source code for macrosynergy.management.validation

import warnings
import datetime
from typing import (
    Dict,
    List,
    NamedTuple,
    Optional,
    Union,
)
import pandas as pd

from macrosynergy.management.utils import reduce_df, is_valid_iso_date
from macrosynergy.management.utils.df_utils import standardise_dataframe


[docs]class QDFArgs(NamedTuple):
    """
    Contains the QuantamentalDataFrame and associated arguments.
    """

    df: pd.DataFrame
    cids: List[str]
    xcats: List[str]
    metrics: List[str]
    start: str
    end: str


[docs]def validate_and_reduce_qdf(
    df: pd.DataFrame,
    cids: Optional[List[str]] = None,
    xcats: Optional[List[str]] = None,
    metrics: Optional[List[str]] = None,
    intersect: Optional[bool] = False,
    tickers: Optional[List[str]] = None,
    blacklist: Optional[Dict[str, List[str]]] = None,
    start: Optional[str] = None,
    end: Optional[str] = None,
):
    """
    Validates the inputs to a function that takes a DataFrame as its first argument. The
    DataFrame is then reduced according to the inputs.

    Parameters
    ----------
    df : pd.DataFrame
        A DataFrame with the following columns: 'cid', 'xcat', 'real_date', and at least
        one metric from - 'value', 'grading', 'eop_lag', or 'mop_lag'.
    cids : List[str]
        A list of cids to select from the DataFrame. If None, all cids are selected.
    xcats : List[str]
        A list of xcats to select from the DataFrame. If None, all xcats are selected.
    metrics : List[str]
        A list of metrics to select from the DataFrame. If None, all metrics are
        selected.
    intersect : bool
        if True only retains cids that are available for all xcats. Default is False.
    tickers : List[str]
        A list of tickers that will be selected from the DataFrame if they exist,
        regardless of start, end, blacklist, and intersect arguments.
    blacklist : dict
        cross-sections with date ranges that should be excluded from the data frame. If
        one cross-section has several blacklist periods append numbers to the cross-section
        code.
    start : str
        ISO-8601 formatted date string. Select data from this date onwards. If None, all
        dates are selected.
    end : str
        ISO-8601 formatted date string. Select data up to and including this date. If
        None, all dates are selected.

    Returns
    -------
    QDFArgs
        A NamedTuple that contains the validated arguments.
    """

    df: pd.DataFrame = df.copy()
    df = standardise_dataframe(df=df)

    metrics = _validate_metrics(df=df, metrics=metrics)

    cids_provided: bool = cids is not None
    xcats_provided: bool = xcats is not None
    missing_cids: List[str]
    missing_xcats: List[str]

    cids, missing_cids = _set_or_find_missing_in_df(
        df=df, col_name="cid", values=cids, param_name="cids"
    )
    xcats, missing_xcats = _set_or_find_missing_in_df(
        df=df, col_name="xcat", values=xcats, param_name="xcats"
    )

    start, end = _validate_start_and_end_dates(df, start, end)

    ticker_df: pd.DataFrame = pd.DataFrame()
    if tickers is not None:
        ticker_df = _get_ticker_df(df=df, tickers=tickers, metrics=metrics)

    df: pd.DataFrame
    r_xcats: List[str]
    r_cids: List[str]
    df, r_xcats, r_cids = reduce_df(
        df=df,
        cids=cids if isinstance(cids, list) else [cids],
        xcats=xcats if isinstance(xcats, list) else [xcats],
        intersect=intersect,
        start=start,
        end=end,
        blacklist=blacklist,
        out_all=True,
    )

    df: pd.DataFrame = pd.concat([df, ticker_df], axis=0)
    df = df.drop_duplicates()

    if (
        ((len(r_xcats) != len(xcats) - len(missing_xcats)) and xcats_provided)
        or ((len(r_cids) != len(cids) - len(missing_cids)) and cids_provided)
    ) and not intersect:
        m_cids: List[str] = list(set(cids).difference(set(r_cids), set(missing_cids)))
        m_xcats: List[str] = list(
            set(xcats).difference(set(r_xcats), set(missing_xcats))
        )

        warnings.warn(
            "The provided arguments resulted in a DataFrame that does not "
            "contain all the requested cids and xcats. "
            + (f"Missing cids: {m_cids}. " if m_cids else "")
            + (f"Missing xcats: {m_xcats}. " if m_xcats else "")
        )
        for m_cid in m_cids:
            cids.remove(m_cid)
        for m_xcat in m_xcats:
            xcats.remove(m_xcat)
            
    elif intersect:
        if len(r_cids) == 0:
            raise ValueError(
                "The arguments provided resulted in an empty DataFrame when "
                "filtered. There are no intersecting cids."
            )
        
        if len(r_xcats) == 0:
            raise ValueError(
                "The arguments provided resulted in an empty DataFrame when "
                "filtered. There are no intersecting xcats."
            )

        cids = r_cids
        xcats = r_xcats

    if df.empty:
        raise ValueError(
            "The arguments provided resulted in an "
            "empty DataFrame when filtered (see `reduce_df`)."
        )

    return QDFArgs(df, cids, xcats, metrics, start, end)


def _get_ticker_df(df: pd.DataFrame, tickers: List[str], metrics: Optional[List[str]]):
    """
    Filters a QuantamentalDataFrame by tickers.

    Parameters
    ----------
    df : pd.DataFrame
        a Pandas DataFrame.
    metrics : List[str]
        a list of metrics.
    tickers : List[str]
        a list of tickers.

    Returns
    -------
    pd.DataFrame
        the filtered DataFrame.
    """

    df_tickers: List[pd.DataFrame] = [pd.DataFrame()]
    for ticker in tickers:
        _cid, _xcat = ticker.split("_", 1)
        df_tickers.append(
            df.loc[
                (df["cid"] == _cid) & (df["xcat"] == _xcat),
                ["real_date", "cid", "xcat"] + metrics,
            ]
        )
    ticker_df: pd.DataFrame = pd.concat(df_tickers, axis=0)
    return ticker_df


def _validate_start_and_end_dates(df: pd.DataFrame, start: str, end: str):
    """
    Determines start and end dates for a DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to be filtered.
    start : str
        ISO-8601 formatted date string. If None, the earliest date in the DataFrame is
        used.
    end : str
        ISO-8601 formatted date string. If None, the latest date in the DataFrame is
        used.

    Returns
    -------
    Tuple[str]
        Tuple of start and end dates.
    """

    if start is None:
        start: str = pd.Timestamp(df["real_date"].min()).strftime("%Y-%m-%d")
    if end is None:
        end: str = pd.Timestamp(df["real_date"].max()).strftime("%Y-%m-%d")
    for var, name in [(start, "start"), (end, "end")]:
        if not is_valid_iso_date(var):
            raise ValueError(f"`{name}` must be a valid ISO date string")

    return start, end


def _validate_metrics(df: pd.DataFrame, metrics: List[str]):
    """
    Validates the metrics passed to a function.

    Parameters
    ----------
    df : pd.DataFrame
        a Pandas DataFrame.
    metrics : List[str]
        a list of metrics to be checked.

    Returns
    -------
    List[str]
        a list of metrics to be used.
    """

    required_columns: List[str] = ["real_date", "cid", "xcat"]
    if metrics is None:
        metrics: List[str] = list(set(df.columns) - set(required_columns))
    required_columns += metrics
    if not set(required_columns).issubset(set(df.columns)):
        raise ValueError(
            f"DataFrame must contain the following columns: {required_columns}"
        )
    return metrics


def _set_or_find_missing_in_df(
    df: pd.DataFrame, col_name: str, values: Optional[List], param_name: str
):
    """
    Returns the values passed to a function and a list of values that are not found in a
    specific column of a DataFrame. If values is None, all unique values in the
    DataFrame are returned.

    Parameters
    ----------
    df : pd.DataFrame
        a Pandas DataFrame.
    col_name : str
        name of column in the DataFrame.
    values : List
        list of values to be checked.
    param_name : str
        name of parameter passed to function.

    Returns
    -------
    List
        list of values that are not in the DataFrame.
    """

    missing_values: List = []
    if values is None:
        values = df[col_name].unique().tolist()
    else:
        missing_values = _find_missing_in_df(
            df=df, col_name=col_name, values=values, param_name=param_name
        )
    return values, missing_values


def _find_missing_in_df(df: pd.DataFrame, col_name: str, values: List, param_name: str):
    """
    Finds values in a list that are not in a specific column of a DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        a Pandas DataFrame.
    col_name : str
        name of column in the DataFrame.
    values : List
        list of values to be checked.
    param_name : str
        name of parameter passed to function.

    Returns
    -------
    List
        list of values that are not in the DataFrame.
    """

    missing: List = []
    if not set(values).issubset(set(df[col_name].unique())):
        # warn
        warnings.warn(
            f"The following {col_name}(s), passed in `{param_name}`,"
            " are not in the DataFrame `df`: "
            f"{list(set(values) - set(df[col_name].unique()))}."
        )
        missing = list(set(values) - set(df[col_name].unique()))

    return missing