Source code for macrosynergy.panel.granger_causality_test

"""
Run Granger Causality Test on a standardized quantamental dataframe.  **NOTE: This
module is under development, and is not yet ready for production use.**
"""

import warnings
from typing import Any, Dict, List, Optional, Union

import pandas as pd
import statsmodels
from packaging import version
from statsmodels.tsa.stattools import grangercausalitytests

from macrosynergy.management.simulate import make_test_df
from macrosynergy.management.types import QuantamentalDataFrame
from macrosynergy.management.utils import (
    downsample_df_on_real_date,
    is_valid_iso_date,
    qdf_to_ticker_df,
    reduce_df_by_ticker,
)

import logging

logger = logging.getLogger(__name__)


[docs]def granger_causality_test(
    df: pd.DataFrame,
    tickers: Optional[List[str]] = None,
    cids: Optional[Union[str, List[str]]] = None,
    xcats: Optional[Union[str, List[str]]] = None,
    max_lag: Union[int, List[int]] = 4,
    add_constant: bool = False,
    freq: str = "M",
    agg: str = "mean",
    start: Optional[str] = None,
    end: Optional[str] = None,
    metric: str = "value",
) -> Dict[Any, Any]:
    """
    Run Granger Causality Test on a standardized quantamental dataframe. Since the Granger
    Causality Test is a pairwise test, only two tickers are permitted. Tickers may be
    specified directly using the `tickers` parameter, or formed by providing one
    cross-section (`cid`) and two extended categories (`xcats`), or two cross-sections
    (`cids`) and one extended category (`xcat`). Tickers are constructed from the
    specified `cids` and `xcats` in the provided order. The order of the tickers is
    preserved, with the first ticker being tested to determine whether it Granger-causes
    the second. The test evaluates whether the time series of the first ticker
    Granger-causes the time series of the second ticker.

    Parameters
    ----------
    df : pd.DataFrame
        A standardized quantamental dataframe.
    tickers : List[str]
        A list of tickers to run the test on. A maximum of two tickers can be specified.
    cids : Union[str, List[str]]
        One or two cross-sections to run the test on. If two cross-sections are specified,
        then only one extended category can be specified. If one cross-section is
        specified, then two categories must be specified.
    xcats : Union[str, List[str]]
        One or two extended categories to run the test on. If two categories are specified,
        then only one cross-sections can be specified. If one category is specified, then
        two cross-sections must be specified.
    max_lag : Union[int, List[int]]
        If `max_lag` is an integer, then the function computes the test for all lags up
        to `max_lag`. If `max_lag` is a list of integers, then the function computes the
        test only for lags specified in the list.
    add_constant : bool
        Whether to add a constant to the regression.
    freq : str
        The frequency to downsample the data to. Must be one of "D", "W", "M", "Q", "A".
        Default is "M".
    agg : str
        The aggregation method to use when downsampling the data. Must be one of "mean"
        (default), "median", "min", "max", "first" or "last".
    start : str
        The start date of the data. Must be a valid ISO date. If not specified, the
        earliest date in `df` is used.
    end : str
        The end date of the data. Must be a valid ISO date. If not specified, the latest
        date in `df` is used.
    metric : str
        The metric to run the test on. Must be a column in `df`. Default is "value".

    Raises
    ------
    TypeError
        If any of the inputs are of the wrong type.
    ValueError
        If any of the input values are invalid.

    Returns
    -------
    Dict[Any, Any]
        A dictionary containing the results of the Granger Causality Test. The keys are
        the lags and the values are the results of the test.
    """

    ## Check inputs

    _type_checks(
        df=df,
        tickers=tickers,
        cids=cids,
        xcats=xcats,
        max_lag=max_lag,
        add_constant=add_constant,
        start=start,
        end=end,
        freq=freq,
        agg=agg,
        metric=metric,
    )
    ## value checks for `freq` and `agg` are implicitly checked in downstream functions

    ## Copy df to prevent side effects
    df: QuantamentalDataFrame = QuantamentalDataFrame(df)

    ## Construct tickers from the `cids` and `xcats` if `tickers` is not specified
    tickers: List[str] = _get_tickers(tickers=tickers, cids=cids, xcats=xcats)

    ## Reduce df
    df: QuantamentalDataFrame = reduce_df_by_ticker(
        df=df, ticks=tickers, start=start, end=end
    )

    # Downsample df
    freq = freq.upper()
    agg = agg.lower()
    df = downsample_df_on_real_date(
        df=df, groupby_columns=["cid", "xcat"], freq=freq, agg=agg
    )

    # Pivot df
    df_wide: pd.DataFrame = QuantamentalDataFrame(df).to_wide(value_column=metric)

    # there must only be two columns in df_wide
    assert len(df_wide.columns) == 2, "df_wide must have only two columns"

    logger.info(
        "Running Granger Causality Test: Testing whether %s Granger causes %s",
        df_wide.columns[0],
        df_wide.columns[1],
    )
    # NOTE: Since no NANs are allowed in the input data, we must drop them here
    # This may yield unexpected/unreliable results for tickers with large periods of
    # missing data

    # drop any rows with NANs
    df_wide = df_wide.dropna(how="any", axis=0)
    if df_wide.empty:
        raise ValueError(
            "The input data contains only NANs. "
            "Please check the input data for missing values or "
            "consider using a different downsampling frequency/date range."
        )

    gct: Dict[Any, Any] = _granger_causality_backend(
        data=df_wide,
        max_lag=max_lag,
    )

    return gct


def _statsmodels_compatibility_wrapper(
    x: Any = None, maxlag: Any = None, addconst: Any = None, verbose: Any = None
) -> Any:
    """
    Wrapper function to handle compatibility issues with different versions of
    statsmodels.
    """
    if version.parse(statsmodels.__version__) < version.parse("0.15.0"):
        with warnings.catch_warnings():
            warnings.simplefilter(action="ignore", category=FutureWarning)
            return grangercausalitytests(x, maxlag, addconst, False)

    else:
        return grangercausalitytests(x, maxlag, addconst)


def _granger_causality_backend(
    data: pd.DataFrame, max_lag: Union[int, List[int]], add_constant: bool = True
) -> Dict[Any, Any]:
    assert len(data.columns) == 2, "`data` must have only two columns"
    assert (
        isinstance(max_lag, int)
        or isinstance(max_lag, list)
        and all(isinstance(l, int) for l in max_lag)
        and len(max_lag) > 0
    ), "`max_lag` must be an integer or a list of integers"
    assert isinstance(add_constant, bool), "`add_constant` must be a boolean"

    arguments: Dict[str, Any] = dict(
        x=data,
        maxlag=max_lag,
        addconst=add_constant,
    )

    return _statsmodels_compatibility_wrapper(**arguments)


def _get_tickers(
    tickers: Optional[List[str]] = None,
    cids: Optional[List[str]] = None,
    xcats: Optional[List[str]] = None,
) -> List[str]:
    """
    Simply returns the tickers if they are specified. If they are not specified, then
    the function forms the list of tickers from the `cids` and `xcats` parameters such
    that the order of the formed tickers is preserved.

    Parameters
    ----------
    tickers : List[str]
        A list of tickers.
    cids : Union[str, List[str]]
        One or two cross-sections.
    xcats : Union[str, List[str]]
        One or two extended categories.
    """

    if tickers is not None:
        return tickers
    else:
        if isinstance(cids, str):
            cids: List[str] = [cids]
        if isinstance(xcats, str):
            xcats: List[str] = [xcats]
        return [f"{c}_{x}" for c in cids for x in xcats]


def _type_checks(
    df: pd.DataFrame,
    tickers: Optional[List[str]],
    cids: Optional[List[str]],
    xcats: Optional[List[str]],
    max_lag: Union[int, List[int]],
    add_constant: bool,
    start: Optional[str],
    end: Optional[str],
    freq: str,
    agg: str,
    metric: str,
) -> bool:
    """
    Does type checks on the inputs to `granger_causality_test`. All inputs are checked
    for type and value errors.

    Raises
    ------
    TypeError
        If any of the inputs are of the wrong type.
    ValueError
        If any of the input values are invalid.

    Returns
    -------
    bool
        True if all type checks pass.
    """

    if not isinstance(df, QuantamentalDataFrame):
        raise TypeError("df must be a standardized quantamental dataframe")

    if not isinstance(metric, str):
        raise TypeError("`metric` must be a string")

    err_msg: str = f"`metric` '{metric}' not found in `df`"
    if metric not in df.columns:
        raise ValueError(err_msg)

    if not isinstance(max_lag, (int, list)):
        raise TypeError("`max_lag` must be an integer or a list of integers")
    elif isinstance(max_lag, list) and not all(isinstance(l, int) for l in max_lag):
        raise TypeError("`max_lag` must be an integer or a list of integers")

    for dt, nm in zip([start, end], ["start", "end"]):
        if dt is not None and not is_valid_iso_date(dt):
            raise ValueError(f"{nm} must be a valid ISO date")

    if isinstance(cids, str):
        cids: List[str] = [cids]
    if isinstance(xcats, str):
        xcats: List[str] = [xcats]

    if bool(cids) ^ bool(xcats):
        raise ValueError("`cids` and `xcats` must be specified together")

    bcidxcats: bool = bool(cids) and bool(xcats)

    if bool(tickers) and (bcidxcats):
        raise ValueError(
            "`tickers` cannot be specified if `cids` & `xcats` are specified"
        )

    found_tickers: List[str] = QuantamentalDataFrame(df).list_tickers()

    if bool(tickers):
        # check if there are only two
        if len(set(tickers)) != 2:
            raise ValueError("Only two tickers can be specified in `tickers`")

        if not all(isinstance(t, str) for t in tickers):
            raise TypeError("`tickers` must be a list of strings")

        if not set(tickers).issubset(set(found_tickers)):
            raise ValueError(
                "All tickers specified in `tickers` must be in `df`."
                f"Missing tickers: {set(tickers) - set(found_tickers)}"
            )

    else:
        assert bcidxcats, "Failed to resolve tickers"

    if bcidxcats:
        for lx, nm in zip([cids, xcats], ["cid", "xcat"]):
            if not (isinstance(lx, list) and all(isinstance(x, str) for x in lx)):
                raise TypeError(f"`{nm}` must be a list of strings")
            if not set(lx).issubset(set(df[nm])):
                raise ValueError(
                    f"All '{nm}s' in `{nm}` specified must be in `df`. "
                    f"Missing {nm}s: {set(lx) - set(df[nm])}."
                )

        tks: List[str] = [f"{c}_{x}" for c in cids for x in xcats]
        if not len(tks) == 2:
            raise ValueError(
                "The combination of `cids` & `xcats` must yield two tickers",
                f"Found {len(tks)} tickers: {tks}, ",
                f"from `cids` {cids} and `xcats` {xcats}",
            )
        if not set(tks).issubset(set(found_tickers)):
            raise ValueError(
                "All combinations of `cids` & `xcats` (i.e. tickers) specified must "
                "be in `df`."
                f"Missing tickers: {set(tks) - set(found_tickers)}"
            )

    if not isinstance(freq, str):
        raise TypeError("`freq` must be a string")

    if not isinstance(agg, str):
        raise TypeError("`agg` must be a string")

    if add_constant not in [True, False] or not isinstance(add_constant, bool):
        raise TypeError("`add_constant` must be a boolean")

    return True


if __name__ == "__main__":
    cids: List[str] = ["AUD"]
    xcats: List[str] = ["FX", "EQ"]

    df: pd.DataFrame = make_test_df(
        cids=cids,
        xcats=xcats,
    )

    gct: Dict[Any, Any] = granger_causality_test(
        df=df,
        cids=cids,
        xcats=xcats,
    )

    cids: List[str] = ["AUD", "CAD"]
    xcats: str = "FX"
    # tickers =  AUD_FX, CAD_FX
    df: pd.DataFrame = make_test_df(
        cids=cids,
        xcats=xcats,
    )

    gct: Dict[Any, Any] = granger_causality_test(
        df=df,
        tickers=["AUD_FX", "CAD_FX"],
    )

    print(gct)