Source code for macrosynergy.panel.granger_causality_test

"""
Run Granger Causality Test on a standardized quantamental dataframe.  **NOTE: This
module is under development, and is not yet ready for production use.**
"""

import warnings
from typing import Any, Dict, List, Optional, Union

import pandas as pd
import statsmodels
from packaging import version
from statsmodels.tsa.stattools import grangercausalitytests

from macrosynergy.management.simulate import make_test_df
from macrosynergy.management.types import QuantamentalDataFrame
from macrosynergy.management.utils import (
    downsample_df_on_real_date,
    is_valid_iso_date,
    qdf_to_ticker_df,
    reduce_df_by_ticker,
)

import logging

logger = logging.getLogger(__name__)


[docs]def granger_causality_test( df: pd.DataFrame, tickers: Optional[List[str]] = None, cids: Optional[Union[str, List[str]]] = None, xcats: Optional[Union[str, List[str]]] = None, max_lag: Union[int, List[int]] = 4, add_constant: bool = False, freq: str = "M", agg: str = "mean", start: Optional[str] = None, end: Optional[str] = None, metric: str = "value", ) -> Dict[Any, Any]: """ Run Granger Causality Test on a standardized quantamental dataframe. Since the Granger Causality Test is a pairwise test, only two tickers are permitted. Tickers may be specified directly using the `tickers` parameter, or formed by providing one cross-section (`cid`) and two extended categories (`xcats`), or two cross-sections (`cids`) and one extended category (`xcat`). Tickers are constructed from the specified `cids` and `xcats` in the provided order. The order of the tickers is preserved, with the first ticker being tested to determine whether it Granger-causes the second. The test evaluates whether the time series of the first ticker Granger-causes the time series of the second ticker. Parameters ---------- df : pd.DataFrame A standardized quantamental dataframe. tickers : List[str] A list of tickers to run the test on. A maximum of two tickers can be specified. cids : Union[str, List[str]] One or two cross-sections to run the test on. If two cross-sections are specified, then only one extended category can be specified. If one cross-section is specified, then two categories must be specified. xcats : Union[str, List[str]] One or two extended categories to run the test on. If two categories are specified, then only one cross-sections can be specified. If one category is specified, then two cross-sections must be specified. max_lag : Union[int, List[int]] If `max_lag` is an integer, then the function computes the test for all lags up to `max_lag`. If `max_lag` is a list of integers, then the function computes the test only for lags specified in the list. add_constant : bool Whether to add a constant to the regression. freq : str The frequency to downsample the data to. Must be one of "D", "W", "M", "Q", "A". Default is "M". agg : str The aggregation method to use when downsampling the data. Must be one of "mean" (default), "median", "min", "max", "first" or "last". start : str The start date of the data. Must be a valid ISO date. If not specified, the earliest date in `df` is used. end : str The end date of the data. Must be a valid ISO date. If not specified, the latest date in `df` is used. metric : str The metric to run the test on. Must be a column in `df`. Default is "value". Raises ------ TypeError If any of the inputs are of the wrong type. ValueError If any of the input values are invalid. Returns ------- Dict[Any, Any] A dictionary containing the results of the Granger Causality Test. The keys are the lags and the values are the results of the test. """ ## Check inputs _type_checks( df=df, tickers=tickers, cids=cids, xcats=xcats, max_lag=max_lag, add_constant=add_constant, start=start, end=end, freq=freq, agg=agg, metric=metric, ) ## value checks for `freq` and `agg` are implicitly checked in downstream functions ## Copy df to prevent side effects df: QuantamentalDataFrame = QuantamentalDataFrame(df) ## Construct tickers from the `cids` and `xcats` if `tickers` is not specified tickers: List[str] = _get_tickers(tickers=tickers, cids=cids, xcats=xcats) ## Reduce df df: QuantamentalDataFrame = reduce_df_by_ticker( df=df, ticks=tickers, start=start, end=end ) # Downsample df freq = freq.upper() agg = agg.lower() df = downsample_df_on_real_date( df=df, groupby_columns=["cid", "xcat"], freq=freq, agg=agg ) # Pivot df df_wide: pd.DataFrame = QuantamentalDataFrame(df).to_wide(value_column=metric) # there must only be two columns in df_wide assert len(df_wide.columns) == 2, "df_wide must have only two columns" logger.info( "Running Granger Causality Test: Testing whether %s Granger causes %s", df_wide.columns[0], df_wide.columns[1], ) # NOTE: Since no NANs are allowed in the input data, we must drop them here # This may yield unexpected/unreliable results for tickers with large periods of # missing data # drop any rows with NANs df_wide = df_wide.dropna(how="any", axis=0) if df_wide.empty: raise ValueError( "The input data contains only NANs. " "Please check the input data for missing values or " "consider using a different downsampling frequency/date range." ) gct: Dict[Any, Any] = _granger_causality_backend( data=df_wide, max_lag=max_lag, ) return gct
def _statsmodels_compatibility_wrapper( x: Any = None, maxlag: Any = None, addconst: Any = None, verbose: Any = None ) -> Any: """ Wrapper function to handle compatibility issues with different versions of statsmodels. """ if version.parse(statsmodels.__version__) < version.parse("0.15.0"): with warnings.catch_warnings(): warnings.simplefilter(action="ignore", category=FutureWarning) return grangercausalitytests(x, maxlag, addconst, False) else: return grangercausalitytests(x, maxlag, addconst) def _granger_causality_backend( data: pd.DataFrame, max_lag: Union[int, List[int]], add_constant: bool = True ) -> Dict[Any, Any]: assert len(data.columns) == 2, "`data` must have only two columns" assert ( isinstance(max_lag, int) or isinstance(max_lag, list) and all(isinstance(l, int) for l in max_lag) and len(max_lag) > 0 ), "`max_lag` must be an integer or a list of integers" assert isinstance(add_constant, bool), "`add_constant` must be a boolean" arguments: Dict[str, Any] = dict( x=data, maxlag=max_lag, addconst=add_constant, ) return _statsmodels_compatibility_wrapper(**arguments) def _get_tickers( tickers: Optional[List[str]] = None, cids: Optional[List[str]] = None, xcats: Optional[List[str]] = None, ) -> List[str]: """ Simply returns the tickers if they are specified. If they are not specified, then the function forms the list of tickers from the `cids` and `xcats` parameters such that the order of the formed tickers is preserved. Parameters ---------- tickers : List[str] A list of tickers. cids : Union[str, List[str]] One or two cross-sections. xcats : Union[str, List[str]] One or two extended categories. """ if tickers is not None: return tickers else: if isinstance(cids, str): cids: List[str] = [cids] if isinstance(xcats, str): xcats: List[str] = [xcats] return [f"{c}_{x}" for c in cids for x in xcats] def _type_checks( df: pd.DataFrame, tickers: Optional[List[str]], cids: Optional[List[str]], xcats: Optional[List[str]], max_lag: Union[int, List[int]], add_constant: bool, start: Optional[str], end: Optional[str], freq: str, agg: str, metric: str, ) -> bool: """ Does type checks on the inputs to `granger_causality_test`. All inputs are checked for type and value errors. Raises ------ TypeError If any of the inputs are of the wrong type. ValueError If any of the input values are invalid. Returns ------- bool True if all type checks pass. """ if not isinstance(df, QuantamentalDataFrame): raise TypeError("df must be a standardized quantamental dataframe") if not isinstance(metric, str): raise TypeError("`metric` must be a string") err_msg: str = f"`metric` '{metric}' not found in `df`" if metric not in df.columns: raise ValueError(err_msg) if not isinstance(max_lag, (int, list)): raise TypeError("`max_lag` must be an integer or a list of integers") elif isinstance(max_lag, list) and not all(isinstance(l, int) for l in max_lag): raise TypeError("`max_lag` must be an integer or a list of integers") for dt, nm in zip([start, end], ["start", "end"]): if dt is not None and not is_valid_iso_date(dt): raise ValueError(f"{nm} must be a valid ISO date") if isinstance(cids, str): cids: List[str] = [cids] if isinstance(xcats, str): xcats: List[str] = [xcats] if bool(cids) ^ bool(xcats): raise ValueError("`cids` and `xcats` must be specified together") bcidxcats: bool = bool(cids) and bool(xcats) if bool(tickers) and (bcidxcats): raise ValueError( "`tickers` cannot be specified if `cids` & `xcats` are specified" ) found_tickers: List[str] = QuantamentalDataFrame(df).list_tickers() if bool(tickers): # check if there are only two if len(set(tickers)) != 2: raise ValueError("Only two tickers can be specified in `tickers`") if not all(isinstance(t, str) for t in tickers): raise TypeError("`tickers` must be a list of strings") if not set(tickers).issubset(set(found_tickers)): raise ValueError( "All tickers specified in `tickers` must be in `df`." f"Missing tickers: {set(tickers) - set(found_tickers)}" ) else: assert bcidxcats, "Failed to resolve tickers" if bcidxcats: for lx, nm in zip([cids, xcats], ["cid", "xcat"]): if not (isinstance(lx, list) and all(isinstance(x, str) for x in lx)): raise TypeError(f"`{nm}` must be a list of strings") if not set(lx).issubset(set(df[nm])): raise ValueError( f"All '{nm}s' in `{nm}` specified must be in `df`. " f"Missing {nm}s: {set(lx) - set(df[nm])}." ) tks: List[str] = [f"{c}_{x}" for c in cids for x in xcats] if not len(tks) == 2: raise ValueError( "The combination of `cids` & `xcats` must yield two tickers", f"Found {len(tks)} tickers: {tks}, ", f"from `cids` {cids} and `xcats` {xcats}", ) if not set(tks).issubset(set(found_tickers)): raise ValueError( "All combinations of `cids` & `xcats` (i.e. tickers) specified must " "be in `df`." f"Missing tickers: {set(tks) - set(found_tickers)}" ) if not isinstance(freq, str): raise TypeError("`freq` must be a string") if not isinstance(agg, str): raise TypeError("`agg` must be a string") if add_constant not in [True, False] or not isinstance(add_constant, bool): raise TypeError("`add_constant` must be a boolean") return True if __name__ == "__main__": cids: List[str] = ["AUD"] xcats: List[str] = ["FX", "EQ"] df: pd.DataFrame = make_test_df( cids=cids, xcats=xcats, ) gct: Dict[Any, Any] = granger_causality_test( df=df, cids=cids, xcats=xcats, ) cids: List[str] = ["AUD", "CAD"] xcats: str = "FX" # tickers = AUD_FX, CAD_FX df: pd.DataFrame = make_test_df( cids=cids, xcats=xcats, ) gct: Dict[Any, Any] = granger_causality_test( df=df, tickers=["AUD_FX", "CAD_FX"], ) print(gct)