Source code for macrosynergy.learning.forecasting.neighbors.nearest_neighbors

import numpy as np
import pandas as pd

from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.multiclass import check_classification_targets
[docs]class KNNClassifier(ClassifierMixin, BaseEstimator): def __init__(self, n_neighbors="sqrt", weights="uniform"): """ Nearest neighbors classifier. Parameters ---------- n_neighbors : int, float, or str Number of neighbors to use. If int, the number of neighbors to use. If float, the fraction of the number of samples to use. If "sqrt", the square root of the number of samples is used. weights : str Weight function used to aggregate neighbors. Possible values are "uniform" and "distance". Notes ----- The class is a wrapper around the KNeighborsClassifier from scikit-learn. It has been implemented to allow for the use of a fraction of the number of samples as the number of neighbors to use. In addition, the square root of the number of samples is a common rule of thumb for the number of neighbors to use - we wanted to allow for this option in cross-validation. """ self._check_init_params(n_neighbors, weights) self.n_neighbors = n_neighbors self.weights = weights self.knn_ = None self.classes_ = [-1, 1]
[docs] def fit(self, X, y): """ Fit method. Parameters ---------- X : pd.DataFrame or np.ndarray The input feature matrix. y : pd.Series or np.ndarray The target variable. Returns ------- self The fitted model. """ self._check_fit_params(X, y) if self.n_neighbors == "sqrt": n = int(np.sqrt(len(X))) elif isinstance(self.n_neighbors, float): n = int(self.n_neighbors * len(X)) else: n = self.n_neighbors self.knn_ = KNeighborsClassifier(n_neighbors=n, weights=self.weights).fit(X, y) return self
[docs] def predict(self, X): """ Predict method. Parameters ---------- X : pd.DataFrame or np.ndarray The input feature matrix. Returns ------- np.ndarray The predicted values. """ self._check_predict_params(X) return self.knn_.predict(X)
[docs] def predict_proba(self, X): """ Predict probability method. Parameters ---------- X : pd.DataFrame or np.ndarray The input feature matrix. Returns ------- np.ndarray The predicted probabilities. """ self._check_predict_params(X) return self.knn_.predict_proba(X)
def __getattr__(self, attr): """ Return the class attributes. Parameters ---------- attr : str The attribute to return. Returns ------- Any The attribute. """ try: return getattr(self.knn_, attr) except AttributeError: raise AttributeError( f"'{type(self).__name__}' object has no attribute '{attr}'" ) def _check_init_params(self, n_neighbors, weights): """ Check the parameters passed to the __init__ method. """ # n_neighbors if not isinstance(n_neighbors, (int, float, str)): raise TypeError("n_neighbors must be an int, float, or str") if isinstance(n_neighbors, str) and n_neighbors != "sqrt": raise ValueError('n_neighbors must be "sqrt" if it is a str') if isinstance(n_neighbors, float) and ((n_neighbors <= 0) or (n_neighbors >= 1)): raise ValueError("n_neighbors must be between 0 and 1 if it is a float") if isinstance(n_neighbors, int) and n_neighbors <= 0: raise ValueError("n_neighbors must be greater than 0 if it is an int") if not isinstance(weights, str): raise TypeError("weights must be a str") if weights not in ["uniform", "distance"]: raise ValueError('weights must be "uniform" or "distance') def _check_fit_params(self, X, y): """ Check the parameters passed to the fit method. """ # Type checks if not isinstance(X, (pd.DataFrame, np.ndarray)): raise TypeError("X must be a pd.DataFrame or np.ndarray") if not isinstance(y, (pd.Series, np.ndarray)): raise TypeError("y must be a pd.Series or np.ndarray") # Value checks if X.shape[0] != y.shape[0]: raise ValueError("X and y must have the same number of samples") if isinstance(X, (pd.DataFrame, pd.Series)) and isinstance(y, pd.Series): if not X.index.equals(y.index): raise ValueError("X and y must have the same index") # Value checks for X if isinstance(X, pd.DataFrame): if not isinstance(X.index, pd.MultiIndex): raise ValueError("X must have a multi-index") if len(X.index.levels) != 2: raise ValueError("X must have a multi-index with two levels") if not X.index.get_level_values(0).dtype == "object": raise TypeError("The outer index of X must be strings.") if not X.index.get_level_values(1).dtype == "datetime64[ns]": raise TypeError("The inner index of X must be datetime.date.") if isinstance(X, np.ndarray): if X.ndim != 2: raise ValueError( "When the input feature matrix for nearest neighbor forecasts is a " "numpy array, it must have two dimensions." ) # Value checks for y if isinstance(y, pd.Series): if not isinstance(y.index, pd.MultiIndex): raise ValueError("y must have a multi-index") if len(y.index.levels) != 2: raise ValueError("y must have a multi-index with two levels") if not y.index.get_level_values(0).dtype == "object": raise TypeError("The outer index of y must be strings.") if not y.index.get_level_values(1).dtype == "datetime64[ns]": raise TypeError("The inner index of y must be datetime.date.") if isinstance(y, np.ndarray): if y.ndim != 1: raise ValueError( "When the target variable for nearest neighbor forecasts is a numpy " "array, it must have one dimension." ) check_classification_targets(y) def _check_predict_params(self, X): """ Check the parameters passed to the predict method. """ # Type checks if not isinstance(X, (pd.DataFrame, np.ndarray)): raise TypeError("X must be a pd.DataFrame or np.ndarray") # Value checks for X if isinstance(X, pd.DataFrame): if not isinstance(X.index, pd.MultiIndex): raise ValueError("X must have a multi-index") if len(X.index.levels) != 2: raise ValueError("X must have a multi-index with two levels") if not X.index.get_level_values(0).dtype == "object": raise TypeError("The outer index of X must be strings.") if not X.index.get_level_values(1).dtype == "datetime64[ns]": raise TypeError("The inner index of X must be datetime.date.") if isinstance(X, np.ndarray): if X.ndim != 2: raise ValueError( "When the input feature matrix for nearest neighbor forecasts is a " "numpy array, it must have two dimensions." )
if __name__ == "__main__": from macrosynergy.management.simulate import make_qdf import macrosynergy.management as msm from macrosynergy.learning import SignalOptimizer, ExpandingKFoldPanelSplit from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer, r2_score # Randomly generate an unbalanced panel dataset, multi-indexed by cross-section and # real_date cids = ["AUD", "CAD", "GBP", "USD"] xcats = ["XR", "CRY", "GROWTH", "INFL"] cols = ["earliest", "latest", "mean_add", "sd_mult", "ar_coef", "back_coef"] df_cids = pd.DataFrame( index=cids, columns=["earliest", "latest", "mean_add", "sd_mult"] ) df_cids.loc["AUD"] = ["2002-01-01", "2020-12-31", 0, 1] df_cids.loc["CAD"] = ["2003-01-01", "2020-12-31", 0, 1] df_cids.loc["GBP"] = ["2000-01-01", "2020-12-31", 0, 1] df_cids.loc["USD"] = ["2000-01-01", "2020-12-31", 0, 1] df_xcats = pd.DataFrame(index=xcats, columns=cols) df_xcats.loc["XR"] = ["2000-01-01", "2020-12-31", 0, 1, 0, 3] df_xcats.loc["CRY"] = ["2000-01-01", "2020-12-31", 0, 1, 0, 0] df_xcats.loc["GROWTH"] = ["2000-01-01", "2020-12-31", 0, 1, -0.9, 0] df_xcats.loc["INFL"] = ["2000-01-01", "2020-12-31", 0, 1, 0.8, 0] dfd = make_qdf(df_cids, df_xcats, back_ar=0.75) dfd["grading"] = np.ones(dfd.shape[0]) black = {"GBP": ["2009-01-01", "2012-06-30"], "CAD": ["2018-01-01", "2100-01-01"]} dfd = msm.reduce_df(df=dfd, cids=cids, xcats=xcats, blacklist=black) dfd = dfd.pivot(index=["cid", "real_date"], columns="xcat", values="value") X = dfd.drop(columns=["XR"]) y = np.sign(dfd["XR"]) # Fit nearest neighbors classifier knn = KNNClassifier(n_neighbors=0.1) knn.fit(X, y) print(knn.predict(X))