Source code for nflwin.preprocessing

"""Tools to get raw data ready for modeling."""
from __future__ import print_function, division

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import NotFittedError

[docs]class ComputeElapsedTime(BaseEstimator): """Compute the total elapsed time from the start of the game. Parameters ---------- quarter_colname : string Which column indicates what quarter it is. quarter_time_colname : string Which column indicates how much time has elapsed in the current quarter. quarter_to_second_mapping : dict (default=``{"Q1": 0, "Q2": 900, "Q3": 1800, "Q4": 2700, "OT": 3600, "OT2": 4500, "OT3": 5400}``) What mapping to use between the string values in the quarter column and the seconds they correspond to. Mostly useful if your data had quarters listed as something like "Quarter 1" or "q1" instead of the values from ``nfldb``. total_time_colname : string (default="total_elapsed_time") What column name to store the total elapsed time under. copy : boolean (default=True) Whether to add the new column in place. """ def __init__(self, quarter_colname, quarter_time_colname, quarter_to_second_mapping={"Q1": 0, "Q2": 900, "Q3": 1800, "Q4": 2700, "OT": 3600, "OT2": 4500, "OT3": 5400}, total_time_colname="total_elapsed_time", copy=True): self.quarter_colname = quarter_colname self.quarter_time_colname = quarter_time_colname self.quarter_to_second_mapping = quarter_to_second_mapping self.total_time_colname = total_time_colname self.copy = copy
[docs] def fit(self, X, y=None): return self
[docs] def transform(self, X, y=None): """Create the new column. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- X : Pandas DataFrame, of shape(number of plays, number of features + 1) The input DataFrame, with the new column added. Raises ------ KeyError If ``quarter_colname`` or ``quarter_time_colname`` don't exist, or if ``total_time_colname`` **does** exist. TypeError If the total time elapsed is not a numeric column, which typically indicates that the mapping did not apply to every row. """ if self.quarter_colname not in X.columns: raise KeyError("ComputeElapsedTime: quarter_colname {0} does not exist in dataset." .format(self.quarter_colname)) if self.quarter_time_colname not in X.columns: raise KeyError("ComputeElapsedTime: quarter_time_colname {0} does not exist in dataset." .format(self.quarter_time_colname)) if self.total_time_colname in X.columns: raise KeyError("ComputeElapsedTime: total_time_colname {0} already exists in dataset." .format(self.total_time_colname)) if self.copy: X = X.copy() try: time_elapsed = X[self.quarter_colname].replace(self.quarter_to_second_mapping) + X[self.quarter_time_colname] except TypeError: raise TypeError("ComputeElapsedTime: Total time elapsed not numeric. Check your mapping from quarter name to time.") X[self.total_time_colname] = time_elapsed return X
[docs]class ComputeIfOffenseIsHome(BaseEstimator): """Determine if the team currently with possession is the home team. Parameters ---------- offense_team_colname : string Which column indicates what team was on offense. home_team_colname : string Which column indicates what team was the home team. offense_home_team_colname : string (default="is_offense_home") What column to store whether or not the offense was the home team. copy : boolean (default=True) Whether to add the new column in place. """ def __init__(self, offense_team_colname, home_team_colname, offense_home_team_colname="is_offense_home", copy=True): self.offense_team_colname = offense_team_colname self.home_team_colname = home_team_colname self.offense_home_team_colname = offense_home_team_colname self.copy = copy
[docs] def fit(self, X, y=None): return self
[docs] def transform(self, X, y=None): """Create the new column. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- X : Pandas DataFrame, of shape(number of plays, number of features + 1) The input DataFrame, with the new column added. Raises ------ KeyError If ``offense_team_colname`` or ``home_team_colname`` don't exist, or if ``offense_home_team_colname`` **does** exist. """ if self.home_team_colname not in X.columns: raise KeyError("ComputeIfOffenseWon: home_team_colname {0} does not exist in dataset." .format(self.home_team_colname)) if self.offense_team_colname not in X.columns: raise KeyError("ComputeIfOffenseWon: offense_team_colname {0} does not exist in dataset." .format(self.offense_team_colname)) if self.offense_home_team_colname in X.columns: raise KeyError("ComputeIfOffenseWon: offense_home_team_colname {0} already exists in dataset." .format(self.offense_home_team_colname)) if self.copy: X = X.copy() X[self.offense_home_team_colname] = (X[self.home_team_colname] == X[self.offense_team_colname]) return X
[docs]class MapToInt(BaseEstimator): """Map a column of values to integers. Mapping to integer is nice if you know a column only has a few specific values in it, but you need to convert it to integers before one-hot encoding it. Parameters ---------- colname : string The name of the column to perform the mapping on. copy : boolean (default=True) If ``False``, apply the mapping in-place. Attributes ---------- mapping : dict Keys are the unique values of the column, values are the integers those values will be mapped to. Note ---- The ``transform`` method DOES NOT CHECK to see if the input DataFrame only contains values in ``mapping``. Any values not in ``mapping`` will be left alone, which can cause subtle bugs if you're not careful. """ def __init__(self, colname, copy=True): self.colname = colname self.copy = copy self.mapping = None
[docs] def fit(self, X, y=None): """Find all unique strings and construct the mapping. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- self : For compatibility with Scikit-learn's ``Pipeline``. Raises ------ KeyError If ``colname`` is not in ``X``. """ if self.colname not in X.columns: raise KeyError("MapStringsToInt: Required column {0} " "not present in data".format(self.colname)) unique_values = X[self.colname].unique() self.mapping = {unique_values[i]: i for i in range(len(unique_values))} try: del self.mapping[np.nan] except KeyError: pass return self
[docs] def transform(self, X, y=None): """Apply the mapping to the data. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- X : Pandas DataFrame, of shape(number of plays, number of features) The input DataFrame, with the mapping applied. Raises ------ NotFittedError If ``transform`` is called before ``fit``. KeyError If ``colname`` is not in ``X``. """ if not self.mapping: raise NotFittedError("MapStringsToInt: Must fit before transform.") if self.colname not in X.columns: raise KeyError("MapStringsToInt: Required column {0} " "not present in data".format(self.colname)) if self.copy: X = X.copy() X[self.colname].replace(self.mapping, inplace=True) return X
[docs]class OneHotEncoderFromDataFrame(BaseEstimator): """One-hot encode a DataFrame. This cleaner wraps the standard scikit-learn OneHotEncoder, handling the transfer between column name and column index. Parameters ---------- categorical_feature_names : "all" or array of column names. Specify what features are treated as categorical. * "all" (default): All features are treated as categorical. * array of column names: Array of categorical feature names. dtype : number type, default=np.float. Desired dtype of output. handle_unknown : str, "error" (default) or "ignore". Whether to raise an error or ignore if an unknown categorical feature is present during transform. copy : boolean (default=True) If ``False``, apply the encoding in-place. """ @property def dtype(self): return self._dtype @dtype.setter def dtype(self, dtype): self._dtype = dtype self.onehot.dtype = self._dtype @property def handle_unknown(self): return self._handle_unknown @handle_unknown.setter def handle_unknown(self, handle_unknown): self._handle_unknown = handle_unknown self.onehot.handle_unknown = self._handle_unknown def __init__(self, categorical_feature_names="all", dtype=np.float, handle_unknown="error", copy=True): self.onehot = OneHotEncoder(sparse=False, n_values="auto", categorical_features="all") #We'll subset the DF self.categorical_feature_names = categorical_feature_names self.dtype = dtype self.handle_unknown = handle_unknown self.copy = copy
[docs] def fit(self, X, y=None): """Convert the column names to indices, then compute the one hot encoding. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- self : For compatibility with Scikit-learn's ``Pipeline``. """ if self.categorical_feature_names == "all": self.categorical_feature_names = X.columns #Get all columns that need to be encoded: data_to_encode = X[self.categorical_feature_names] self.onehot.fit(data_to_encode) return self
[docs] def transform(self, X, y=None): """Apply the encoding to the data. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- X : Pandas DataFrame, of shape(number of plays, number of new features) The input DataFrame, with the encoding applied. """ if self.copy: X = X.copy() data_to_transform = X[self.categorical_feature_names] transformed_data = self.onehot.transform(data_to_transform) #TODO (AndrewRook): Find good column names for the encoded columns. colnames = ["onehot_col{0}".format(i+1) for i in range(transformed_data.shape[1])] #Create a dataframe from the transformed columns (setting the index is critical for #merging with data containing non-standard indexes) transformed_df = pd.DataFrame(transformed_data, columns=colnames, index=X.index) X.drop(self.categorical_feature_names, axis=1, inplace=True) X[transformed_df.columns] = transformed_df return X
[docs]class CreateScoreDifferential(BaseEstimator): """Convert offense and defense scores into a differential (offense - defense). Parameters ---------- home_score_colname : string The name of the column containing the score of the home team. away_score_colname : string The name of the column containing the score of the away team. offense_home_colname : string The name of the column indicating if the offense is home. score_differential_colname : string (default=``"score_differential"``) The name of column containing the score differential. Must not already exist in the DataFrame. copy : boolean (default = ``True``) If ``False``, add the score differential in place. """ def __init__(self, home_score_colname, away_score_colname, offense_home_colname, score_differential_colname="score_differential", copy=True): self.home_score_colname = home_score_colname self.away_score_colname = away_score_colname self.offense_home_colname = offense_home_colname self.score_differential_colname = score_differential_colname self.copy = copy
[docs] def fit(self, X, y=None): return self
[docs] def transform(self, X, y=None): """Create the score differential column. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- X : Pandas DataFrame, of shape(number of plays, number of features + 1) The input DataFrame, with the score differential column added. """ try: score_differential = ((X[self.home_score_colname] - X[self.away_score_colname]) * (2 * X[self.offense_home_colname] - 1)) except KeyError: raise KeyError("CreateScoreDifferential: data missing required column. Must " "include columns named {0}, {1}, and {2}".format(self.home_score_colname, self.away_score_colname, self.offense_home_colname)) if self.score_differential_colname in X.columns: raise KeyError("CreateScoreDifferential: column {0} already in DataFrame, and can't " "be used for the score differential".format(self.score_differential_colname)) if self.copy: X = X.copy() X[self.score_differential_colname] = score_differential return X
[docs]class CheckColumnNames(BaseEstimator): """Make sure user has the right column names, in the right order. This is a useful first step to make sure that nothing is going to break downstream, but can also be used effectively to drop columns that are no longer necessary. Parameters ---------- column_names : ``None``, or list of strings A list of column names that need to be present in the scoring data. All other columns will be stripped out. The order of the columns will be applied to any scoring data as well, in order to handle the fact that pandas lets you play fast and loose with column order. If ``None``, will obtain every column in the DataFrame passed to the ``fit`` method. copy : boolean (default=``True``) If ``False``, add the score differential in place. """ def __init__(self, column_names=None, copy=True): self.column_names = column_names self.copy = copy self._fit = True self.user_specified_columns = False if self.column_names is None: self._fit = False else: self.user_specified_columns = True
[docs] def fit(self, X, y=None): """Grab the column names from a Pandas DataFrame. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- self : For compatibility with Scikit-learn's ``Pipeline``. """ if not self.user_specified_columns: self.column_names = X.columns self._fit = True return self
[docs] def transform(self, X, y=None): """Apply the column ordering to the data. Parameters ---------- X : Pandas DataFrame, of shape(number of plays, number of features) NFL play data. y : Numpy array, with length = number of plays, or None 1 if the home team won, 0 if not. (Used as part of Scikit-learn's ``Pipeline``) Returns ------- X : Pandas DataFrame, of shape(number of plays, ``len(column_names)``) The input DataFrame, properly ordered and with extraneous columns dropped Raises ------ KeyError If the input data frame doesn't have all the columns specified by ``column_names``. NotFittedError If ``transform`` is called before ``fit``. """ if not self._fit: raise NotFittedError("CheckColumnName: Call 'fit' before 'transform") if self.copy: X = X.copy() try: return X[self.column_names] except KeyError: raise KeyError("CheckColumnName: DataFrame does not have required columns. " "Must contain at least {0}".format(self.column_names))