Source code for nflwin.preprocessing
"""Tools to get raw data ready for modeling."""
from __future__ import print_function, division
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import NotFittedError
[docs]class ComputeElapsedTime(BaseEstimator):
"""Compute the total elapsed time from the start of the game.
Parameters
----------
quarter_colname : string
Which column indicates what quarter it is.
quarter_time_colname : string
Which column indicates how much time has elapsed in the current quarter.
quarter_to_second_mapping : dict (default=``{"Q1": 0, "Q2": 900, "Q3": 1800, "Q4": 2700,
"OT": 3600, "OT2": 4500, "OT3": 5400}``)
What mapping to use between the string values in the quarter column and the seconds they
correspond to. Mostly useful if your data had quarters listed as something like "Quarter 1"
or "q1" instead of the values from ``nfldb``.
total_time_colname : string (default="total_elapsed_time")
What column name to store the total elapsed time under.
copy : boolean (default=True)
Whether to add the new column in place.
"""
def __init__(self, quarter_colname, quarter_time_colname,
quarter_to_second_mapping={"Q1": 0, "Q2": 900, "Q3": 1800, "Q4": 2700,
"OT": 3600, "OT2": 4500, "OT3": 5400},
total_time_colname="total_elapsed_time", copy=True):
self.quarter_colname = quarter_colname
self.quarter_time_colname = quarter_time_colname
self.quarter_to_second_mapping = quarter_to_second_mapping
self.total_time_colname = total_time_colname
self.copy = copy
[docs] def transform(self, X, y=None):
"""Create the new column.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
X : Pandas DataFrame, of shape(number of plays, number of features + 1)
The input DataFrame, with the new column added.
Raises
------
KeyError
If ``quarter_colname`` or ``quarter_time_colname`` don't exist, or
if ``total_time_colname`` **does** exist.
TypeError
If the total time elapsed is not a numeric column, which typically indicates
that the mapping did not apply to every row.
"""
if self.quarter_colname not in X.columns:
raise KeyError("ComputeElapsedTime: quarter_colname {0} does not exist in dataset."
.format(self.quarter_colname))
if self.quarter_time_colname not in X.columns:
raise KeyError("ComputeElapsedTime: quarter_time_colname {0} does not exist in dataset."
.format(self.quarter_time_colname))
if self.total_time_colname in X.columns:
raise KeyError("ComputeElapsedTime: total_time_colname {0} already exists in dataset."
.format(self.total_time_colname))
if self.copy:
X = X.copy()
try:
time_elapsed = X[self.quarter_colname].replace(self.quarter_to_second_mapping) + X[self.quarter_time_colname]
except TypeError:
raise TypeError("ComputeElapsedTime: Total time elapsed not numeric. Check your mapping from quarter name to time.")
X[self.total_time_colname] = time_elapsed
return X
[docs]class ComputeIfOffenseIsHome(BaseEstimator):
"""Determine if the team currently with possession is the home team.
Parameters
----------
offense_team_colname : string
Which column indicates what team was on offense.
home_team_colname : string
Which column indicates what team was the home team.
offense_home_team_colname : string (default="is_offense_home")
What column to store whether or not the offense was the home team.
copy : boolean (default=True)
Whether to add the new column in place.
"""
def __init__(self, offense_team_colname,
home_team_colname,
offense_home_team_colname="is_offense_home",
copy=True):
self.offense_team_colname = offense_team_colname
self.home_team_colname = home_team_colname
self.offense_home_team_colname = offense_home_team_colname
self.copy = copy
[docs] def transform(self, X, y=None):
"""Create the new column.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
X : Pandas DataFrame, of shape(number of plays, number of features + 1)
The input DataFrame, with the new column added.
Raises
------
KeyError
If ``offense_team_colname`` or ``home_team_colname`` don't exist, or
if ``offense_home_team_colname`` **does** exist.
"""
if self.home_team_colname not in X.columns:
raise KeyError("ComputeIfOffenseWon: home_team_colname {0} does not exist in dataset."
.format(self.home_team_colname))
if self.offense_team_colname not in X.columns:
raise KeyError("ComputeIfOffenseWon: offense_team_colname {0} does not exist in dataset."
.format(self.offense_team_colname))
if self.offense_home_team_colname in X.columns:
raise KeyError("ComputeIfOffenseWon: offense_home_team_colname {0} already exists in dataset."
.format(self.offense_home_team_colname))
if self.copy:
X = X.copy()
X[self.offense_home_team_colname] = (X[self.home_team_colname] == X[self.offense_team_colname])
return X
[docs]class MapToInt(BaseEstimator):
"""Map a column of values to integers.
Mapping to integer is nice if you know a column
only has a few specific values in it, but you need
to convert it to integers before one-hot encoding it.
Parameters
----------
colname : string
The name of the column to perform the mapping on.
copy : boolean (default=True)
If ``False``, apply the mapping in-place.
Attributes
----------
mapping : dict
Keys are the unique values of the column, values are the
integers those values will be mapped to.
Note
----
The ``transform`` method DOES NOT CHECK to see if the input
DataFrame only contains values in ``mapping``. Any values not
in ``mapping`` will be left alone, which can cause subtle bugs
if you're not careful.
"""
def __init__(self, colname, copy=True):
self.colname = colname
self.copy = copy
self.mapping = None
[docs] def fit(self, X, y=None):
"""Find all unique strings and construct the mapping.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
self : For compatibility with Scikit-learn's ``Pipeline``.
Raises
------
KeyError
If ``colname`` is not in ``X``.
"""
if self.colname not in X.columns:
raise KeyError("MapStringsToInt: Required column {0} "
"not present in data".format(self.colname))
unique_values = X[self.colname].unique()
self.mapping = {unique_values[i]: i for i in range(len(unique_values))}
try:
del self.mapping[np.nan]
except KeyError:
pass
return self
[docs] def transform(self, X, y=None):
"""Apply the mapping to the data.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
X : Pandas DataFrame, of shape(number of plays, number of features)
The input DataFrame, with the mapping applied.
Raises
------
NotFittedError
If ``transform`` is called before ``fit``.
KeyError
If ``colname`` is not in ``X``.
"""
if not self.mapping:
raise NotFittedError("MapStringsToInt: Must fit before transform.")
if self.colname not in X.columns:
raise KeyError("MapStringsToInt: Required column {0} "
"not present in data".format(self.colname))
if self.copy:
X = X.copy()
X[self.colname].replace(self.mapping, inplace=True)
return X
[docs]class OneHotEncoderFromDataFrame(BaseEstimator):
"""One-hot encode a DataFrame.
This cleaner wraps the standard scikit-learn OneHotEncoder,
handling the transfer between column name and column index.
Parameters
----------
categorical_feature_names : "all" or array of column names.
Specify what features are treated as categorical.
* "all" (default): All features are treated as categorical.
* array of column names: Array of categorical feature names.
dtype : number type, default=np.float.
Desired dtype of output.
handle_unknown : str, "error" (default) or "ignore".
Whether to raise an error or ignore if an unknown categorical feature
is present during transform.
copy : boolean (default=True)
If ``False``, apply the encoding in-place.
"""
@property
def dtype(self):
return self._dtype
@dtype.setter
def dtype(self, dtype):
self._dtype = dtype
self.onehot.dtype = self._dtype
@property
def handle_unknown(self):
return self._handle_unknown
@handle_unknown.setter
def handle_unknown(self, handle_unknown):
self._handle_unknown = handle_unknown
self.onehot.handle_unknown = self._handle_unknown
def __init__(self,
categorical_feature_names="all",
dtype=np.float,
handle_unknown="error",
copy=True):
self.onehot = OneHotEncoder(sparse=False, n_values="auto",
categorical_features="all") #We'll subset the DF
self.categorical_feature_names = categorical_feature_names
self.dtype = dtype
self.handle_unknown = handle_unknown
self.copy = copy
[docs] def fit(self, X, y=None):
"""Convert the column names to indices, then compute the one hot encoding.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
self : For compatibility with Scikit-learn's ``Pipeline``.
"""
if self.categorical_feature_names == "all":
self.categorical_feature_names = X.columns
#Get all columns that need to be encoded:
data_to_encode = X[self.categorical_feature_names]
self.onehot.fit(data_to_encode)
return self
[docs] def transform(self, X, y=None):
"""Apply the encoding to the data.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
X : Pandas DataFrame, of shape(number of plays, number of new features)
The input DataFrame, with the encoding applied.
"""
if self.copy:
X = X.copy()
data_to_transform = X[self.categorical_feature_names]
transformed_data = self.onehot.transform(data_to_transform)
#TODO (AndrewRook): Find good column names for the encoded columns.
colnames = ["onehot_col{0}".format(i+1) for i in range(transformed_data.shape[1])]
#Create a dataframe from the transformed columns (setting the index is critical for
#merging with data containing non-standard indexes)
transformed_df = pd.DataFrame(transformed_data, columns=colnames, index=X.index)
X.drop(self.categorical_feature_names, axis=1, inplace=True)
X[transformed_df.columns] = transformed_df
return X
[docs]class CreateScoreDifferential(BaseEstimator):
"""Convert offense and defense scores into a differential (offense - defense).
Parameters
----------
home_score_colname : string
The name of the column containing the score of the home team.
away_score_colname : string
The name of the column containing the score of the away team.
offense_home_colname : string
The name of the column indicating if the offense is home.
score_differential_colname : string (default=``"score_differential"``)
The name of column containing the score differential. Must not already
exist in the DataFrame.
copy : boolean (default = ``True``)
If ``False``, add the score differential in place.
"""
def __init__(self, home_score_colname,
away_score_colname,
offense_home_colname,
score_differential_colname="score_differential",
copy=True):
self.home_score_colname = home_score_colname
self.away_score_colname = away_score_colname
self.offense_home_colname = offense_home_colname
self.score_differential_colname = score_differential_colname
self.copy = copy
[docs] def transform(self, X, y=None):
"""Create the score differential column.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
X : Pandas DataFrame, of shape(number of plays, number of features + 1)
The input DataFrame, with the score differential column added.
"""
try:
score_differential = ((X[self.home_score_colname] - X[self.away_score_colname]) *
(2 * X[self.offense_home_colname] - 1))
except KeyError:
raise KeyError("CreateScoreDifferential: data missing required column. Must "
"include columns named {0}, {1}, and {2}".format(self.home_score_colname,
self.away_score_colname,
self.offense_home_colname))
if self.score_differential_colname in X.columns:
raise KeyError("CreateScoreDifferential: column {0} already in DataFrame, and can't "
"be used for the score differential".format(self.score_differential_colname))
if self.copy:
X = X.copy()
X[self.score_differential_colname] = score_differential
return X
[docs]class CheckColumnNames(BaseEstimator):
"""Make sure user has the right column names, in the right order.
This is a useful first step to make sure that nothing
is going to break downstream, but can also be used effectively
to drop columns that are no longer necessary.
Parameters
----------
column_names : ``None``, or list of strings
A list of column names that need to be present in the scoring
data. All other columns will be stripped out. The order of the
columns will be applied to any scoring
data as well, in order to handle the fact that pandas lets
you play fast and loose with column order. If ``None``,
will obtain every column in the DataFrame passed to the
``fit`` method.
copy : boolean (default=``True``)
If ``False``, add the score differential in place.
"""
def __init__(self, column_names=None, copy=True):
self.column_names = column_names
self.copy = copy
self._fit = True
self.user_specified_columns = False
if self.column_names is None:
self._fit = False
else:
self.user_specified_columns = True
[docs] def fit(self, X, y=None):
"""Grab the column names from a Pandas DataFrame.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
self : For compatibility with Scikit-learn's ``Pipeline``.
"""
if not self.user_specified_columns:
self.column_names = X.columns
self._fit = True
return self
[docs] def transform(self, X, y=None):
"""Apply the column ordering to the data.
Parameters
----------
X : Pandas DataFrame, of shape(number of plays, number of features)
NFL play data.
y : Numpy array, with length = number of plays, or None
1 if the home team won, 0 if not.
(Used as part of Scikit-learn's ``Pipeline``)
Returns
-------
X : Pandas DataFrame, of shape(number of plays, ``len(column_names)``)
The input DataFrame, properly ordered and with extraneous
columns dropped
Raises
------
KeyError
If the input data frame doesn't have all the columns specified
by ``column_names``.
NotFittedError
If ``transform`` is called before ``fit``.
"""
if not self._fit:
raise NotFittedError("CheckColumnName: Call 'fit' before 'transform")
if self.copy:
X = X.copy()
try:
return X[self.column_names]
except KeyError:
raise KeyError("CheckColumnName: DataFrame does not have required columns. "
"Must contain at least {0}".format(self.column_names))