Source code for nflwin.model

"""Tools for creating and running the model."""
from __future__ import print_function, division

import os

import numpy as np
from scipy import integrate
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import brier_score_loss
from sklearn.neighbors import KernelDensity
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import NotFittedError

import preprocessing
import utilities

[docs]class WPModel(object):
    """The object that computes win probabilities.

    In addition to holding the model itself, it defines some columns names likely to be
    used in the model as parameters to allow other users to more easily figure out which
    columns go into the model.

    Parameters
    ----------
    copy_data : boolean (default=``True``)
        Whether or not to copy data when fitting and applying the model. Running the model
        in-place (``copy_data=False``) will be faster and have a smaller memory footprint,
        but if not done carefully can lead to data integrity issues.

    Attributes
    ----------
    model : A Scikit-learn pipeline (or equivalent)
        The actual model used to compute WP. Upon initialization it will be set to
        a default model, but can be overridden by the user.
    column_descriptions : dictionary
        A dictionary whose keys are the names of the columns used in the model, and the values are
        string descriptions of what the columns mean. Set at initialization to be the default model,
        if you create your own model you'll need to update this attribute manually.
    training_seasons : A list of ints, or ``None`` (default=``None``)
        If the model was trained using data downloaded from nfldb, a list of the seasons
        used to train the model. If nfldb was **not** used, an empty list. If no model
        has been trained yet, ``None``.
    training_season_types : A list of strings or ``None`` (default=``None``)
        Same as ``training_seasons``, except for the portions of the seasons used in training the
        model ("Preseason", "Regular", and/or "Postseason").
    validation_seasons : same as ``training_seasons``, but for validation data.
    validation_season_types : same as ``training_season_types``, but for validation data.
    sample_probabilities : A numpy array of floats or ``None`` (default=``None``)
        After the model has been validated, contains the sampled predicted probabilities used to
        compute the validation statistic.
    predicted_win_percents : A numpy array of floats or ``None`` (default=``None``)
        After the model has been validated, contains the actual probabilities in the test
        set at each probability in ``sample_probabilities``.
    num_plays_used : A numpy array of floats or ``None`` (default=``None``)
        After the model has been validated, contains the number of plays used to compute each
        element of ``predicted_win_percents``.
    model_directory : string
        The directory where all models will be saved to or loaded from.

    """
    model_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
    _default_model_filename = "default_model.nflwin"

    def __init__(self,
                 copy_data=True
                ):
        self.copy_data = copy_data

        self.model = self.create_default_pipeline()
        self._training_seasons = None
        self._training_season_types = None
        self._validation_seasons = None
        self._validation_season_types = None

        self._sample_probabilities = None
        self._predicted_win_percents = None
        self._num_plays_used = None


    @property
    def training_seasons(self):
        return self._training_seasons
    @property
    def training_seasons_types(self):
        return self._training_season_types
    @property
    def validation_seasons(self):
        return self._validation_seasons
    @property
    def validation_seasons_types(self):
        return self._validation_season_types

    @property
    def sample_probabilities(self):
        return self._sample_probabilities
    @property
    def predicted_win_percents(self):
        return self._predicted_win_percents
    @property
    def num_plays_used(self):
        return self._num_plays_used

[docs]    def train_model(self,
                    source_data="nfldb",
                    training_seasons=[2009, 2010, 2011, 2012, 2013, 2014],
                    training_season_types=["Regular", "Postseason"],
                    target_colname="offense_won"):
        """Train the model.

        Once a modeling pipeline is set up (either the default or something
        custom-generated), historical data needs to be fed into it in order to
        "fit" the model so that it can then be used to predict future results.
        This method implements a simple wrapper around the core Scikit-learn functionality
        which does this.

        The default is to use data from the nfldb database, however that can be changed
        to a simple Pandas DataFrame if desired (for instance if you wish to use data
        from another source).

        There is no particular output from this function, rather the parameters governing
        the fit of the model are saved inside the model object itself. If you want to get an
        estimate of the quality of the fit, use the ``validate_model`` method after running
        this method.

        Notes
        -----
        If you are loading in the default model, **there is no need to re-run this method**.
        In fact, doing so will likely result in weird errors and could corrupt the model if you
        were to try to save it back to disk.

        Parameters
        ----------
        source_data : the string ``"nfldb"`` or a Pandas DataFrame (default=``"nfldb"``)
            The data to be used to train the model. If ``"nfldb"``, will query the nfldb
            database for the training data (note that this requires a correctly configured
            installation of nfldb's database).
        training_seasons : list of ints (default=``[2009, 2010, 2011, 2012, 2013, 2014]``)
            What seasons to use to train the model if getting data from the nfldb database.
            If ``source_data`` is not ``"nfldb"``, this argument will be ignored.
            **NOTE:** it is critical not to use all possible data in order to train the
            model - some will need to be reserved for a final validation (see the
            ``validate_model`` method). A good dataset to reserve
            for validation is the most recent one or two NFL seasons.
        training_season_types : list of strings (default=``["Regular", "Postseason"]``)
            If querying from the nfldb database, what parts of the seasons to use.
            Options are "Preseason", "Regular", and "Postseason". If ``source_data`` is not
            ``"nfldb"``, this argument will be ignored.
        target_colname : string or integer (default=``"offense_won"``)
            The name of the target variable column. 

        Returns
        -------
        ``None``
        """
        self._training_seasons = []
        self._training_season_types = []
        if isinstance(source_data, basestring):
            if source_data == "nfldb":
                source_data = utilities.get_nfldb_play_data(season_years=training_seasons,
                                                            season_types=training_season_types)
                self._training_seasons = training_seasons
                self._training_season_types = training_season_types
            else:
                raise ValueError("WPModel: if source_data is a string, it must be 'nfldb'")
        target_col = source_data[target_colname]
        feature_cols = source_data.drop(target_colname, axis=1)
        self.model.fit(feature_cols, target_col)

[docs]    def validate_model(self,
                       source_data="nfldb",
                       validation_seasons=[2015],
                       validation_season_types=["Regular", "Postseason"],
                       target_colname="offense_won"):
        """Validate the model.

        Once a modeling pipeline is trained, a different dataset must be fed into the trained model
        to validate the quality of the fit.
        This method implements a simple wrapper around the core Scikit-learn functionality
        which does this.

        The default is to use data from the nfldb database, however that can be changed
        to a simple Pandas DataFrame if desired (for instance if you wish to use data
        from another source).

        The output of this method is a p value which represents the confidence at which
        we can reject the null hypothesis that the model predicts the appropriate win
        probabilities. This number is computed by first smoothing the predicted win probabilities of both all test data and
        just the data where the offense won with a gaussian `kernel density
        estimate <http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html#sklearn.neighbors.KernelDensity>`_
        with standard deviation = 0.01. Once the data is smooth, ratios at each percentage point from 1% to 99% are computed (i.e.
        what fraction of the time did the offense win when the model says they have a 1% chance of winning, 2% chance, etc.). Each of
        these ratios should be well approximated by the binomial distribution, since they are essentially independent (not perfectly
        but hopefully close enough) weighted coin flips, giving a p value. From there `Fisher's method <https://en.wikipedia.org/wiki/Fisher%27s_method>`_
        is used to combine the p values into a global p value. A p value close to zero means that the model is unlikely to be
        properly predicting the correct win probabilities. A p value close to one, **while not proof that the model is correct**,
        means that the model is at least not inconsistent with the hypothesis that it predicts good win probabilities.

        Parameters
        ----------
        source_data : the string ``"nfldb"`` or a Pandas DataFrame (default=``"nfldb"``)
            The data to be used to train the model. If ``"nfldb"``, will query the nfldb
            database for the training data (note that this requires a correctly configured
            installation of nfldb's database).
        training_seasons : list of ints (default=``[2015]``)
            What seasons to use to validate the model if getting data from the nfldb database.
            If ``source_data`` is not ``"nfldb"``, this argument will be ignored.
            **NOTE:** it is critical not to use the same data to validate the model as was used
            in the fit. Generally a good data set to use for validation is one from a time
            period more recent than was used to train the model. For instance, if the model was trained
            on data from 2009-2014, data from the 2015 season would be a sensible choice to validate the model.
        training_season_types : list of strings (default=``["Regular", "Postseason"]``)
            If querying from the nfldb database, what parts of the seasons to use.
            Options are "Preseason", "Regular", and "Postseason". If ``source_data`` is not
            ``"nfldb"``, this argument will be ignored.
        target_colname : string or integer (default=``"offense_won"``)
            The name of the target variable column. 

        Returns
        -------
        float, between 0 and 1
            The combined p value, where smaller values indicate that the model is not accurately predicting win
            probabilities.
            
        Raises
        ------
        NotFittedError
            If the model hasn't been fit.

        Notes
        -----
        Probabilities are computed between 1 and 99 percent because a single incorrect prediction at 100% or 0% automatically drives
        the global p value to zero. Since the model is being smoothed this situation can occur even when there are no model predictions
        at those extreme values, and therefore leads to erroneous p values.

        While it seems reasonable (to me at least), I am not totally certain that this approach is entirely correct.
        It's certainly sub-optimal in that you would ideally reject the null hypothesis that the model predictions
        **aren't** appropriate, but that seems to be a much harder problem (and one that would need much more test
        data to beat down the uncertainties involved). I'm also not sure if using Fisher's method is appropriate here,
        and I wonder if it might be necessary to Monte Carlo this. I would welcome input from others on better ways to do this.
        
        """

        if self.training_seasons is None:
            raise NotFittedError("Must fit model before validating.")
        
        self._validation_seasons = []
        self._validation_season_types = []
        if isinstance(source_data, basestring):
            if source_data == "nfldb":
                source_data = utilities.get_nfldb_play_data(season_years=validation_seasons,
                                                            season_types=validation_season_types)
                self._validation_seasons = validation_seasons
                self._validation_season_types = validation_season_types
            else:
                raise ValueError("WPModel: if source_data is a string, it must be 'nfldb'")
            
        target_col = source_data[target_colname]
        feature_cols = source_data.drop(target_colname, axis=1)
        predicted_probabilities = self.model.predict_proba(feature_cols)[:,1]

        self._sample_probabilities, self._predicted_win_percents, self._num_plays_used = (
            WPModel._compute_predicted_percentages(target_col.values, predicted_probabilities))

        #Compute the maximal deviation from a perfect prediction as well as the area under the
        #curve of the residual between |predicted - perfect|:
        max_deviation, residual_area = self._compute_prediction_statistics(self.sample_probabilities,
                                                                           self.predicted_win_percents)
        return max_deviation, residual_area
        
        #Compute p-values for each where null hypothesis is that distributions are same, then combine
        #them all to make sure data is not inconsistent with accurate predictions.
        # combined_pvalue = self._test_distribution(self.sample_probabilities,
        #                                           self.predicted_win_percents,
        #                                           self.num_plays_used)
        
        # return combined_pvalue

    @staticmethod
    def _compute_prediction_statistics(sample_probabilities, predicted_win_percents):
        """Take the KDE'd model estimates, then compute statistics.

        Returns
        -------
        A tuple of (``max_deviation``, ``residual_area``), where ``max_deviation``
        is the largest discrepancy between the model and expectation at any WP,
        and ``residual_area`` is the total area under the curve of |predicted WP - expected WP|.
        """
        abs_deviations = np.abs(predicted_win_percents - sample_probabilities)
        max_deviation = np.max(abs_deviations)
        residual_area = integrate.simps(abs_deviations,
                                        sample_probabilities)
        return (max_deviation, residual_area)
                                       

[docs]    def predict_wp(self, plays):
        """Estimate the win probability for a set of plays.

        Basically a simple wrapper around ``WPModel.model.predict_proba``,
        takes in a DataFrame and then spits out an array of predicted
        win probabilities.

        Parameters
        ----------
        plays : Pandas DataFrame
            The input data to use to make the predictions.

        Returns
        -------
        Numpy array, of length ``len(plays)``
            Predicted probability that the offensive team in each play
            will go on to win the game.

        Raises
        ------
        NotFittedError
            If the model hasn't been fit.
        """
        if self.training_seasons is None:
            raise NotFittedError("Must fit model before predicting WP.")

        return self.model.predict_proba(plays)[:,1]


[docs]    def plot_validation(self, axis=None, **kwargs):
        """Plot the validation data.

        Parameters
        ----------
        axis : matplotlib.pyplot.axis object or ``None`` (default=``None``)
            If provided, the validation line will be overlaid on ``axis``.
            Otherwise, a new figure and axis will be generated and plotted on.
        **kwargs
            Arguments to ``axis.plot``.

        Returns
        -------
        matplotlib.pylot.axis
            The axis the plot was made on.

        Raises
        ------
        NotFittedError
            If the model hasn't been fit **and** validated.
        """

        if self.sample_probabilities is None:
            raise NotFittedError("Must validate model before plotting.")
        
        import matplotlib.pyplot as plt
        if axis is None:
            axis = plt.figure().add_subplot(111)
            axis.plot([0, 100], [0, 100], ls="--", lw=2, color="black")
            axis.set_xlabel("Predicted WP")
            axis.set_ylabel("Actual WP")
        axis.plot(self.sample_probabilities,
                  self.predicted_win_percents,
                  **kwargs)

        return axis
            

    @staticmethod
    def _test_distribution(sample_probabilities, predicted_win_percents, num_plays_used):
        """Based off assuming the data at each probability is a Bernoulli distribution."""

        #Get the p-values:
        p_values = [stats.binom_test(np.round(predicted_win_percents[i] * num_plays_used[i]),
                                     np.round(num_plays_used[i]),
                                     p=sample_probabilities[i]) for i in range(len(sample_probabilities))]
        combined_p_value = stats.combine_pvalues(p_values)[1]
        return(combined_p_value)

    @staticmethod
    def _compute_predicted_percentages(actual_results, predicted_win_probabilities):
        """Compute the sample percentages from a validation data set.
        """
        kde_offense_won = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(
            (predicted_win_probabilities[(actual_results == 1)])[:, np.newaxis])
        kde_total = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(
            predicted_win_probabilities[:, np.newaxis])
        sample_probabilities = np.linspace(0.01, 0.99, 99)
        number_density_offense_won = np.exp(kde_offense_won.score_samples(sample_probabilities[:, np.newaxis])) * np.sum((actual_results))
        number_density_total = np.exp(kde_total.score_samples(sample_probabilities[:, np.newaxis])) * len(actual_results)
        number_offense_won = number_density_offense_won * np.sum(actual_results) / np.sum(number_density_offense_won)
        number_total = number_density_total * len(actual_results) / np.sum(number_density_total)
        predicted_win_percents = number_offense_won / number_total

        return 100.*sample_probabilities, 100.*predicted_win_percents, number_total
    
[docs]    def create_default_pipeline(self):
        """Create the default win probability estimation pipeline.


        Returns
        -------
        Scikit-learn pipeline
            The default pipeline, suitable for computing win probabilities
            but by no means the best possible model.

        This can be run any time a new default pipeline is required,
        and either set to the ``model`` attribute or used independently.
        """

        steps = []

        offense_team_colname = "offense_team"
        home_team_colname = "home_team"
        home_score_colname = "curr_home_score"
        away_score_colname = "curr_away_score"
        down_colname = "down"
        quarter_colname = "quarter"
        time_colname = "seconds_elapsed"
        yardline_colname = "yardline"
        yards_to_go_colname="yards_to_go"

        self.column_descriptions = {
            offense_team_colname: "Abbreviation for the offensive team",
            home_team_colname: "Abbreviation for the home team",
            away_score_colname: "Abbreviation for the visiting team",
            down_colname: "The current down",
            yards_to_go_colname: "Yards to a first down (or the endzone)",
            quarter_colname: "The quarter",
            time_colname: "Seconds elapsed in the quarter",
            yardline_colname: ("The yardline, given by (yards from own goalline - 50). "
                               "-49 is your own 1 while 49 is the opponent's 1.")
            }

        is_offense_home = preprocessing.ComputeIfOffenseIsHome(offense_team_colname,
                                                               home_team_colname,
                                                               copy=self.copy_data)
        steps.append(("compute_offense_home", is_offense_home))
        score_differential = preprocessing.CreateScoreDifferential(home_score_colname,
                                                                   away_score_colname,
                                                                   is_offense_home.offense_home_team_colname,
                                                                   copy=self.copy_data)
        steps.append(("create_score_differential", score_differential))
        steps.append(("map_downs_to_int", preprocessing.MapToInt(down_colname, copy=self.copy_data)))
        total_time_elapsed = preprocessing.ComputeElapsedTime(quarter_colname, time_colname, copy=self.copy_data)
        steps.append(("compute_total_time_elapsed", total_time_elapsed))
        steps.append(("remove_unnecessary_columns", preprocessing.CheckColumnNames(
            column_names=[is_offense_home.offense_home_team_colname,
                          score_differential.score_differential_colname,
                          total_time_elapsed.total_time_colname,
                          yardline_colname,
                          yards_to_go_colname,
                          down_colname],
            copy=self.copy_data)))
        steps.append(("encode_categorical_columns", preprocessing.OneHotEncoderFromDataFrame(
            categorical_feature_names=[down_colname],
            copy=self.copy_data)))

        search_grid = {'base_estimator__penalty': ['l1', 'l2'],
                       'base_estimator__C': [0.01, 0.1, 1, 10, 100]
                      }
        base_model = LogisticRegression()
        calibrated_model = CalibratedClassifierCV(base_model, cv=2, method="isotonic")
        #grid_search_model = GridSearchCV(calibrated_model, search_grid,
        #                     scoring=self._brier_loss_scorer)
        steps.append(("compute_model", calibrated_model))

        pipe = Pipeline(steps)
        return pipe

[docs]    def save_model(self, filename=None):
        """Save the WPModel instance to disk.

        All models are saved to the same place, with the installed
        NFLWin library (given by ``WPModel.model_directory``). 

        Parameters
        ----------
        filename : string (default=None):
            The filename to use for the saved model. If this parameter
            is not specified, save to the default filename. Note that if a model
            already lists with this filename, it will be overwritten. Note also that
            this is a filename only, **not** a full path. If a full path is specified
            it is likely (albeit not guaranteed) to cause errors.

        Returns
        -------
        ``None``
        """

        if filename is None:
            filename = self._default_model_filename
        joblib.dump(self, os.path.join(self.model_directory, filename))

    @classmethod
[docs]    def load_model(cls, filename=None):
        """Load a saved WPModel.

        Parameters
        ----------
        Same as ``save_model``.

        Returns
        -------
        ``nflwin.WPModel`` instance.
        """
        if filename is None:
            filename = cls._default_model_filename
            
        return joblib.load(os.path.join(cls.model_directory, filename))

    @staticmethod
    def _brier_loss_scorer(estimator, X, y):
        """Use the Brier loss to estimate model score.

        For use in GridSearchCV, instead of accuracy.
        """
        predicted_positive_probabilities = estimator.predict_proba(X)[:, 1]
        return 1. - brier_score_loss(y, predicted_positive_probabilities)