Source code for nflwin.utilities

"""Utility functions that don't fit in the main modules"""
from __future__ import print_function, division

import numpy as np
import pandas as pd


[docs]def connect_nfldb(): """Connect to the nfldb database. Rather than using the builtin method we make our own, since we're going to use SQLAlchemy as the engine. However, we can still make use of the information in the nfldb config file to get information like username and password, which means this function doesn't need any arguments. Parameters ---------- None Returns ------- SQLAlchemy engine object A connected engine, ready to be used to query the DB. Raises ------ IOError If it can't find the config file. """ import nfldb import sqlalchemy as sql db_config, paths_tried = nfldb.db.config() if db_config is None: raise IOError("get_play_data: could not find database config! Looked" " in these places: {0}".format(paths_tried)) db_config["drivername"] = "postgres" db_config["username"] = db_config["user"] del db_config["user"] del db_config["timezone"] engine = sql.create_engine(sql.engine.url.URL(**db_config)) return engine
[docs]def get_nfldb_play_data(season_years=None, season_types=["Regular", "Postseason"]): """Get play-by-play data from the nfldb database. We use a specialized query and then postprocessing because, while possible to do using the objects created by ``nfldb``, it is *orders of magnitude slower*. This is due to the more general nature of ``nfldb``, which is not really designed for this kind of data mining. Since we need to get a lot of data in a single way, it's much simpler to interact at a lower level with the underlying postgres database. Parameters ---------- season_years : list (default=None) A list of all years to get data for (earliest year in nfldb is 2009). If ``None``, get data from all available seasons. season_types : list (default=["Regular", "Postseason"]) A list of all parts of seasons to get data for (acceptable values are "Preseason", "Regular", and "Postseason"). If ``None``, get data from all three season types. Returns ------- Pandas DataFrame The play by play data, with the following columns: * **gsis_id:** The official NFL GSIS_ID for the game. * **drive_id:** The id of the drive, starts at 1 and increases by 1 for each new drive. * **play_id:** The id of the play in ``nfldb``. Note that sequential plays have increasing but not necessarily sequential values. With ``drive_id`` and ``gsis_id``, works as a unique identifier for a given play. * **quarter:** The quarter, prepended with "Q" (e.g. ``Q1`` means the first quarter). Overtime periods are denoted as ``OT``, ``OT2``, and theoretically ``OT3`` if one were to ever be played. * **seconds_elapsed:** seconds elapsed since the start of the quarter. * **offense_team:** The abbreviation of the team currently with possession of the ball. * **yardline:** The current field position. Goes from -49 to 49, where negative numbers indicate that the team with possession is on its own side of the field. * **down:** The down. kickoffs, extra points, and similar have a down of 0. * **yards_to_go:** How many yards needed in order to get a first down (or touchdown). * **home_team:** The abbreviation of the home team. * **away_team:** The abbreviation of the away team. * **curr_home_score:** The home team's score at the start of the play. * **curr_away_score:** The away team's score at the start of the play. * **offense_won:** A boolean - ``True`` if the offense won the game, ``False`` otherwise. (The database query skips tied games.) Notes ----- ``gsis_id``, ``drive_id``, and ``play_id`` are not necessary to make the model, but are included because they can be useful for computing things like WPA. """ engine = connect_nfldb() sql_string = _make_nfldb_query_string(season_years=season_years, season_types=season_types) plays_df = pd.read_sql(sql_string, engine) #Fix yardline, quarter and time elapsed: def yardline_time_fix(row): try: yardline = float(row['yardline'][1:-1]) except TypeError: yardline = np.nan split_time = row['time'].split(",") return yardline, split_time[0][1:], float(split_time[1][:-1]) plays_df[['yardline', 'quarter', 'seconds_elapsed']] = pd.DataFrame(plays_df.apply(yardline_time_fix, axis=1).values.tolist()) plays_df.drop('time', axis=1, inplace=True) #Set NaN downs (kickoffs, etc) to 0: plays_df['down'] = plays_df['down'].fillna(value=0).astype(np.int8) #Aggregate scores: plays_df = _aggregate_nfldb_scores(plays_df) return plays_df
def _aggregate_nfldb_scores(play_df): """Aggregate the raw nfldb data to get the score of every play.""" # First, add the yardline of the subsequent play to the df play_df['next_yardline'] = play_df['yardline'].shift(-1) #Set up the dictionary to keep track of things: curr_home_score = 0 curr_away_score = 0 curr_gsis_id = play_df.iloc[0].gsis_id argdict = {"curr_home_score": 0, "curr_away_score": 0, "curr_gsis_id": play_df.iloc[0].gsis_id} #Define an internal function to actually compute the score of a given play: def compute_current_scores(play, argdict): #If new game, set scores to zero: if play.gsis_id != argdict['curr_gsis_id']: argdict['curr_home_score'] = 0 argdict['curr_away_score'] = 0 argdict['curr_gsis_id'] = play.gsis_id #Get current score at start of play: home_score_to_return = argdict['curr_home_score'] away_score_to_return = argdict['curr_away_score'] #Check if an extra point is missing from the data: if play.offense_play_points == 6 and play.next_yardline < 0: play.offense_play_points += 1 if play.defense_play_points == 6 and play.next_yardline < 0: play.defense_play_points += 1 #Update scores, if necessary: if play.offense_team == play.home_team: argdict['curr_home_score'] += play.offense_play_points argdict['curr_away_score'] += play.defense_play_points else: argdict['curr_home_score'] += play.defense_play_points argdict['curr_away_score'] += play.offense_play_points return home_score_to_return, away_score_to_return #Apply function to data: #TODO (AndrewRook): Make the .apply function go faster, currently it's a large bottleneck aggregate_scores = play_df.apply(compute_current_scores, axis=1, args=(argdict,)) aggregate_scores = pd.DataFrame(aggregate_scores.values.tolist()) play_df[['curr_home_score', 'curr_away_score']] = aggregate_scores #Drop unnecessary columns: play_df.drop(labels=["next_yardline", "offense_play_points", "defense_play_points"], axis=1, inplace=True) return play_df def _make_nfldb_query_string(season_years=None, season_types=None): """Construct the query string to get all the play data. This way is a little more compact and robust than specifying the string in the function that uses it. """ play_fields = ['gsis_id', 'drive_id', 'play_id', 'time', 'pos_team AS offense_team', 'yardline', 'down', 'yards_to_go'] offense_play_points = ("GREATEST(" "(agg_play.fumbles_rec_tds * 6), " "(agg_play.kicking_rec_tds * 6), " "(agg_play.passing_tds * 6), " "(agg_play.receiving_tds * 6), " "(agg_play.rushing_tds * 6), " "(agg_play.kicking_xpmade * 1), " "(agg_play.passing_twoptm * 2), " "(agg_play.receiving_twoptm * 2), " "(agg_play.rushing_twoptm * 2), " "(agg_play.kicking_fgm * 3)) " "AS offense_play_points") defense_play_points = ("GREATEST(" "(agg_play.defense_frec_tds * 6), " "(agg_play.defense_int_tds * 6), " "(agg_play.defense_misc_tds * 6), " "(agg_play.kickret_tds * 6), " "(agg_play.puntret_tds * 6), " "(agg_play.defense_safe * 2)) " "AS defense_play_points") game_fields = ("game.home_team, game.away_team, " "((game.home_score > game.away_score AND play.pos_team = game.home_team) " "OR (game.away_score > game.home_score AND play.pos_team = game.away_team)) AS offense_won") where_clause = ("WHERE game.home_score != game.away_score " "AND game.finished = TRUE " "AND play.pos_team != 'UNK' " "AND (play.time).phase not in ('Pregame', 'Half', 'Final')") if season_years is not None: where_clause += " AND game.season_year" if len(season_years) == 1: where_clause += " = {0}".format(season_years[0]) else: where_clause += (" in ({0})" "".format(",".join([str(year) for year in season_years]))) if season_types is not None: where_clause += " AND game.season_type" if len(season_types) == 1: where_clause += " = '{0}'".format(season_types[0]) else: where_clause += " in ('{0}')".format("','".join(season_types)) query_string = "SELECT " query_string += "play." + ", play.".join(play_fields) query_string += ", " + offense_play_points query_string += ", " + defense_play_points query_string += ", " + game_fields query_string += " FROM play INNER JOIN agg_play" query_string += (" ON play.gsis_id = agg_play.gsis_id" " AND play.drive_id = agg_play.drive_id" " AND play.play_id = agg_play.play_id") query_string += " INNER JOIN game on play.gsis_id = game.gsis_id" query_string += " " + where_clause query_string += " ORDER BY play.gsis_id, play.drive_id, play.play_id;" return query_string