Predicting Goals in Hockey: An Expected Goals Model Using Machine Learning
Data Cleaning
import pandas as pd
df_original = pd.read_csv("EH_pbp_query_20202021_2021-06-25.csv")
df_original.head()
import numpy as np
# get dataframe with shots only and make random sample of 5175 of the observations
df_no_goals = df_original[df_original["event_type"] == "SHOT"]
shot_rows_sample = np.random.choice(df_no_goals.index, 5176, False)
shots_sample = df_no_goals.index.isin(shot_rows_sample)
df_shots_sample = df_no_goals[shots_sample]
# get dataframe of goals only and append with shots-only sample dataframe
df_only_goals = df_original[df_original["event_type"] == "GOAL"]
df_all = df_only_goals.append(df_shots_sample)
df_all.head()
df_all["event_type"].value_counts()
# drop columns with string data
df_all = df_all.drop(["season", "game_date", "session", "clock_time",
"event_description","event_team", "event_player_1", "home_on_1", "home_on_2", "home_on_3",
"home_on_4", "home_on_5", "home_on_6","home_on_7", "away_on_1", "away_on_2",
"away_on_3", "away_on_4", "away_on_5", "away_on_6","away_on_7", "home_goalie",
"away_goalie", "home_team", "away_team", "game_score_state", "home_zone", "pred_goal"], axis=1)
df_all.head()
df_event_type = df_all.event_type.str.get_dummies()
df_event_type.head()
df_shot_type = df_all.event_detail.str.get_dummies()
df_shot_type.head()
df_event_zone = df_all.event_zone.str.get_dummies()
df_event_zone.head()
def if_assist(column):
column = column.fillna("0")
char = []
for value in column:
if "." in value:
char.append(value)
for value in column:
if value in char:
column = column.replace(value, 1)
column = pd.to_numeric(column)
return column
df_all["event_player_2"] = if_assist(df_all["event_player_2"])
df_all["event_player_3"] = if_assist(df_all["event_player_3"])
df_all["event_player_3"].value_counts()
df_all.head()
# put all tables together
df_all = df_all.join(df_event_type).join(df_shot_type).join(df_event_zone)
df_all.head()
# drop redundant columns
df_all = df_all.drop(["event_type", "event_detail", "event_zone"], axis=1)
df_all.head()
# final round of removing unecessary columns
df_all = df_all.drop(columns=["game_id", "game_period", "game_seconds", "home_skaters", "away_skaters", "home_score", "away_score", "Def", "Neu", "Off", "event_index", "coords_x", "coords_y", "num_on", "num_off", "players_on", "players_off", "game_strength_state", "pbp_distance", "home_zonestart", "face_index", "pen_index", "shift_index", "SHOT"])
df_all.head()
# drop the rows with NaN values
df_all = df_all.dropna(how="any")
df_all.head()
# move "GOAL" column to first
goal_col = "GOAL"
first_col = df_all.pop(goal_col)
df_all.insert(12, goal_col, first_col)
df_all.head()
df_all.to_csv("hockey_data_1.csv", index=False)