March Madness Predictions
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('.'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
def change_loc(loc):
if loc == 'H':
return 'A'
elif loc == 'A':
return 'H'
else:
return 'N'
folder = "mens-march-mania-2022/MDataFiles_Stage1/"
seeds = pd.read_csv(folder+"MNCAATourneySeeds.csv")
conferences = pd.read_csv(folder+"MTeamConferences.csv")
regular_detail = pd.read_csv(folder+"MRegularSeasonDetailedResults.csv")
tourney_compact = pd.read_csv(folder+"MNCAATourneyCompactResults.csv")
win_teams = pd.DataFrame()
lose_teams = pd.DataFrame()
columns = ['Season', 'TeamID', 'Points', 'Opp_Points',
'Loc', 'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA',
'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'Opp_FGM', 'Opp_FGA',
'Opp_FGM3', 'Opp_FGA3', 'Opp_FTM', 'Opp_FTA', 'Opp_OR', 'Opp_DR', 'Opp_Ast', 'Opp_TO',
'Opp_Stl', 'Opp_Blk', 'Opp_PF']
win_teams[columns] = regular_detail[['Season', 'WTeamID', 'WScore', 'LScore',
'WLoc', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA',
'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA',
'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO',
'LStl', 'LBlk', 'LPF']]
win_teams["Wins"] = 1
win_teams["Losses"] = 0
lose_teams[columns] = regular_detail[['Season', 'LTeamID', 'LScore', 'WScore',
'WLoc', 'NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WFGM', 'WFGA',
'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO',
'WStl', 'WBlk', 'WPF']]
lose_teams['Loc'] = lose_teams['Loc'].apply(change_loc)
lose_teams["Wins"] = 0
lose_teams["Losses"] = 1
win_lose_teams = pd.concat([win_teams, lose_teams])
win_lose_teams
combined_teams = win_lose_teams.groupby(['Season','TeamID']).sum()
combined_teams["Total_Games"] = combined_teams["Wins"] + combined_teams["Losses"]
display(combined_teams)
combined_teams.columns.values
regular_season_input = pd.DataFrame()
regular_season_input['Win_Ratio'] = combined_teams['Wins'] / combined_teams['Total_Games']
regular_season_input['Points_Per_Game'] = combined_teams['Points'] / combined_teams['Total_Games']
regular_season_input['Points_Opp_Per_Game'] = combined_teams['Opp_Points'] / combined_teams['Total_Games']
regular_season_input['Points_Ratio'] = combined_teams['Points'] / combined_teams['Opp_Points']
regular_season_input['OT_Per_Game'] = combined_teams['NumOT'] / combined_teams['Total_Games']
# any shots
regular_season_input['FG_Per_Game'] = combined_teams['FGM'] / combined_teams['Total_Games']
regular_season_input['FG_Opp_Per_Game'] = combined_teams['Opp_FGM'] / combined_teams['Total_Games']
regular_season_input['FG_Ratio'] = combined_teams['FGM'] / combined_teams['FGA']
# 3 pointers
regular_season_input['FG3_Per_Game'] = combined_teams['FGM3'] / combined_teams['Total_Games']
regular_season_input['FG3_Opp_Per_Game'] = combined_teams['Opp_FGM3'] / combined_teams['Total_Games']
regular_season_input['FG3_Ratio'] = combined_teams['FGM3'] / combined_teams['FGA3']
# free throws
regular_season_input['FT_Per_Game'] = combined_teams['FTM'] / combined_teams['Total_Games']
regular_season_input['FT_Opp_Per_Game'] = combined_teams['Opp_FTM'] / combined_teams['Total_Games']
regular_season_input['FT_Ratio'] = combined_teams['FTM'] / combined_teams['FTA']
# offensive rebounds
regular_season_input['OR_Ratio'] = combined_teams['OR'] / (combined_teams['OR'] + combined_teams['Opp_DR'])
# defensive rebounds
regular_season_input['DR_Ratio'] = combined_teams['DR'] / (combined_teams['OR'] + combined_teams['Opp_DR'])
# assists
regular_season_input['Ast_Per_Game'] = combined_teams['Ast'] / combined_teams['Total_Games']
# turnovers
regular_season_input['TO_Per_Game'] = combined_teams['TO'] / combined_teams['Total_Games']
# steals
regular_season_input['Stl_Per_Game'] = combined_teams['Stl'] / combined_teams['Total_Games']
# blocks
regular_season_input['Blk_Per_Game'] = combined_teams['Blk'] / combined_teams['Total_Games']
# personal fouls
regular_season_input['PF_Per_Game'] = combined_teams['PF'] / combined_teams['Total_Games']
display(regular_season_input)
display(regular_season_input.describe())
display(seeds)
def get_seeds(team:str,seeds:[int]) -> [int]:
for x in range(len(tourney_input)):
i = (tourney_input['Season'][x], tourney_input[team][x])
seed = seed_dict.loc[i].values[0]
if len(seed)==4:
seed = int(seed[1:-1])
else:
seed = int(seed[1:])
seeds.append(seed)
return seeds
seed_dict = seeds.set_index(['Season', 'TeamID'])
win_ids = tourney_compact['WTeamID']
lose_ids = tourney_compact['LTeamID']
season = tourney_compact['Season']
winners = pd.DataFrame()
winners[['Season','Team_1','Team_2']] = tourney_compact[['Season','WTeamID','LTeamID']]
winners['Won?'] = True
losers = pd.DataFrame()
losers[['Season','Team_1','Team_2']] = tourney_compact[['Season','LTeamID','WTeamID']]
losers['Won?'] = False
tourney_input = pd.concat([winners,losers])
tourney_input = tourney_input[tourney_input['Season']>=2003].reset_index(drop=True)
tourney_input['Team_1_Seed'] = get_seeds("Team_1",[])
tourney_input['Team_2_Seed'] = get_seeds("Team_2",[])
tourney_input.sort_values(by=["Season"],ascending=True)
training_data1 = tourney_input.iloc[0:1115,:]
training_data2 = tourney_input.iloc[1181:2296,:]
test_data1 = tourney_input.iloc[1115:1181,:]
test_data2 = tourney_input.iloc[2296:2362,:]
a = [training_data1, training_data2]
b = [test_data1, test_data2]
training_data = pd.concat(a)
training_data.reset_index(inplace=True)
test_data = pd.concat(b)
test_data.reset_index(inplace=True)
tourney_input
def get_training_scores(team:str,x:int) -> pd.Series:
i = (training_data['Season'][x], training_data[team][x])
team_score = regular_season_input.loc[i]
team_score['Seed'] = training_data[team+"_Seed"][x]
return team_score
#score differences
train_output_scores = []
for x in range(len(training_data)):
train_output = get_training_scores("Team_1",x) - get_training_scores("Team_2",x)
train_output['Won?'] = training_data['Won?'][x]
train_output_scores.append(train_output)
train_output_scores = pd.DataFrame(train_output_scores)
display(train_output_scores)
display(train_output_scores.describe())
def get_test_scores(team:str,x:int) -> pd.Series:
i = (test_data['Season'][x], test_data[team][x])
team_score = regular_season_input.loc[i]
team_score['Seed'] = test_data[team+"_Seed"][x]
return team_score
#score differences
output_scores_test = []
for x in range(len(test_data)):
test_output = get_test_scores("Team_1",x) - get_test_scores("Team_2",x)
test_output['Won?'] = test_data['Won?'][x]
output_scores_test.append(test_output)
output_scores_test = pd.DataFrame(output_scores_test)
display(output_scores_test)
display(output_scores_test.describe())
# which ratios correlate highly with the game result (closer to 1 = strongly correlated)
correlation = round(train_output_scores.corr(),2)
display(np.abs(correlation['Won?']).sort_values(ascending=False))
Random Forest Classifier
# model using training data
X_train = train_output_scores[train_output_scores.columns[:-1]].values
y_train = train_output_scores['Won?'].values
X_test = output_scores_test[output_scores_test.columns[:-1]].values
y_test = output_scores_test['Won?'].values
#X_train, X_test, y_train, y_test = train_test_split(\
# X,y,\
# test_size=0.2,\
# train_size=0.8,\
# random_state=42)
# normalization
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
display(X_train)
model = RandomForestClassifier(random_state=1)
model = model.fit(X_train, y_train)
model.score(X_test,y_test)
Elo Rankings
def win_prob(team1_elo, team2_elo):
elo_diff = (team2_elo - team1_elo)/400
t1_win_prob = 1/(1+10**elo_diff)
return t1_win_prob
#2021 Elo Rankings: https://www.warrennolan.com/basketball/2021/elo
url = "https://www.warrennolan.com/basketball/2021/elo"
data = pd.read_html(url)
data[0].to_csv("processed_datasets/elo_rankings.csv")
elo_data = data[0]
elo_data
seed_team_data = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MTeams.csv")
team_data = seed_team_data[["TeamName","TeamID","LastD1Season"]]
team_data = team_data[team_data["LastD1Season"]==2022][["TeamName","TeamID"]]
display(team_data)
training_data = training_data.drop(columns="index")
display(training_data)
new_train_df = training_data.merge(team_data,left_on="Team_1",right_on="TeamID").sort_values("Season").reset_index()
new_train_df = new_train_df.merge(elo_data,left_on="TeamName",right_on="Team").sort_values("Season").reset_index()
new_train_df = new_train_df.drop(columns=["level_0","index"])
display(new_train_df)
def new_training_scores(team:str,x:int) -> pd.Series:
i = (new_train_df['Season'][x], new_train_df[team][x])
team_score = regular_season_input.loc[i]
team_score['Seed'] = new_train_df[team+"_Seed"][x]
return team_score
# score differences
new_output_scores = []
for x in range(len(new_train_df)):
new_train_output = new_training_scores("Team_1",x) - new_training_scores("Team_2",x)
new_train_output['Won?'] = new_train_df['Won?'][x]
new_train_output["ELO"] = new_train_df["ELO"][x]
new_output_scores.append(new_train_output)
new_output_scores = pd.DataFrame(new_output_scores)
display(new_output_scores)
display(new_output_scores.describe())
new_test_df = test_data.merge(team_data,left_on="Team_1",right_on="TeamID").sort_values("Season").reset_index()
new_test_df = new_test_df.drop(columns=["level_0","index"])
new_test_df = new_test_df.merge(elo_data,left_on="TeamName",right_on="Team").sort_values("Season").reset_index()
new_test_df = new_test_df.drop(columns=["index"])
display(new_test_df)
def new_test_scores(team:str,x:int) -> pd.Series:
i = (new_test_df['Season'][x], new_test_df[team][x])
team_score = regular_season_input.loc[i]
team_score['Seed'] = new_test_df[team+"_Seed"][x]
return team_score
# score differences
new_test_output_scores = []
for x in range(len(new_test_df)):
new_test_output = new_test_scores("Team_1",x) - new_test_scores("Team_2",x)
new_test_output['Won?'] = new_test_df['Won?'][x]
new_test_output["ELO"] = new_test_df["ELO"][x]
new_test_output_scores.append(new_test_output)
new_test_output_scores = pd.DataFrame(new_test_output_scores)
display(new_test_output_scores)
display(new_test_output_scores.describe())
X_train = new_output_scores[new_output_scores.columns[:-1]].values
y_train = new_output_scores['Won?'].values
X_test = new_test_output_scores[new_test_output_scores.columns[:-1]].values
y_test = new_test_output_scores['Won?'].values
# normalization
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
model = RandomForestClassifier(random_state=1)
model = model.fit(X_train, y_train)
model.score(X_test,y_test)
Neural Network
import sys
import tensorflow as tf
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
#dataset I am training and validating with
train_output_scores
features = train_output_scores.iloc[:,0:22]
y = train_output_scores.iloc[:,22]
features_train, features_test, y_train, y_test = train_test_split(\
features,y,\
test_size=0.2,\
train_size=0.8)
#define the Keras model
neuralNet = Sequential()
neuralNet.add(Dense(10, input_dim=22, activation='relu'))
neuralNet.add(Dense(5, activation='relu'))
neuralNet.add(Dense(1, activation='sigmoid'))
#compile Keras model
neuralNet.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#fitting the Keras model
neuralNet.fit(features_train, y_train, epochs=50, batch_size=5)
#evaluate the Neural Network
_, accuracy = neuralNet.evaluate(features_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))
Custom Algorithm
from xgboost import XGBRegressor
team_id_name = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MTeams.csv")
region = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MSeasons.csv")
seed = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneySeeds.csv")
win_loss_season = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MRegularSeasonCompactResults.csv")
win_loss_tourn = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyCompactResults.csv")
conference = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MTeamConferences.csv")
conf_rankings = pd.read_csv("processed_datasets/conf_rankings.csv")
team_id_name.drop(['FirstD1Season', 'LastD1Season'], axis=1, inplace = True)
seed = seed[seed['Season'].between(2012, 2019)]
conference = conference[conference['Season'].between(2012, 2019)]
season_team_seed = pd.merge(seed, team_id_name, on = "TeamID", how = "left")
season_team_seed_conf = season_team_seed.merge(conference, on=["Season", "TeamID"], how="left")
win_loss_season = win_loss_season[win_loss_season['Season'].between(2012, 2019)]
season_data_temp = win_loss_season.merge(season_team_seed_conf, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID'])
season_data_temp.drop(['TeamID'], axis=1, inplace = True)
season_data_temp.rename(columns={"Seed": "WTeamSeed", "TeamName": "WTeamName", "ConfAbbrev": "WConfAbbrev"}, inplace = True)
season_data = season_data_temp.merge(season_team_seed_conf, left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID'])
season_data.drop(['TeamID'], axis=1, inplace = True)
season_data.rename(columns={"Seed": "LTeamSeed", "TeamName": "LTeamName", "ConfAbbrev": "LConfAbbrev"}, inplace = True)
win_loss_tourn = win_loss_tourn[win_loss_tourn['Season'].between(2012, 2019)]
win_loss_tourn['GameType'] = win_loss_tourn['DayNum']
win_loss_tourn.drop(['WLoc'], axis=1, inplace = True)
game_type = {134: "PlayIn",135: "PlayIn",136: "Round1",137: "Round1",138: "Round2",139: "Round2",
143: "Round3",144: "Round3",145: "Round4",146: "Round4",152: "Round5",154: "Round6"
}
win_loss_tourn.replace({"GameType": game_type}, inplace = True)
win_loss_tourn = win_loss_tourn[~win_loss_tourn['GameType'].isin([140, 147, 148])]
tourn_data_temp = win_loss_tourn.merge(season_team_seed_conf, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID'])
tourn_data_temp.drop(['TeamID'], axis=1, inplace = True)
tourn_data_temp.rename(columns={"Seed": "WTeamSeed", "TeamName": "WTeamName", "ConfAbbrev": "WConfAbbrev"}, inplace = True)
tourn_data = tourn_data_temp.merge(season_team_seed_conf, left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID'])
tourn_data.drop(['TeamID'], axis=1, inplace = True)
tourn_data.rename(columns={"Seed": "LTeamSeed", "TeamName": "LTeamName", "ConfAbbrev": "LConfAbbrev"}, inplace = True)
replace_conf = conf_rankings.set_index("Conference").to_dict()["Strength of Opponent Ranking"]
season_data.drop(season_data.index[season_data['WConfAbbrev'] == 'aac'], inplace = True)
season_data.drop(season_data.index[season_data['LConfAbbrev'] == 'aac'], inplace = True)
tourn_data.drop(tourn_data.index[tourn_data['WConfAbbrev'] == 'aac'], inplace = True)
tourn_data.drop(tourn_data.index[tourn_data['LConfAbbrev'] == 'aac'], inplace = True)
season_data.replace(replace_conf, inplace = True)
tourn_data.replace(replace_conf, inplace = True)
# season_data.replace(replace_aac, inplace = True)
# season_data.replace(replace_aac, inplace = True)
#
def normalize(df_col):
mean_value = df_col.mean()
std_value = df_col.std()
return np.divide(np.subtract(df_col, mean_value), std_value)
tourn_data.drop(['DayNum', 'NumOT', 'GameType'], axis=1, inplace = True)
tourn_data["WTeamSeed"] = normalize((tourn_data["WTeamSeed"].str.extract('(\d+)').astype('int'))*(-1))
tourn_data["LTeamSeed"] = normalize((tourn_data["LTeamSeed"].str.extract('(\d+)').astype('int'))*(-1))
tourn_data["WScore"] = normalize(tourn_data["WScore"])
tourn_data["LScore"] = normalize(tourn_data["LScore"])
tourn_data["Type"] = 1
season_data.drop(['DayNum', 'NumOT', 'WLoc'], axis=1, inplace = True)
season_data["WTeamSeed"] = normalize((season_data["WTeamSeed"].str.extract('(\d+)').astype('float'))*(-1))
season_data["LTeamSeed"] = normalize((season_data["LTeamSeed"].str.extract('(\d+)').astype('float'))*(-1))
season_data["WScore"] = normalize(season_data["WScore"])
season_data["LScore"] = normalize(season_data["LScore"])
season_data["Type"] = 0
all_data = season_data.append(tourn_data)
all_data["Type"] = normalize(all_data["Type"])
all_data["WConfAbbrev"] = all_data["WConfAbbrev"].astype('float')
all_data["LConfAbbrev"] = all_data["LConfAbbrev"].astype('float')
input_data = all_data.drop(["WTeamName", "LTeamName", "Type", "WTeamID", "LTeamID", "LScore", "WScore"], axis=1)
X, y = input_data, normalize(all_data["WTeamID"])
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
model = XGBRegressor(n_estimators=1000)
model.fit(X_train, y_train,
early_stopping_rounds=5,
eval_set=[(X_valid, y_valid)],
verbose=False)
model.score(X_valid, y_valid)
input_data
season_team_seed_conf.to_csv("processed_datasets/season_team_seed_conf.csv")
season_data.to_csv("processed_datasets/season_data.csv")
tourn_data.to_csv("processed_datasets/tournament_data.csv")
Further Experiments
df = pd.read_csv("processed_datasets/season_team_seed_conf_w_SOS.csv")
df = df[["Season","TeamID","Strength of Schedule","ConfAbbrev","Average \"Placement\""]]
new_df = tourney_input.merge(df, left_on=['Season','Team_1'], right_on=['Season','TeamID'])
new_df = new_df.sort_values('Season',ascending=True)
display(new_df)
Final Test
#TEST ON THE FINAL MODEL: Neural Network
#dataset I am testing with
output_scores_test
features_t = output_scores_test.iloc[:,0:22]
y_t = output_scores_test.iloc[:,22]
#evaluate the Neural Network on Test data (from 2021 season)
_, accuracy = neuralNet.evaluate(features_t, y_t)
print('Accuracy: %.2f' % (accuracy*100))