March Madness Predictions

# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestClassifier # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('.'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

def change_loc(loc): if loc == 'H': return 'A' elif loc == 'A': return 'H' else: return 'N'

folder = "mens-march-mania-2022/MDataFiles_Stage1/"

seeds = pd.read_csv(folder+"MNCAATourneySeeds.csv") conferences = pd.read_csv(folder+"MTeamConferences.csv") regular_detail = pd.read_csv(folder+"MRegularSeasonDetailedResults.csv") tourney_compact = pd.read_csv(folder+"MNCAATourneyCompactResults.csv")

win_teams = pd.DataFrame() lose_teams = pd.DataFrame() columns = ['Season', 'TeamID', 'Points', 'Opp_Points', 'Loc', 'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'Opp_FGM', 'Opp_FGA', 'Opp_FGM3', 'Opp_FGA3', 'Opp_FTM', 'Opp_FTA', 'Opp_OR', 'Opp_DR', 'Opp_Ast', 'Opp_TO', 'Opp_Stl', 'Opp_Blk', 'Opp_PF'] win_teams[columns] = regular_detail[['Season', 'WTeamID', 'WScore', 'LScore', 'WLoc', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']] win_teams["Wins"] = 1 win_teams["Losses"] = 0 lose_teams[columns] = regular_detail[['Season', 'LTeamID', 'LScore', 'WScore', 'WLoc', 'NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']] lose_teams['Loc'] = lose_teams['Loc'].apply(change_loc) lose_teams["Wins"] = 0 lose_teams["Losses"] = 1 win_lose_teams = pd.concat([win_teams, lose_teams]) win_lose_teams

combined_teams = win_lose_teams.groupby(['Season','TeamID']).sum() combined_teams["Total_Games"] = combined_teams["Wins"] + combined_teams["Losses"] display(combined_teams) combined_teams.columns.values

regular_season_input = pd.DataFrame() regular_season_input['Win_Ratio'] = combined_teams['Wins'] / combined_teams['Total_Games'] regular_season_input['Points_Per_Game'] = combined_teams['Points'] / combined_teams['Total_Games'] regular_season_input['Points_Opp_Per_Game'] = combined_teams['Opp_Points'] / combined_teams['Total_Games'] regular_season_input['Points_Ratio'] = combined_teams['Points'] / combined_teams['Opp_Points'] regular_season_input['OT_Per_Game'] = combined_teams['NumOT'] / combined_teams['Total_Games'] # any shots regular_season_input['FG_Per_Game'] = combined_teams['FGM'] / combined_teams['Total_Games'] regular_season_input['FG_Opp_Per_Game'] = combined_teams['Opp_FGM'] / combined_teams['Total_Games'] regular_season_input['FG_Ratio'] = combined_teams['FGM'] / combined_teams['FGA'] # 3 pointers regular_season_input['FG3_Per_Game'] = combined_teams['FGM3'] / combined_teams['Total_Games'] regular_season_input['FG3_Opp_Per_Game'] = combined_teams['Opp_FGM3'] / combined_teams['Total_Games'] regular_season_input['FG3_Ratio'] = combined_teams['FGM3'] / combined_teams['FGA3'] # free throws regular_season_input['FT_Per_Game'] = combined_teams['FTM'] / combined_teams['Total_Games'] regular_season_input['FT_Opp_Per_Game'] = combined_teams['Opp_FTM'] / combined_teams['Total_Games'] regular_season_input['FT_Ratio'] = combined_teams['FTM'] / combined_teams['FTA'] # offensive rebounds regular_season_input['OR_Ratio'] = combined_teams['OR'] / (combined_teams['OR'] + combined_teams['Opp_DR']) # defensive rebounds regular_season_input['DR_Ratio'] = combined_teams['DR'] / (combined_teams['OR'] + combined_teams['Opp_DR']) # assists regular_season_input['Ast_Per_Game'] = combined_teams['Ast'] / combined_teams['Total_Games'] # turnovers regular_season_input['TO_Per_Game'] = combined_teams['TO'] / combined_teams['Total_Games'] # steals regular_season_input['Stl_Per_Game'] = combined_teams['Stl'] / combined_teams['Total_Games'] # blocks regular_season_input['Blk_Per_Game'] = combined_teams['Blk'] / combined_teams['Total_Games'] # personal fouls regular_season_input['PF_Per_Game'] = combined_teams['PF'] / combined_teams['Total_Games']

display(regular_season_input) display(regular_season_input.describe())

display(seeds)

def get_seeds(team:str,seeds:[int]) -> [int]: for x in range(len(tourney_input)): i = (tourney_input['Season'][x], tourney_input[team][x]) seed = seed_dict.loc[i].values[0] if len(seed)==4: seed = int(seed[1:-1]) else: seed = int(seed[1:]) seeds.append(seed) return seeds seed_dict = seeds.set_index(['Season', 'TeamID']) win_ids = tourney_compact['WTeamID'] lose_ids = tourney_compact['LTeamID'] season = tourney_compact['Season'] winners = pd.DataFrame() winners[['Season','Team_1','Team_2']] = tourney_compact[['Season','WTeamID','LTeamID']] winners['Won?'] = True losers = pd.DataFrame() losers[['Season','Team_1','Team_2']] = tourney_compact[['Season','LTeamID','WTeamID']] losers['Won?'] = False tourney_input = pd.concat([winners,losers]) tourney_input = tourney_input[tourney_input['Season']>=2003].reset_index(drop=True) tourney_input['Team_1_Seed'] = get_seeds("Team_1",[]) tourney_input['Team_2_Seed'] = get_seeds("Team_2",[]) tourney_input.sort_values(by=["Season"],ascending=True) training_data1 = tourney_input.iloc[0:1115,:] training_data2 = tourney_input.iloc[1181:2296,:] test_data1 = tourney_input.iloc[1115:1181,:] test_data2 = tourney_input.iloc[2296:2362,:] a = [training_data1, training_data2] b = [test_data1, test_data2] training_data = pd.concat(a) training_data.reset_index(inplace=True) test_data = pd.concat(b) test_data.reset_index(inplace=True)

tourney_input

def get_training_scores(team:str,x:int) -> pd.Series: i = (training_data['Season'][x], training_data[team][x]) team_score = regular_season_input.loc[i] team_score['Seed'] = training_data[team+"_Seed"][x] return team_score #score differences train_output_scores = [] for x in range(len(training_data)): train_output = get_training_scores("Team_1",x) - get_training_scores("Team_2",x) train_output['Won?'] = training_data['Won?'][x] train_output_scores.append(train_output) train_output_scores = pd.DataFrame(train_output_scores) display(train_output_scores) display(train_output_scores.describe())

def get_test_scores(team:str,x:int) -> pd.Series: i = (test_data['Season'][x], test_data[team][x]) team_score = regular_season_input.loc[i] team_score['Seed'] = test_data[team+"_Seed"][x] return team_score #score differences output_scores_test = [] for x in range(len(test_data)): test_output = get_test_scores("Team_1",x) - get_test_scores("Team_2",x) test_output['Won?'] = test_data['Won?'][x] output_scores_test.append(test_output) output_scores_test = pd.DataFrame(output_scores_test) display(output_scores_test) display(output_scores_test.describe())

# which ratios correlate highly with the game result (closer to 1 = strongly correlated) correlation = round(train_output_scores.corr(),2) display(np.abs(correlation['Won?']).sort_values(ascending=False))

Random Forest Classifier

# model using training data X_train = train_output_scores[train_output_scores.columns[:-1]].values y_train = train_output_scores['Won?'].values X_test = output_scores_test[output_scores_test.columns[:-1]].values y_test = output_scores_test['Won?'].values #X_train, X_test, y_train, y_test = train_test_split(\ # X,y,\ # test_size=0.2,\ # train_size=0.8,\ # random_state=42) # normalization scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) display(X_train)

model = RandomForestClassifier(random_state=1) model = model.fit(X_train, y_train) model.score(X_test,y_test)

Elo Rankings

def win_prob(team1_elo, team2_elo): elo_diff = (team2_elo - team1_elo)/400 t1_win_prob = 1/(1+10**elo_diff) return t1_win_prob #2021 Elo Rankings: https://www.warrennolan.com/basketball/2021/elo url = "https://www.warrennolan.com/basketball/2021/elo" data = pd.read_html(url) data[0].to_csv("processed_datasets/elo_rankings.csv") elo_data = data[0] elo_data

seed_team_data = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MTeams.csv") team_data = seed_team_data[["TeamName","TeamID","LastD1Season"]] team_data = team_data[team_data["LastD1Season"]==2022][["TeamName","TeamID"]] display(team_data) training_data = training_data.drop(columns="index") display(training_data)

new_train_df = training_data.merge(team_data,left_on="Team_1",right_on="TeamID").sort_values("Season").reset_index() new_train_df = new_train_df.merge(elo_data,left_on="TeamName",right_on="Team").sort_values("Season").reset_index() new_train_df = new_train_df.drop(columns=["level_0","index"]) display(new_train_df) def new_training_scores(team:str,x:int) -> pd.Series: i = (new_train_df['Season'][x], new_train_df[team][x]) team_score = regular_season_input.loc[i] team_score['Seed'] = new_train_df[team+"_Seed"][x] return team_score # score differences new_output_scores = [] for x in range(len(new_train_df)): new_train_output = new_training_scores("Team_1",x) - new_training_scores("Team_2",x) new_train_output['Won?'] = new_train_df['Won?'][x] new_train_output["ELO"] = new_train_df["ELO"][x] new_output_scores.append(new_train_output) new_output_scores = pd.DataFrame(new_output_scores) display(new_output_scores) display(new_output_scores.describe())

new_test_df = test_data.merge(team_data,left_on="Team_1",right_on="TeamID").sort_values("Season").reset_index() new_test_df = new_test_df.drop(columns=["level_0","index"]) new_test_df = new_test_df.merge(elo_data,left_on="TeamName",right_on="Team").sort_values("Season").reset_index() new_test_df = new_test_df.drop(columns=["index"]) display(new_test_df) def new_test_scores(team:str,x:int) -> pd.Series: i = (new_test_df['Season'][x], new_test_df[team][x]) team_score = regular_season_input.loc[i] team_score['Seed'] = new_test_df[team+"_Seed"][x] return team_score # score differences new_test_output_scores = [] for x in range(len(new_test_df)): new_test_output = new_test_scores("Team_1",x) - new_test_scores("Team_2",x) new_test_output['Won?'] = new_test_df['Won?'][x] new_test_output["ELO"] = new_test_df["ELO"][x] new_test_output_scores.append(new_test_output) new_test_output_scores = pd.DataFrame(new_test_output_scores) display(new_test_output_scores) display(new_test_output_scores.describe())

X_train = new_output_scores[new_output_scores.columns[:-1]].values y_train = new_output_scores['Won?'].values X_test = new_test_output_scores[new_test_output_scores.columns[:-1]].values y_test = new_test_output_scores['Won?'].values # normalization scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) model = RandomForestClassifier(random_state=1) model = model.fit(X_train, y_train) model.score(X_test,y_test)

Neural Network

import sys import tensorflow as tf from numpy import loadtxt from keras.models import Sequential from keras.layers import Dense from tensorflow import keras from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense #dataset I am training and validating with train_output_scores features = train_output_scores.iloc[:,0:22] y = train_output_scores.iloc[:,22] features_train, features_test, y_train, y_test = train_test_split(\ features,y,\ test_size=0.2,\ train_size=0.8)

#define the Keras model neuralNet = Sequential() neuralNet.add(Dense(10, input_dim=22, activation='relu')) neuralNet.add(Dense(5, activation='relu')) neuralNet.add(Dense(1, activation='sigmoid'))

#compile Keras model neuralNet.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#fitting the Keras model neuralNet.fit(features_train, y_train, epochs=50, batch_size=5)

#evaluate the Neural Network _, accuracy = neuralNet.evaluate(features_test, y_test) print('Accuracy: %.2f' % (accuracy*100))

Custom Algorithm

from xgboost import XGBRegressor team_id_name = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MTeams.csv") region = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MSeasons.csv") seed = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneySeeds.csv") win_loss_season = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MRegularSeasonCompactResults.csv") win_loss_tourn = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyCompactResults.csv") conference = pd.read_csv("mens-march-mania-2022/MDataFiles_Stage1/MTeamConferences.csv") conf_rankings = pd.read_csv("processed_datasets/conf_rankings.csv")

team_id_name.drop(['FirstD1Season', 'LastD1Season'], axis=1, inplace = True) seed = seed[seed['Season'].between(2012, 2019)] conference = conference[conference['Season'].between(2012, 2019)] season_team_seed = pd.merge(seed, team_id_name, on = "TeamID", how = "left") season_team_seed_conf = season_team_seed.merge(conference, on=["Season", "TeamID"], how="left") win_loss_season = win_loss_season[win_loss_season['Season'].between(2012, 2019)] season_data_temp = win_loss_season.merge(season_team_seed_conf, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID']) season_data_temp.drop(['TeamID'], axis=1, inplace = True) season_data_temp.rename(columns={"Seed": "WTeamSeed", "TeamName": "WTeamName", "ConfAbbrev": "WConfAbbrev"}, inplace = True) season_data = season_data_temp.merge(season_team_seed_conf, left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID']) season_data.drop(['TeamID'], axis=1, inplace = True) season_data.rename(columns={"Seed": "LTeamSeed", "TeamName": "LTeamName", "ConfAbbrev": "LConfAbbrev"}, inplace = True) win_loss_tourn = win_loss_tourn[win_loss_tourn['Season'].between(2012, 2019)] win_loss_tourn['GameType'] = win_loss_tourn['DayNum'] win_loss_tourn.drop(['WLoc'], axis=1, inplace = True) game_type = {134: "PlayIn",135: "PlayIn",136: "Round1",137: "Round1",138: "Round2",139: "Round2", 143: "Round3",144: "Round3",145: "Round4",146: "Round4",152: "Round5",154: "Round6" } win_loss_tourn.replace({"GameType": game_type}, inplace = True) win_loss_tourn = win_loss_tourn[~win_loss_tourn['GameType'].isin([140, 147, 148])] tourn_data_temp = win_loss_tourn.merge(season_team_seed_conf, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID']) tourn_data_temp.drop(['TeamID'], axis=1, inplace = True) tourn_data_temp.rename(columns={"Seed": "WTeamSeed", "TeamName": "WTeamName", "ConfAbbrev": "WConfAbbrev"}, inplace = True) tourn_data = tourn_data_temp.merge(season_team_seed_conf, left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID']) tourn_data.drop(['TeamID'], axis=1, inplace = True) tourn_data.rename(columns={"Seed": "LTeamSeed", "TeamName": "LTeamName", "ConfAbbrev": "LConfAbbrev"}, inplace = True) replace_conf = conf_rankings.set_index("Conference").to_dict()["Strength of Opponent Ranking"] season_data.drop(season_data.index[season_data['WConfAbbrev'] == 'aac'], inplace = True) season_data.drop(season_data.index[season_data['LConfAbbrev'] == 'aac'], inplace = True) tourn_data.drop(tourn_data.index[tourn_data['WConfAbbrev'] == 'aac'], inplace = True) tourn_data.drop(tourn_data.index[tourn_data['LConfAbbrev'] == 'aac'], inplace = True) season_data.replace(replace_conf, inplace = True) tourn_data.replace(replace_conf, inplace = True) # season_data.replace(replace_aac, inplace = True) # season_data.replace(replace_aac, inplace = True) # def normalize(df_col): mean_value = df_col.mean() std_value = df_col.std() return np.divide(np.subtract(df_col, mean_value), std_value) tourn_data.drop(['DayNum', 'NumOT', 'GameType'], axis=1, inplace = True) tourn_data["WTeamSeed"] = normalize((tourn_data["WTeamSeed"].str.extract('(\d+)').astype('int'))*(-1)) tourn_data["LTeamSeed"] = normalize((tourn_data["LTeamSeed"].str.extract('(\d+)').astype('int'))*(-1)) tourn_data["WScore"] = normalize(tourn_data["WScore"]) tourn_data["LScore"] = normalize(tourn_data["LScore"]) tourn_data["Type"] = 1 season_data.drop(['DayNum', 'NumOT', 'WLoc'], axis=1, inplace = True) season_data["WTeamSeed"] = normalize((season_data["WTeamSeed"].str.extract('(\d+)').astype('float'))*(-1)) season_data["LTeamSeed"] = normalize((season_data["LTeamSeed"].str.extract('(\d+)').astype('float'))*(-1)) season_data["WScore"] = normalize(season_data["WScore"]) season_data["LScore"] = normalize(season_data["LScore"]) season_data["Type"] = 0

all_data = season_data.append(tourn_data) all_data["Type"] = normalize(all_data["Type"]) all_data["WConfAbbrev"] = all_data["WConfAbbrev"].astype('float') all_data["LConfAbbrev"] = all_data["LConfAbbrev"].astype('float') input_data = all_data.drop(["WTeamName", "LTeamName", "Type", "WTeamID", "LTeamID", "LScore", "WScore"], axis=1) X, y = input_data, normalize(all_data["WTeamID"]) X_train, X_valid, y_train, y_valid = train_test_split(X, y) model = XGBRegressor(n_estimators=1000) model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False) model.score(X_valid, y_valid)

input_data

season_team_seed_conf.to_csv("processed_datasets/season_team_seed_conf.csv") season_data.to_csv("processed_datasets/season_data.csv") tourn_data.to_csv("processed_datasets/tournament_data.csv")

Further Experiments

df = pd.read_csv("processed_datasets/season_team_seed_conf_w_SOS.csv") df = df[["Season","TeamID","Strength of Schedule","ConfAbbrev","Average \"Placement\""]]

new_df = tourney_input.merge(df, left_on=['Season','Team_1'], right_on=['Season','TeamID']) new_df = new_df.sort_values('Season',ascending=True) display(new_df)

Final Test

#TEST ON THE FINAL MODEL: Neural Network #dataset I am testing with output_scores_test features_t = output_scores_test.iloc[:,0:22] y_t = output_scores_test.iloc[:,22] #evaluate the Neural Network on Test data (from 2021 season) _, accuracy = neuralNet.evaluate(features_t, y_t) print('Accuracy: %.2f' % (accuracy*100))