import os
import re
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.semi_supervised import LabelSpreading
from catboost import CatBoostClassifier
pd.set_option('display.max_columns', None)
# DATA_PATH = '../input/ncaam-march-mania-2021/'
DATA_PATH = '/work/WDataFiles_Stage2/'
# DATA_PATH_M = '../input/ncaam-march-mania-2021/'
for filename in os.listdir(DATA_PATH):
print(filename)
WTeamConferences.csv
WNCAATourneyDetailedResults.csv
WNCAATourneySlots.csv
WRegularSeasonCompactResults.csv
WNCAATourneySeeds.csv
Conferences.csv
WNCAATourneyCompactResults.csv
WTeamSpellings.csv
WSampleSubmissionStage2.csv
WTeams.csv
WRegularSeasonDetailedResults.csv
WGameCities.csv
WSeasons.csv
Cities.csv
# df_seeds = pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv")
df_seeds = pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv")
df_seeds.head()
# df_season_results = pd.read_csv(DATA_PATH + "MRegularSeasonCompactResults.csv")
df_season_results = pd.read_csv(DATA_PATH + "WRegularSeasonCompactResults.csv")
df_season_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)
df_season_results['ScoreGap'] = df_season_results['WScore'] - df_season_results['LScore']
df_season_results.head()
num_win = df_season_results.groupby(['Season', 'WTeamID']).count()
num_win = num_win.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "NumWins", "WTeamID": "TeamID"})
num_loss = df_season_results.groupby(['Season', 'LTeamID']).count()
num_loss = num_loss.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "NumLosses", "LTeamID": "TeamID"})
gap_win = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()
gap_win = gap_win[['Season', 'WTeamID', 'ScoreGap']].rename(columns={"ScoreGap": "GapWins", "WTeamID": "TeamID"})
gap_loss = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
gap_loss = gap_loss[['Season', 'LTeamID', 'ScoreGap']].rename(columns={"ScoreGap": "GapLosses", "LTeamID": "TeamID"})
df_features_season_w = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_l = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})
df_features_season = pd.concat([df_features_season_w, df_features_season_l], 0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)
df_features_season = df_features_season.merge(num_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(num_loss, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(gap_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(gap_loss, on=['Season', 'TeamID'], how='left')
df_features_season.fillna(0, inplace=True)
df_features_season['WinRatio'] = df_features_season['NumWins'] / (df_features_season['NumWins'] + df_features_season['NumLosses'])
df_features_season['GapAvg'] = (
(df_features_season['NumWins'] * df_features_season['GapWins'] -
df_features_season['NumLosses'] * df_features_season['GapLosses'])
/ (df_features_season['NumWins'] + df_features_season['NumLosses'])
)
df_features_season.drop(['NumWins', 'NumLosses', 'GapWins', 'GapLosses'], axis=1, inplace=True)
# df_tourney_results = pd.read_csv(DATA_PATH + "MNCAATourneyCompactResults.csv")
df_tourney_results = pd.read_csv(DATA_PATH + "WNCAATourneyCompactResults.csv")
df_tourney_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)
def get_round(day):
# round_dic = {134: 0, 135: 0, 136: 1, 137: 1, 138: 2, 139: 2, 143: 3, 144: 3, 145: 4, 146: 4, 152: 5, 154: 6}
round_dic = {137: 0, 138: 0, 139: 1, 140: 1, 141: 2, 144: 3, 145: 3, 146: 4, 147: 4, 148: 4, 151:5, 153: 5, 155: 6} # probably wrong but I don't use it anyways
try:
return round_dic[day]
except:
print(f'Unknow day : {day}')
return 0
df_tourney_results['Round'] = df_tourney_results['DayNum'].apply(get_round)
df_tourney_results.head()
df = df_tourney_results.copy()
df = df[df['Season'] >= 2003].reset_index(drop=True)
df.head()
df = pd.merge(
df,
df_seeds,
how='left',
left_on=['Season', 'WTeamID'],
right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedW'})
df = pd.merge(
df,
df_seeds,
how='left',
left_on=['Season', 'LTeamID'],
right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedL'})
def treat_seed(seed):
return int(re.sub("[^0-9]", "", seed))
df['SeedW'] = df['SeedW'].apply(treat_seed)
df['SeedL'] = df['SeedL'].apply(treat_seed)
df.head()
df = pd.merge(
df,
df_features_season,
how='left',
left_on=['Season', 'WTeamID'],
right_on=['Season', 'TeamID']
).rename(columns={
'NumWins': 'NumWinsW',
'NumLosses': 'NumLossesW',
'GapWins': 'GapWinsW',
'GapLosses': 'GapLossesW',
'WinRatio': 'WinRatioW',
'GapAvg': 'GapAvgW',
}).drop(columns='TeamID', axis=1)
df = pd.merge(
df,
df_features_season,
how='left',
left_on=['Season', 'LTeamID'],
right_on=['Season', 'TeamID']
).rename(columns={
'NumWins': 'NumWinsL',
'NumLosses': 'NumLossesL',
'GapWins': 'GapWinsL',
'GapLosses': 'GapLossesL',
'WinRatio': 'WinRatioL',
'GapAvg': 'GapAvgL',
}).drop(columns='TeamID', axis=1)
df.head()
def add_loosing_matches(win_df):
win_rename = {
"WTeamID": "TeamIdA",
"WScore" : "ScoreA",
"LTeamID" : "TeamIdB",
"LScore": "ScoreB",
"SeedW": "SeedA",
"SeedL": "SeedB",
'WinRatioW' : 'WinRatioA',
'WinRatioL' : 'WinRatioB',
'GapAvgW' : 'GapAvgA',
'GapAvgL' : 'GapAvgB',
# "OrdinalRankW": "OrdinalRankA",
# "OrdinalRankL": "OrdinalRankB",
}
lose_rename = {
"WTeamID": "TeamIdB",
"WScore" : "ScoreB",
"LTeamID" : "TeamIdA",
"LScore": "ScoreA",
"SeedW": "SeedB",
"SeedL": "SeedA",
'GapAvgW' : 'GapAvgB',
'GapAvgL' : 'GapAvgA',
'WinRatioW' : 'WinRatioB',
'WinRatioL' : 'WinRatioA',
# "OrdinalRankW": "OrdinalRankB",
# "OrdinalRankL": "OrdinalRankA",
}
win_df = win_df.copy()
lose_df = win_df.copy()
win_df = win_df.rename(columns=win_rename)
lose_df = lose_df.rename(columns=lose_rename)
return pd.concat([win_df, lose_df], 0, sort=False)
df = add_loosing_matches(df)
df['SeedDiff'] = df['SeedA'] - df['SeedB']
df['WinRatioDiff'] = df['WinRatioA'] - df['WinRatioB']
df['GapAvgDiff'] = df['GapAvgA'] - df['GapAvgB']
# df['OrdinalRankDiff'] = df['OrdinalRankA'] - df['OrdinalRankB']
df.head()
# df_test = pd.read_csv(DATA_PATH + "MSampleSubmissionStage1.csv")
df_test = pd.read_csv(DATA_PATH + "WSampleSubmissionStage2.csv")
df_test['Season'] = df_test['ID'].apply(lambda x: int(x.split('_')[0]))
df_test['TeamIdA'] = df_test['ID'].apply(lambda x: int(x.split('_')[1]))
df_test['TeamIdB'] = df_test['ID'].apply(lambda x: int(x.split('_')[2]))
df_test.head()
df_test = pd.merge(
df_test,
df_seeds,
how='left',
left_on=['Season', 'TeamIdA'],
right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedA'})
df_test = pd.merge(
df_test,
df_seeds,
how='left',
left_on=['Season', 'TeamIdB'],
right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedB'})
df_test['SeedA'] = df_test['SeedA'].apply(treat_seed)
df_test['SeedB'] = df_test['SeedB'].apply(treat_seed)
df_test = pd.merge(
df_test,
df_features_season,
how='left',
left_on=['Season', 'TeamIdA'],
right_on=['Season', 'TeamID']
).rename(columns={
'NumWins': 'NumWinsA',
'NumLosses': 'NumLossesA',
'GapWins': 'GapWinsA',
'GapLosses': 'GapLossesA',
'WinRatio': 'WinRatioA',
'GapAvg': 'GapAvgA',
}).drop(columns='TeamID', axis=1)
df_test = pd.merge(
df_test,
df_features_season,
how='left',
left_on=['Season', 'TeamIdB'],
right_on=['Season', 'TeamID']
).rename(columns={
'NumWins': 'NumWinsB',
'NumLosses': 'NumLossesB',
'GapWins': 'GapWinsB',
'GapLosses': 'GapLossesB',
'WinRatio': 'WinRatioB',
'GapAvg': 'GapAvgB',
}).drop(columns='TeamID', axis=1)
df_test['SeedDiff'] = df_test['SeedA'] - df_test['SeedB']
df_test['WinRatioDiff'] = df_test['WinRatioA'] - df_test['WinRatioB']
df_test['GapAvgDiff'] = df_test['GapAvgA'] - df_test['GapAvgB']
# df_test['OrdinalRankDiff'] = df_test['OrdinalRankA'] - df_test['OrdinalRankB']
df_test.head()
df['ScoreDiff'] = df['ScoreA'] - df['ScoreB']
df['WinA'] = (df['ScoreDiff'] > 0).astype(int)
features = [
'SeedA',
'SeedB',
'WinRatioA',
'GapAvgA',
'WinRatioB',
'GapAvgB',
# 'OrdinalRankA',
# 'OrdinalRankB',
'SeedDiff',
'WinRatioDiff',
'GapAvgDiff'
# 'OrdinalRankDiff',
]
def rescale(features, df_train, df_val, df_test=None):
min_ = df_train[features].min()
max_ = df_train[features].max()
df_train[features] = (df_train[features] - min_) / (max_ - min_)
df_val[features] = (df_val[features] - min_) / (max_ - min_)
if df_test is not None:
df_test[features] = (df_test[features] - min_) / (max_ - min_)
return df_train, df_val, df_test
def kfold_reg(df, df_test_=None, plot=False, verbose=0, mode="reg"):
seasons = df['Season'].unique()
cvs = []
pred_tests = []
target = "ScoreDiff" if mode == "reg" else "WinA"
for season in seasons[15:]:
if verbose:
print(f'\nValidating on season {season}')
df_train = df[df['Season'] < season].reset_index(drop=True).copy()
df_val = df[df['Season'] == season].reset_index(drop=True).copy()
df_test = df_test_.copy()
df_train, df_val, df_test = rescale(features, df_train, df_val, df_test)
if mode == "reg":
model = ElasticNet(alpha=1, l1_ratio=0.5)
elif mode == "lgbm":
model = LGBMClassifier(learning_rate=0.01, n_estimators=400, num_leaves=12,
random_state=33)
elif mode == "nb":
model = MultinomialNB(alpha=0.5)
elif mode == "cat":
model = CatBoostClassifier(iterations=200,
learning_rate=0.3,
depth=13,verbose=0)
elif mode == "ls":
model = LabelSpreading(kernel='rbf',n_neighbors=8,alpha=0.01,max_iter=500,tol=0.003)
else:
model = LogisticRegression(C=10)
model.fit(df_train[features], df_train[target])
if mode == "reg":
pred = model.predict(df_val[features])
pred = (pred - pred.min()) / (pred.max() - pred.min())
else:
pred = model.predict_proba(df_val[features])[:, 1]
if df_test is not None:
if mode == "reg":
pred_test = model.predict(df_test[features])
pred_test = (pred_test - pred_test.min()) / (pred_test.max() - pred_test.min())
else:
pred_test = model.predict_proba(df_test[features])[:, 1]
pred_tests.append(pred_test)
if plot:
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(pred, df_val['ScoreDiff'].values, s=5)
plt.grid(True)
plt.subplot(1, 2, 2)
sns.histplot(pred)
plt.show()
loss = log_loss(df_val['WinA'].values, pred)
cvs.append(loss)
if verbose:
print(f'\t -> Scored {loss:.3f}')
print(f'\n Local CV is {np.mean(cvs):.3f}')
return pred_tests
pred_tests = kfold_reg(df, df_test, plot=False, verbose=1, mode="cat")
Validating on season 2018
-> Scored 0.765
Validating on season 2019
-> Scored 0.564
Local CV is 0.664
pred_test = np.mean(pred_tests, 0)
sub = df_test[['ID', 'Pred']].copy()
sub['Pred'] = pred_test
sub.to_csv('submission.csv', index=False)
_ = sns.histplot(sub['Pred'])
sub.head()