import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
# from lightgbm import LGBMRegressor
# from lightgbm import LGBMClassifier
import seaborn as sns
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.metrics import log_loss,precision_score,auc,roc_auc_score
import warnings
warnings.filterwarnings('ignore')
Run to view results
Kenpom = pd.read_csv('/work/External Data/Mkenpom2021.csv')
Teams = pd.read_csv('/work/NCAAMData/MTeamSpellings.csv',encoding='cp1251')
Kenpom['Team'] = Kenpom['Team'].str.lower()
Kenpom = Kenpom.merge(Teams, how='left', left_on=['Team'], right_on=['TeamNameSpelling'])
Kenpom = Kenpom.dropna(subset=['TeamID'])
Kenpom['TeamID'] = Kenpom['TeamID'].astype(int)
Kenpom = Kenpom.drop(['TeamNameSpelling','Team'],axis =1)
Kenpom.head()
Run to view results
men538 = pd.read_csv('/work/External Data/538ratingsMen.csv').drop(['TeamName'],axis = 1)
men538.head()
Run to view results
MNCAATourneyDetailedResults = pd.read_csv('/work/NCAAMData/MNCAATourneyDetailedResults.csv')
MNCAATourneyCompactResults = pd.read_csv('/work/NCAAMData/MNCAATourneyCompactResults.csv')
TourneyResults = MNCAATourneyDetailedResults.merge(MNCAATourneyCompactResults,
on= MNCAATourneyCompactResults.columns.to_list(), how='right')
TourneyResults['TypeCompetition'] = 'Tourney'
MRegularSeasonDetailedResults = pd.read_csv('/work/NCAAMData/MRegularSeasonDetailedResults.csv')
MRegularSeasonCompactResults = pd.read_csv('/work/NCAAMData/MRegularSeasonCompactResults.csv')
SeasonResults = MRegularSeasonDetailedResults.merge(MRegularSeasonCompactResults,
on= MRegularSeasonCompactResults.columns.to_list(), how='right')
SeasonResults['TypeCompetition'] = 'Season'
X = TourneyResults.merge(SeasonResults, on= SeasonResults.columns.to_list(), how='outer')
X = X.drop(['DayNum',
'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'], axis=1)
X.head()
Run to view results
A = X[X['WLoc']=='A'].rename(columns={
'WTeamID': 'BTeamID',
'WScore': 'BScore',
'LTeamID': 'ATeamID',
'LScore': 'AScore',
}).drop('WLoc', axis=1)
H = X[X['WLoc']=='H'].rename(columns={
'WTeamID': 'ATeamID',
'WScore': 'AScore',
'LTeamID': 'BTeamID',
'LScore': 'BScore',
}).drop('WLoc', axis=1)
N = X[X['WLoc']=='N'].rename(columns={
'WTeamID': 'ATeamID',
'WScore': 'AScore',
'LTeamID': 'BTeamID',
'LScore': 'BScore',
})
import random
for index in N.index:
if random.randint(0, 1) == 1:
N.at[index, 'WLoc'] = N.at[index, 'ATeamID']
N.at[index, 'ATeamID'] = N.at[index, 'BTeamID']
N.at[index, 'BTeamID'] = N.at[index, 'WLoc']
N.at[index, 'WLoc'] = N.at[index, 'AScore']
N.at[index, 'AScore'] = N.at[index, 'BScore']
N.at[index, 'BScore'] = N.at[index, 'WLoc']
N = N.drop('WLoc', axis=1)
X = H.append(A, ignore_index=False,sort=False)
X = X.append(N, ignore_index=False,sort=False)
X.head()
Run to view results
#X = X.merge(NCAA, how='left', left_on=['Season','ATeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1)
#X = X.merge(NCAA, how='left', left_on=['Season','BTeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1)
X = X.merge(Kenpom, how='left', left_on=['Season','ATeamID'], right_on=['Year','TeamID']).drop(['TeamID','Year'], axis=1)
X = X.merge(Kenpom, how='left', left_on=['Season','BTeamID'], right_on=['Year','TeamID']).drop(['TeamID','Year'], axis=1)
X = X.merge(men538, how='left', left_on=['Season','ATeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1)
X = X.merge(men538, how='left', left_on=['Season','BTeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1)
X = X.dropna(subset=['Rank_x','Rank_y'])
X = X.dropna(subset=['Seed_x','Seed_y'])
X
Run to view results
#X['Conference'] = X['Conference_x'] + X['Conference_y']
X['Rank'] = X['Rank_x'] - X['Rank_y']
X['Wins'] = X['Wins_x'] - X['Wins_y']
X['Losses'] = X['Losses_x'] - X['Losses_y']
X['Seed'] = X['Seed_x'] - X['Seed_y']
X['Pyth'] = X['Pyth_x'] - X['Pyth_y']
X['AdjustO'] = X['AdjustO_x'] - X['AdjustO_y']
X['AdjustO Rank'] = X['AdjustO Rank_x'] - X['AdjustO Rank_y']
X['AdjustD'] = X['AdjustD_x'] - X['AdjustD_y']
X['AdjustD Rank'] = X['AdjustD Rank_x'] - X['AdjustD Rank_y']
X['AdjustT'] = X['AdjustT_x'] - X['AdjustT_y']
X['AdjustT Rank'] = X['AdjustT Rank_x'] - X['AdjustT Rank_y']
X['Luck'] = X['Luck_x'] - X['Luck_y']
X['Luck Rank'] = X['Luck Rank_x'] - X['Luck Rank_y']
X['SOS Pyth'] = X['SOS Pyth_x'] - X['SOS Pyth_y']
X['SOS Pyth Rank'] = X['SOS Pyth Rank_x'] - X['SOS Pyth Rank_y']
X['SOS OppO'] = X['SOS OppO_x'] - X['SOS OppO_y']
X['SOS OppO Rank'] = X['SOS OppO Rank_x'] - X['SOS OppO Rank_y']
X['SOS OppD'] = X['SOS OppD_x'] - X['SOS OppD_y']
X['SOS OppD Rank'] = X['SOS OppD Rank_x'] - X['SOS OppD Rank_y']
X['NCSOS Pyth'] = X['NCSOS Pyth_x'] - X['NCSOS Pyth_y']
X['NCSOS Pyth Rank'] = X['NCSOS Pyth Rank_x'] - X['NCSOS Pyth Rank_y']
X['538rating'] = X['538rating_x'] - X['538rating_y']
X = X.drop(['Rank_x','Conference_x','Wins_x','Losses_x','Seed_x','Pyth_x','AdjustO_x','AdjustO Rank_x',
'AdjustD_x','AdjustD Rank_x','AdjustT_x','AdjustT Rank_x','Luck_x','Luck Rank_x','SOS Pyth_x',
'SOS Pyth Rank_x','SOS OppO_x','SOS OppO Rank_x','SOS OppD_x','SOS OppD Rank_x','NCSOS Pyth_x',
'NCSOS Pyth Rank_x','Rank_y','Conference_y','Wins_y','Losses_y','Seed_y','Pyth_y','AdjustO_y',
'AdjustO Rank_y','AdjustD_y','AdjustD Rank_y','AdjustT_y','AdjustT Rank_y',
'Luck_y','Luck Rank_y','SOS Pyth_y','SOS Pyth Rank_y','SOS OppO_y','SOS OppO Rank_y',
'SOS OppD_y','SOS OppD Rank_y','NCSOS Pyth_y','NCSOS Pyth Rank_y','538rating_x','538rating_y'],axis = 1)
Run to view results
X['HomeWin'] = (X['AScore']-X['BScore'] > 0).astype(int)
X.head()
Run to view results
test = pd.read_csv('/work/NCAAMData/MSampleSubmissionStage2.csv')
submission = pd.read_csv('/work/NCAAMData/MSampleSubmissionStage2.csv')
test['Season'] = test['ID'].apply(lambda x: int(x.split('_')[0]))
test['ATeamID'] = test['ID'].apply(lambda x: int(x.split('_')[1]))
test['BTeamID'] = test['ID'].apply(lambda x: int(x.split('_')[2]))
test = test.drop(['Pred','ID'], axis=1)
test['TypeCompetition'] = 'Tourney'
test.head()
Run to view results
test = test.merge(Kenpom, how='left', left_on=['Season','ATeamID'], right_on=['Year','TeamID']).drop(['TeamID','Year'], axis=1)
test = test.merge(Kenpom, how='left', left_on=['Season','BTeamID'], right_on=['Year','TeamID']).drop(['TeamID','Year'], axis=1)
test = test.merge(men538, how='left', left_on=['Season','ATeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1)
test = test.merge(men538, how='left', left_on=['Season','BTeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1)
test.head()
Run to view results
#test['Conference'] = test['Conference_x'] + test['Conference_y']
test['Rank'] = test['Rank_x'] - test['Rank_y']
test['Wins'] = test['Wins_x'] - test['Wins_y']
test['Losses'] = test['Losses_x'] - test['Losses_y']
#test['Seed'] = test['Seed_x'] - test['Seed_y']
test['Pyth'] = test['Pyth_x'] - test['Pyth_y']
test['AdjustO'] = test['AdjustO_x'] - test['AdjustO_y']
test['AdjustO Rank'] = test['AdjustO Rank_x'] - test['AdjustO Rank_y']
test['AdjustD'] = test['AdjustD_x'] - test['AdjustD_y']
test['AdjustD Rank'] = test['AdjustD Rank_x'] - test['AdjustD Rank_y']
test['AdjustT'] = test['AdjustT_x'] - test['AdjustT_y']
test['AdjustT Rank'] = test['AdjustT Rank_x'] - test['AdjustT Rank_y']
test['Luck'] = test['Luck_x'] - test['Luck_y']
test['Luck Rank'] = test['Luck Rank_x'] - test['Luck Rank_y']
test['SOS Pyth'] = test['SOS Pyth_x'] - test['SOS Pyth_y']
test['SOS Pyth Rank'] = test['SOS Pyth Rank_x'] - test['SOS Pyth Rank_y']
test['SOS OppO'] = test['SOS OppO_x'] - test['SOS OppO_y']
test['SOS OppO Rank'] = test['SOS OppO Rank_x'] - test['SOS OppO Rank_y']
test['SOS OppD'] = test['SOS OppD_x'] - test['SOS OppD_y']
test['SOS OppD Rank'] = test['SOS OppD Rank_x'] - test['SOS OppD Rank_y']
test['NCSOS Pyth'] = test['NCSOS Pyth_x'] - test['NCSOS Pyth_y']
test['NCSOS Pyth Rank'] = test['NCSOS Pyth Rank_x'] - test['NCSOS Pyth Rank_y']
test['538rating'] = test['538rating_x'] - test['538rating_y']
test = test.drop(['Rank_x','Conference_x','Wins_x','Losses_x','Seed_x','Pyth_x','AdjustO_x','AdjustO Rank_x',
'AdjustD_x','AdjustD Rank_x','AdjustT_x','AdjustT Rank_x','Luck_x','Luck Rank_x','SOS Pyth_x',
'SOS Pyth Rank_x','SOS OppO_x','SOS OppO Rank_x','SOS OppD_x','SOS OppD Rank_x','NCSOS Pyth_x',
'NCSOS Pyth Rank_x','Rank_y','Conference_y','Wins_y','Losses_y','Seed_y','Pyth_y','AdjustO_y',
'AdjustO Rank_y','AdjustD_y','AdjustD Rank_y','AdjustT_y','AdjustT Rank_y',
'Luck_y','Luck Rank_y','SOS Pyth_y','SOS Pyth Rank_y','SOS OppO_y','SOS OppO Rank_y',
'SOS OppD_y','SOS OppD Rank_y','NCSOS Pyth_y','NCSOS Pyth Rank_y','538rating_x','538rating_y'],axis = 1)
Run to view results
#ADD SEED
Seeds = pd.read_csv('/work/NCAAMData/MNCAATourneySeeds.csv')
test = test.merge(Seeds, how='left', left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID']).drop('TeamID', axis=1).rename(columns={'Seed': 'ASeed'})
test = test.merge(Seeds, how='left', left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID']).drop('TeamID', axis=1).rename(columns={'Seed': 'BSeed'})
#SEED TO FLOAT
test['ASeed'] = test['ASeed'].str.replace(r'[^0-9]', '').astype('float')
test['BSeed'] = test['BSeed'].str.replace(r'[^0-9]', '').astype('float')
test['Seed'] = test['ASeed'] - test['BSeed']
test= test.drop(['ASeed','BSeed'],axis =1)
test.head()
Run to view results
temp = X.append(test, ignore_index=False,sort=False)
temp = pd.get_dummies(temp,dtype=bool)
X = temp[:len(X)]
test = temp[len(X):]
temp = pd.DataFrame
test = test.drop(['AScore','BScore','HomeWin'],axis = 1)
Run to view results
test.head()
Run to view results
X.head()
Run to view results
lgbm_parameters= {
'objective': 'binary',
'metric': 'binary_logloss',
}
Run to view results
test_pred = np.zeros(len(test))
test_pred = []
kf = KFold(n_splits=10, shuffle=True)
for year in test['Season'].unique():
#X_year = X[(X['Season'] >= year-5)&(X['Season'] <= year+5)]
X_year = X[((X['TypeCompetition_Season'] == True)&(X['Season'] == year))|(X['Season'] != year)]
y_year = X_year['HomeWin']
X_year = X_year.drop(['AScore','BScore','HomeWin'], axis=1)
test_year = test[test['Season'] == year]
cat_val_pred = np.zeros(len(y_year))
cat_test_pred = np.zeros(len(test_year))
logloss = []
for trn_idx, val_idx in kf.split(X_year,y_year):
x_train_idx = X_year.iloc[trn_idx]
y_train_idx = y_year.iloc[trn_idx]
x_valid_idx = X_year.iloc[val_idx]
y_valid_idx = y_year.iloc[val_idx]
cat_model = CatBoostClassifier()
cat_model.fit(x_train_idx, y_train_idx, eval_set = ((x_valid_idx,y_valid_idx)),verbose = False,cat_features=[0,1,2])
cat_test_pred += cat_model.predict_proba(test_year)[:,1]/10
logloss.append(log_loss(y_valid_idx, cat_model.predict_proba(x_valid_idx)[:,1]))
test_pred += cat_test_pred.tolist()
print('Year_Predict:',year,'Log_Loss:',np.mean(logloss))
Run to view results
submission.Pred = test_pred
submission.to_csv('submission.csv', index=False)
Run to view results
plt.rcParams["figure.figsize"] = (5, 10)
features=cat_model.get_feature_importance(
verbose=True)
sns.barplot(y=x_train_idx.columns,x=features,orient='h',palette='rocket');
Run to view results
_ = sns.histplot(submission['Pred'])
Run to view results
submission.head()
Run to view results