Kaggle NCAAM

import numpy as np import pandas as pd import random from sklearn.preprocessing import LabelEncoder # from lightgbm import LGBMRegressor # from lightgbm import LGBMClassifier import seaborn as sns from catboost import CatBoostClassifier import matplotlib.pyplot as plt from sklearn.model_selection import KFold from tqdm import tqdm from sklearn.metrics import log_loss,precision_score,auc,roc_auc_score import warnings warnings.filterwarnings('ignore')

Kenpom = pd.read_csv('/work/External Data/Mkenpom2021.csv') Teams = pd.read_csv('/work/NCAAMData/MTeamSpellings.csv',encoding='cp1251') Kenpom['Team'] = Kenpom['Team'].str.lower() Kenpom = Kenpom.merge(Teams, how='left', left_on=['Team'], right_on=['TeamNameSpelling']) Kenpom = Kenpom.dropna(subset=['TeamID']) Kenpom['TeamID'] = Kenpom['TeamID'].astype(int) Kenpom = Kenpom.drop(['TeamNameSpelling','Team'],axis =1) Kenpom.head()

Run the app to see the outputs

Press the run button in the top right corner

men538 = pd.read_csv('/work/External Data/538ratingsMen.csv').drop(['TeamName'],axis = 1) men538.head()

Run the app to see the outputs

Press the run button in the top right corner

MNCAATourneyDetailedResults = pd.read_csv('/work/NCAAMData/MNCAATourneyDetailedResults.csv') MNCAATourneyCompactResults = pd.read_csv('/work/NCAAMData/MNCAATourneyCompactResults.csv') TourneyResults = MNCAATourneyDetailedResults.merge(MNCAATourneyCompactResults, on= MNCAATourneyCompactResults.columns.to_list(), how='right') TourneyResults['TypeCompetition'] = 'Tourney' MRegularSeasonDetailedResults = pd.read_csv('/work/NCAAMData/MRegularSeasonDetailedResults.csv') MRegularSeasonCompactResults = pd.read_csv('/work/NCAAMData/MRegularSeasonCompactResults.csv') SeasonResults = MRegularSeasonDetailedResults.merge(MRegularSeasonCompactResults, on= MRegularSeasonCompactResults.columns.to_list(), how='right') SeasonResults['TypeCompetition'] = 'Season' X = TourneyResults.merge(SeasonResults, on= SeasonResults.columns.to_list(), how='outer') X = X.drop(['DayNum', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'], axis=1) X.head()

Run the app to see the outputs

Press the run button in the top right corner

A = X[X['WLoc']=='A'].rename(columns={ 'WTeamID': 'BTeamID', 'WScore': 'BScore', 'LTeamID': 'ATeamID', 'LScore': 'AScore', }).drop('WLoc', axis=1) H = X[X['WLoc']=='H'].rename(columns={ 'WTeamID': 'ATeamID', 'WScore': 'AScore', 'LTeamID': 'BTeamID', 'LScore': 'BScore', }).drop('WLoc', axis=1) N = X[X['WLoc']=='N'].rename(columns={ 'WTeamID': 'ATeamID', 'WScore': 'AScore', 'LTeamID': 'BTeamID', 'LScore': 'BScore', }) import random for index in N.index: if random.randint(0, 1) == 1: N.at[index, 'WLoc'] = N.at[index, 'ATeamID'] N.at[index, 'ATeamID'] = N.at[index, 'BTeamID'] N.at[index, 'BTeamID'] = N.at[index, 'WLoc'] N.at[index, 'WLoc'] = N.at[index, 'AScore'] N.at[index, 'AScore'] = N.at[index, 'BScore'] N.at[index, 'BScore'] = N.at[index, 'WLoc'] N = N.drop('WLoc', axis=1) X = H.append(A, ignore_index=False,sort=False) X = X.append(N, ignore_index=False,sort=False) X.head()

Run the app to see the outputs

Press the run button in the top right corner

#X = X.merge(NCAA, how='left', left_on=['Season','ATeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1) #X = X.merge(NCAA, how='left', left_on=['Season','BTeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1) X = X.merge(Kenpom, how='left', left_on=['Season','ATeamID'], right_on=['Year','TeamID']).drop(['TeamID','Year'], axis=1) X = X.merge(Kenpom, how='left', left_on=['Season','BTeamID'], right_on=['Year','TeamID']).drop(['TeamID','Year'], axis=1) X = X.merge(men538, how='left', left_on=['Season','ATeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1) X = X.merge(men538, how='left', left_on=['Season','BTeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1) X = X.dropna(subset=['Rank_x','Rank_y']) X = X.dropna(subset=['Seed_x','Seed_y']) X

Run the app to see the outputs

Press the run button in the top right corner

#X['Conference'] = X['Conference_x'] + X['Conference_y'] X['Rank'] = X['Rank_x'] - X['Rank_y'] X['Wins'] = X['Wins_x'] - X['Wins_y'] X['Losses'] = X['Losses_x'] - X['Losses_y'] X['Seed'] = X['Seed_x'] - X['Seed_y'] X['Pyth'] = X['Pyth_x'] - X['Pyth_y'] X['AdjustO'] = X['AdjustO_x'] - X['AdjustO_y'] X['AdjustO Rank'] = X['AdjustO Rank_x'] - X['AdjustO Rank_y'] X['AdjustD'] = X['AdjustD_x'] - X['AdjustD_y'] X['AdjustD Rank'] = X['AdjustD Rank_x'] - X['AdjustD Rank_y'] X['AdjustT'] = X['AdjustT_x'] - X['AdjustT_y'] X['AdjustT Rank'] = X['AdjustT Rank_x'] - X['AdjustT Rank_y'] X['Luck'] = X['Luck_x'] - X['Luck_y'] X['Luck Rank'] = X['Luck Rank_x'] - X['Luck Rank_y'] X['SOS Pyth'] = X['SOS Pyth_x'] - X['SOS Pyth_y'] X['SOS Pyth Rank'] = X['SOS Pyth Rank_x'] - X['SOS Pyth Rank_y'] X['SOS OppO'] = X['SOS OppO_x'] - X['SOS OppO_y'] X['SOS OppO Rank'] = X['SOS OppO Rank_x'] - X['SOS OppO Rank_y'] X['SOS OppD'] = X['SOS OppD_x'] - X['SOS OppD_y'] X['SOS OppD Rank'] = X['SOS OppD Rank_x'] - X['SOS OppD Rank_y'] X['NCSOS Pyth'] = X['NCSOS Pyth_x'] - X['NCSOS Pyth_y'] X['NCSOS Pyth Rank'] = X['NCSOS Pyth Rank_x'] - X['NCSOS Pyth Rank_y'] X['538rating'] = X['538rating_x'] - X['538rating_y'] X = X.drop(['Rank_x','Conference_x','Wins_x','Losses_x','Seed_x','Pyth_x','AdjustO_x','AdjustO Rank_x', 'AdjustD_x','AdjustD Rank_x','AdjustT_x','AdjustT Rank_x','Luck_x','Luck Rank_x','SOS Pyth_x', 'SOS Pyth Rank_x','SOS OppO_x','SOS OppO Rank_x','SOS OppD_x','SOS OppD Rank_x','NCSOS Pyth_x', 'NCSOS Pyth Rank_x','Rank_y','Conference_y','Wins_y','Losses_y','Seed_y','Pyth_y','AdjustO_y', 'AdjustO Rank_y','AdjustD_y','AdjustD Rank_y','AdjustT_y','AdjustT Rank_y', 'Luck_y','Luck Rank_y','SOS Pyth_y','SOS Pyth Rank_y','SOS OppO_y','SOS OppO Rank_y', 'SOS OppD_y','SOS OppD Rank_y','NCSOS Pyth_y','NCSOS Pyth Rank_y','538rating_x','538rating_y'],axis = 1)

X['HomeWin'] = (X['AScore']-X['BScore'] > 0).astype(int) X.head()

Run the app to see the outputs

Press the run button in the top right corner

test = pd.read_csv('/work/NCAAMData/MSampleSubmissionStage2.csv') submission = pd.read_csv('/work/NCAAMData/MSampleSubmissionStage2.csv') test['Season'] = test['ID'].apply(lambda x: int(x.split('_')[0])) test['ATeamID'] = test['ID'].apply(lambda x: int(x.split('_')[1])) test['BTeamID'] = test['ID'].apply(lambda x: int(x.split('_')[2])) test = test.drop(['Pred','ID'], axis=1) test['TypeCompetition'] = 'Tourney' test.head()

Run the app to see the outputs

Press the run button in the top right corner

test = test.merge(Kenpom, how='left', left_on=['Season','ATeamID'], right_on=['Year','TeamID']).drop(['TeamID','Year'], axis=1) test = test.merge(Kenpom, how='left', left_on=['Season','BTeamID'], right_on=['Year','TeamID']).drop(['TeamID','Year'], axis=1) test = test.merge(men538, how='left', left_on=['Season','ATeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1) test = test.merge(men538, how='left', left_on=['Season','BTeamID'], right_on=['Season','TeamID']).drop(['TeamID'], axis=1) test.head()

Run the app to see the outputs

Press the run button in the top right corner

#test['Conference'] = test['Conference_x'] + test['Conference_y'] test['Rank'] = test['Rank_x'] - test['Rank_y'] test['Wins'] = test['Wins_x'] - test['Wins_y'] test['Losses'] = test['Losses_x'] - test['Losses_y'] #test['Seed'] = test['Seed_x'] - test['Seed_y'] test['Pyth'] = test['Pyth_x'] - test['Pyth_y'] test['AdjustO'] = test['AdjustO_x'] - test['AdjustO_y'] test['AdjustO Rank'] = test['AdjustO Rank_x'] - test['AdjustO Rank_y'] test['AdjustD'] = test['AdjustD_x'] - test['AdjustD_y'] test['AdjustD Rank'] = test['AdjustD Rank_x'] - test['AdjustD Rank_y'] test['AdjustT'] = test['AdjustT_x'] - test['AdjustT_y'] test['AdjustT Rank'] = test['AdjustT Rank_x'] - test['AdjustT Rank_y'] test['Luck'] = test['Luck_x'] - test['Luck_y'] test['Luck Rank'] = test['Luck Rank_x'] - test['Luck Rank_y'] test['SOS Pyth'] = test['SOS Pyth_x'] - test['SOS Pyth_y'] test['SOS Pyth Rank'] = test['SOS Pyth Rank_x'] - test['SOS Pyth Rank_y'] test['SOS OppO'] = test['SOS OppO_x'] - test['SOS OppO_y'] test['SOS OppO Rank'] = test['SOS OppO Rank_x'] - test['SOS OppO Rank_y'] test['SOS OppD'] = test['SOS OppD_x'] - test['SOS OppD_y'] test['SOS OppD Rank'] = test['SOS OppD Rank_x'] - test['SOS OppD Rank_y'] test['NCSOS Pyth'] = test['NCSOS Pyth_x'] - test['NCSOS Pyth_y'] test['NCSOS Pyth Rank'] = test['NCSOS Pyth Rank_x'] - test['NCSOS Pyth Rank_y'] test['538rating'] = test['538rating_x'] - test['538rating_y'] test = test.drop(['Rank_x','Conference_x','Wins_x','Losses_x','Seed_x','Pyth_x','AdjustO_x','AdjustO Rank_x', 'AdjustD_x','AdjustD Rank_x','AdjustT_x','AdjustT Rank_x','Luck_x','Luck Rank_x','SOS Pyth_x', 'SOS Pyth Rank_x','SOS OppO_x','SOS OppO Rank_x','SOS OppD_x','SOS OppD Rank_x','NCSOS Pyth_x', 'NCSOS Pyth Rank_x','Rank_y','Conference_y','Wins_y','Losses_y','Seed_y','Pyth_y','AdjustO_y', 'AdjustO Rank_y','AdjustD_y','AdjustD Rank_y','AdjustT_y','AdjustT Rank_y', 'Luck_y','Luck Rank_y','SOS Pyth_y','SOS Pyth Rank_y','SOS OppO_y','SOS OppO Rank_y', 'SOS OppD_y','SOS OppD Rank_y','NCSOS Pyth_y','NCSOS Pyth Rank_y','538rating_x','538rating_y'],axis = 1)

#ADD SEED Seeds = pd.read_csv('/work/NCAAMData/MNCAATourneySeeds.csv') test = test.merge(Seeds, how='left', left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID']).drop('TeamID', axis=1).rename(columns={'Seed': 'ASeed'}) test = test.merge(Seeds, how='left', left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID']).drop('TeamID', axis=1).rename(columns={'Seed': 'BSeed'}) #SEED TO FLOAT test['ASeed'] = test['ASeed'].str.replace(r'[^0-9]', '').astype('float') test['BSeed'] = test['BSeed'].str.replace(r'[^0-9]', '').astype('float') test['Seed'] = test['ASeed'] - test['BSeed'] test= test.drop(['ASeed','BSeed'],axis =1) test.head()

Run the app to see the outputs

Press the run button in the top right corner

temp = X.append(test, ignore_index=False,sort=False) temp = pd.get_dummies(temp,dtype=bool) X = temp[:len(X)] test = temp[len(X):] temp = pd.DataFrame test = test.drop(['AScore','BScore','HomeWin'],axis = 1)

test.head()

Run the app to see the outputs

Press the run button in the top right corner

X.head()

Run the app to see the outputs

Press the run button in the top right corner

lgbm_parameters= { 'objective': 'binary', 'metric': 'binary_logloss', }

test_pred = np.zeros(len(test)) test_pred = [] kf = KFold(n_splits=10, shuffle=True) for year in test['Season'].unique(): #X_year = X[(X['Season'] >= year-5)&(X['Season'] <= year+5)] X_year = X[((X['TypeCompetition_Season'] == True)&(X['Season'] == year))|(X['Season'] != year)] y_year = X_year['HomeWin'] X_year = X_year.drop(['AScore','BScore','HomeWin'], axis=1) test_year = test[test['Season'] == year] cat_val_pred = np.zeros(len(y_year)) cat_test_pred = np.zeros(len(test_year)) logloss = [] for trn_idx, val_idx in kf.split(X_year,y_year): x_train_idx = X_year.iloc[trn_idx] y_train_idx = y_year.iloc[trn_idx] x_valid_idx = X_year.iloc[val_idx] y_valid_idx = y_year.iloc[val_idx] cat_model = CatBoostClassifier() cat_model.fit(x_train_idx, y_train_idx, eval_set = ((x_valid_idx,y_valid_idx)),verbose = False,cat_features=[0,1,2]) cat_test_pred += cat_model.predict_proba(test_year)[:,1]/10 logloss.append(log_loss(y_valid_idx, cat_model.predict_proba(x_valid_idx)[:,1])) test_pred += cat_test_pred.tolist() print('Year_Predict:',year,'Log_Loss:',np.mean(logloss))

Run the app to see the outputs

Press the run button in the top right corner

submission.Pred = test_pred submission.to_csv('submission.csv', index=False)

plt.rcParams["figure.figsize"] = (5, 10) features=cat_model.get_feature_importance( verbose=True) sns.barplot(y=x_train_idx.columns,x=features,orient='h',palette='rocket');

Run the app to see the outputs

Press the run button in the top right corner

_ = sns.histplot(submission['Pred'])

Run the app to see the outputs

Press the run button in the top right corner

submission.head()

Run the app to see the outputs

Press the run button in the top right corner