College Basketball Tournament Qualification Analysis

import pandas as pd

cbb = pd.read_csv('cbb_refined_2.csv')

cbb.head()

cbb.SUCCESSFUL.mean()

#Features to remove due to Identification purpose #TEAM, CONF, YEAR, G

cbb_cleaned = cbb.drop(['TEAM','CONF','YEAR','G'], axis = 1)

cbb_cleaned.info()

# SEED, Power_Rating, W(Wins), and WAB(Wins above Bubble Teams) are not related to the play of the team they are features determined by the NCAA not the teams play from game to game # Must Remove features mentioned above

# there are NO MISSING VALUES

cbb_cleaned_2 = cbb_cleaned.drop(['SEED','Power_Rating','W','WAB'], axis = 1)

cbb_cleaned_2.info()

cbb_cleaned_2.describe().transpose()

cbb_cleaned_2['SUCCESSFUL'].hist(xrot=45.0)

#largely unbalanced- will play part in the value of differnt metrics

# next will use pairplot to compare features that are more general metrics instead of focused metrics

import seaborn as sns

sns.pairplot(cbb_cleaned_2[['Offensive_Efficiency','Defensive_Efficiency','SUCCESSFUL']], hue = 'SUCCESSFUL')

#Observations of above # Offensive_Efficiency(ability of offense to produce points)initially looks to play a larger role than defensive efficiency in terms of success of the team(TARGET)

cbb_cleaned_2.info()

# the following pairplot looks to observe the relationship between more specific offensive features as well as overall offensive officiency compared to the success of the team

sns.pairplot(cbb_cleaned_2[['Offensive_Efficiency','Defensive_Efficiency','EFG_O','EFG_D','Turnover_Rate','OffensiveRebound_Rate','FreeThrowRate','2P_O','3P_O','Tempo','SUCCESSFUL']], hue = 'SUCCESSFUL')

# looking to drop the Offensive_Efficiency, Defensive_Efficiency, and EFG_O features due to Multicollinearity

cbb_cleansing_done = cbb_cleaned_2.drop(['Offensive_Efficiency','Defensive_Efficiency','EFG_O'], axis = 1)

#the following is trimming the data due to outliers from above pairplot

cbb_trimmed_tempo = cbb_cleansing_done[cbb_cleansing_done['Tempo']<75] cbb_trimmed_tempo_turnover = cbb_trimmed_tempo[cbb_trimmed_tempo['Turnover_Rate']<25] cbb_trimmed_tempo_turnover_3PO = cbb_trimmed_tempo_turnover[cbb_trimmed_tempo_turnover['3P_O']>27] cbb_final = cbb_trimmed_tempo_turnover_3PO

cbb_final.info()

cbb_final.corr()['SUCCESSFUL'].sort_values(ascending=False)

#the negative correlation values are features that are better for the team when lower....not sure how it will #impact model

# top 3 correlated to team success # 2 Point percentage on offense # 3 Point percentage on offense # Offensive Rebound Rate

cbb_final.corr()['SUCCESSFUL'].sort_values(ascending=False)

X = cbb_final.drop('SUCCESSFUL', axis=1) y = cbb_final['SUCCESSFUL']

X.head()

from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from sklearn.metrics import roc_auc_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

# Logistic Regression from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression(solver='liblinear') logmodel.fit(X_train,y_train)

y_pred_log = logmodel.predict(X_test)

confusion_matrix(y_test,y_pred_log)

print(classification_report(y_test,y_pred_log))

from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve import matplotlib.pyplot as plt log_roc_auc = roc_auc_score(y_test, logmodel.predict_proba(X_test)[:,1]) fpr, tpr, thresholds = roc_curve(y_test, logmodel.predict_proba(X_test)[:,1]) plt.figure() plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % log_roc_auc) plt.plot([0, 1], [0, 1],'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig('Log_ROC') plt.show()

# Random Forest from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier(n_estimators=100) rfc.fit(X_train,y_train)

y_pred_rfc = rfc.predict(X_test)

print(confusion_matrix(y_test,y_pred_rfc)) print(classification_report(y_test,y_pred_rfc))

print('ROC AUC: ', roc_auc_score(y_test,y_pred_rfc))

from sklearn import tree

dt_model = tree.DecisionTreeClassifier(min_samples_leaf=5, max_depth=3) dt_model.fit(X_train,y_train)

y_pred_dt = dt_model.predict(X_test)

confusion_matrix(y_test,y_pred_dt)

print(classification_report(y_test,y_pred_dt))

print('ROC AUC: ', roc_auc_score(y_test,y_pred_dt))

from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X_ = scaler.fit_transform(X) X_rescaled = pd.DataFrame(X_, columns=X.columns)

X_train_rescaled, X_test_rescaled, y_train_rescaled, y_test_rescaled = train_test_split(X_rescaled, y, test_size=0.3, random_state=1)

knn = KNeighborsClassifier(n_neighbors=4, metric='euclidean') knn.fit(X_train_rescaled, y_train_rescaled) y_pred_knn = knn.predict(X_test_rescaled) print(classification_report(y_test_rescaled,y_pred_knn))

print('ROC AUC: ', roc_auc_score(y_test_rescaled,y_pred_knn))

from sklearn.model_selection import cross_val_score max_K = 100 cv_scores = [ ] for K in range(1,max_K): knn = KNeighborsClassifier(n_neighbors = K) scores = cross_val_score(knn,X_train,y_train.values.ravel(),cv = 5,scoring = "accuracy") cv_scores.append(scores.mean())

sns.lineplot(x=range(1,max_K), y=cv_scores)

print ('Max value ROC_AUC: ', max(cv_scores), 'Optimal k: ', cv_scores.index(max(cv_scores))+1)

from sklearn.neural_network import MLPClassifier

MLPC_model = MLPClassifier(hidden_layer_sizes=45, activation='logistic', solver='adam', random_state=1) MLPC_model.fit(X_train, y_train) y_pred_mlpc = MLPC_model.predict(X_test) print(confusion_matrix(y_test,y_pred_mlpc)) print(classification_report(y_test,y_pred_mlpc))

print('ROC AUC: ', roc_auc_score(y_test,y_pred_mlpc))

min_hidden_layer_size = 10 max_hidden_layer_size = 100 cv_scores = [ ] for s in range(min_hidden_layer_size,max_hidden_layer_size,5): MLPC_model = MLPClassifier(hidden_layer_sizes=s, activation='relu', solver='adam', random_state=1) scores = cross_val_score(MLPC_model,X_train,y_train.values.ravel(),cv = 5,scoring = "roc_auc") cv_scores.append(scores.mean())

print('Max value ROC AUC: ', max(cv_scores), ' Optimal Node: ' , cv_scores.index(max(cv_scores))+1)

#Feature Importance Evaluation

!pip install eli5

import eli5 from eli5.sklearn import PermutationImportance

perm = PermutationImportance(logmodel, random_state=1, scoring='roc_auc').fit(X_test, y_test) eli5.show_weights(perm, feature_names = X_test.columns.tolist())

cbb_final['FreeThrowRate'].hist(xrot=45.0)

freethrow_bins = [20, 30, 40, 50, 60]

freethrowsummary = cbb_final.groupby(pd.cut(cbb_final['FreeThrowRate'], bins=freethrow_bins))['SUCCESSFUL'].agg(['mean', 'size'])

freethrowsummary.reset_index(inplace=True)

import matplotlib.pyplot as plt

fig2, axs2 = plt.subplots(2, figsize=(8,5)) fig2.suptitle('Success by Freethrow Rate') axs2[0].bar(freethrowsummary.FreeThrowRate.astype('str'), freethrowsummary['mean']) axs2[1].bar(freethrowsummary.FreeThrowRate.astype('str'), freethrowsummary['size'])

#if you reach a free throw rate of 40 minimum youn will have a much better chance of reaching the Tournament

cbb_final['2P_O'].hist(xrot=45.0)

twopoint_bins = [40, 45, 50, 55, 60]

twopointsummary = cbb_final.groupby(pd.cut(cbb_final['2P_O'], bins=twopoint_bins))['SUCCESSFUL'].agg(['mean', 'size'])

twopointsummary.reset_index(inplace=True)

twopointsummary

fig2, axs2 = plt.subplots(2, figsize=(8,5)) fig2.suptitle('Success by 2 Point Percentage') axs2[0].bar(twopointsummary['2P_O'].astype('str'), twopointsummary['mean']) axs2[1].bar(twopointsummary['2P_O'].astype('str'), twopointsummary['size'])