import pandas as pd
cbb = pd.read_csv('cbb_refined_2.csv')
cbb.head()
cbb.SUCCESSFUL.mean()
#Features to remove due to Identification purpose
#TEAM, CONF, YEAR, G
cbb_cleaned = cbb.drop(['TEAM','CONF','YEAR','G'], axis = 1)
cbb_cleaned.info()
# SEED, Power_Rating, W(Wins), and WAB(Wins above Bubble Teams) are not related to the play of the team they are features determined by the NCAA not the teams play from game to game
# Must Remove features mentioned above
# there are NO MISSING VALUES
cbb_cleaned_2 = cbb_cleaned.drop(['SEED','Power_Rating','W','WAB'], axis = 1)
cbb_cleaned_2.info()
cbb_cleaned_2.describe().transpose()
cbb_cleaned_2['SUCCESSFUL'].hist(xrot=45.0)
#largely unbalanced- will play part in the value of differnt metrics
# next will use pairplot to compare features that are more general metrics instead of focused metrics
import seaborn as sns
sns.pairplot(cbb_cleaned_2[['Offensive_Efficiency','Defensive_Efficiency','SUCCESSFUL']], hue = 'SUCCESSFUL')
#Observations of above
# Offensive_Efficiency(ability of offense to produce points)initially looks to play a larger role than defensive efficiency in terms of success of the team(TARGET)
cbb_cleaned_2.info()
# the following pairplot looks to observe the relationship between more specific offensive features as well as overall offensive officiency compared to the success of the team
sns.pairplot(cbb_cleaned_2[['Offensive_Efficiency','Defensive_Efficiency','EFG_O','EFG_D','Turnover_Rate','OffensiveRebound_Rate','FreeThrowRate','2P_O','3P_O','Tempo','SUCCESSFUL']], hue = 'SUCCESSFUL')
# looking to drop the Offensive_Efficiency, Defensive_Efficiency, and EFG_O features due to Multicollinearity
cbb_cleansing_done = cbb_cleaned_2.drop(['Offensive_Efficiency','Defensive_Efficiency','EFG_O'], axis = 1)
#the following is trimming the data due to outliers from above pairplot
cbb_trimmed_tempo = cbb_cleansing_done[cbb_cleansing_done['Tempo']<75]
cbb_trimmed_tempo_turnover = cbb_trimmed_tempo[cbb_trimmed_tempo['Turnover_Rate']<25]
cbb_trimmed_tempo_turnover_3PO = cbb_trimmed_tempo_turnover[cbb_trimmed_tempo_turnover['3P_O']>27]
cbb_final = cbb_trimmed_tempo_turnover_3PO
cbb_final.info()
cbb_final.corr()['SUCCESSFUL'].sort_values(ascending=False)
#the negative correlation values are features that are better for the team when lower....not sure how it will
#impact model
# top 3 correlated to team success
# 2 Point percentage on offense
# 3 Point percentage on offense
# Offensive Rebound Rate
cbb_final.corr()['SUCCESSFUL'].sort_values(ascending=False)
X = cbb_final.drop('SUCCESSFUL', axis=1)
y = cbb_final['SUCCESSFUL']
X.head()
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(solver='liblinear')
logmodel.fit(X_train,y_train)
y_pred_log = logmodel.predict(X_test)
confusion_matrix(y_test,y_pred_log)
print(classification_report(y_test,y_pred_log))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
log_roc_auc = roc_auc_score(y_test, logmodel.predict_proba(X_test)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, logmodel.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % log_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)
y_pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test,y_pred_rfc))
print(classification_report(y_test,y_pred_rfc))
print('ROC AUC: ', roc_auc_score(y_test,y_pred_rfc))
from sklearn import tree
dt_model = tree.DecisionTreeClassifier(min_samples_leaf=5, max_depth=3)
dt_model.fit(X_train,y_train)
y_pred_dt = dt_model.predict(X_test)
confusion_matrix(y_test,y_pred_dt)
print(classification_report(y_test,y_pred_dt))
print('ROC AUC: ', roc_auc_score(y_test,y_pred_dt))
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_ = scaler.fit_transform(X)
X_rescaled = pd.DataFrame(X_, columns=X.columns)
X_train_rescaled, X_test_rescaled, y_train_rescaled, y_test_rescaled = train_test_split(X_rescaled, y, test_size=0.3, random_state=1)
knn = KNeighborsClassifier(n_neighbors=4, metric='euclidean')
knn.fit(X_train_rescaled, y_train_rescaled)
y_pred_knn = knn.predict(X_test_rescaled)
print(classification_report(y_test_rescaled,y_pred_knn))
print('ROC AUC: ', roc_auc_score(y_test_rescaled,y_pred_knn))
from sklearn.model_selection import cross_val_score
max_K = 100
cv_scores = [ ]
for K in range(1,max_K):
knn = KNeighborsClassifier(n_neighbors = K)
scores = cross_val_score(knn,X_train,y_train.values.ravel(),cv = 5,scoring = "accuracy")
cv_scores.append(scores.mean())
sns.lineplot(x=range(1,max_K), y=cv_scores)
print ('Max value ROC_AUC: ', max(cv_scores), 'Optimal k: ', cv_scores.index(max(cv_scores))+1)
from sklearn.neural_network import MLPClassifier
MLPC_model = MLPClassifier(hidden_layer_sizes=45, activation='logistic', solver='adam', random_state=1)
MLPC_model.fit(X_train, y_train)
y_pred_mlpc = MLPC_model.predict(X_test)
print(confusion_matrix(y_test,y_pred_mlpc))
print(classification_report(y_test,y_pred_mlpc))
print('ROC AUC: ', roc_auc_score(y_test,y_pred_mlpc))
min_hidden_layer_size = 10
max_hidden_layer_size = 100
cv_scores = [ ]
for s in range(min_hidden_layer_size,max_hidden_layer_size,5):
MLPC_model = MLPClassifier(hidden_layer_sizes=s, activation='relu', solver='adam', random_state=1)
scores = cross_val_score(MLPC_model,X_train,y_train.values.ravel(),cv = 5,scoring = "roc_auc")
cv_scores.append(scores.mean())
print('Max value ROC AUC: ', max(cv_scores), ' Optimal Node: ' , cv_scores.index(max(cv_scores))+1)
#Feature Importance Evaluation
!pip install eli5
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(logmodel, random_state=1, scoring='roc_auc').fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())
cbb_final['FreeThrowRate'].hist(xrot=45.0)
freethrow_bins = [20, 30, 40, 50, 60]
freethrowsummary = cbb_final.groupby(pd.cut(cbb_final['FreeThrowRate'], bins=freethrow_bins))['SUCCESSFUL'].agg(['mean', 'size'])
freethrowsummary.reset_index(inplace=True)
import matplotlib.pyplot as plt
fig2, axs2 = plt.subplots(2, figsize=(8,5))
fig2.suptitle('Success by Freethrow Rate')
axs2[0].bar(freethrowsummary.FreeThrowRate.astype('str'), freethrowsummary['mean'])
axs2[1].bar(freethrowsummary.FreeThrowRate.astype('str'), freethrowsummary['size'])
#if you reach a free throw rate of 40 minimum youn will have a much better chance of reaching the Tournament
cbb_final['2P_O'].hist(xrot=45.0)
twopoint_bins = [40, 45, 50, 55, 60]
twopointsummary = cbb_final.groupby(pd.cut(cbb_final['2P_O'], bins=twopoint_bins))['SUCCESSFUL'].agg(['mean', 'size'])
twopointsummary.reset_index(inplace=True)
twopointsummary
fig2, axs2 = plt.subplots(2, figsize=(8,5))
fig2.suptitle('Success by 2 Point Percentage')
axs2[0].bar(twopointsummary['2P_O'].astype('str'), twopointsummary['mean'])
axs2[1].bar(twopointsummary['2P_O'].astype('str'), twopointsummary['size'])