# Start writing code here...
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import pyplot
from matplotlib.colors import LinearSegmentedColormap
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, roc_curve, plot_roc_curve, plot_confusion_matrix, classification_report, confusion_matrix, r2_score, mean_squared_error as MSE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
#Load data
df = pd.read_csv("data.csv", encoding = "ISO-8859-1")
df.sample(100)
#Check all data
df.describe(include='all')
#Check data shape
df.shape
Model1
#Select colums for model1
df1 = df[['good','dist','seas','week','iced']]
df1.shape
#Check missing data
for i in df1.columns:
print("---- %s ---" % i)
print(round((df1[i].isna().sum()/len(df1[i])), 3))
---- good ---
0.0
---- dist ---
0.0
---- seas ---
0.0
---- week ---
0.0
---- iced ---
0.0
#Seperate to train and test sets
Train, Test = train_test_split(df1, stratify = df1.good,
test_size = 0.3,
random_state = 345)
display(len(Train),
round(len(Train)/len(df1), 3),
len(Test),
round(len(Test)/len(df1), 3))
#Check trainset
display(Train.groupby('good').size())
round(Train.groupby('good').size()/len(Train), 3)
#Check testset
display(Test.groupby('good').size())
round(Test.groupby('good').size()/len(Test), 3)
#Upsample of trainset
np.random.seed(345) # to create reproducible results
maj_class = np.where(Train.good == 'Y')[0]
min_class = np.where(Train.good == 'N')[0]
resample = np.random.choice(min_class, size = len(maj_class), replace = True)
TrainDS = pd.concat([Train.iloc[maj_class], Train.iloc[resample]])
#Check trainset
display(TrainDS.groupby('good').size())
round(TrainDS.groupby('good').size()/len(TrainDS), 3)
#Scaling for trainset
scaler = RobustScaler()
sca_raw_data_train = TrainDS.drop(columns = 'good').select_dtypes(exclude = ['object', 'category'])
scaler = scaler.fit(sca_raw_data_train)
scaled_data_train = pd.DataFrame(scaler.transform(sca_raw_data_train))
scaled_data_train.columns = sca_raw_data_train.columns
scaled_data_train.index = sca_raw_data_train.index
TrainDS_Sca = pd.concat([TrainDS.drop(sca_raw_data_train.columns, axis = 1), scaled_data_train], axis = 1)
TrainDS_Sca.head()
#Scaling for testset
sca_raw_data_test = Test.drop(columns = 'good')
scaled_data_test = pd.DataFrame(scaler.transform(sca_raw_data_test))
scaled_data_test.columns = sca_raw_data_test.columns
scaled_data_test.index = sca_raw_data_test.index
Test_Sca = pd.concat([Test.drop(sca_raw_data_test.columns, axis = 1), scaled_data_test], axis = 1)
Test_Sca.head()
#Set up data and labels
X_train = TrainDS_Sca.drop(columns = 'good')
y_train = TrainDS_Sca.good
X_test = Test_Sca.drop(columns = 'good')
y_test = Test_Sca.good
pos_label = "Y"
#Set up scoring metric
scoring_metric = 'balanced_accuracy'
#Set up classifiers and tuning parameters
names = ['Logistic Regression']
classifiers = [LogisticRegression(random_state = 345, solver = 'liblinear')]
param_grids = [{'C': [0.01, 0.25, 0.5, 1, 2]}]
#Create empty lists for storing outcomes
models = []
preds = []
probs = []
BAs = []
AUCs = []
FPRs = []
TPRs = []
timings = []
#Train classifiers and generate test predictions/probabilities
for i, eachClassifier in enumerate(classifiers):
print('Now working on model ', i + 1, ' of ', len(classifiers), ': ', names[i], sep = '')
#define cross-validation/parameter tuning settings
search = GridSearchCV(eachClassifier,
param_grids[i],
cv = 5,
scoring = scoring_metric,
n_jobs = -1)
Threshold = 0.5
model = search.fit(X_train, y_train)
pred = search.predict(X_test)
prob = np.where(search.predict_proba(X_test)[:, 1] > Threshold, 1, 0)
models.append(model)
preds.append(pred)
probs.append(prob)
BAs.append(balanced_accuracy_score(y_test, pred))
AUCs.append(roc_auc_score(y_test, prob))
FPR, TPR, _ = roc_curve(y_test, prob, pos_label = pos_label)
FPRs.append(FPR)
TPRs.append(TPR)
timings.append(model.refit_time_)
print('Finished!')
Now working on model 1 of 1: Logistic Regression
Finished!
results = pd.DataFrame({'Classifier': names,
'Balanced Accuracy': BAs,
'AUC': AUCs,
'TPR': TPRs,
'FPR': FPRs,
'Refit Time': timings}).sort_values('AUC', ascending = False)
display(round(results[['Classifier', 'Refit Time', 'Balanced Accuracy', 'AUC']], 3))
index = results.index[0]
models[index].best_estimator_
#Check model AUC
def Custom_ROC_Plot (results, X_test, y_test, title, figwidth = 8, figheight = 8):
fig, ax = plt.subplots(figsize = (figwidth, figheight))
ax.plot(ax.get_xlim(), ax.get_ylim(), ls = '--', c = 'k')
ax.set(title = title)
for i in results.index:
plot_roc_curve(models[i],
X_test,
y_test,
color = cm.Set1(i),
label = results.loc[i, 'Classifier'] + ': {:.3f}'.format(results.loc[i, 'AUC']),
ax = ax)
return([fig, ax])
fig, ax = Custom_ROC_Plot(results, X_test, y_test, title = 'Test AUC Comparison')
#Check model confusion matrix
plot_confusion_matrix(models[index],
X_test,
y_test,
cmap = plt.cm.Blues,
values_format = 'd')
print(classification_report(y_test, preds[index], digits = 3))
precision recall f1-score support
N 0.311 0.723 0.435 719
Y 0.915 0.650 0.760 3287
accuracy 0.663 4006
macro avg 0.613 0.687 0.598 4006
weighted avg 0.807 0.663 0.702 4006
Model2
#Select colums for model2
df2 = df[['good','dist','seas','week','iced','NumKicks']]
df2.shape
#Check missing data
for i in df2.columns:
print("---- %s ---" % i)
print(round((df2[i].isna().sum()/len(df2[i])), 3))
---- good ---
0.0
---- dist ---
0.0
---- seas ---
0.0
---- week ---
0.0
---- iced ---
0.0
---- NumKicks ---
0.0
#Seperate to train and test sets
Train, Test = train_test_split(df2, stratify = df2.good,
test_size = 0.3,
random_state = 345)
display(len(Train),
round(len(Train)/len(df2), 3),
len(Test),
round(len(Test)/len(df2), 3))
#Check trainset
display(Train.groupby('good').size())
round(Train.groupby('good').size()/len(Train), 3)
#Check testset
display(Test.groupby('good').size())
round(Test.groupby('good').size()/len(Test), 3)
#Upsample of trainset
np.random.seed(345) # to create reproducible results
maj_class = np.where(Train.good == 'Y')[0]
min_class = np.where(Train.good == 'N')[0]
resample = np.random.choice(min_class, size = len(maj_class), replace = True)
TrainDS = pd.concat([Train.iloc[maj_class], Train.iloc[resample]])
#Check trainset
display(TrainDS.groupby('good').size())
round(TrainDS.groupby('good').size()/len(TrainDS), 3)
#Scaling for trainset
scaler = RobustScaler()
sca_raw_data_train = TrainDS.drop(columns = 'good').select_dtypes(exclude = ['object', 'category'])
scaler = scaler.fit(sca_raw_data_train)
scaled_data_train = pd.DataFrame(scaler.transform(sca_raw_data_train))
scaled_data_train.columns = sca_raw_data_train.columns
scaled_data_train.index = sca_raw_data_train.index
TrainDS_Sca = pd.concat([TrainDS.drop(sca_raw_data_train.columns, axis = 1), scaled_data_train], axis = 1)
TrainDS_Sca.head()
#Scaling for testset
sca_raw_data_test = Test.drop(columns = 'good')
scaled_data_test = pd.DataFrame(scaler.transform(sca_raw_data_test))
scaled_data_test.columns = sca_raw_data_test.columns
scaled_data_test.index = sca_raw_data_test.index
Test_Sca = pd.concat([Test.drop(sca_raw_data_test.columns, axis = 1), scaled_data_test], axis = 1)
Test_Sca.head()
#Set up data and labels
X_train = TrainDS_Sca.drop(columns = 'good')
y_train = TrainDS_Sca.good
X_test = Test_Sca.drop(columns = 'good')
y_test = Test_Sca.good
pos_label = "Y"
#Set up scoring metric
scoring_metric = 'balanced_accuracy'
#Set up classifiers and tuning parameters
names = ['Logistic Regression']
classifiers = [LogisticRegression(random_state = 345, solver = 'liblinear')]
param_grids = [{'C': [0.01, 0.25, 0.5, 1, 2]}]
#Create empty lists for storing outcomes
models = []
preds = []
probs = []
BAs = []
AUCs = []
FPRs = []
TPRs = []
timings = []
#Train classifiers and generate test predictions/probabilities
for i, eachClassifier in enumerate(classifiers):
print('Now working on model ', i + 1, ' of ', len(classifiers), ': ', names[i], sep = '')
#define cross-validation/parameter tuning settings
search = GridSearchCV(eachClassifier,
param_grids[i],
cv = 5,
scoring = scoring_metric,
n_jobs = -1)
Threshold = 0.5
model = search.fit(X_train, y_train)
pred = search.predict(X_test)
prob = np.where(search.predict_proba(X_test)[:, 1] > Threshold, 1, 0)
models.append(model)
preds.append(pred)
probs.append(prob)
BAs.append(balanced_accuracy_score(y_test, pred))
AUCs.append(roc_auc_score(y_test, prob))
FPR, TPR, _ = roc_curve(y_test, prob, pos_label = pos_label)
FPRs.append(FPR)
TPRs.append(TPR)
timings.append(model.refit_time_)
print('Finished!')
Now working on model 1 of 1: Logistic Regression
Finished!
results = pd.DataFrame({'Classifier': names,
'Balanced Accuracy': BAs,
'AUC': AUCs,
'TPR': TPRs,
'FPR': FPRs,
'Refit Time': timings}).sort_values('AUC', ascending = False)
display(round(results[['Classifier', 'Refit Time', 'Balanced Accuracy', 'AUC']], 3))
index = results.index[0]
models[index].best_estimator_
#Check model AUC
def Custom_ROC_Plot (results, X_test, y_test, title, figwidth = 8, figheight = 8):
fig, ax = plt.subplots(figsize = (figwidth, figheight))
ax.plot(ax.get_xlim(), ax.get_ylim(), ls = '--', c = 'k')
ax.set(title = title)
for i in results.index:
plot_roc_curve(models[i],
X_test,
y_test,
color = cm.Set1(i),
label = results.loc[i, 'Classifier'] + ': {:.3f}'.format(results.loc[i, 'AUC']),
ax = ax)
return([fig, ax])
fig, ax = Custom_ROC_Plot(results, X_test, y_test, title = 'Test AUC Comparison')
#Check model confusion matrix
plot_confusion_matrix(models[index],
X_test,
y_test,
cmap = plt.cm.Blues,
values_format = 'd')