import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
sklearn.__version__
pd.set_option('display.max_columns', 100)
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
path = "sample_data/bank-additional-full.csv"
df = pd.read_csv(path,sep=";")
df.head()
df.shape
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 41188 non-null int64
1 job 41188 non-null object
2 marital 41188 non-null object
3 education 41188 non-null object
4 default 41188 non-null object
5 housing 41188 non-null object
6 loan 41188 non-null object
7 contact 41188 non-null object
8 month 41188 non-null object
9 day_of_week 41188 non-null object
10 duration 41188 non-null int64
11 campaign 41188 non-null int64
12 pdays 41188 non-null int64
13 previous 41188 non-null int64
14 poutcome 41188 non-null object
15 emp.var.rate 41188 non-null float64
16 cons.price.idx 41188 non-null float64
17 cons.conf.idx 41188 non-null float64
18 euribor3m 41188 non-null float64
19 nr.employed 41188 non-null float64
20 y 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usage: 6.6+ MB
# To check if we have any null values using below code for better clarification.
df.isnull().sum()
# Finding the unique values of y
df['y'].unique()
# how many people(client) have subscribed to a term deposit?
df.y.value_counts()
sns.set(font_scale=1.5)
countplt=sns.countplot(x='y', data=df, palette ='Set1')
plt.show()
# checking for duplicate values
df.duplicated().sum()
# Removing Duplicate Values
df = df.drop_duplicates()
# how many people(client) have subscribed to a term deposit?
df.y.value_counts()
print("# Missing value 'job' variable: {0}".format(len(df.loc[df['job'] == "unknown"])))
print("# Missing value 'marital' variable: {0}".format(len(df.loc[df['marital'] == "unknown"])))
print("# Missing value 'education' variable: {0}".format(len(df.loc[df['education'] == "unknown"])))
print("# Missing value 'default' variable: {0}".format(len(df.loc[df['default'] == "unknown"])))
print("# Missing value 'housing' variable: {0}".format(len(df.loc[df['housing'] == "unknown"])))
print("# Missing value 'loan' variable: {0}".format(len(df.loc[df['loan'] == "unknown"])))
print("# Missing value 'contact' variable: {0}".format(len(df.loc[df['contact'] == "unknown"])))
print("# Missing value 'month' variable: {0}".format(len(df.loc[df['month'] == "unknown"])))
print("# Missing value 'day_of_week' variable: {0}".format(len(df.loc[df['day_of_week'] == "unknown"])))
print("# Missing value 'poutcome' variable: {0}".format(len(df.loc[df['poutcome'] == "unknown"])))
# Missing value 'job' variable: 330
# Missing value 'marital' variable: 80
# Missing value 'education' variable: 1730
# Missing value 'default' variable: 8596
# Missing value 'housing' variable: 990
# Missing value 'loan' variable: 990
# Missing value 'contact' variable: 0
# Missing value 'month' variable: 0
# Missing value 'day_of_week' variable: 0
# Missing value 'poutcome' variable: 0
df[df['marital'] == "unknown"]
# dropping the martial status which is unknown from our dataframe
df.drop(df[df['marital'] == "unknown"].index, inplace=True)
df.shape
df[df['default'] == "yes"].shape
df.education.value_counts()
df.job.value_counts()
df.describe()
# checking for outliers in dataset
plt.figure(figsize=(20,10))
df.boxplot()
plt.title("Boxplot of the dataframe", fontsize = 15)
print()
# looking inside duration variable boxplot
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['duration'])
plt.show()
# looking inside age variable boxplot
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['age'])
plt.show()
sns.boxplot(x = 'y', y = 'age', data = df)
# looking inside campaign variable boxplot
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['campaign'])
plt.show()
sns.boxplot(x = 'y', y = 'campaign', data = df)
Q1_d = df['duration'].quantile(.25)
Q3_d = df['duration'].quantile(.75)
Q1_a = df['age'].quantile(.25)
Q3_a = df['age'].quantile(.75)
Q1_c = df['campaign'].quantile(.25)
Q3_c = df['campaign'].quantile(.75)
IQR_d = Q3_d - Q1_d
IQR_a = Q3_a - Q1_a
IQR_c = Q3_c - Q1_c
print(IQR_d)
print(IQR_a)
print(IQR_c)
217.0
15.0
2.0
lower_d = Q1_d - 1.5 * IQR_d
upper_d = Q3_d + 1.5 * IQR_d
lower_a = Q1_a - 1.5 * IQR_a
upper_a = Q3_a + 1.5 * IQR_a
lower_c = Q1_c - 1.5 * IQR_d
upper_c = Q3_c + 1.5 * IQR_d
print(lower_d,upper_d)
print(lower_a,upper_a)
print(lower_c,upper_c)
-223.5 644.5
9.5 69.5
-324.5 328.5
# new dataframe created after removing outlier that exist outside the interval assign
df_out = df[df['duration'] >= lower_d]
df_out= df[df['duration'] <= upper_d]
# new dataframe created after removing outlier that exist outside the interval assign
df_out = df[df['age'] >= lower_a]
df_out= df[df['age'] <= upper_a]
# new dataframe created after removing outlier that exist outside the interval assign
df_out = df[df['campaign'] >= lower_c]
df_out= df[df['campaign'] <= upper_c]
df_out.describe()
# Calculating correlation
corr_matrix = df.corr()
print(corr_matrix)
age duration campaign pdays previous \
age 1.000000 -0.001414 0.004117 -0.035040 0.024821
duration -0.001414 1.000000 -0.071767 -0.047526 0.020336
campaign 0.004117 -0.071767 1.000000 0.052477 -0.079105
pdays -0.035040 -0.047526 0.052477 1.000000 -0.587662
previous 0.024821 0.020336 -0.079105 -0.587662 1.000000
emp.var.rate -0.000642 -0.027923 0.150454 0.271068 -0.420431
cons.price.idx 0.000665 0.005598 0.127120 0.078883 -0.202701
cons.conf.idx 0.129279 -0.008223 -0.013610 -0.091437 -0.051025
euribor3m 0.010417 -0.032824 0.134818 0.296796 -0.454390
nr.employed -0.018144 -0.044734 0.143812 0.372446 -0.501320
emp.var.rate cons.price.idx cons.conf.idx euribor3m \
age -0.000642 0.000665 0.129279 0.010417
duration -0.027923 0.005598 -0.008223 -0.032824
campaign 0.150454 0.127120 -0.013610 0.134818
pdays 0.271068 0.078883 -0.091437 0.296796
previous -0.420431 -0.202701 -0.051025 -0.454390
emp.var.rate 1.000000 0.775100 0.196273 0.972238
cons.price.idx 0.775100 1.000000 0.059165 0.687925
cons.conf.idx 0.196273 0.059165 1.000000 0.277851
euribor3m 0.972238 0.687925 0.277851 1.000000
nr.employed 0.906926 0.521625 0.100622 0.945138
nr.employed
age -0.018144
duration -0.044734
campaign 0.143812
pdays 0.372446
previous -0.501320
emp.var.rate 0.906926
cons.price.idx 0.521625
cons.conf.idx 0.100622
euribor3m 0.945138
nr.employed 1.000000
# Creating correlation heat map.
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix, annot=True, vmin=-1, vmax=1,fmt='.2f')
plt.title("Correlation Heatmap", fontsize = 15)
plt.show()
# plotting the heatmap of only highly correlated varibles with threshold value 0.9
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix[corr_matrix > 0.9], annot=True)
plt.title("Correlation Heatmap", fontsize = 15)
plt.show()
# changing yes to 1 and no to 0
df['y'] = (df['y']=='yes').astype(int)
df.y.value_counts()
sns.distplot(df['age'], color = 'green')
plt.title('Customer Age Distribution', fontsize = 18)
plt.xlabel('Age', fontsize = 10)
plt.ylabel('count')
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
plt.figure(figsize=(11,7))
df[df['y']==1]['age'].hist(alpha = 0.5, color = 'red', bins= 50, label='y=1')
df[df['y']==0]['age'].hist(alpha = 0.5, color = 'blue', bins= 50, label='y=0')
plt.legend()
plt.xlabel('age')
plt.figure(figsize=(11,7))
df[df['y']==1]['campaign'].hist(alpha = 0.5, color = 'red', bins= 40, label='y=1')
df[df['y']==0]['campaign'].hist(alpha =0.5, color = 'blue', bins= 40, label='y=0')
plt.legend()
plt.xlabel('campaign')
plt.figure(figsize=(15,7))
df[df['y']==1]['duration'].hist(alpha = 0.5, color = 'red', bins= 50, label='y=1')
df[df['y']==0]['duration'].hist(alpha = 0.5, color = 'blue', bins= 50, label='y=0')
plt.legend()
plt.xlabel('duration')
sns.jointplot(x='age', y='campaign', data=df, color = 'green', alpha=0.2)
plt.figure(figsize=(11,7))
sns.lmplot(y='campaign',x='age',hue = 'y', data=df,col='marital',palette='Set1')
plt.figure(figsize=(7,7))
sns.distplot(df['emp.var.rate'])
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
cat_list = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
for column in cat_list:
plt.figure(figsize=(21,9))
sns.countplot(x = column, data = df, hue = 'y', palette = 'Set1')
plt.title('Barplot of '+column)
plt.show()
for column in cat_list:
plt.figure(figsize=(21,9))
sns.countplot(x = column, data = df, palette = 'Set1')
plt.title('Barplot of '+column)
plt.show()
#Create a list of element containing the string 'purpose, job,etc.'. Call this list cat_list.
cat_list = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
new_df = pd.get_dummies(df, columns = cat_list)
data = new_df.dropna()
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 41096 entries, 0 to 41187
Data columns (total 63 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 41096 non-null int64
1 duration 41096 non-null int64
2 campaign 41096 non-null int64
3 pdays 41096 non-null int64
4 previous 41096 non-null int64
5 emp.var.rate 41096 non-null float64
6 cons.price.idx 41096 non-null float64
7 cons.conf.idx 41096 non-null float64
8 euribor3m 41096 non-null float64
9 nr.employed 41096 non-null float64
10 y 41096 non-null int64
11 job_admin. 41096 non-null uint8
12 job_blue-collar 41096 non-null uint8
13 job_entrepreneur 41096 non-null uint8
14 job_housemaid 41096 non-null uint8
15 job_management 41096 non-null uint8
16 job_retired 41096 non-null uint8
17 job_self-employed 41096 non-null uint8
18 job_services 41096 non-null uint8
19 job_student 41096 non-null uint8
20 job_technician 41096 non-null uint8
21 job_unemployed 41096 non-null uint8
22 job_unknown 41096 non-null uint8
23 marital_divorced 41096 non-null uint8
24 marital_married 41096 non-null uint8
25 marital_single 41096 non-null uint8
26 education_basic.4y 41096 non-null uint8
27 education_basic.6y 41096 non-null uint8
28 education_basic.9y 41096 non-null uint8
29 education_high.school 41096 non-null uint8
30 education_illiterate 41096 non-null uint8
31 education_professional.course 41096 non-null uint8
32 education_university.degree 41096 non-null uint8
33 education_unknown 41096 non-null uint8
34 default_no 41096 non-null uint8
35 default_unknown 41096 non-null uint8
36 default_yes 41096 non-null uint8
37 housing_no 41096 non-null uint8
38 housing_unknown 41096 non-null uint8
39 housing_yes 41096 non-null uint8
40 loan_no 41096 non-null uint8
41 loan_unknown 41096 non-null uint8
42 loan_yes 41096 non-null uint8
43 contact_cellular 41096 non-null uint8
44 contact_telephone 41096 non-null uint8
45 month_apr 41096 non-null uint8
46 month_aug 41096 non-null uint8
47 month_dec 41096 non-null uint8
48 month_jul 41096 non-null uint8
49 month_jun 41096 non-null uint8
50 month_mar 41096 non-null uint8
51 month_may 41096 non-null uint8
52 month_nov 41096 non-null uint8
53 month_oct 41096 non-null uint8
54 month_sep 41096 non-null uint8
55 day_of_week_fri 41096 non-null uint8
56 day_of_week_mon 41096 non-null uint8
57 day_of_week_thu 41096 non-null uint8
58 day_of_week_tue 41096 non-null uint8
59 day_of_week_wed 41096 non-null uint8
60 poutcome_failure 41096 non-null uint8
61 poutcome_nonexistent 41096 non-null uint8
62 poutcome_success 41096 non-null uint8
dtypes: float64(5), int64(6), uint8(52)
memory usage: 5.8 MB
data.head()
data.shape
df[['duration', 'y']].boxplot(by=['y'], sym ='', figsize = [6, 6])
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
return array(a, dtype, copy=False, order=order)
# split the datasets into training and test data
X = data.drop('y', axis=1)
y = data.y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 555, test_size= 0.30)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
#Standardization of the data
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)
def model_perf_v1(model,X_train, X_val, y_train, y_val):
pred_dt = model.predict(X_val)
print("Accuracy on training set:")
pred = model.predict(X_train)
print(metrics.accuracy_score(y_true = y_train, y_pred = pred))
#
print("Accuracy on testing set:")
accuracy = (metrics.accuracy_score(y_true = y_val, y_pred = pred_dt))
print(accuracy)
#confusion matrix
confusion_matrix_ = pd.crosstab(index=y_val, columns=pred_dt.ravel(), rownames=['Expected'], colnames=['Predicted'])
#visualization
sns.heatmap(confusion_matrix_, annot=True, square=False, fmt='', cbar=False)
plt.title("Confusion Matrix", fontsize = 15)
plt.show()
#
print("Recall:")
recall = (metrics.recall_score(y_val,pred_dt))
#recall_no = (metrics.recall_score(y_val,pred_dt))
print(recall)
#print(recall_no)
# #
print("Specificity:")
tn, fp, fn, tp = confusion_matrix(y_val,pred_dt).ravel()
spec = tn/(tn+fp)
Specificity = (spec)
print(Specificity)
# #
print("Precision:")
Precision = (metrics.precision_score(y_val,pred_dt))
print(Precision)
# #
print("Balanced Accuracy:")
Balanced_Accuracy = (metrics.balanced_accuracy_score(y_val,pred_dt))
print(Balanced_Accuracy)
# #
print("F1 score:")
F1_score = (metrics.f1_score(y_val,pred_dt))
print(F1_score)
#classification_report
# print(metrics.classification_report(y_test, pred_dt))
return accuracy,recall,Specificity,Precision,F1_score,Balanced_Accuracy
def model_perf_to_lst(model,X_val, y_val):
lst = [str(model)]
pred_dt = model.predict(X_val)
#print("Accuracy on testing set:")
lst.append(metrics.accuracy_score(y_true = y_val, y_pred = pred_dt))
#print("Recall:")
lst.append(metrics.recall_score(y_val,pred_dt))
#
#print("Specificity:")
tn, fp, fn, tp = confusion_matrix(y_val,pred_dt).ravel()
spec = tn/(tn+fp)
lst.append(spec)
#
#print("Precision:")
lst.append(metrics.precision_score(y_val,pred_dt))
#
#print("Balanced Accuracy:")
lst.append(metrics.balanced_accuracy_score(y_val,pred_dt))
#
#print("F1 score:")
lst.append(metrics.f1_score(y_val,pred_dt))
return lst
best_cl_normal = pd.DataFrame(columns = ['Model','Accuracy','Recall', 'Specificity', 'Precision', 'Balanced Accuracy', 'F1 score'])
rf = RandomForestClassifier()
# fitting the model
rf.fit(X_train,y_train)
accuracy_rf, recall_rf, Specificity_rf, Precision_rf, F1_score_rf, Balanced_Accuracy_rf = model_perf_v1(rf,X_train,X_test,y_train,y_test)
rf_perf = model_perf_to_lst(rf, X_test, y_test)
Accuracy on training set:
0.9999652379462579
Accuracy on testing set:
0.9147538324276097
Recall:
0.4593406593406593
Specificity:
0.971452024808464
Precision:
0.6670212765957447
Balanced Accuracy:
0.7153963420745617
F1 score:
0.5440347071583513
best_cl_normal.loc[len(best_cl_normal)] = rf_perf
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
accuracy_dt,recall_dt,Specificity_dt,Precision_dt,F1_score_dt,Balanced_Accuracy_dt = model_perf_v1(dt,X_train,X_test,y_train,y_test)
dt_perf = model_perf_to_lst(dt, X_test, y_test)
Accuracy on training set:
1.0
Accuracy on testing set:
0.8952875334577014
Recall:
0.5472527472527473
Specificity:
0.9386172929587742
Precision:
0.526056338028169
Balanced Accuracy:
0.7429350201057607
F1 score:
0.5364452423698385
best_cl_normal.loc[len(best_cl_normal)] = dt_perf
log = LogisticRegression(max_iter=2500)
# fitting the model
log.fit(X_train, y_train)
accuracy_log, recall_log, Specificity_log, Precision_log, F1_score_log, Balanced_Accuracy_log = model_perf_v1(log,X_train,X_test,y_train,y_test)
log_perf = model_perf_to_lst(log, X_test, y_test)
Accuracy on training set:
0.9109396183126499
Accuracy on testing set:
0.911996106740206
Recall:
0.43736263736263736
Specificity:
0.9710871944545786
Precision:
0.6531728665207878
Balanced Accuracy:
0.704224915908608
F1 score:
0.5239139973672663
best_cl_normal.loc[len(best_cl_normal)] = log_perf
knn = KNeighborsClassifier(n_neighbors=3)
# fitting the model
knn.fit(X_train, y_train)
accuracy_knn, recall_knn, Specificity_knn, Precision_knn, F1_score_knn, Balanced_Accuracy_knn = model_perf_v1(knn,X_train,X_test,y_train,y_test)
knn_perf = model_perf_to_lst(knn, X_test, y_test)
Accuracy on training set:
0.9324225675252894
Accuracy on testing set:
0.8947197664044123
Recall:
0.326007326007326
Specificity:
0.9655235315578257
Precision:
0.5407047387606319
Balanced Accuracy:
0.6457654287825758
F1 score:
0.40676416819012795
best_cl_normal.loc[len(best_cl_normal)] = knn_perf
gnb = GaussianNB()
gnb.fit(X_train, y_train)
accuracy_gnb, recall_gnb, Specificity_gnb, Precision_gnb, F1_score_gnb, Balanced_Accuracy_gnb = model_perf_v1(gnb,X_train,X_test,y_train,y_test)
gnb_perf = model_perf_to_lst(gnb, X_test, y_test)
Accuracy on training set:
0.7343136232488615
Accuracy on testing set:
0.7329061562170492
Recall:
0.819047619047619
Specificity:
0.722181685516235
Precision:
0.26849183477425553
Balanced Accuracy:
0.770614652281927
F1 score:
0.40441309459215047
best_cl_normal.loc[len(best_cl_normal)] = gnb_perf
grb = GradientBoostingClassifier()
grb.fit(X_train, y_train)
accuracy_grb, recall_grb, Specificity_grb, Precision_grb, F1_score_grb, Balanced_Accuracy_grb = model_perf_v1(grb,X_train,X_test,y_train,y_test)
grb_perf = model_perf_to_lst(grb, X_test, y_test)
Accuracy on training set:
0.9220634755101331
Accuracy on testing set:
0.9181604347473437
Recall:
0.5523809523809524
Specificity:
0.9636993797883984
Precision:
0.6545138888888888
Balanced Accuracy:
0.7580401660846754
F1 score:
0.5991259435836314
best_cl_normal.loc[len(best_cl_normal)] = grb_perf
best_cl_normal
# Helper function for grid search
def grid_search_helper():
pipeline1 = Pipeline((
('clf', DecisionTreeClassifier()),
))
pipeline2 = Pipeline((
('clf', LogisticRegression()),
))
pipeline3 = Pipeline((
('clf', KNeighborsClassifier()),
))
pipeline4 = Pipeline((
('clf', GaussianNB()),
))
pipeline5 = Pipeline((
('clf', GradientBoostingClassifier()),
))
parameters1 = {
'clf__min_samples_split': [5, 10, 20, 30, 40, 50],
'clf__max_depth': list(range(1,15))
}
parameters2 = {
'clf__C': np.logspace(0, 4, 10)
}
parameters3 = {
'clf__n_neighbors': [1,3,5,7,9,11,15]
}
parameters4 = {
'clf__var_smoothing': np.logspace(0,-9)
}
parameters5 ={
'clf__n_estimators':[1, 2, 5, 10, 20, 50],
'clf__learning_rate':[0.1, 0.3, 0.5, 0.7, 1]
}
pars = [parameters1, parameters2, parameters3, parameters4, parameters5]
pips = [pipeline1, pipeline2, pipeline3, pipeline4, pipeline5]
print("starting Gridsearch")
dict_best_params ={}
for i in range(len(pars)):
print(pars[i])
print(pips[i])
gs = GridSearchCV(pips[i], pars[i], cv= 3, n_jobs=-1)
gs.fit(X_train, y_train)
print("finished Gridsearch\n")
#print(gs.best_estimator_)
dict_best_params[i]= gs.best_estimator_
return dict_best_params
best_params_dict = grid_search_helper()
starting Gridsearch
{'clf__min_samples_split': [5, 10, 20, 30, 40, 50], 'clf__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]}
Pipeline(memory=None,
steps=(('clf',
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort='deprecated', random_state=None,
splitter='best')),),
verbose=False)
finished Gridsearch
{'clf__C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
3.59381366e+03, 1.00000000e+04])}
Pipeline(memory=None,
steps=(('clf',
LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None,
penalty='l2', random_state=None,
solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)),),
verbose=False)
finished Gridsearch
{'clf__n_neighbors': [1, 3, 5, 7, 9, 11, 15]}
Pipeline(memory=None,
steps=(('clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=5, p=2,
weights='uniform')),),
verbose=False)
finished Gridsearch
{'clf__var_smoothing': array([1.00000000e+00, 6.55128557e-01, 4.29193426e-01, 2.81176870e-01,
1.84206997e-01, 1.20679264e-01, 7.90604321e-02, 5.17947468e-02,
3.39322177e-02, 2.22299648e-02, 1.45634848e-02, 9.54095476e-03,
6.25055193e-03, 4.09491506e-03, 2.68269580e-03, 1.75751062e-03,
1.15139540e-03, 7.54312006e-04, 4.94171336e-04, 3.23745754e-04,
2.12095089e-04, 1.38949549e-04, 9.10298178e-05, 5.96362332e-05,
3.90693994e-05, 2.55954792e-05, 1.67683294e-05, 1.09854114e-05,
7.19685673e-06, 4.71486636e-06, 3.08884360e-06, 2.02358965e-06,
1.32571137e-06, 8.68511374e-07, 5.68986603e-07, 3.72759372e-07,
2.44205309e-07, 1.59985872e-07, 1.04811313e-07, 6.86648845e-08,
4.49843267e-08, 2.94705170e-08, 1.93069773e-08, 1.26485522e-08,
8.28642773e-09, 5.42867544e-09, 3.55648031e-09, 2.32995181e-09,
1.52641797e-09, 1.00000000e-09])}
Pipeline(memory=None,
steps=(('clf', GaussianNB(priors=None, var_smoothing=1e-09)),),
verbose=False)
finished Gridsearch
{'clf__n_estimators': [1, 2, 5, 10, 20, 50], 'clf__learning_rate': [0.1, 0.3, 0.5, 0.7, 1]}
Pipeline(memory=None,
steps=(('clf',
GradientBoostingClassifier(ccp_alpha=0.0,
criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance',
max_depth=3, max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_iter_no_change=None,
presort='deprecated',
random_state=None, subsample=1.0,
tol=0.0001, validation_fraction=0.1,
verbose=0, warm_start=False)),),
verbose=False)
finished Gridsearch
accuracy_dt_h,recall_dt_h,Specificity_dt_h,Precision_dt_h,F1_score_dt_h,Balanced_Accuracy_dt_h = model_perf_v1(best_params_dict[0],X_train,X_test,y_train,y_test)
Accuracy on training set:
0.9174401223624292
Accuracy on testing set:
0.91540270905994
Recall:
0.5025641025641026
Specificity:
0.9668004377964247
Precision:
0.6533333333333333
Balanced Accuracy:
0.7346822701802636
F1 score:
0.5681159420289855
accuracy_log_h, recall_log_h, Specificity_log_h, Precision_log_h, F1_score_log_h, Balanced_Accuracy_log_h = model_perf_v1(best_params_dict[1],X_train,X_test,y_train,y_test)
Accuracy on training set:
0.910696283936455
Accuracy on testing set:
0.9120772163192473
Recall:
0.4380952380952381
Specificity:
0.9710871944545786
Precision:
0.653551912568306
Balanced Accuracy:
0.7045912162749084
F1 score:
0.524561403508772
accuracy_knn_h, recall_knn_h, Specificity_knn_h, Precision_knn_h, F1_score_knn_h, Balanced_Accuracy_knn_h = model_perf_v1(best_params_dict[2],X_train,X_test,y_train,y_test)
Accuracy on training set:
0.9051343553377134
Accuracy on testing set:
0.9003974369373023
Recall:
0.2608058608058608
Specificity:
0.980025538124772
Precision:
0.6191304347826087
Balanced Accuracy:
0.6204156994653164
F1 score:
0.3670103092783505
accuracy_gnb_h, recall_gnb_h, Specificity_gnb_h, Precision_gnb_h, F1_score_gnb_h, Balanced_Accuracy_gnb_h = model_perf_v1(best_params_dict[3],X_train,X_test,y_train,y_test)
Accuracy on training set:
0.8837209302325582
Accuracy on testing set:
0.8831210966015086
Recall:
0.4197802197802198
Specificity:
0.9408062750820868
Precision:
0.46890343698854337
Balanced Accuracy:
0.6802932474311533
F1 score:
0.4429841515268651
accuracy_grb_h, recall_grb_h, Specificity_grb_h, Precision_grb_h, F1_score_grb_h, Balanced_Accuracy_grb_h = model_perf_v1(best_params_dict[4],X_train,X_test,y_train,y_test)
Accuracy on training set:
0.920742517467932
Accuracy on testing set:
0.916538243166518
Recall:
0.515018315018315
Specificity:
0.9665268150310106
Precision:
0.6570093457943925
Balanced Accuracy:
0.7407725650246628
F1 score:
0.5774127310061601
# 3 fold cross validation
k=3
params = dict(
min_samples_split = [5, 10, 20, 30, 40, 50],
max_depth = list(range(1,15)),
n_estimators = [1, 2, 5, 10, 20, 50]
)
params
rf_1 = RandomForestClassifier()
rf_gs = GridSearchCV(estimator=rf_1, param_grid=params, cv=k, n_jobs=-1 )
# fitting the random forest model
rf_gs.fit(X_train, y_train)
best_estimator_rf = rf_gs.best_estimator_
accuracy_rf_h, recall_rf_h, Specificity_rf_h, Precision_rf_h, F1_score_rf_h, Balanced_Accuracy_rf_h = model_perf_v1(best_estimator_rf,X_train,X_test,y_train,y_test)
Accuracy on training set:
0.9344735286960754
Accuracy on testing set:
0.9139427366371968
Recall:
0.3912087912087912
Specificity:
0.979022254651587
Precision:
0.6989528795811518
Balanced Accuracy:
0.6851155229301891
F1 score:
0.5016439643024895
column_labels = ['classifier','accuracy','recall','specificity','precision','f1-score','balanced']
# Random forest classifier best model performance
df1 = pd.DataFrame([['RandomForest_h',accuracy_rf_h, recall_rf_h, Specificity_rf_h, Precision_rf_h, F1_score_rf_h, Balanced_Accuracy_rf_h]],columns =column_labels )
# decision tree classifier best model
df2= pd.DataFrame([['DecisionTreeT_h',accuracy_dt_h, recall_dt_h, Specificity_dt_h, Precision_dt_h, F1_score_dt_h, Balanced_Accuracy_dt_h]],columns =column_labels )
# logistic regression best model
df3 = pd.DataFrame([['LogisticRegression_h', accuracy_log_h, recall_log_h, Specificity_log_h, Precision_log_h, F1_score_log_h, Balanced_Accuracy_log_h]],columns =column_labels )
# guassian naive bayes best model performance using PCA
df4 = pd.DataFrame([['GuassianNB_h',accuracy_gnb_h, recall_gnb_h, Specificity_gnb_h, Precision_gnb_h, F1_score_gnb_h, Balanced_Accuracy_gnb_h]],columns =column_labels )
# KNN best model performance
df5 = pd.DataFrame([['KNN_h',accuracy_knn_h, recall_knn_h, Specificity_knn_h, Precision_knn_h, F1_score_knn_h, Balanced_Accuracy_knn_h]],columns =column_labels )
# Gradient Boosting classifier best model performance
df6 = pd.DataFrame([['GradientBoosting_h',accuracy_grb_h, recall_grb_h, Specificity_grb_h, Precision_grb_h, F1_score_grb_h, Balanced_Accuracy_grb_h]],columns =column_labels )
combined_data_h = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)
combined_data_h
import plotly.express as ex
Y = df['y']
# print(dff)
def myplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
colors = {'1':'pink', '0':'blue'}
plt.scatter(xs * scalex,ys * scaley, c= y.apply(lambda x: colors[x]))
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
def PCA_transformation(dataset):
x=dataset.drop(['y'], axis=1) #droping column # a the features
x=StandardScaler().fit_transform(x) # standarize the variables
pca=PCA(n_components=2) #first 2 leading principal components
PC=pca.fit_transform(x)
principalDF=pd.DataFrame(data=PC,columns=['pc1','pc2'])
print("First 2 leading principal components")
finalDf = pd.concat([principalDF, data[['y']]], axis = 1)
print(finalDf.head())
# myplot(PC[:,0:2],np.transpose(pca.components_[0:2, :]))
# plt.show()
fx = dataset.drop(['y'], axis=1) #droping column
PCloadings = pca.components_.T * np.sqrt(pca.explained_variance_)
components=fx.columns.tolist()
loadingdf=pd.DataFrame(PCloadings,columns=('PC1','PC2'))
loadingdf["variable"]=components
print("PCA Loadings")
print(loadingdf)
return loadingdf,x
def cumluative_varienceGraph(n_components,x):
pca_test = PCA(n_components=n_components)
pca_test.fit(x)
sns.set(style='whitegrid')
plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.axvline(linewidth=4, color='r', linestyle = '--', x=46, ymin=0, ymax=1)
display(plt.show())
evr = pca_test.explained_variance_ratio_
cvr = np.cumsum(pca_test.explained_variance_ratio_)
pca_df = pd.DataFrame()
pca_df['Cumulative Variance Ratio'] = cvr
pca_df['Explained Variance Ratio'] = evr
def plotPCA(loadingdf):
fig=ex.scatter(x=loadingdf['PC1'],y=loadingdf['PC2'],text=loadingdf['variable'],)
fig.update_layout(
height=600,width=500,
title_text='loadings plot')
fig.update_traces(textposition='bottom center')
fig.add_shape(type="line",
x0=-0, y0=-0.5,x1=-0,y1=2.5,
line=dict(color="RoyalBlue",width=3)
)
fig.add_shape(type="line",
x0=-1, y0=0,x1=1,y1=0,
line=dict(color="RoyalBlue",width=3)
)
fig.show()
loadingsDF,x_full = PCA_transformation(data)
cumluative_varienceGraph(62,x_full)
First 2 leading principal components
pc1 pc2 y
0 -2.138045 2.336784 0.0
1 -2.573533 2.560814 0.0
2 -1.726158 0.633776 0.0
3 -1.926238 1.036604 0.0
4 -1.903958 1.191214 0.0
PCA Loadings
PC1 PC2 variable
0 -0.033195 0.354352 age
1 0.043055 0.027895 duration
2 -0.175063 -0.041416 campaign
3 -0.481831 -0.244205 pdays
4 0.686791 0.302667 previous
.. ... ... ...
57 -0.015194 0.009378 day_of_week_tue
58 -0.025483 0.000716 day_of_week_wed
59 0.547140 0.200260 poutcome_failure
60 -0.722817 -0.299174 poutcome_nonexistent
61 0.455405 0.232916 poutcome_success
[62 rows x 3 columns]
#plotPCA function generated the newplot.png file which we are uploading here and displaying.
#Converting ipynb file in colab to html did not show the below image so we are saving the picture in image and manually displaying.
plotPCA(loadingsDF)
from IPython.display import Image
Image('newplot.png')
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=43)
X_pca = pca.fit_transform(x_full)
# Get the transformed dataset
X_pca = pd.DataFrame(X_pca)
print(X_pca.head())
0 1 2 3 4 5 6 \
0 -2.138024 2.333661 -0.178764 1.232175 -0.565714 -1.957714 -0.430562
1 -2.573363 2.562840 0.477053 -0.011948 -0.822362 -0.549637 -0.748131
2 -1.725983 0.631196 1.443510 -0.568783 -0.713549 -0.335646 1.864104
3 -1.926228 1.031268 1.275879 0.243650 -0.832721 -1.750789 -0.343200
4 -1.903882 1.185243 1.552207 -0.222040 1.237438 -1.099605 0.211662
7 8 9 10 11 12 13 \
0 -0.354217 -0.298676 -0.040870 0.935283 -1.115147 0.058043 1.329108
1 -0.610015 0.757035 0.201822 3.275354 1.683457 -0.437876 -1.773976
2 -0.214848 -0.438932 -1.363563 2.604932 0.573183 -1.815302 -1.300672
3 -0.684802 -0.142264 0.121030 -0.438680 0.154452 -1.042602 -0.640914
4 -1.132211 -2.452797 1.872789 3.233771 0.422096 -1.415202 -0.977421
14 15 16 17 18 19 20 \
0 1.316705 -0.929502 0.346183 0.755764 2.395659 -1.099119 1.448558
1 -0.179287 -1.755962 -0.715037 0.119448 0.805998 1.242205 0.355523
2 0.354092 -1.425833 -0.128269 0.301787 1.125733 1.196142 0.089947
3 0.638126 -1.502631 -0.642433 -0.499252 1.280330 -0.153854 -1.349885
4 0.154783 -1.420064 -0.215410 0.349358 1.130065 1.372814 0.093610
21 22 23 24 25 26 27 \
0 -4.679535 -0.473311 -2.412075 -0.219518 -0.069511 0.933063 0.680618
1 -0.393300 0.497622 0.385291 0.110805 -0.350256 -0.152197 -0.329891
2 -0.966826 0.216064 0.358272 0.081927 -0.589658 -0.063251 -0.068180
3 -0.640381 -2.105181 1.085176 1.915219 0.780520 -1.245636 1.065065
4 -0.719223 0.243767 0.360673 0.147902 -0.713916 -0.119369 -0.133523
28 29 30 31 32 33 34 \
0 0.645854 -0.750203 0.146156 -0.103656 -0.426967 -0.018876 1.675217
1 -0.066906 -0.298971 -0.443508 0.229026 0.048583 0.112445 -0.002381
2 0.069069 -0.335224 -0.348311 0.355253 0.120469 -0.240086 -0.246059
3 1.310081 0.066720 -0.200815 0.106048 0.177156 -0.309706 0.091712
4 0.135728 -0.330433 -0.469265 0.416804 0.192251 -0.341322 -0.340192
35 36 37 38 39 40 41 \
0 0.087926 -0.531726 1.876080 -0.061449 0.095078 0.092252 0.095594
1 0.158067 -0.749383 0.303571 -0.464235 0.470304 0.721799 0.051214
2 0.155251 -0.637814 -0.044062 -0.468789 0.471991 0.934550 0.061912
3 -0.887240 -0.889648 0.959366 0.573654 0.438335 0.175863 0.508800
4 0.148241 -0.520848 0.296141 -0.529967 0.462967 0.748439 0.042194
42
0 0.684089
1 0.866502
2 -0.157105
3 0.968411
4 1.066174
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, Y, test_size=0.20, random_state=2)
#Initialize the logistic regression model
logpca = LogisticRegression(max_iter=2500)
# Train the model
logpca.fit(X_train_pca, y_train_pca)
accuracy_log_pca,recall_log_pca,Specificity_log_pca,Precision_log_pca,F1_score_log_pca,Balanced_Accuracy_log_pca = model_perf_v1(logpca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
Accuracy on training set:
0.9100559678793041
Accuracy on testing set:
0.9093673965936739
Recall:
0.3940724478594951
Specificity:
0.9735941989328225
Precision:
0.6503623188405797
Balanced Accuracy:
0.6838333233961588
F1 score:
0.49077238550922764
#dropping varibales not being used in PCA
y= df['y']
dff = df.drop(['default','month','day_of_week','campaign','poutcome','housing','contact'], axis=1) #droping column
dff['y'].replace('yes','1',inplace=True)
dff['y'].replace('no','0',inplace=True)
y=dff['y'] # assign y variable - the target
dff = pd.get_dummies(dff, columns = ['education', 'marital','job','loan'])
loadingsDF,x_defined_components = PCA_transformation(dff)
cumluative_varienceGraph(35,x_defined_components)
First 2 leading principal components
pc1 pc2 y
0 -1.821741 2.619953 0.0
1 -1.487264 0.564340 0.0
2 -1.424279 -0.157760 0.0
3 -1.489013 0.390836 0.0
4 -1.464163 0.371313 0.0
PCA Loadings
PC1 PC2 variable
0 -0.058638 0.599318 age
1 0.044505 0.023868 duration
2 -0.448712 -0.083116 pdays
3 0.596350 0.107419 previous
4 -0.948292 -0.145303 emp.var.rate
5 -0.709212 -0.081345 cons.price.idx
6 -0.190438 0.011115 cons.conf.idx
7 -0.954212 -0.141824 euribor3m
8 -0.917032 -0.150804 nr.employed
9 -0.083587 0.461013 education_basic.4y
10 -0.065577 0.190013 education_basic.6y
11 -0.075936 0.236097 education_basic.9y
12 0.049192 -0.233243 education_high.school
13 0.000207 0.029297 education_illiterate
14 -0.032173 -0.026405 education_professional.course
15 0.119777 -0.377989 education_university.degree
16 0.012463 0.062940 education_unknown
17 -0.011166 -0.043244 marital_divorced
18 -0.204207 0.666216 marital_married
19 0.229694 -0.693449 marital_single
20 0.097670 -0.420789 job_admin.
21 -0.151475 0.443327 job_blue-collar
22 -0.029553 0.048791 job_entrepreneur
23 -0.059981 0.147195 job_housemaid
24 0.018021 0.009053 job_management
25 0.110915 0.372997 job_retired
26 -0.005183 -0.018690 job_self-employed
27 -0.023372 -0.085553 job_services
28 0.228766 -0.206625 job_student
29 -0.047399 -0.163806 job_technician
30 0.020344 0.019163 job_unemployed
31 -0.021061 0.054833 job_unknown
32 -0.001070 0.047842 loan_no
33 -0.006240 -0.002724 loan_unknown
34 0.003801 -0.049597 loan_yes
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=29)
X_pca1 = pca.fit_transform(x_defined_components)
# Get the transformed dataset
X_pca1 = pd.DataFrame(X_pca1)
print(X_pca1.head())
0 1 2 3 4 5 6 \
0 -1.822041 2.620139 0.248845 -1.152239 -0.432808 0.911235 2.080483
1 -1.487255 0.565214 -0.577864 -0.277660 -0.770206 3.311466 -1.290902
2 -1.424276 -0.157315 -0.956728 0.194343 -0.738681 2.971460 -1.594139
3 -1.488700 0.393532 -0.276738 -0.486458 -1.053578 -0.327105 -0.631790
4 -1.464135 0.370704 2.386862 1.963042 -0.428551 3.373218 -1.285200
7 8 9 10 11 12 13 \
0 0.283355 0.541003 -3.944060 -2.277833 -0.083390 1.972649 -1.338934
1 0.570610 0.199479 0.325447 -0.449042 0.319432 -0.726282 0.077655
2 0.843497 0.132507 0.186384 -0.647994 0.253652 -0.576761 0.060789
3 0.780065 -0.694965 -0.532012 1.284619 -2.913083 -0.305744 1.179061
4 0.624549 0.201324 0.371043 -0.416257 0.305803 -0.609206 -0.221397
14 15 16 17 18 19 20 \
0 -0.737365 1.672090 -1.421867 0.306011 0.690351 -1.426623 1.295551
1 0.161335 -0.001481 -0.114454 0.244162 -0.601994 -0.001202 0.739461
2 0.174422 0.053596 -0.146063 0.119468 -0.320701 -0.048202 0.710232
3 -0.204346 0.597329 1.592784 -0.034242 -0.674804 -1.166393 0.461975
4 1.001019 0.044358 0.048931 -0.068875 -0.276215 -0.117497 0.821672
21 22 23 24 25 26 27 \
0 -1.282089 0.156637 -0.087436 0.176007 0.500557 0.395101 -0.018440
1 0.115928 -0.788632 0.622612 0.139541 1.081307 0.334802 0.907746
2 0.389754 -0.856857 0.775943 0.155672 -0.256300 0.354367 0.215320
3 -0.274844 0.517141 -0.015656 0.593967 0.694204 0.634360 -1.545910
4 0.147001 -0.848068 0.592129 0.115595 1.011838 0.349642 0.870761
28
0 0.019883
1 -0.190355
2 -0.010968
3 0.135362
4 -0.150155
X_train_pca1, X_test_pca1, y_train_pca1, y_test_pca1 = train_test_split(X_pca1, Y, test_size=0.20, random_state=2)
#Initialize the logistic regression model with pca model with 29 components
from sklearn.linear_model import LogisticRegression
logpca1 = LogisticRegression(max_iter=2500)
# Train the model
logpca1.fit(X_train_pca1, y_train_pca1)
accuracy_log_pca1,recall_log_pca1,Specificity_log_pca1,Precision_log_pca1,F1_score_log_pca1,Balanced_Accuracy_log_pca1 = model_perf_v1(logpca1,X_train_pca1,X_test_pca1,y_train_pca1,y_test_pca1)
Accuracy on training set:
0.9093563693880035
Accuracy on testing set:
0.9094890510948905
Recall:
0.38748627881448955
Specificity:
0.9745519222875907
Precision:
0.6549165120593692
Balanced Accuracy:
0.6810191005510401
F1 score:
0.486896551724138
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=43)
X_pca = pca.fit_transform(x_full)
# Get the transformed dataset
X_pca = pd.DataFrame(X_pca)
print(X_pca.head())
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, Y, test_size=0.20,
random_state=2)
0 1 2 3 4 5 6 \
0 -2.138024 2.333661 -0.178764 1.232175 -0.565714 -1.957714 -0.430562
1 -2.573363 2.562840 0.477053 -0.011948 -0.822362 -0.549637 -0.748131
2 -1.725983 0.631196 1.443510 -0.568783 -0.713549 -0.335646 1.864104
3 -1.926228 1.031268 1.275879 0.243650 -0.832721 -1.750789 -0.343200
4 -1.903882 1.185243 1.552207 -0.222040 1.237438 -1.099605 0.211662
7 8 9 10 11 12 13 \
0 -0.354217 -0.298676 -0.040870 0.935283 -1.115147 0.058043 1.329108
1 -0.610015 0.757035 0.201822 3.275354 1.683457 -0.437876 -1.773976
2 -0.214848 -0.438932 -1.363563 2.604932 0.573183 -1.815302 -1.300672
3 -0.684802 -0.142264 0.121030 -0.438680 0.154452 -1.042602 -0.640914
4 -1.132211 -2.452797 1.872789 3.233771 0.422096 -1.415202 -0.977421
14 15 16 17 18 19 20 \
0 1.316705 -0.929502 0.346183 0.755764 2.395659 -1.099119 1.448558
1 -0.179287 -1.755962 -0.715037 0.119448 0.805998 1.242205 0.355523
2 0.354092 -1.425833 -0.128269 0.301787 1.125733 1.196142 0.089947
3 0.638126 -1.502631 -0.642433 -0.499252 1.280330 -0.153854 -1.349885
4 0.154783 -1.420064 -0.215410 0.349358 1.130065 1.372814 0.093610
21 22 23 24 25 26 27 \
0 -4.679535 -0.473311 -2.412075 -0.219518 -0.069511 0.933063 0.680618
1 -0.393300 0.497622 0.385291 0.110805 -0.350256 -0.152197 -0.329891
2 -0.966826 0.216064 0.358272 0.081927 -0.589658 -0.063251 -0.068180
3 -0.640381 -2.105181 1.085176 1.915219 0.780520 -1.245636 1.065065
4 -0.719223 0.243767 0.360673 0.147902 -0.713916 -0.119369 -0.133523
28 29 30 31 32 33 34 \
0 0.645854 -0.750203 0.146156 -0.103656 -0.426967 -0.018876 1.675217
1 -0.066906 -0.298971 -0.443508 0.229026 0.048583 0.112445 -0.002381
2 0.069069 -0.335224 -0.348311 0.355253 0.120469 -0.240086 -0.246059
3 1.310081 0.066720 -0.200815 0.106048 0.177156 -0.309706 0.091712
4 0.135728 -0.330433 -0.469265 0.416804 0.192251 -0.341322 -0.340192
35 36 37 38 39 40 41 \
0 0.087926 -0.531726 1.876080 -0.061449 0.095078 0.092252 0.095594
1 0.158067 -0.749383 0.303571 -0.464235 0.470304 0.721799 0.051214
2 0.155251 -0.637814 -0.044062 -0.468789 0.471991 0.934550 0.061912
3 -0.887240 -0.889648 0.959366 0.573654 0.438335 0.175863 0.508800
4 0.148241 -0.520848 0.296141 -0.529967 0.462967 0.748439 0.042194
42
0 0.684089
1 0.866502
2 -0.157105
3 0.968411
4 1.066174
dt_pca = DecisionTreeClassifier()
dt_pca.fit(X_train_pca, y_train_pca)
accuracy_dt_pca,recall_dt_pca,Specificity_dt_pca,Precision_dt_pca,F1_score_dt_pca,Balanced_Accuracy_dt_pca = model_perf_v1(dt_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
Accuracy on training set:
1.0
Accuracy on testing set:
0.8762773722627737
Recall:
0.4544456641053787
Specificity:
0.9288548365029415
Precision:
0.44325481798715205
Balanced Accuracy:
0.6916502503041602
F1 score:
0.44878048780487806
rf_pca = RandomForestClassifier()
# fitting the model
rf_pca.fit(X_train_pca,y_train_pca)
accuracy_rf_pca,recall_rf_pca,Specificity_rf_pca,Precision_rf_pca,F1_score_rf_pca,Balanced_Accuracy_rf_pca = model_perf_v1(rf_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
Accuracy on training set:
1.0
Accuracy on testing set:
0.9038929440389294
Recall:
0.38309549945115257
Specificity:
0.968805582158982
Precision:
0.6048526863084922
Balanced Accuracy:
0.6759505408050673
F1 score:
0.4690860215053763
knn_pca = KNeighborsClassifier(n_neighbors=3)
# Train the model
knn_pca.fit(X_train_pca, y_train_pca)
accuracy_knn_pca,recall_knn_pca,Specificity_knn_pca,Precision_knn_pca,F1_score_knn_pca,Balanced_Accuracy_knn_pca = model_perf_v1(knn_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
Accuracy on training set:
0.9302834894756054
Accuracy on testing set:
0.8918491484184915
Recall:
0.31174533479692645
Specificity:
0.9641537830072513
Precision:
0.5201465201465202
Balanced Accuracy:
0.6379495589020889
F1 score:
0.38984214138641043
gnb_pca = GaussianNB()
gnb_pca.fit(X_train_pca, y_train_pca)
accuracy_gnb_pca,recall_gnb_pca,Specificity_gnb_pca,Precision_gnb_pca,F1_score_gnb_pca,Balanced_Accuracy_gnb_pca = model_perf_v1(gnb_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
Accuracy on training set:
0.8587419394086871
Accuracy on testing set:
0.8586374695863747
Recall:
0.45334796926454446
Specificity:
0.9091530989191408
Precision:
0.38347260909935005
Balanced Accuracy:
0.6812505340918427
F1 score:
0.4154929577464789
grb_pca = GradientBoostingClassifier()
grb_pca.fit(X_train_pca, y_train_pca)
accuracy_grb_pca, recall_grb_pca, Specificity_grb_pca, Precision_grb_pca, F1_score_grb_pca, Balanced_Accuracy_grb_pca = model_perf_v1(grb_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
Accuracy on training set:
0.9157744251125441
Accuracy on testing set:
0.9075425790754258
Recall:
0.433589462129528
Specificity:
0.9666165002052264
Precision:
0.6181533646322379
Balanced Accuracy:
0.7001029811673771
F1 score:
0.5096774193548387
column_labels = ['classifier','accuracy','recall','specificity','precision','f1-score','balanced']
# Random forest classifier best model performance
df1 = pd.DataFrame([['RandomForest_pca',accuracy_rf_pca, recall_rf_pca, Specificity_rf_pca, Precision_rf_pca, F1_score_rf_pca, Balanced_Accuracy_rf_pca]],columns =column_labels )
# decision tree classifier best model
df2= pd.DataFrame([['DecisionTree_pca',accuracy_dt_pca, recall_dt_pca, Specificity_dt_pca, Precision_dt_pca, F1_score_dt_pca, Balanced_Accuracy_dt_pca]],columns =column_labels )
# logistic regression best model
df3 = pd.DataFrame([['LogisticRegression_pca', accuracy_log_pca, recall_log_pca, Specificity_log_pca, Precision_log_pca, F1_score_log_pca, Balanced_Accuracy_log_pca]],columns =column_labels )
# guassian naive bayes best model performance using PCA
df4 = pd.DataFrame([['GuassianNB_pca',accuracy_gnb_pca, recall_gnb_pca, Specificity_gnb_pca, Precision_gnb_pca, F1_score_gnb_pca, Balanced_Accuracy_gnb_pca]],columns =column_labels )
# KNN best model performance
df5 = pd.DataFrame([['KNN_pca',accuracy_knn_pca, recall_knn_pca, Specificity_knn_pca, Precision_knn_pca, F1_score_knn_pca, Balanced_Accuracy_knn_pca]],columns =column_labels )
# Gradient Boosting classifier best model performance
df6 = pd.DataFrame([['GradientBoosting_pca',accuracy_grb_pca,recall_grb_pca,Specificity_grb_pca,Precision_grb_pca,F1_score_grb_pca,Balanced_Accuracy_grb_pca]],columns =column_labels )
combined_data_ppca = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)
combined_data_ppca
def feature_importance(model):
# fit the model
model.fit(X, y)
feature_list = list(X.columns)
# get importance
importance = list(model.feature_importances_)
feature_importance = [(feature, round(importance,2)) for feature, importance in zip (feature_list, importance)]
[print('variable:{:25} Importance:{}'.format(*pair)) for pair in feature_importance]
rf_fi = RandomForestClassifier()
feature_importance(rf_fi)
variable:age Importance:0.08
variable:duration Importance:0.28
variable:campaign Importance:0.04
variable:pdays Importance:0.03
variable:previous Importance:0.01
variable:emp.var.rate Importance:0.02
variable:cons.price.idx Importance:0.02
variable:cons.conf.idx Importance:0.02
variable:euribor3m Importance:0.09
variable:nr.employed Importance:0.05
variable:job_admin. Importance:0.01
variable:job_blue-collar Importance:0.01
variable:job_entrepreneur Importance:0.0
variable:job_housemaid Importance:0.0
variable:job_management Importance:0.01
variable:job_retired Importance:0.01
variable:job_self-employed Importance:0.0
variable:job_services Importance:0.01
variable:job_student Importance:0.0
variable:job_technician Importance:0.01
variable:job_unemployed Importance:0.0
variable:job_unknown Importance:0.0
variable:marital_divorced Importance:0.01
variable:marital_married Importance:0.01
variable:marital_single Importance:0.01
variable:education_basic.4y Importance:0.01
variable:education_basic.6y Importance:0.0
variable:education_basic.9y Importance:0.01
variable:education_high.school Importance:0.01
variable:education_illiterate Importance:0.0
variable:education_professional.course Importance:0.01
variable:education_university.degree Importance:0.01
variable:education_unknown Importance:0.0
variable:default_no Importance:0.01
variable:default_unknown Importance:0.01
variable:default_yes Importance:0.0
variable:housing_no Importance:0.01
variable:housing_unknown Importance:0.0
variable:housing_yes Importance:0.01
variable:loan_no Importance:0.01
variable:loan_unknown Importance:0.0
variable:loan_yes Importance:0.01
variable:contact_cellular Importance:0.01
variable:contact_telephone Importance:0.01
variable:month_apr Importance:0.0
variable:month_aug Importance:0.0
variable:month_dec Importance:0.0
variable:month_jul Importance:0.0
variable:month_jun Importance:0.0
variable:month_mar Importance:0.0
variable:month_may Importance:0.0
variable:month_nov Importance:0.0
variable:month_oct Importance:0.01
variable:month_sep Importance:0.0
variable:day_of_week_fri Importance:0.01
variable:day_of_week_mon Importance:0.01
variable:day_of_week_thu Importance:0.01
variable:day_of_week_tue Importance:0.01
variable:day_of_week_wed Importance:0.01
variable:poutcome_failure Importance:0.01
variable:poutcome_nonexistent Importance:0.01
variable:poutcome_success Importance:0.02
# dropping less importance features
X_dropped_rf = X.drop(['job_entrepreneur','job_housemaid','job_self-employed','job_student','job_unemployed','job_unknown','education_basic.6y','education_illiterate','education_unknown','default_yes','housing_unknown','loan_unknown','month_apr','month_aug','month_dec','month_jul','month_jun','month_mar','month_nov','month_sep'],axis=1)
print(X_dropped_rf.shape)
(41096, 42)
# splitting the data ino train and test
X_train_r, X_test_r, y_train, y_test = train_test_split(X_dropped_rf, y, test_size=0.30, random_state=2020)
rf_drop= RandomForestClassifier()
# fitting the decision tree model
rf_drop.fit(X_train_r, y_train)
accuracy_rf_fi, recall_rf_fi, Specificity_rf_fi, Precision_rf_fi, F1_score_rf_fi, Balanced_Accuracy_rf_Fi = model_perf_v1(rf_drop,X_train_r,X_test_r,y_train,y_test)
Accuracy on training set:
0.9999304758925157
Accuracy on testing set:
0.9098872576851326
Recall:
0.45964125560538116
Specificity:
0.9646983895914839
Precision:
0.613160518444666
Balanced Accuracy:
0.7121698225984325
F1 score:
0.5254164886800512
dt_fi = DecisionTreeClassifier()
feature_importance(dt_fi)
variable:age Importance:0.09
variable:duration Importance:0.34
variable:campaign Importance:0.04
variable:pdays Importance:0.02
variable:previous Importance:0.01
variable:emp.var.rate Importance:0.0
variable:cons.price.idx Importance:0.01
variable:cons.conf.idx Importance:0.02
variable:euribor3m Importance:0.08
variable:nr.employed Importance:0.15
variable:job_admin. Importance:0.01
variable:job_blue-collar Importance:0.01
variable:job_entrepreneur Importance:0.0
variable:job_housemaid Importance:0.0
variable:job_management Importance:0.01
variable:job_retired Importance:0.0
variable:job_self-employed Importance:0.0
variable:job_services Importance:0.01
variable:job_student Importance:0.0
variable:job_technician Importance:0.01
variable:job_unemployed Importance:0.0
variable:job_unknown Importance:0.0
variable:marital_divorced Importance:0.0
variable:marital_married Importance:0.01
variable:marital_single Importance:0.01
variable:education_basic.4y Importance:0.0
variable:education_basic.6y Importance:0.01
variable:education_basic.9y Importance:0.01
variable:education_high.school Importance:0.01
variable:education_illiterate Importance:0.0
variable:education_professional.course Importance:0.01
variable:education_university.degree Importance:0.01
variable:education_unknown Importance:0.0
variable:default_no Importance:0.0
variable:default_unknown Importance:0.0
variable:default_yes Importance:0.0
variable:housing_no Importance:0.01
variable:housing_unknown Importance:0.0
variable:housing_yes Importance:0.01
variable:loan_no Importance:0.01
variable:loan_unknown Importance:0.0
variable:loan_yes Importance:0.01
variable:contact_cellular Importance:0.01
variable:contact_telephone Importance:0.0
variable:month_apr Importance:0.0
variable:month_aug Importance:0.0
variable:month_dec Importance:0.0
variable:month_jul Importance:0.0
variable:month_jun Importance:0.0
variable:month_mar Importance:0.0
variable:month_may Importance:0.0
variable:month_nov Importance:0.0
variable:month_oct Importance:0.01
variable:month_sep Importance:0.0
variable:day_of_week_fri Importance:0.01
variable:day_of_week_mon Importance:0.01
variable:day_of_week_thu Importance:0.01
variable:day_of_week_tue Importance:0.01
variable:day_of_week_wed Importance:0.01
variable:poutcome_failure Importance:0.0
variable:poutcome_nonexistent Importance:0.0
variable:poutcome_success Importance:0.01
# dropping less important features
X_dropped_dt = X.drop(['emp.var.rate','job_entrepreneur','job_housemaid','job_retired','job_self-employed','job_services','job_student','job_unemployed','job_unknown','marital_divorced','education_illiterate','education_unknown','default_yes','default_unknown','default_no','housing_unknown','loan_no','loan_unknown','contact_cellular','month_apr','month_aug','month_dec','month_jun','month_jul','month_mar','month_may','month_nov','month_sep','poutcome_failure','poutcome_nonexistent'],axis=1)
print(X_dropped_dt.shape)
(41096, 32)
# splitting the data ino train and test
X_train_dt, X_test_dt, y_train, y_test = train_test_split(X_dropped_dt, y, test_size=0.30, random_state=2020)
dt_drop = DecisionTreeClassifier()
# fitting the model
dt_drop.fit(X_train_dt, y_train)
accuracy_dt_fi, recall_dt_fi, Specificity_dt_fi, Precision_dt_fi, F1_score_dt_fi, Balanced_Accuracy_dt_fi = model_perf_v1(dt_drop,X_train_dt,X_test_dt,y_train,y_test)
Accuracy on training set:
1.0
Accuracy on testing set:
0.8850677264984995
Recall:
0.5089686098654709
Specificity:
0.9308525156946593
Precision:
0.472588480222068
Balanced Accuracy:
0.719910562780065
F1 score:
0.490104354084203
# Random forest classifier best model performance ((regular/no tunning)
df1 = pd.DataFrame([['RF',accuracy_rf, recall_rf, Specificity_rf, Precision_rf, F1_score_rf, Balanced_Accuracy_rf]],columns =column_labels )
# decision tree classifier best model (regular/no tunning)
df2= pd.DataFrame([['DT',accuracy_dt, recall_dt, Specificity_dt, Precision_dt, F1_score_dt, Balanced_Accuracy_dt]],columns =column_labels )
# logistic regression best model (regular/no tunning)
df3 = pd.DataFrame([['Logistic', accuracy_log, recall_log, Specificity_log, Precision_log, F1_score_log, Balanced_Accuracy_log]],columns =column_labels )
# guassian naive bayes best model performance using PCA
df4 = pd.DataFrame([['GNB',accuracy_gnb_pca, recall_gnb_pca, Specificity_gnb_pca, Precision_gnb_pca, F1_score_gnb_pca, Balanced_Accuracy_gnb_pca]],columns =column_labels )
# KNN best model performance (regular/no tunning)
df5 = pd.DataFrame([['KNN',accuracy_knn, recall_knn, Specificity_knn, Precision_knn, F1_score_knn, Balanced_Accuracy_knn]],columns =column_labels )
# Gradient Boosting classifier best model performance(regular/no tunning)
df6 = pd.DataFrame([['GradientBoosting',accuracy_grb,recall_grb,Specificity_grb,Precision_grb,F1_score_grb,Balanced_Accuracy_grb]],columns =column_labels )
combined_data_2 = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)
combined_data_2
def prec_auc(model,X_test,y_test):
probs = model.predict_proba(X_test)
# retrieve just the probabilities for the positive class
pos_probs = probs[:, 1]
# calculate precision recal curve for model
precision, recall, thresholds = precision_recall_curve(y_test, pos_probs)
auc_score_p = auc(recall, precision)
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
plt.plot(recall, precision, marker='.', label = model)
print('\n')
#print('model + precision AUC: %.3f' % auc_score_p)
print('model : {} precision AUC: {:.3f}'.format(model, auc_score_p))
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
#plt.legend()
# show the plot
plt.show()
# default rf model
prec_auc(rf,X_test,y_test)
# random forest model usng hyperparameter
prec_auc(best_estimator_rf, X_test, y_test)
model : RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False) precision AUC: 0.106
model : RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=14, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=20,
min_weight_fraction_leaf=0.0, n_estimators=50,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False) precision AUC: 0.105
# default decision tree model
prec_auc(dt,X_test,y_test)
# decision tree model using hyperparameter tunning
prec_auc(best_params_dict[0], X_test, y_test)
model : DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best') precision AUC: 0.156
model : Pipeline(memory=None,
steps=[('clf',
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=5,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=30,
min_weight_fraction_leaf=0.0,
presort='deprecated', random_state=None,
splitter='best'))],
verbose=False) precision AUC: 0.110
# default logistic regression model
prec_auc(log,X_test,y_test)
# logistic regression model using hyperparameter tunning
prec_auc(best_params_dict[1], X_test,y_test)
model : LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=2500,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False) precision AUC: 0.103
model : Pipeline(memory=None,
steps=[('clf',
LogisticRegression(C=7.742636826811269, class_weight=None,
dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto',
n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs',
tol=0.0001, verbose=0, warm_start=False))],
verbose=False) precision AUC: 0.103
# default gradient boosting model
prec_auc(grb,X_test,y_test)
# gradient boosting model using hyperparameter tunning
prec_auc(best_params_dict[4], X_test,y_test)
model : GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_iter_no_change=None, presort='deprecated',
random_state=None, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False) precision AUC: 0.105
model : Pipeline(memory=None,
steps=[('clf',
GradientBoostingClassifier(ccp_alpha=0.0,
criterion='friedman_mse', init=None,
learning_rate=0.3, loss='deviance',
max_depth=3, max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=20,
n_iter_no_change=None,
presort='deprecated',
random_state=None, subsample=1.0,
tol=0.0001, validation_fraction=0.1,
verbose=0, warm_start=False))],
verbose=False) precision AUC: 0.105
# default knn
prec_auc(knn,X_test,y_test)
#knn model using hyperparameter tunning
prec_auc(best_params_dict[2], X_test,y_test)
model : KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=3, p=2,
weights='uniform') precision AUC: 0.110
model : Pipeline(memory=None,
steps=[('clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=15, p=2,
weights='uniform'))],
verbose=False) precision AUC: 0.105
# default guassian nb
prec_auc(gnb,X_test,y_test)
#gaussian naive bayes model using hyperparameter tunning
prec_auc(best_params_dict[3], X_test,y_test)
model : GaussianNB(priors=None, var_smoothing=1e-09) precision AUC: 0.136
model : Pipeline(memory=None,
steps=[('clf', GaussianNB(priors=None, var_smoothing=1.0))],
verbose=False) precision AUC: 0.108
from imblearn.over_sampling import RandomOverSampler
/usr/local/lib/python3.7/dist-packages/sklearn/externals/six.py:31: FutureWarning:
The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning:
The sklearn.neighbors.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.
# before oversampling
print("counts of label '1': {}".format(sum(y_train == 1)))
print("counts of label '0': {} \n".format(sum(y_train == 0)))
counts of label '1': 3289
counts of label '0': 25478
upsample = RandomOverSampler(sampling_strategy='minority')
X_train_up, y_train_up = upsample.fit_resample(X_train, y_train)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning:
Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.
X_train_up.shape
y_train_up.shape
# after oversampling the minority class
print("counts of label '1': {}".format(sum(y_train_up == 1)))
print("counts of label '0': {}".format(sum(y_train_up == 0)))
counts of label '1': 25478
counts of label '0': 25478
rf_b = RandomForestClassifier()
# fitting the model
rf_b.fit(X_train_up,y_train_up)
accuracy_rf_up, recall_rf_up, Specificity_rf_up, Precision_rf_up, F1_score_rf_up, Balanced_Accuracy_rf_up = model_perf_v1(rf_b,X_train_up, X_test, y_train_up, y_test)
Accuracy on training set:
1.0
Accuracy on testing set:
0.8777678643847838
Recall:
0.01868460388639761
Specificity:
0.982349194795742
Precision:
0.1141552511415525
Balanced Accuracy:
0.5005168993410698
F1 score:
0.03211303789338472
dt_b = DecisionTreeClassifier()
dt_b.fit(X_train_up, y_train_up)
accuracy_dt_up,recall_dt_up,Specificity_dt_up,Precision_dt_up,F1_score_dt_up,Balanced_Accuracy_dt_up = model_perf_v1(dt_b,X_train_up, X_test,y_train_up,y_test)
Accuracy on training set:
1.0
Accuracy on testing set:
0.7979560386081597
Recall:
0.11659192825112108
Specificity:
0.8809025566372487
Precision:
0.10648464163822526
Balanced Accuracy:
0.4987472424441849
F1 score:
0.1113093114520157
log_b = LogisticRegression(max_iter=2500)
# fitting the model
log_b.fit(X_train_up, y_train_up)
accuracy_log_up, recall_log_up, Specificity_log_up, Precision_log_up, F1_score_log_up, Balanced_Accuracy_log_up = model_perf_v1(log_b,X_train_up,X_test,y_train_up,y_test)
Accuracy on training set:
0.5289661668890808
Accuracy on testing set:
0.5065293211128234
Recall:
0.5007473841554559
Specificity:
0.5072331907924665
Precision:
0.11008872822872165
Balanced Accuracy:
0.5039902874739612
F1 score:
0.1804956896551724
gnb_b = GaussianNB()
# fitting the model
gnb_b.fit(X_train_up, y_train_up)
accuracy_gnb_up, recall_gnb_up, Specificity_gnb_up, Precision_gnb_up, F1_score_gnb_up, Balanced_Accuracy_gnb_up = model_perf_v1(gnb_b,X_train_up,X_test,y_train_up,y_test)
Accuracy on training set:
0.5002943716147265
Accuracy on testing set:
0.10909238381052802
Recall:
1.0
Specificity:
0.0006368847238649804
Precision:
0.10858626846291186
Balanced Accuracy:
0.5003184423619325
F1 score:
0.19590043923865302
knn_b = KNeighborsClassifier(n_neighbors=3)
# fitting the model
knn_b.fit(X_train_up, y_train_up)
accuracy_knn_up, recall_knn_up, Specificity_knn_up, Precision_knn_up, F1_score_knn_up, Balanced_Accuracy_knn_up = model_perf_v1(knn_b,X_train_up,X_test,y_train_up,y_test)
Accuracy on training set:
0.9433040270036894
Accuracy on testing set:
0.7257685132614162
Recall:
0.22944693572496264
Specificity:
0.786188699845328
Precision:
0.11554384644335718
Balanced Accuracy:
0.5078178177851453
F1 score:
0.1536921151439299
grb_b = GradientBoostingClassifier()
grb_b.fit(X_train_up, y_train_up)
accuracy_grb_up, recall_grb_up, Specificity_grb_up, Precision_grb_up, F1_score_grb_up, Balanced_Accuracy_grb_up = model_perf_v1(grb_b,X_train_up,X_test,y_train_up,y_test)
Accuracy on training set:
0.6042664259361017
Accuracy on testing set:
0.5420553167329062
Recall:
0.4170403587443946
Specificity:
0.5572741333818579
Precision:
0.10287610619469026
Balanced Accuracy:
0.48715724606312627
F1 score:
0.1650399290150843
column_labels = ['classifier','accuracy','recall','specificity','precision','f1-score','balanced']
df_1 = pd.DataFrame([['RandomForest_b',accuracy_rf_up,recall_rf_up,Specificity_rf_up,Precision_rf_up,F1_score_rf_up,Balanced_Accuracy_rf_up]],columns =column_labels )
df_2= pd.DataFrame([['DecisionTree_b',accuracy_dt_up,recall_dt_up,Specificity_dt_up,Precision_dt_up,F1_score_dt_up,Balanced_Accuracy_dt_up]],columns =column_labels )
df_3 = pd.DataFrame([['LogisticRegression_b',accuracy_log_up,recall_log_up,Specificity_log_up,Precision_log_up,F1_score_log_up,Balanced_Accuracy_log_up]],columns =column_labels )
df_4 = pd.DataFrame([['GuassianNB_b',accuracy_gnb_up,recall_gnb_up,Specificity_gnb_up,Precision_gnb_up,F1_score_gnb_up,Balanced_Accuracy_gnb_up]],columns =column_labels )
df_5 = pd.DataFrame([['KNN_b',accuracy_knn_up,recall_knn_up,Specificity_knn_up,Precision_knn_up,F1_score_knn_up,Balanced_Accuracy_knn_up]],columns =column_labels )
df_6 = pd.DataFrame([['GradientBoosting_b',accuracy_grb_up,recall_grb_up,Specificity_grb_up,Precision_grb_up,F1_score_grb_up,Balanced_Accuracy_grb_up]],columns =column_labels )
combined_data_b= pd.concat([df_1, df_2, df_3, df_4, df_5, df_6])
combined_data_b
def accuracy_graph(ac1, ac2, ac3, ac4, ac5, ac6,ac7, ac8, ac9, ac10, ac11, ac12):
Balanced_data = [float(accuracy_rf_up)*100, float(accuracy_dt_up)*100, float(accuracy_log_up)*100 ,float(accuracy_gnb_up)*100,float(accuracy_knn_up)*100,float(accuracy_grb_up)*100]
Unbalanced_data = [float(accuracy_rf)*100, float(accuracy_dt)*100, float(accuracy_log)*100, float(accuracy_gnb_pca)*100, float(accuracy_knn)*100, float(accuracy_grb)*100]
index = ['RF','DT','Log', 'GNB', 'KNN','GRB']
acc_pd = pd.DataFrame({'Balanced data':Balanced_data,'Unbalanced data':Unbalanced_data},index=index)
acc_pd
ax = acc_pd.plot(kind='bar', ylim=(0,100), xlabel='Classifiers', ylabel = 'Performance measure', legend=True, figsize=(15, 10))
#plt.figure(figsize=(21,9))
ax.set_title('Accuracy Score of all classification model')
accuracy_graph(accuracy_rf, accuracy_rf_up, accuracy_dt, accuracy_dt_up, accuracy_log, accuracy_log_up, accuracy_gnb_pca, accuracy_gnb_up, accuracy_knn, accuracy_knn_up, accuracy_grb,accuracy_grb_up)
#reverting back standartization, since Random Forest, Decision tree and Logistic regression don't require feature scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 555, test_size= 0.30)
best_cl_normal_bal = pd.DataFrame(columns = ['Model','Accuracy','Recall', 'Specificity', 'Precision', 'Balanced Accuracy', 'F1 score'])
#random forest with weights
rf_bal = RandomForestClassifier(class_weight='balanced')
# fitting the model
rf_bal.fit(X_train,y_train)
rf_perf_bal = model_perf_to_lst(rf_bal, X_test, y_test)
best_cl_normal_bal.loc[len(best_cl_normal_bal)] = rf_perf_bal
#Decision tree with weights
dt_bal = DecisionTreeClassifier(class_weight='balanced')
dt_bal.fit(X_train, y_train)
dt_perf_bal = model_perf_to_lst(dt_bal, X_test, y_test)
best_cl_normal_bal.loc[len(best_cl_normal_bal)] = dt_perf_bal
#logistic regression
log_bal = LogisticRegression(max_iter=2500, class_weight='balanced')
# fitting the model
log_bal.fit(X_train, y_train)
log_perf_bal = model_perf_to_lst(log_bal, X_test, y_test)
best_cl_normal_bal.loc[len(best_cl_normal_bal)] = log_perf_bal
best_cl_normal_bal
#%%shell
#jupyter nbconvert --to html Group6_DSC540_MarketingAnalytics_ProjectMilestone.ipynb