import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
sklearn.__version__
pd.set_option('display.max_columns', 100)
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
path = "sample_data/bank-additional-full.csv"
df = pd.read_csv(path,sep=";")
df.head()
df.shape
df.info()
# To check if we have any null values using below code for better clarification.
df.isnull().sum()
# Finding the unique values of y
df['y'].unique()
# how many people(client) have subscribed to a term deposit?
df.y.value_counts()
sns.set(font_scale=1.5)
countplt=sns.countplot(x='y', data=df, palette ='Set1')
plt.show()
# checking for duplicate values
df.duplicated().sum()
# Removing Duplicate Values
df = df.drop_duplicates()
# how many people(client) have subscribed to a term deposit?
df.y.value_counts()
print("# Missing value 'job' variable: {0}".format(len(df.loc[df['job'] == "unknown"])))
print("# Missing value 'marital' variable: {0}".format(len(df.loc[df['marital'] == "unknown"])))
print("# Missing value 'education' variable: {0}".format(len(df.loc[df['education'] == "unknown"])))
print("# Missing value 'default' variable: {0}".format(len(df.loc[df['default'] == "unknown"])))
print("# Missing value 'housing' variable: {0}".format(len(df.loc[df['housing'] == "unknown"])))
print("# Missing value 'loan' variable: {0}".format(len(df.loc[df['loan'] == "unknown"])))
print("# Missing value 'contact' variable: {0}".format(len(df.loc[df['contact'] == "unknown"])))
print("# Missing value 'month' variable: {0}".format(len(df.loc[df['month'] == "unknown"])))
print("# Missing value 'day_of_week' variable: {0}".format(len(df.loc[df['day_of_week'] == "unknown"])))
print("# Missing value 'poutcome' variable: {0}".format(len(df.loc[df['poutcome'] == "unknown"])))
df[df['marital'] == "unknown"]
# dropping the martial status which is unknown from our dataframe
df.drop(df[df['marital'] == "unknown"].index, inplace=True)
df.shape
df[df['default'] == "yes"].shape
df.education.value_counts()
df.job.value_counts()
df.describe()
# checking for outliers in dataset
plt.figure(figsize=(20,10))
df.boxplot()
plt.title("Boxplot of the dataframe", fontsize = 15)
print()
# looking inside duration variable boxplot
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['duration'])
plt.show()
# looking inside age variable boxplot
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['age'])
plt.show()
sns.boxplot(x = 'y', y = 'age', data = df)
# looking inside campaign variable boxplot
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['campaign'])
plt.show()
sns.boxplot(x = 'y', y = 'campaign', data = df)
Q1_d = df['duration'].quantile(.25)
Q3_d = df['duration'].quantile(.75)
Q1_a = df['age'].quantile(.25)
Q3_a = df['age'].quantile(.75)
Q1_c = df['campaign'].quantile(.25)
Q3_c = df['campaign'].quantile(.75)
IQR_d = Q3_d - Q1_d
IQR_a = Q3_a - Q1_a
IQR_c = Q3_c - Q1_c
print(IQR_d)
print(IQR_a)
print(IQR_c)
lower_d = Q1_d - 1.5 * IQR_d
upper_d = Q3_d + 1.5 * IQR_d
lower_a = Q1_a - 1.5 * IQR_a
upper_a = Q3_a + 1.5 * IQR_a
lower_c = Q1_c - 1.5 * IQR_d
upper_c = Q3_c + 1.5 * IQR_d
print(lower_d,upper_d)
print(lower_a,upper_a)
print(lower_c,upper_c)
# new dataframe created after removing outlier that exist outside the interval assign
df_out = df[df['duration'] >= lower_d]
df_out= df[df['duration'] <= upper_d]
# new dataframe created after removing outlier that exist outside the interval assign
df_out = df[df['age'] >= lower_a]
df_out= df[df['age'] <= upper_a]
# new dataframe created after removing outlier that exist outside the interval assign
df_out = df[df['campaign'] >= lower_c]
df_out= df[df['campaign'] <= upper_c]
df_out.describe()
# Calculating correlation
corr_matrix = df.corr()
print(corr_matrix)
# Creating correlation heat map.
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix, annot=True, vmin=-1, vmax=1,fmt='.2f')
plt.title("Correlation Heatmap", fontsize = 15)
plt.show()
# plotting the heatmap of only highly correlated varibles with threshold value 0.9
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix[corr_matrix > 0.9], annot=True)
plt.title("Correlation Heatmap", fontsize = 15)
plt.show()
# changing yes to 1 and no to 0
df['y'] = (df['y']=='yes').astype(int)
df.y.value_counts()
sns.distplot(df['age'], color = 'green')
plt.title('Customer Age Distribution', fontsize = 18)
plt.xlabel('Age', fontsize = 10)
plt.ylabel('count')
plt.show()
plt.figure(figsize=(11,7))
df[df['y']==1]['age'].hist(alpha = 0.5, color = 'red', bins= 50, label='y=1')
df[df['y']==0]['age'].hist(alpha = 0.5, color = 'blue', bins= 50, label='y=0')
plt.legend()
plt.xlabel('age')
plt.figure(figsize=(11,7))
df[df['y']==1]['campaign'].hist(alpha = 0.5, color = 'red', bins= 40, label='y=1')
df[df['y']==0]['campaign'].hist(alpha =0.5, color = 'blue', bins= 40, label='y=0')
plt.legend()
plt.xlabel('campaign')
plt.figure(figsize=(15,7))
df[df['y']==1]['duration'].hist(alpha = 0.5, color = 'red', bins= 50, label='y=1')
df[df['y']==0]['duration'].hist(alpha = 0.5, color = 'blue', bins= 50, label='y=0')
plt.legend()
plt.xlabel('duration')
sns.jointplot(x='age', y='campaign', data=df, color = 'green', alpha=0.2)
plt.figure(figsize=(11,7))
sns.lmplot(y='campaign',x='age',hue = 'y', data=df,col='marital',palette='Set1')
plt.figure(figsize=(7,7))
sns.distplot(df['emp.var.rate'])
cat_list = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
for column in cat_list:
plt.figure(figsize=(21,9))
sns.countplot(x = column, data = df, hue = 'y', palette = 'Set1')
plt.title('Barplot of '+column)
plt.show()
for column in cat_list:
plt.figure(figsize=(21,9))
sns.countplot(x = column, data = df, palette = 'Set1')
plt.title('Barplot of '+column)
plt.show()
#Create a list of element containing the string 'purpose, job,etc.'. Call this list cat_list.
cat_list = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
new_df = pd.get_dummies(df, columns = cat_list)
data = new_df.dropna()
data.info()
data.head()
data.shape
df[['duration', 'y']].boxplot(by=['y'], sym ='', figsize = [6, 6])
# split the datasets into training and test data
X = data.drop('y', axis=1)
y = data.y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 555, test_size= 0.30)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
#Standardization of the data
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)
def model_perf_v1(model,X_train, X_val, y_train, y_val):
pred_dt = model.predict(X_val)
print("Accuracy on training set:")
pred = model.predict(X_train)
print(metrics.accuracy_score(y_true = y_train, y_pred = pred))
#
print("Accuracy on testing set:")
accuracy = (metrics.accuracy_score(y_true = y_val, y_pred = pred_dt))
print(accuracy)
#confusion matrix
confusion_matrix_ = pd.crosstab(index=y_val, columns=pred_dt.ravel(), rownames=['Expected'], colnames=['Predicted'])
#visualization
sns.heatmap(confusion_matrix_, annot=True, square=False, fmt='', cbar=False)
plt.title("Confusion Matrix", fontsize = 15)
plt.show()
#
print("Recall:")
recall = (metrics.recall_score(y_val,pred_dt))
#recall_no = (metrics.recall_score(y_val,pred_dt))
print(recall)
#print(recall_no)
# #
print("Specificity:")
tn, fp, fn, tp = confusion_matrix(y_val,pred_dt).ravel()
spec = tn/(tn+fp)
Specificity = (spec)
print(Specificity)
# #
print("Precision:")
Precision = (metrics.precision_score(y_val,pred_dt))
print(Precision)
# #
print("Balanced Accuracy:")
Balanced_Accuracy = (metrics.balanced_accuracy_score(y_val,pred_dt))
print(Balanced_Accuracy)
# #
print("F1 score:")
F1_score = (metrics.f1_score(y_val,pred_dt))
print(F1_score)
#classification_report
# print(metrics.classification_report(y_test, pred_dt))
return accuracy,recall,Specificity,Precision,F1_score,Balanced_Accuracy
def model_perf_to_lst(model,X_val, y_val):
lst = [str(model)]
pred_dt = model.predict(X_val)
#print("Accuracy on testing set:")
lst.append(metrics.accuracy_score(y_true = y_val, y_pred = pred_dt))
#print("Recall:")
lst.append(metrics.recall_score(y_val,pred_dt))
#
#print("Specificity:")
tn, fp, fn, tp = confusion_matrix(y_val,pred_dt).ravel()
spec = tn/(tn+fp)
lst.append(spec)
#
#print("Precision:")
lst.append(metrics.precision_score(y_val,pred_dt))
#
#print("Balanced Accuracy:")
lst.append(metrics.balanced_accuracy_score(y_val,pred_dt))
#
#print("F1 score:")
lst.append(metrics.f1_score(y_val,pred_dt))
return lst
best_cl_normal = pd.DataFrame(columns = ['Model','Accuracy','Recall', 'Specificity', 'Precision', 'Balanced Accuracy', 'F1 score'])
rf = RandomForestClassifier()
# fitting the model
rf.fit(X_train,y_train)
accuracy_rf, recall_rf, Specificity_rf, Precision_rf, F1_score_rf, Balanced_Accuracy_rf = model_perf_v1(rf,X_train,X_test,y_train,y_test)
rf_perf = model_perf_to_lst(rf, X_test, y_test)
best_cl_normal.loc[len(best_cl_normal)] = rf_perf
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
accuracy_dt,recall_dt,Specificity_dt,Precision_dt,F1_score_dt,Balanced_Accuracy_dt = model_perf_v1(dt,X_train,X_test,y_train,y_test)
dt_perf = model_perf_to_lst(dt, X_test, y_test)
best_cl_normal.loc[len(best_cl_normal)] = dt_perf
log = LogisticRegression(max_iter=2500)
# fitting the model
log.fit(X_train, y_train)
accuracy_log, recall_log, Specificity_log, Precision_log, F1_score_log, Balanced_Accuracy_log = model_perf_v1(log,X_train,X_test,y_train,y_test)
log_perf = model_perf_to_lst(log, X_test, y_test)
best_cl_normal.loc[len(best_cl_normal)] = log_perf
knn = KNeighborsClassifier(n_neighbors=3)
# fitting the model
knn.fit(X_train, y_train)
accuracy_knn, recall_knn, Specificity_knn, Precision_knn, F1_score_knn, Balanced_Accuracy_knn = model_perf_v1(knn,X_train,X_test,y_train,y_test)
knn_perf = model_perf_to_lst(knn, X_test, y_test)
best_cl_normal.loc[len(best_cl_normal)] = knn_perf
gnb = GaussianNB()
gnb.fit(X_train, y_train)
accuracy_gnb, recall_gnb, Specificity_gnb, Precision_gnb, F1_score_gnb, Balanced_Accuracy_gnb = model_perf_v1(gnb,X_train,X_test,y_train,y_test)
gnb_perf = model_perf_to_lst(gnb, X_test, y_test)
best_cl_normal.loc[len(best_cl_normal)] = gnb_perf
grb = GradientBoostingClassifier()
grb.fit(X_train, y_train)
accuracy_grb, recall_grb, Specificity_grb, Precision_grb, F1_score_grb, Balanced_Accuracy_grb = model_perf_v1(grb,X_train,X_test,y_train,y_test)
grb_perf = model_perf_to_lst(grb, X_test, y_test)
best_cl_normal.loc[len(best_cl_normal)] = grb_perf
best_cl_normal
# Helper function for grid search
def grid_search_helper():
pipeline1 = Pipeline((
('clf', DecisionTreeClassifier()),
))
pipeline2 = Pipeline((
('clf', LogisticRegression()),
))
pipeline3 = Pipeline((
('clf', KNeighborsClassifier()),
))
pipeline4 = Pipeline((
('clf', GaussianNB()),
))
pipeline5 = Pipeline((
('clf', GradientBoostingClassifier()),
))
parameters1 = {
'clf__min_samples_split': [5, 10, 20, 30, 40, 50],
'clf__max_depth': list(range(1,15))
}
parameters2 = {
'clf__C': np.logspace(0, 4, 10)
}
parameters3 = {
'clf__n_neighbors': [1,3,5,7,9,11,15]
}
parameters4 = {
'clf__var_smoothing': np.logspace(0,-9)
}
parameters5 ={
'clf__n_estimators':[1, 2, 5, 10, 20, 50],
'clf__learning_rate':[0.1, 0.3, 0.5, 0.7, 1]
}
pars = [parameters1, parameters2, parameters3, parameters4, parameters5]
pips = [pipeline1, pipeline2, pipeline3, pipeline4, pipeline5]
print("starting Gridsearch")
dict_best_params ={}
for i in range(len(pars)):
print(pars[i])
print(pips[i])
gs = GridSearchCV(pips[i], pars[i], cv= 3, n_jobs=-1)
gs.fit(X_train, y_train)
print("finished Gridsearch\n")
#print(gs.best_estimator_)
dict_best_params[i]= gs.best_estimator_
return dict_best_params
best_params_dict = grid_search_helper()
accuracy_dt_h,recall_dt_h,Specificity_dt_h,Precision_dt_h,F1_score_dt_h,Balanced_Accuracy_dt_h = model_perf_v1(best_params_dict[0],X_train,X_test,y_train,y_test)
accuracy_log_h, recall_log_h, Specificity_log_h, Precision_log_h, F1_score_log_h, Balanced_Accuracy_log_h = model_perf_v1(best_params_dict[1],X_train,X_test,y_train,y_test)
accuracy_knn_h, recall_knn_h, Specificity_knn_h, Precision_knn_h, F1_score_knn_h, Balanced_Accuracy_knn_h = model_perf_v1(best_params_dict[2],X_train,X_test,y_train,y_test)
accuracy_gnb_h, recall_gnb_h, Specificity_gnb_h, Precision_gnb_h, F1_score_gnb_h, Balanced_Accuracy_gnb_h = model_perf_v1(best_params_dict[3],X_train,X_test,y_train,y_test)
accuracy_grb_h, recall_grb_h, Specificity_grb_h, Precision_grb_h, F1_score_grb_h, Balanced_Accuracy_grb_h = model_perf_v1(best_params_dict[4],X_train,X_test,y_train,y_test)
# 3 fold cross validation
k=3
params = dict(
min_samples_split = [5, 10, 20, 30, 40, 50],
max_depth = list(range(1,15)),
n_estimators = [1, 2, 5, 10, 20, 50]
)
params
rf_1 = RandomForestClassifier()
rf_gs = GridSearchCV(estimator=rf_1, param_grid=params, cv=k, n_jobs=-1 )
# fitting the random forest model
rf_gs.fit(X_train, y_train)
best_estimator_rf = rf_gs.best_estimator_
accuracy_rf_h, recall_rf_h, Specificity_rf_h, Precision_rf_h, F1_score_rf_h, Balanced_Accuracy_rf_h = model_perf_v1(best_estimator_rf,X_train,X_test,y_train,y_test)
column_labels = ['classifier','accuracy','recall','specificity','precision','f1-score','balanced']
# Random forest classifier best model performance
df1 = pd.DataFrame([['RandomForest_h',accuracy_rf_h, recall_rf_h, Specificity_rf_h, Precision_rf_h, F1_score_rf_h, Balanced_Accuracy_rf_h]],columns =column_labels )
# decision tree classifier best model
df2= pd.DataFrame([['DecisionTreeT_h',accuracy_dt_h, recall_dt_h, Specificity_dt_h, Precision_dt_h, F1_score_dt_h, Balanced_Accuracy_dt_h]],columns =column_labels )
# logistic regression best model
df3 = pd.DataFrame([['LogisticRegression_h', accuracy_log_h, recall_log_h, Specificity_log_h, Precision_log_h, F1_score_log_h, Balanced_Accuracy_log_h]],columns =column_labels )
# guassian naive bayes best model performance using PCA
df4 = pd.DataFrame([['GuassianNB_h',accuracy_gnb_h, recall_gnb_h, Specificity_gnb_h, Precision_gnb_h, F1_score_gnb_h, Balanced_Accuracy_gnb_h]],columns =column_labels )
# KNN best model performance
df5 = pd.DataFrame([['KNN_h',accuracy_knn_h, recall_knn_h, Specificity_knn_h, Precision_knn_h, F1_score_knn_h, Balanced_Accuracy_knn_h]],columns =column_labels )
# Gradient Boosting classifier best model performance
df6 = pd.DataFrame([['GradientBoosting_h',accuracy_grb_h, recall_grb_h, Specificity_grb_h, Precision_grb_h, F1_score_grb_h, Balanced_Accuracy_grb_h]],columns =column_labels )
combined_data_h = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)
combined_data_h
import plotly.express as ex
Y = df['y']
# print(dff)
def myplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
colors = {'1':'pink', '0':'blue'}
plt.scatter(xs * scalex,ys * scaley, c= y.apply(lambda x: colors[x]))
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
def PCA_transformation(dataset):
x=dataset.drop(['y'], axis=1) #droping column # a the features
x=StandardScaler().fit_transform(x) # standarize the variables
pca=PCA(n_components=2) #first 2 leading principal components
PC=pca.fit_transform(x)
principalDF=pd.DataFrame(data=PC,columns=['pc1','pc2'])
print("First 2 leading principal components")
finalDf = pd.concat([principalDF, data[['y']]], axis = 1)
print(finalDf.head())
# myplot(PC[:,0:2],np.transpose(pca.components_[0:2, :]))
# plt.show()
fx = dataset.drop(['y'], axis=1) #droping column
PCloadings = pca.components_.T * np.sqrt(pca.explained_variance_)
components=fx.columns.tolist()
loadingdf=pd.DataFrame(PCloadings,columns=('PC1','PC2'))
loadingdf["variable"]=components
print("PCA Loadings")
print(loadingdf)
return loadingdf,x
def cumluative_varienceGraph(n_components,x):
pca_test = PCA(n_components=n_components)
pca_test.fit(x)
sns.set(style='whitegrid')
plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.axvline(linewidth=4, color='r', linestyle = '--', x=46, ymin=0, ymax=1)
display(plt.show())
evr = pca_test.explained_variance_ratio_
cvr = np.cumsum(pca_test.explained_variance_ratio_)
pca_df = pd.DataFrame()
pca_df['Cumulative Variance Ratio'] = cvr
pca_df['Explained Variance Ratio'] = evr
def plotPCA(loadingdf):
fig=ex.scatter(x=loadingdf['PC1'],y=loadingdf['PC2'],text=loadingdf['variable'],)
fig.update_layout(
height=600,width=500,
title_text='loadings plot')
fig.update_traces(textposition='bottom center')
fig.add_shape(type="line",
x0=-0, y0=-0.5,x1=-0,y1=2.5,
line=dict(color="RoyalBlue",width=3)
)
fig.add_shape(type="line",
x0=-1, y0=0,x1=1,y1=0,
line=dict(color="RoyalBlue",width=3)
)
fig.show()
loadingsDF,x_full = PCA_transformation(data)
cumluative_varienceGraph(62,x_full)
#plotPCA function generated the newplot.png file which we are uploading here and displaying.
#Converting ipynb file in colab to html did not show the below image so we are saving the picture in image and manually displaying.
plotPCA(loadingsDF)
from IPython.display import Image
Image('newplot.png')
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=43)
X_pca = pca.fit_transform(x_full)
# Get the transformed dataset
X_pca = pd.DataFrame(X_pca)
print(X_pca.head())
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, Y, test_size=0.20, random_state=2)
#Initialize the logistic regression model
logpca = LogisticRegression(max_iter=2500)
# Train the model
logpca.fit(X_train_pca, y_train_pca)
accuracy_log_pca,recall_log_pca,Specificity_log_pca,Precision_log_pca,F1_score_log_pca,Balanced_Accuracy_log_pca = model_perf_v1(logpca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
#dropping varibales not being used in PCA
y= df['y']
dff = df.drop(['default','month','day_of_week','campaign','poutcome','housing','contact'], axis=1) #droping column
dff['y'].replace('yes','1',inplace=True)
dff['y'].replace('no','0',inplace=True)
y=dff['y'] # assign y variable - the target
dff = pd.get_dummies(dff, columns = ['education', 'marital','job','loan'])
loadingsDF,x_defined_components = PCA_transformation(dff)
cumluative_varienceGraph(35,x_defined_components)
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=29)
X_pca1 = pca.fit_transform(x_defined_components)
# Get the transformed dataset
X_pca1 = pd.DataFrame(X_pca1)
print(X_pca1.head())
X_train_pca1, X_test_pca1, y_train_pca1, y_test_pca1 = train_test_split(X_pca1, Y, test_size=0.20, random_state=2)
#Initialize the logistic regression model with pca model with 29 components
from sklearn.linear_model import LogisticRegression
logpca1 = LogisticRegression(max_iter=2500)
# Train the model
logpca1.fit(X_train_pca1, y_train_pca1)
accuracy_log_pca1,recall_log_pca1,Specificity_log_pca1,Precision_log_pca1,F1_score_log_pca1,Balanced_Accuracy_log_pca1 = model_perf_v1(logpca1,X_train_pca1,X_test_pca1,y_train_pca1,y_test_pca1)
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=43)
X_pca = pca.fit_transform(x_full)
# Get the transformed dataset
X_pca = pd.DataFrame(X_pca)
print(X_pca.head())
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, Y, test_size=0.20,
random_state=2)
dt_pca = DecisionTreeClassifier()
dt_pca.fit(X_train_pca, y_train_pca)
accuracy_dt_pca,recall_dt_pca,Specificity_dt_pca,Precision_dt_pca,F1_score_dt_pca,Balanced_Accuracy_dt_pca = model_perf_v1(dt_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
rf_pca = RandomForestClassifier()
# fitting the model
rf_pca.fit(X_train_pca,y_train_pca)
accuracy_rf_pca,recall_rf_pca,Specificity_rf_pca,Precision_rf_pca,F1_score_rf_pca,Balanced_Accuracy_rf_pca = model_perf_v1(rf_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
knn_pca = KNeighborsClassifier(n_neighbors=3)
# Train the model
knn_pca.fit(X_train_pca, y_train_pca)
accuracy_knn_pca,recall_knn_pca,Specificity_knn_pca,Precision_knn_pca,F1_score_knn_pca,Balanced_Accuracy_knn_pca = model_perf_v1(knn_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
gnb_pca = GaussianNB()
gnb_pca.fit(X_train_pca, y_train_pca)
accuracy_gnb_pca,recall_gnb_pca,Specificity_gnb_pca,Precision_gnb_pca,F1_score_gnb_pca,Balanced_Accuracy_gnb_pca = model_perf_v1(gnb_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
grb_pca = GradientBoostingClassifier()
grb_pca.fit(X_train_pca, y_train_pca)
accuracy_grb_pca, recall_grb_pca, Specificity_grb_pca, Precision_grb_pca, F1_score_grb_pca, Balanced_Accuracy_grb_pca = model_perf_v1(grb_pca,X_train_pca,X_test_pca,y_train_pca,y_test_pca)
column_labels = ['classifier','accuracy','recall','specificity','precision','f1-score','balanced']
# Random forest classifier best model performance
df1 = pd.DataFrame([['RandomForest_pca',accuracy_rf_pca, recall_rf_pca, Specificity_rf_pca, Precision_rf_pca, F1_score_rf_pca, Balanced_Accuracy_rf_pca]],columns =column_labels )
# decision tree classifier best model
df2= pd.DataFrame([['DecisionTree_pca',accuracy_dt_pca, recall_dt_pca, Specificity_dt_pca, Precision_dt_pca, F1_score_dt_pca, Balanced_Accuracy_dt_pca]],columns =column_labels )
# logistic regression best model
df3 = pd.DataFrame([['LogisticRegression_pca', accuracy_log_pca, recall_log_pca, Specificity_log_pca, Precision_log_pca, F1_score_log_pca, Balanced_Accuracy_log_pca]],columns =column_labels )
# guassian naive bayes best model performance using PCA
df4 = pd.DataFrame([['GuassianNB_pca',accuracy_gnb_pca, recall_gnb_pca, Specificity_gnb_pca, Precision_gnb_pca, F1_score_gnb_pca, Balanced_Accuracy_gnb_pca]],columns =column_labels )
# KNN best model performance
df5 = pd.DataFrame([['KNN_pca',accuracy_knn_pca, recall_knn_pca, Specificity_knn_pca, Precision_knn_pca, F1_score_knn_pca, Balanced_Accuracy_knn_pca]],columns =column_labels )
# Gradient Boosting classifier best model performance
df6 = pd.DataFrame([['GradientBoosting_pca',accuracy_grb_pca,recall_grb_pca,Specificity_grb_pca,Precision_grb_pca,F1_score_grb_pca,Balanced_Accuracy_grb_pca]],columns =column_labels )
combined_data_ppca = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)
combined_data_ppca
def feature_importance(model):
# fit the model
model.fit(X, y)
feature_list = list(X.columns)
# get importance
importance = list(model.feature_importances_)
feature_importance = [(feature, round(importance,2)) for feature, importance in zip (feature_list, importance)]
[print('variable:{:25} Importance:{}'.format(*pair)) for pair in feature_importance]
rf_fi = RandomForestClassifier()
feature_importance(rf_fi)
# dropping less importance features
X_dropped_rf = X.drop(['job_entrepreneur','job_housemaid','job_self-employed','job_student','job_unemployed','job_unknown','education_basic.6y','education_illiterate','education_unknown','default_yes','housing_unknown','loan_unknown','month_apr','month_aug','month_dec','month_jul','month_jun','month_mar','month_nov','month_sep'],axis=1)
print(X_dropped_rf.shape)
# splitting the data ino train and test
X_train_r, X_test_r, y_train, y_test = train_test_split(X_dropped_rf, y, test_size=0.30, random_state=2020)
rf_drop= RandomForestClassifier()
# fitting the decision tree model
rf_drop.fit(X_train_r, y_train)
accuracy_rf_fi, recall_rf_fi, Specificity_rf_fi, Precision_rf_fi, F1_score_rf_fi, Balanced_Accuracy_rf_Fi = model_perf_v1(rf_drop,X_train_r,X_test_r,y_train,y_test)
dt_fi = DecisionTreeClassifier()
feature_importance(dt_fi)
# dropping less important features
X_dropped_dt = X.drop(['emp.var.rate','job_entrepreneur','job_housemaid','job_retired','job_self-employed','job_services','job_student','job_unemployed','job_unknown','marital_divorced','education_illiterate','education_unknown','default_yes','default_unknown','default_no','housing_unknown','loan_no','loan_unknown','contact_cellular','month_apr','month_aug','month_dec','month_jun','month_jul','month_mar','month_may','month_nov','month_sep','poutcome_failure','poutcome_nonexistent'],axis=1)
print(X_dropped_dt.shape)
# splitting the data ino train and test
X_train_dt, X_test_dt, y_train, y_test = train_test_split(X_dropped_dt, y, test_size=0.30, random_state=2020)
dt_drop = DecisionTreeClassifier()
# fitting the model
dt_drop.fit(X_train_dt, y_train)
accuracy_dt_fi, recall_dt_fi, Specificity_dt_fi, Precision_dt_fi, F1_score_dt_fi, Balanced_Accuracy_dt_fi = model_perf_v1(dt_drop,X_train_dt,X_test_dt,y_train,y_test)
# Random forest classifier best model performance ((regular/no tunning)
df1 = pd.DataFrame([['RF',accuracy_rf, recall_rf, Specificity_rf, Precision_rf, F1_score_rf, Balanced_Accuracy_rf]],columns =column_labels )
# decision tree classifier best model (regular/no tunning)
df2= pd.DataFrame([['DT',accuracy_dt, recall_dt, Specificity_dt, Precision_dt, F1_score_dt, Balanced_Accuracy_dt]],columns =column_labels )
# logistic regression best model (regular/no tunning)
df3 = pd.DataFrame([['Logistic', accuracy_log, recall_log, Specificity_log, Precision_log, F1_score_log, Balanced_Accuracy_log]],columns =column_labels )
# guassian naive bayes best model performance using PCA
df4 = pd.DataFrame([['GNB',accuracy_gnb_pca, recall_gnb_pca, Specificity_gnb_pca, Precision_gnb_pca, F1_score_gnb_pca, Balanced_Accuracy_gnb_pca]],columns =column_labels )
# KNN best model performance (regular/no tunning)
df5 = pd.DataFrame([['KNN',accuracy_knn, recall_knn, Specificity_knn, Precision_knn, F1_score_knn, Balanced_Accuracy_knn]],columns =column_labels )
# Gradient Boosting classifier best model performance(regular/no tunning)
df6 = pd.DataFrame([['GradientBoosting',accuracy_grb,recall_grb,Specificity_grb,Precision_grb,F1_score_grb,Balanced_Accuracy_grb]],columns =column_labels )
combined_data_2 = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)
combined_data_2
def prec_auc(model,X_test,y_test):
probs = model.predict_proba(X_test)
# retrieve just the probabilities for the positive class
pos_probs = probs[:, 1]
# calculate precision recal curve for model
precision, recall, thresholds = precision_recall_curve(y_test, pos_probs)
auc_score_p = auc(recall, precision)
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
plt.plot(recall, precision, marker='.', label = model)
print('\n')
#print('model + precision AUC: %.3f' % auc_score_p)
print('model : {} precision AUC: {:.3f}'.format(model, auc_score_p))
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
#plt.legend()
# show the plot
plt.show()
# default rf model
prec_auc(rf,X_test,y_test)
# random forest model usng hyperparameter
prec_auc(best_estimator_rf, X_test, y_test)
# default decision tree model
prec_auc(dt,X_test,y_test)
# decision tree model using hyperparameter tunning
prec_auc(best_params_dict[0], X_test, y_test)
# default logistic regression model
prec_auc(log,X_test,y_test)
# logistic regression model using hyperparameter tunning
prec_auc(best_params_dict[1], X_test,y_test)
# default gradient boosting model
prec_auc(grb,X_test,y_test)
# gradient boosting model using hyperparameter tunning
prec_auc(best_params_dict[4], X_test,y_test)
# default knn
prec_auc(knn,X_test,y_test)
#knn model using hyperparameter tunning
prec_auc(best_params_dict[2], X_test,y_test)
# default guassian nb
prec_auc(gnb,X_test,y_test)
#gaussian naive bayes model using hyperparameter tunning
prec_auc(best_params_dict[3], X_test,y_test)
from imblearn.over_sampling import RandomOverSampler
# before oversampling
print("counts of label '1': {}".format(sum(y_train == 1)))
print("counts of label '0': {} \n".format(sum(y_train == 0)))
upsample = RandomOverSampler(sampling_strategy='minority')
X_train_up, y_train_up = upsample.fit_resample(X_train, y_train)
X_train_up.shape
y_train_up.shape
# after oversampling the minority class
print("counts of label '1': {}".format(sum(y_train_up == 1)))
print("counts of label '0': {}".format(sum(y_train_up == 0)))
rf_b = RandomForestClassifier()
# fitting the model
rf_b.fit(X_train_up,y_train_up)
accuracy_rf_up, recall_rf_up, Specificity_rf_up, Precision_rf_up, F1_score_rf_up, Balanced_Accuracy_rf_up = model_perf_v1(rf_b,X_train_up, X_test, y_train_up, y_test)
dt_b = DecisionTreeClassifier()
dt_b.fit(X_train_up, y_train_up)
accuracy_dt_up,recall_dt_up,Specificity_dt_up,Precision_dt_up,F1_score_dt_up,Balanced_Accuracy_dt_up = model_perf_v1(dt_b,X_train_up, X_test,y_train_up,y_test)
log_b = LogisticRegression(max_iter=2500)
# fitting the model
log_b.fit(X_train_up, y_train_up)
accuracy_log_up, recall_log_up, Specificity_log_up, Precision_log_up, F1_score_log_up, Balanced_Accuracy_log_up = model_perf_v1(log_b,X_train_up,X_test,y_train_up,y_test)
gnb_b = GaussianNB()
# fitting the model
gnb_b.fit(X_train_up, y_train_up)
accuracy_gnb_up, recall_gnb_up, Specificity_gnb_up, Precision_gnb_up, F1_score_gnb_up, Balanced_Accuracy_gnb_up = model_perf_v1(gnb_b,X_train_up,X_test,y_train_up,y_test)
knn_b = KNeighborsClassifier(n_neighbors=3)
# fitting the model
knn_b.fit(X_train_up, y_train_up)
accuracy_knn_up, recall_knn_up, Specificity_knn_up, Precision_knn_up, F1_score_knn_up, Balanced_Accuracy_knn_up = model_perf_v1(knn_b,X_train_up,X_test,y_train_up,y_test)
grb_b = GradientBoostingClassifier()
grb_b.fit(X_train_up, y_train_up)
accuracy_grb_up, recall_grb_up, Specificity_grb_up, Precision_grb_up, F1_score_grb_up, Balanced_Accuracy_grb_up = model_perf_v1(grb_b,X_train_up,X_test,y_train_up,y_test)
column_labels = ['classifier','accuracy','recall','specificity','precision','f1-score','balanced']
df_1 = pd.DataFrame([['RandomForest_b',accuracy_rf_up,recall_rf_up,Specificity_rf_up,Precision_rf_up,F1_score_rf_up,Balanced_Accuracy_rf_up]],columns =column_labels )
df_2= pd.DataFrame([['DecisionTree_b',accuracy_dt_up,recall_dt_up,Specificity_dt_up,Precision_dt_up,F1_score_dt_up,Balanced_Accuracy_dt_up]],columns =column_labels )
df_3 = pd.DataFrame([['LogisticRegression_b',accuracy_log_up,recall_log_up,Specificity_log_up,Precision_log_up,F1_score_log_up,Balanced_Accuracy_log_up]],columns =column_labels )
df_4 = pd.DataFrame([['GuassianNB_b',accuracy_gnb_up,recall_gnb_up,Specificity_gnb_up,Precision_gnb_up,F1_score_gnb_up,Balanced_Accuracy_gnb_up]],columns =column_labels )
df_5 = pd.DataFrame([['KNN_b',accuracy_knn_up,recall_knn_up,Specificity_knn_up,Precision_knn_up,F1_score_knn_up,Balanced_Accuracy_knn_up]],columns =column_labels )
df_6 = pd.DataFrame([['GradientBoosting_b',accuracy_grb_up,recall_grb_up,Specificity_grb_up,Precision_grb_up,F1_score_grb_up,Balanced_Accuracy_grb_up]],columns =column_labels )
combined_data_b= pd.concat([df_1, df_2, df_3, df_4, df_5, df_6])
combined_data_b
def accuracy_graph(ac1, ac2, ac3, ac4, ac5, ac6,ac7, ac8, ac9, ac10, ac11, ac12):
Balanced_data = [float(accuracy_rf_up)*100, float(accuracy_dt_up)*100, float(accuracy_log_up)*100 ,float(accuracy_gnb_up)*100,float(accuracy_knn_up)*100,float(accuracy_grb_up)*100]
Unbalanced_data = [float(accuracy_rf)*100, float(accuracy_dt)*100, float(accuracy_log)*100, float(accuracy_gnb_pca)*100, float(accuracy_knn)*100, float(accuracy_grb)*100]
index = ['RF','DT','Log', 'GNB', 'KNN','GRB']
acc_pd = pd.DataFrame({'Balanced data':Balanced_data,'Unbalanced data':Unbalanced_data},index=index)
acc_pd
ax = acc_pd.plot(kind='bar', ylim=(0,100), xlabel='Classifiers', ylabel = 'Performance measure', legend=True, figsize=(15, 10))
#plt.figure(figsize=(21,9))
ax.set_title('Accuracy Score of all classification model')
accuracy_graph(accuracy_rf, accuracy_rf_up, accuracy_dt, accuracy_dt_up, accuracy_log, accuracy_log_up, accuracy_gnb_pca, accuracy_gnb_up, accuracy_knn, accuracy_knn_up, accuracy_grb,accuracy_grb_up)
#reverting back standartization, since Random Forest, Decision tree and Logistic regression don't require feature scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 555, test_size= 0.30)
best_cl_normal_bal = pd.DataFrame(columns = ['Model','Accuracy','Recall', 'Specificity', 'Precision', 'Balanced Accuracy', 'F1 score'])
#random forest with weights
rf_bal = RandomForestClassifier(class_weight='balanced')
# fitting the model
rf_bal.fit(X_train,y_train)
rf_perf_bal = model_perf_to_lst(rf_bal, X_test, y_test)
best_cl_normal_bal.loc[len(best_cl_normal_bal)] = rf_perf_bal
#Decision tree with weights
dt_bal = DecisionTreeClassifier(class_weight='balanced')
dt_bal.fit(X_train, y_train)
dt_perf_bal = model_perf_to_lst(dt_bal, X_test, y_test)
best_cl_normal_bal.loc[len(best_cl_normal_bal)] = dt_perf_bal
#logistic regression
log_bal = LogisticRegression(max_iter=2500, class_weight='balanced')
# fitting the model
log_bal.fit(X_train, y_train)
log_perf_bal = model_perf_to_lst(log_bal, X_test, y_test)
best_cl_normal_bal.loc[len(best_cl_normal_bal)] = log_perf_bal
best_cl_normal_bal
#%%shell
#jupyter nbconvert --to html Group6_DSC540_MarketingAnalytics_ProjectMilestone.ipynb