The Best Classifier - Python IBM project

Classification with Python

We load a dataset using Pandas library, and apply the following algorithms, and find the best one for this specific dataset by accuracy evaluation methods.

Lets first load required libraries:

import itertools import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import NullFormatter import pandas as pd import numpy as np import matplotlib.ticker as ticker from sklearn import preprocessing %matplotlib inline

Downloading the data set

!wget -O loan_train.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv

Loading the data from CSV file

df = pd.read_csv('loan_train.csv') df.head()

df.shape

Convert to date-time object

df['due_date'] = pd.to_datetime(df['due_date']) df['effective_date'] = pd.to_datetime(df['effective_date']) df.head()

Data visualization and pre-processing

df['loan_status'].value_counts()

260 people have paid off the loan on time while 86 have gone into collection

pip install seaborn

!conda install -c anaconda seaborn -y

import seaborn as sns bins = np.linspace(df.Principal.min(), df.Principal.max(), 10) g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2) g.map(plt.hist, 'Principal', bins=bins, ec="k") g.axes[-1].legend() plt.show()

bins = np.linspace(df.age.min(), df.age.max(), 10) g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2) g.map(plt.hist, 'age', bins=bins, ec="k") g.axes[-1].legend() plt.show()

Pre-processing: Feature selection/extraction

Lets check what day of the week people get the loan

df['dayofweek'] = df['effective_date'].dt.dayofweek bins = np.linspace(df.dayofweek.min(), df.dayofweek.max(), 10) g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2) g.map(plt.hist, 'dayofweek', bins=bins, ec="k") g.axes[-1].legend() plt.show()

We see that people who get the loan at the end of the week dont pay it off, so lets use Feature binarization to set a threshold values less then day 4

df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3) else 0) df.head()

Convert Categorical features to numerical values

Let's look at gender

df.groupby(['Gender'])['loan_status'].value_counts(normalize=True)

Lets convert male to 0 and female to 1:

df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True) df.head()

One Hot Encoding

How about education?

df.groupby(['education'])['loan_status'].value_counts(normalize=True)

Feature before One Hot Encoding

df[['Principal','terms','age','Gender','education']].head()

Using one hot encoding technique to convert categorical varables to binary variables and append them to the feature Data Frame

Feature = df[['Principal','terms','age','Gender','weekend']] Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1) Feature.drop(['Master or Above'], axis = 1,inplace=True) Feature.head()

Feature selection

Defining the features of x

X = Feature X[0:5]

What are our lables?

y = df['loan_status'].values y[0:5]

Normalize Data

X= preprocessing.StandardScaler().fit(X).transform(X) X[0:5]

Classification Now, i will use the training set to build an accurate model. Then use the test set to report the accuracy of the model. The following algorithms would be used:

K Nearest Neighbor(KNN) Decision Tree Support Vector Machine Logistic Regression

K Nearest Neighbor(KNN)

from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4) print ('Train set:', X_train.shape, y_train.shape) print ('Test set:', X_test.shape, y_test.shape)

from sklearn.neighbors import KNeighborsClassifier k = 6 neighK6 = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train) neighK6 yhat = neighK6.predict(X_test) yhat[0:5] from sklearn import metrics print("Train set Accuracy: ", metrics.accuracy_score(y_train, neighK6.predict(X_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

Ks = 10 mean_acc = np.zeros((Ks-1)) std_acc = np.zeros((Ks-1)) ConfustionMx = []; for n in range(1,Ks): neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train) yhat=neigh.predict(X_test) mean_acc[n-1] = metrics.accuracy_score(y_test, yhat) std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0]) plt.plot(range(1,Ks),mean_acc,'g') plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10) plt.legend(('Accuracy ', '+/- 3xstd')) plt.ylabel('Accuracy ') plt.xlabel('Number of Nabors (K)') plt.tight_layout() plt.show() print( "Best accuracy:", mean_acc.max(), "k=", mean_acc.argmax()+1)

Decision Tree

from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3) #Modelling Tree = DecisionTreeClassifier(criterion="entropy", max_depth = 6) Tree

Tree.fit(X_trainset,y_trainset)

from six import StringIO

import six import sys sys.modules['sklearn.externals.six'] = six

!pip install pydotplus

predTree = Tree.predict(X_testset) print (predTree [0:5]) print (y_testset [0:5]) from sklearn import metrics import matplotlib.pyplot as plt print("Accuracy: ", metrics.accuracy_score(y_testset, predTree)) !conda install -c conda-forge pydotplus -y !conda install -c conda-forge python-graphviz -y from sklearn.externals.six import StringIO import pydotplus import matplotlib.image as mpimg from sklearn import tree %matplotlib inline dot_data = StringIO() filename = "loan.png" featureNames = df.columns[0:8] targetNames = df['loan_status'].unique().tolist() out=tree.export_graphviz(Tree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_trainset), filled=True, special_characters=True,rotate=False) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png(filename) img = mpimg.imread(filename) plt.figure(figsize=(100, 200)) plt.imshow(img,interpolation='nearest')

Support Vector Machine

df.dtypes df = df[pd.to_numeric(df['education'], errors='coerce').notnull()] df['education'] = df['education'].astype('int') df.dtypes from sklearn import svm clf = svm.SVC(kernel='rbf') clf.fit(X_train, y_train)

yhat = clf.predict(X_test) yhat [0:5] from sklearn.metrics import classification_report, confusion_matrix import itertools

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') cnf_matrix = confusion_matrix(y_test, yhat, labels=['PAIDOFF','COLLECTION']) np.set_printoptions(precision=2) print (classification_report(y_test, yhat)) plt.figure() plot_confusion_matrix(cnf_matrix, classes=['PAIDOFF','COLLECTION'],normalize= False, title='Confusion matrix') from sklearn.metrics import f1_score f1_score(y_test, yhat, average='weighted') from sklearn.metrics import jaccard_score jaccard_score(y_test, yhat, pos_label = "PAIDOFF")

df = df[['loan_status', 'Principal', 'terms', 'effective_date', 'due_date', 'age', 'education', 'Gender']] df['loan_status'] = df['loan_status'].astype('int') from sklearn import preprocessing X = preprocessing.StandardScaler().fit(X).transform(X) X[0:5] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4) print ('Train set:', X_train.shape, y_train.shape) print ('Test set:', X_test.shape, y_test.shape) from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix LogR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train) LogR

yhat = LogR.predict(X_test) yhat yhat_prob = LogR.predict_proba(X_test) yhat_prob

from sklearn.metrics import jaccard_score jaccard_score(y_test, yhat, pos_label = "PAIDOFF") from sklearn.metrics import log_loss log_loss(y_test, yhat_prob)

Model Evaluation using Test set

from sklearn.metrics import jaccard_score from sklearn.metrics import f1_score from sklearn.metrics import log_loss

!wget -O loan_test.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv

Load Test set for evaluation

test_df = pd.read_csv('loan_test.csv') test_df.head()

X= preprocessing.StandardScaler().fit(X).transform(X) X[0:5] Y = test_df['loan_status'].values Y[0:5]

#test the KNN algorithm already trained with K=6 yhatKNN=neigh.predict(X) KNNJaccard = jaccard_score(y, yhatKNN, pos_label = "PAIDOFF") KNNF1 = f1_score(y, yhatKNN, average='weighted') print("Avg F1-score: %.2f" % KNNF1 ) print("KNN Jaccard Score: %.2f" % KNNJaccard) yhatDEC = Tree.predict(X) DTJaccard = jaccard_score(y, yhatDEC,pos_label = "PAIDOFF") DTF1 = f1_score(y, yhatDEC, average='weighted') print("Avg F1-score: %.2f" % DTF1 ) print("Decision Tree Jaccard Score: %.2f" % DTJaccard) yhatSVM=clf.predict(X) SVMJaccard = jaccard_score(y, yhatSVM,pos_label = "PAIDOFF") SVMF1 = f1_score(y, yhatSVM, average='weighted') print("Avg F1-score: %.2f" % SVMF1) print("SVM Jaccard score: %.2f" % SVMJaccard) yhatLOG = LogR.predict(X) yhatLOGproba = LogR.predict_proba(X) LogRJaccard = jaccard_score(y, yhatLOG,pos_label = "PAIDOFF") LogRF1 = f1_score(y, yhatLOG, average='weighted') Logloss = log_loss(y, yhatLOGproba) print("LogLoss: : %.2f" % Logloss) print("Avg F1-score: %.4f" % LogRF1) print("LOG Jaccard score: %.4f" % LogRJaccard)

.css-hdxizt{color:var(--chakra-colors-fg-neutral-primary);font-weight:var(--chakra-fontWeights-bold);letter-spacing:-0.09px;}Classification with Python.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}