Naive Byes Classifier with 3 different datasets
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use('dark_background')
class1 = pd.read_csv('class1.txt',header = None, names = ['col1', 'col2'])
class2 = pd.read_csv('class2.txt',header = None,names = ['col1', 'col2'])
classes = [0]*class1.shape[0]
classes.extend([1]*class2.shape[0])
classes=np.array(classes)
#classes = pd.DataFrame(np.array(classes))
df = pd.concat([class1, class2], axis=0)
X_train,X_test,Y_train, Y_test= train_test_split(df,classes, test_size=0.2, random_state=52)
def class_data(X,y,cls):
"""Function to filter the data of a particular class"""
clas = []
for i in range(len(X)):
if y[i]==cls:
clas.append(X.iloc[i])
return pd.DataFrame(clas)
def plotting(X_train,X_test,y_train,y_test):
ctrain_0= class_data(X_train,y_train,0)
ctrain_1= class_data(X_train,y_train,1)
ctest_0 = class_data(X_test,y_test,0)
ctest_1 = class_data(X_test,y_test,1)
plt.scatter(ctrain_0['col1'],ctrain_0['col2'],label ='TrainClass-0')
plt.scatter(ctest_0['col1'],ctest_0['col2'],label ='TestClass-0')
plt.scatter(ctrain_1['col1'],ctrain_1['col2'],label ='TrainClass-1')
plt.scatter(ctest_1['col1'],ctest_1['col2'],label ='TestClass-1')
plt.legend()
plt.show()
plotting(X_train,X_test,Y_train, Y_test)
def priors(y_train):
""" P(c) - Prior Class Probability, simply sum of one class / total """
class_priors = {}
for outcome in np.unique(y_train):
outcome_count = sum(y_train == outcome)
class_priors[outcome] = outcome_count / y_train.shape[0]
return class_priors
def mean_cov_forClass(data,cls):
"""Calculation of mean and matrix of the data, particularly class wise"""
cols = data.columns
cov = data.cov()
return [np.mean(data),cov]
def likelihood(x, mean, cov_matrix,no_cls):
"""Likelihood is calculated, P(X|Class), assuming the distribution to be normal"""
#x = np.array(x)
#mean = np.array(mean)
cov_matrix = np.array(cov_matrix)
val = (1/(((2*np.pi))*(np.linalg.det(cov_matrix)**0.5)))
val *= np.exp(-0.5*np.dot(np.dot((x - mean).T, np.linalg.inv(cov_matrix)), x - mean))
return val
#print(likelihood([13.316 2.3246], mean, cov_matrix,no_cls))
def class_info(X,y,classes):
"""Provides the information of classes given all the data, like mean and covariance matrix in the form of a dictionary
so that it can be accessed while calculation the posterior probablity."""
class_in= {}
for i in range(classes):
data = class_data(X,y,i)
#print(data)
mu,cov = mean_cov_forClass(data,i)
class_in[i] = [mu,cov]
return class_in
def predict(X,no_cls,prior_prob,class_in):
"""Prediction of data, given a data vector.
We do not consider the the denomiator as it will be same for each class so does not play a role in the prection."""
label = []
#mu, cov = mean_cov_forClass(X_train,0)[0],mean_cov_forClass(X_train,0)[1]
for i in np.array(X):
#Dictionary storing the key as class and value as the posterior probablity of belonging to that class
posterior = {}
for cls in range(no_cls):
#Mean and cov of the class calculated via training data and stored as a dictionary in class_info function
mu, cov = class_in[cls][0],class_in[cls][1]
like= likelihood(i, mu, cov,no_cls)
#print(like)
#posterior probablity of the class
posterior[cls]= round((like * prior_prob[cls]),9)
#print(posterior)
Keymax = max(zip(posterior.values(), posterior.keys()))[1]
label.append(Keymax)
return label
class_in = class_info(X_train,Y_train,2)
print('For Linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TRAIN ' + '\033[0m' + 'data is :')
prior_prob = priors(Y_train)
predicted = predict(X_train,2,prior_prob,class_in)
print(confusion_matrix(Y_train, predicted))
print(accuracy_score(Y_train, predicted))
print('For Linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TEST ' + '\033[0m' + 'data is :')
prior_prob = priors(Y_test)
predicted = predict(X_test,2,prior_prob,class_in)
print(confusion_matrix(Y_test, predicted))
print(accuracy_score(Y_test, predicted))
#NLS data
nls1 = pd.read_csv('NLS/class1.txt',header = None, names = ['col1', 'col2'])
nls2 = pd.read_csv('NLS/class2.txt',header = None,names = ['col1', 'col2'])
nls_class = [0]*nls1.shape[0]
nls_class.extend([1]*nls2.shape[0])
nls_class=np.array(nls_class)
df_nls = pd.concat([nls1, nls2], axis=0)
nls_train,nls_test,y_train, y_test= train_test_split(df_nls,nls_class, test_size=0.2, random_state=52)
plotting(nls_train,nls_test,y_train, y_test)
class_in = class_info(nls_train,y_train,2)
nls_prior_prob = priors(y_train)
print('For non-linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TRAIN ' + '\033[0m' + 'data is :')
nls_predicted = predict(nls_train,2,nls_prior_prob,class_in)
print(confusion_matrix(y_train, nls_predicted))
print(accuracy_score(y_train, nls_predicted))
print('For non-linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TEST ' + '\033[0m' + 'data is :')
#nls_prior_prob = priors(y_train)
nls_predicted = predict(nls_test,2,nls_prior_prob,class_in)
print(confusion_matrix(y_test, nls_predicted))
print(accuracy_score(y_test, nls_predicted))
print()
real1 = pd.read_csv('Real/class1.txt',delim_whitespace=True,header = None, names = ['col1', 'col2'])
real2 = pd.read_csv('Real/class2.txt',delim_whitespace=True,header = None, names = ['col1', 'col2'])
real3 = pd.read_csv('Real/class3.txt',delim_whitespace=True,header = None, names = ['col1', 'col2'])
plt.scatter(real1['col1'],real1['col2'],label ='TestClass-0')
plt.scatter(real2['col1'],real2['col2'],label ='TrainClass-1')
plt.scatter(real3['col1'],real3['col2'],label ='TestClass-1')
plt.show()
real_class = [0]*real1.shape[0]
real_class.extend([1]*real2.shape[0])
real_class.extend([2]*real3.shape[0])
real_class=np.array(real_class)
df_real = pd.concat([real1, real2, real3], axis=0)
real_train,real_test,y_train_, y_test_= train_test_split(df_real,real_class, test_size=0.2, random_state=42)
real_prior_prob = priors(y_train_)
#print(real_prior_prob)
class_in = class_info(real_train,y_train_,3)
print('For non-linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TRAIN ' + '\033[0m' + 'data is :')
real_predicted = predict(real_train,3,real_prior_prob,class_in)
print(confusion_matrix(y_train_, real_predicted, labels=[0, 1, 2]))
print(accuracy_score(y_train_, real_predicted))
print('For non-linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TEST ' + '\033[0m' + 'data is :')
real_prior_prob = priors(y_train_)
real_predicted = predict(real_test,3,real_prior_prob,class_in)
print(confusion_matrix(y_test_,real_predicted, labels=[0, 1, 2]))
print(accuracy_score(y_test_, real_predicted))
print()