Naive Byes Classifier with 3 different datasets
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use('dark_background')
class1 = pd.read_csv('class1.txt',header = None, names = ['col1', 'col2'])
class2 = pd.read_csv('class2.txt',header = None,names = ['col1', 'col2'])
classes = [0]*class1.shape[0]
classes.extend([1]*class2.shape[0])
classes=np.array(classes)
#classes = pd.DataFrame(np.array(classes))
df = pd.concat([class1, class2], axis=0)
X_train,X_test,Y_train, Y_test= train_test_split(df,classes, test_size=0.2, random_state=52)
def class_data(X,y,cls):
    """Function to filter the data of a particular class"""
    clas = []
    for i in range(len(X)):
        if y[i]==cls:
            clas.append(X.iloc[i])
    return pd.DataFrame(clas)
def plotting(X_train,X_test,y_train,y_test):
    ctrain_0= class_data(X_train,y_train,0)
    ctrain_1= class_data(X_train,y_train,1)
    ctest_0 = class_data(X_test,y_test,0)    
    ctest_1 = class_data(X_test,y_test,1)
    plt.scatter(ctrain_0['col1'],ctrain_0['col2'],label ='TrainClass-0')
    plt.scatter(ctest_0['col1'],ctest_0['col2'],label ='TestClass-0')
    plt.scatter(ctrain_1['col1'],ctrain_1['col2'],label ='TrainClass-1')
    plt.scatter(ctest_1['col1'],ctest_1['col2'],label ='TestClass-1')
    plt.legend()
    plt.show()
plotting(X_train,X_test,Y_train, Y_test)
def priors(y_train):
    """ P(c) - Prior Class Probability, simply sum of one class / total """
    class_priors = {}
    for outcome in np.unique(y_train):
        outcome_count = sum(y_train == outcome)
        class_priors[outcome] = outcome_count / y_train.shape[0]
    return class_priors
def mean_cov_forClass(data,cls):
    """Calculation of mean and matrix of the data, particularly class wise"""
    cols = data.columns
    cov = data.cov()
    return [np.mean(data),cov]
def likelihood(x, mean, cov_matrix,no_cls):
    """Likelihood is calculated, P(X|Class), assuming the distribution to be normal"""
    #x = np.array(x)
    #mean = np.array(mean)
    cov_matrix = np.array(cov_matrix)
    val = (1/(((2*np.pi))*(np.linalg.det(cov_matrix)**0.5)))
    val *= np.exp(-0.5*np.dot(np.dot((x - mean).T, np.linalg.inv(cov_matrix)), x - mean))
    return val
#print(likelihood([13.316   2.3246], mean, cov_matrix,no_cls))
def class_info(X,y,classes):
    """Provides the information of classes given all the data, like mean and covariance matrix in the form of a dictionary
    so that it can be accessed while calculation the posterior probablity."""
    class_in= {}
    for i in range(classes):
        data = class_data(X,y,i)
        #print(data)
        mu,cov = mean_cov_forClass(data,i)
        class_in[i] = [mu,cov]
    return class_in
def predict(X,no_cls,prior_prob,class_in):
    """Prediction of data, given a data vector. 
        We do not consider the the denomiator as it will be same for each class so does not play a role in the prection."""
    label = []
    #mu, cov = mean_cov_forClass(X_train,0)[0],mean_cov_forClass(X_train,0)[1]
    for i in np.array(X):
        #Dictionary storing the key as class and value as the posterior probablity of belonging to that class
        posterior = {}
        
        for cls in range(no_cls):
            #Mean and cov of the class calculated via training data and stored as a dictionary in class_info function
            mu, cov = class_in[cls][0],class_in[cls][1]
            like= likelihood(i, mu, cov,no_cls)
            #print(like)
            #posterior probablity of the class
            posterior[cls]= round((like * prior_prob[cls]),9)
        #print(posterior)
        Keymax = max(zip(posterior.values(), posterior.keys()))[1]
        label.append(Keymax)
        
    return label
class_in = class_info(X_train,Y_train,2)
print('For Linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TRAIN ' + '\033[0m' +  'data is :')
prior_prob = priors(Y_train)
predicted = predict(X_train,2,prior_prob,class_in)
print(confusion_matrix(Y_train, predicted))
print(accuracy_score(Y_train, predicted))
print('For Linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TEST ' + '\033[0m' +  'data is :')
prior_prob = priors(Y_test)
predicted = predict(X_test,2,prior_prob,class_in)
print(confusion_matrix(Y_test, predicted))
print(accuracy_score(Y_test, predicted))
#NLS data
nls1 = pd.read_csv('NLS/class1.txt',header = None, names = ['col1', 'col2'])
nls2 = pd.read_csv('NLS/class2.txt',header = None,names = ['col1', 'col2'])
nls_class = [0]*nls1.shape[0]
nls_class.extend([1]*nls2.shape[0])
nls_class=np.array(nls_class)
df_nls = pd.concat([nls1, nls2], axis=0)
nls_train,nls_test,y_train, y_test= train_test_split(df_nls,nls_class, test_size=0.2, random_state=52)
plotting(nls_train,nls_test,y_train, y_test)
class_in = class_info(nls_train,y_train,2)
nls_prior_prob = priors(y_train)
print('For non-linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TRAIN ' + '\033[0m' +  'data is :')
nls_predicted = predict(nls_train,2,nls_prior_prob,class_in)
print(confusion_matrix(y_train, nls_predicted))
print(accuracy_score(y_train, nls_predicted))
print('For non-linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TEST ' + '\033[0m' +  'data is :')
#nls_prior_prob = priors(y_train)
nls_predicted = predict(nls_test,2,nls_prior_prob,class_in)
print(confusion_matrix(y_test, nls_predicted))
print(accuracy_score(y_test, nls_predicted))
print()
real1 = pd.read_csv('Real/class1.txt',delim_whitespace=True,header = None, names = ['col1', 'col2'])
real2 = pd.read_csv('Real/class2.txt',delim_whitespace=True,header = None, names = ['col1', 'col2'])
real3 = pd.read_csv('Real/class3.txt',delim_whitespace=True,header = None, names = ['col1', 'col2'])
plt.scatter(real1['col1'],real1['col2'],label ='TestClass-0')
plt.scatter(real2['col1'],real2['col2'],label ='TrainClass-1')
plt.scatter(real3['col1'],real3['col2'],label ='TestClass-1')
plt.show()
real_class = [0]*real1.shape[0]
real_class.extend([1]*real2.shape[0])
real_class.extend([2]*real3.shape[0])
real_class=np.array(real_class)
df_real = pd.concat([real1, real2, real3], axis=0)
real_train,real_test,y_train_, y_test_= train_test_split(df_real,real_class, test_size=0.2, random_state=42)
real_prior_prob = priors(y_train_)
#print(real_prior_prob)
class_in = class_info(real_train,y_train_,3)
print('For non-linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TRAIN ' + '\033[0m' +  'data is :')
real_predicted = predict(real_train,3,real_prior_prob,class_in)
print(confusion_matrix(y_train_, real_predicted, labels=[0, 1, 2]))
print(accuracy_score(y_train_, real_predicted))
print('For non-linearly seprable data, confusion matrix and Accuracy Score for '+ '\033[1m' + 'TEST ' + '\033[0m' +  'data is :')
real_prior_prob = priors(y_train_)
real_predicted = predict(real_test,3,real_prior_prob,class_in)
print(confusion_matrix(y_test_,real_predicted, labels=[0, 1, 2]))
print(accuracy_score(y_test_, real_predicted))
print()