import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split , KFold
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
# import iris dataset
iris = datasets.load_iris()
# np.c_ is the numpy concatenate function
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
columns= iris['feature_names'] + ['target'])
iris_df.head()
iris_df.describe()
x= iris_df.iloc[:, :-1]
y= iris_df.iloc[:, -1]
x.head()
y.head()
# split the data into train and test sets
x_train, x_test, y_train, y_test= train_test_split(x, y,
test_size= 0.2,
shuffle= True, #shuffle the data to avoid bias
random_state= 0)
x_train= np.asarray(x_train)
y_train= np.asarray(y_train)
x_test= np.asarray(x_test)
y_test= np.asarray(y_test)
print(f'training set size: {x_train.shape[0]} samples \ntest set size: {x_test.shape[0]} samples')
scaler= Normalizer().fit(x_train) # the scaler is fitted to the training set
normalized_x_train= scaler.transform(x_train) # the scaler is applied to the training set
normalized_x_test= scaler.transform(x_test) # the scaler is applied to the test set
print('x train before Normalization')
print(x_train[0:5])
print('\nx train after Normalization')
print(normalized_x_train[0:5])
## Before
# View the relationships between variables; color code by species type
di= {0.0: 'Setosa', 1.0: 'Versicolor', 2.0:'Virginica'} # dictionary
before= sns.pairplot(iris_df.replace({'target': di}), hue= 'target')
before.fig.suptitle('Pair Plot of the dataset Before normalization', y=1.08)
## After
iris_df_2= pd.DataFrame(data= np.c_[normalized_x_train, y_train],
columns= iris['feature_names'] + ['target'])
di= {0.0: 'Setosa', 1.0: 'Versicolor', 2.0: 'Virginica'}
after= sns.pairplot(iris_df_2.replace({'target':di}), hue= 'target')
after.fig.suptitle('Pair Plot of the dataset After normalization', y=1.08)
def distance_ecu(x_train, x_test_point):
"""
Input:
- x_train: corresponding to the training data
- x_test_point: corresponding to the test point
Output:
-distances: The distances between the test point and each point in the training data.
"""
distances= [] ## create empty list called distances
for row in range(len(x_train)): ## Loop over the rows of x_train
current_train_point= x_train[row] #Get them point by point
current_distance= 0 ## initialize the distance by zero
for col in range(len(current_train_point)): ## Loop over the columns of the row
current_distance += (current_train_point[col] - x_test_point[col]) **2
## Or current_distance = current_distance + (x_train[i] - x_test_point[i])**2
current_distance= np.sqrt(current_distance)
distances.append(current_distance) ## Append the distances
# Store distances in a dataframe
distances= pd.DataFrame(data=distances,columns=['dist'])
return distances
def nearest_neighbors(distance_point, K):
"""
Input:
-distance_point: the distances between the test point and each point in the training data.
-K : the number of neighbors
Output:
-df_nearest: the nearest K neighbors between the test point and the training data.
"""
# Sort values using the sort_values function
df_nearest= distance_point.sort_values(by=['dist'], axis=0)
## Take only the first K neighbors
df_nearest= df_nearest[:K]
return df_nearest
def voting(df_nearest, y_train):
"""
Input:
-df_nearest: dataframe contains the nearest K neighbors between the full training dataset and the test point.
-y_train: the labels of the training dataset.
Output:
-y_pred: the prediction based on Majority Voting
"""
## Use the Counter Object to get the labels with K nearest neighbors.
counter_vote= Counter(y_train[df_nearest.index])
y_pred= counter_vote.most_common()[0][0] # Majority Voting
return y_pred
def KNN_from_scratch(x_train, y_train, x_test, K):
"""
Input:
-x_train: the full training dataset
-y_train: the labels of the training dataset
-x_test: the full test dataset
-K: the number of neighbors
Output:
-y_pred: the prediction for the whole test set based on Majority Voting.
"""
y_pred=[]
## Loop over all the test set and perform the three steps
for x_test_point in x_test:
distance_point = distance_ecu(x_train, x_test_point) ## Step 1
df_nearest_point= nearest_neighbors(distance_point, K) ## Step 2
y_pred_point = voting(df_nearest_point, y_train) ## Step 3
y_pred.append(y_pred_point)
return y_pred
K=3
y_pred_scratch= KNN_from_scratch(normalized_x_train, y_train, normalized_x_test, K)
print(y_pred_scratch)
knn=KNeighborsClassifier(K)
knn.fit(normalized_x_train, y_train)
y_pred_sklearn= knn.predict(normalized_x_test)
print(y_pred_sklearn)
print(np.array_equal(y_pred_sklearn, y_pred_scratch))
print(f'The accuracy of our implementation is {accuracy_score(y_test, y_pred_scratch)}')
print(f'The accuracy of sklearn implementation is {accuracy_score(y_test, y_pred_sklearn)}')
n_splits= 4 ## Choose the number of splits
kf= KFold(n_splits= n_splits) ## Call the K Fold function
accuracy_k= [] ## Keep track of the accuracy for each K
k_values= list(range(1,30,2)) ## Search for the best value of K
for k in k_values: ## Loop over the K values
accuracy_fold= 0
for normalized_x_train_fold_idx, normalized_x_valid_fold_idx in kf.split(normalized_x_train): ## Loop over the splits
normalized_x_train_fold= normalized_x_train[normalized_x_train_fold_idx] ## fetch the values
y_train_fold= y_train[normalized_x_train_fold_idx]
normalized_x_test_fold= normalized_x_train[normalized_x_valid_fold_idx]
y_valid_fold= y_train[normalized_x_valid_fold_idx]
y_pred_fold= KNN_from_scratch(normalized_x_train_fold, y_train_fold, normalized_x_test_fold, k)
accuracy_fold+= accuracy_score (y_pred_fold, y_valid_fold) ## Accumulate the accuracy
accuracy_fold= accuracy_fold/ n_splits ## Divide by the number of splits
accuracy_k.append(accuracy_fold)
print(f'The accuracy for each K value was {list ( zip (accuracy_k, k_values))}') ## creates a tuple with accuracy corresponding to k value
print(f'Best accuracy was {np.max(accuracy_k)}, which corresponds to a value of K= {k_values[np.argmax(accuracy_k)]}')