# import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# this data set contains both patterns for rock and mine
df = pd.read_csv('sonar.all-data.csv')
# there are 208 patterns total
df.shape
# note that each column value only ranges from 0 to 1
df.describe().T
df.head()
# in order to see the correlation, we will first need to change our label to numerical values
# we map, Rock for '0' and Mine for '1'
df['Label_num'] = df['Label'].map({'R':0,'M':1})
# heatmap of correlation
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),cmap='mako');
# 5 features with highest correlation with our label
# we are interested in their absolute correlation, so we use np.abs()
np.abs(df.corr()['Label_num']).sort_values(ascending=False)[1:6]
# Barplot of correlations between features and label
corr_label = df.corr()['Label_num'].sort_values(ascending=False)[1:]
plt.figure(figsize=(10,8))
sns.barplot(x=corr_label.index,y=corr_label)
plt.xticks(rotation=90);
from sklearn.model_selection import train_test_split
# We need to drop both Label and Label_num columns to separate out features
X = df.drop(['Label_num','Label'],axis=1)
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
scaler = StandardScaler()
knn = KNeighborsClassifier()
pipe = Pipeline([('scaler',scaler),('knn',knn)])
# set up parameter grid
k_vals = list(range(1,40))
param_grid = {'knn__n_neighbors':k_vals}
# set up gridsearchCV
grid = GridSearchCV(pipe,param_grid=param_grid,scoring='accuracy')
# fit the model to training data
grid.fit(X_train,y_train)
# what are the parameters of best performing estimator?
grid.best_estimator_.get_params()
# grid.cv_results_ is a dictionary type
grid.cv_results_['mean_test_score']
# visualization of above test scores
score_per_k = grid.cv_results_['mean_test_score']
plt.figure(figsize=(8,6))
plt.plot(k_vals,score_per_k)
plt.scatter(k_vals,score_per_k)
plt.vlines(x=1,ymin=0.650,ymax=0.850,colors='red')
from sklearn.metrics import classification_report,plot_confusion_matrix
plot_confusion_matrix(grid,X_test,y_test);
grid_pred = grid.predict(X_test)
print(classification_report(y_test,grid_pred))
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
# first create a confusion matrix
cm = confusion_matrix(y_test,grid_pred)
# then use confusion matrix and grid.classes_ to create the plot
ConfusionMatrixDisplay(cm,display_labels=grid.classes_).plot();