# import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# this data set contains both patterns for rock and mine
df = pd.read_csv('sonar.all-data.csv')
# there are 208 patterns total
df.shape
# note that each column value only ranges from 0 to 1
df.describe().T
countfloat64
208.0 - 208.0
meanfloat64
0.006507211538461538 - 0.7021548076923078
Freq_51
208
0.01606875
Freq_52
208
0.01342019231
Freq_53
208
0.01070913462
Freq_54
208
0.01094086538
Freq_55
208
0.009290384615
Freq_56
208
0.008221634615
Freq_57
208
0.007820192308
Freq_58
208
0.007949038462
Freq_59
208
0.007941346154
Freq_60
208
0.006507211538
df.head()
Freq_1float64
Freq_2float64
0
0.02
0.0371
1
0.0453
0.0523
2
0.0262
0.0582
3
0.01
0.0171
4
0.0762
0.0666
# in order to see the correlation, we will first need to change our label to numerical values
# we map, Rock for '0' and Mine for '1'
df['Label_num'] = df['Label'].map({'R':0,'M':1})
# heatmap of correlation
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),cmap='mako');
# 5 features with highest correlation with our label
# we are interested in their absolute correlation, so we use np.abs()
np.abs(df.corr()['Label_num']).sort_values(ascending=False)[1:6]
# Barplot of correlations between features and label
corr_label = df.corr()['Label_num'].sort_values(ascending=False)[1:]
plt.figure(figsize=(10,8))
sns.barplot(x=corr_label.index,y=corr_label)
plt.xticks(rotation=90);
from sklearn.model_selection import train_test_split
# We need to drop both Label and Label_num columns to separate out features
X = df.drop(['Label_num','Label'],axis=1)
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
scaler = StandardScaler()
knn = KNeighborsClassifier()
pipe = Pipeline([('scaler',scaler),('knn',knn)])
# set up parameter grid
k_vals = list(range(1,40))
param_grid = {'knn__n_neighbors':k_vals}
# set up gridsearchCV
grid = GridSearchCV(pipe,param_grid=param_grid,scoring='accuracy')
# fit the model to training data
grid.fit(X_train,y_train)
# what are the parameters of best performing estimator?
grid.best_estimator_.get_params()
# grid.cv_results_ is a dictionary type
grid.cv_results_['mean_test_score']
# visualization of above test scores
score_per_k = grid.cv_results_['mean_test_score']
plt.figure(figsize=(8,6))
plt.plot(k_vals,score_per_k)
plt.scatter(k_vals,score_per_k)
plt.vlines(x=1,ymin=0.650,ymax=0.850,colors='red')
from sklearn.metrics import classification_report,plot_confusion_matrix
plot_confusion_matrix(grid,X_test,y_test);
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
grid_pred = grid.predict(X_test)
print(classification_report(y_test,grid_pred))
precision recall f1-score support
M 0.92 0.92 0.92 13
R 0.88 0.88 0.88 8
accuracy 0.90 21
macro avg 0.90 0.90 0.90 21
weighted avg 0.90 0.90 0.90 21
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
# first create a confusion matrix
cm = confusion_matrix(y_test,grid_pred)
# then use confusion matrix and grid.classes_ to create the plot
ConfusionMatrixDisplay(cm,display_labels=grid.classes_).plot();