import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
breast_cancer = load_breast_cancer()
breast_cancer.feature_names
breast_cancer.target
X = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
X = X[['mean area', 'mean perimeter', 'worst area', 'worst radius']]
y = pd.Categorical.from_codes(breast_cancer.target, breast_cancer.target_names)
y = pd.get_dummies(y)
y.drop(['benign'], axis=1, inplace=True)
y.head()
# Proportion of records labeled as malignant
y['malignant'].mean()
# Create a dataframe from X & y for exploratory analysis
bc_data = pd.concat([X,y], axis=1)
bc_data.head()
bc_data.describe()
sns.pairplot(bc_data, hue='malignant')
# MinMax Scaler transforms features into [0,1]
# Formula: X_rescaled = (Xi - Xmin) / (Xmax-Xmin)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_ = scaler.fit_transform(X)
X_rescaled = pd.DataFrame(X_, columns=X.columns)
X_rescaled.describe()
X_train, X_test, y_train, y_test = train_test_split(X_rescaled, y, test_size=0.3, random_state=1)
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
from sklearn.metrics import roc_auc_score
print('ROC AUC: ', roc_auc_score(y_test,y_pred))
from sklearn.model_selection import cross_val_score
max_K = 100
cv_scores = [ ]
for K in range(1,max_K):
knn = KNeighborsClassifier(n_neighbors = K)
scores = cross_val_score(knn,X_train,y_train.values.ravel(),cv = 5,scoring = "accuracy")
cv_scores.append(scores.mean())
cv_scores[:4]
sns.lineplot(x=range(1,max_K), y=cv_scores)