import sklearn
sklearn.__version__
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', 100)
# Your package imports here
from sklearn.metrics import confusion_matrix
import seaborn as sn
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
# Load the dataset from UCI
df = pd.read_csv(
'https://archive.ics.uci.edu/ml/'
'machine-learning-databases'
'/breast-cancer-wisconsin/wdbc.data',
header=None)
df.head()
fx = pd.read_csv('C:/Users/Umaima/DSC540/WisconsinBreastCancer.csv')
X = fx.drop('target', axis=1)
y = fx.target
X.head()
y.head()
# Code here
pple_cancer = df[df[1] == "M"]
pple_cancer.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 0 to 567
Data columns (total 32 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 212 non-null int64
1 1 212 non-null object
2 2 212 non-null float64
3 3 212 non-null float64
4 4 212 non-null float64
5 5 212 non-null float64
6 6 212 non-null float64
7 7 212 non-null float64
8 8 212 non-null float64
9 9 212 non-null float64
10 10 212 non-null float64
11 11 212 non-null float64
12 12 212 non-null float64
13 13 212 non-null float64
14 14 212 non-null float64
15 15 212 non-null float64
16 16 212 non-null float64
17 17 212 non-null float64
18 18 212 non-null float64
19 19 212 non-null float64
20 20 212 non-null float64
21 21 212 non-null float64
22 22 212 non-null float64
23 23 212 non-null float64
24 24 212 non-null float64
25 25 212 non-null float64
26 26 212 non-null float64
27 27 212 non-null float64
28 28 212 non-null float64
29 29 212 non-null float64
30 30 212 non-null float64
31 31 212 non-null float64
dtypes: float64(30), int64(1), object(1)
memory usage: 54.7+ KB
print(fx.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mean radius 569 non-null float64
1 mean texture 569 non-null float64
2 mean perimeter 569 non-null float64
3 mean area 569 non-null float64
4 mean smoothness 569 non-null float64
5 mean compactness 569 non-null float64
6 mean concavity 569 non-null float64
7 mean concave points 569 non-null float64
8 mean symmetry 569 non-null float64
9 mean fractal dimension 569 non-null float64
10 radius error 569 non-null float64
11 texture error 569 non-null float64
12 perimeter error 569 non-null float64
13 area error 569 non-null float64
14 smoothness error 569 non-null float64
15 compactness error 569 non-null float64
16 concavity error 569 non-null float64
17 concave points error 569 non-null float64
18 symmetry error 569 non-null float64
19 fractal dimension error 569 non-null float64
20 worst radius 569 non-null float64
21 worst texture 569 non-null float64
22 worst perimeter 569 non-null float64
23 worst area 569 non-null float64
24 worst smoothness 569 non-null float64
25 worst compactness 569 non-null float64
26 worst concavity 569 non-null float64
27 worst concave points 569 non-null float64
28 worst symmetry 569 non-null float64
29 worst fractal dimension 569 non-null float64
30 target 569 non-null int64
dtypes: float64(30), int64(1)
memory usage: 137.9 KB
None
fx.isnull().sum()
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mean radius 569 non-null float64
1 mean texture 569 non-null float64
2 mean perimeter 569 non-null float64
3 mean area 569 non-null float64
4 mean smoothness 569 non-null float64
5 mean compactness 569 non-null float64
6 mean concavity 569 non-null float64
7 mean concave points 569 non-null float64
8 mean symmetry 569 non-null float64
9 mean fractal dimension 569 non-null float64
10 radius error 569 non-null float64
11 texture error 569 non-null float64
12 perimeter error 569 non-null float64
13 area error 569 non-null float64
14 smoothness error 569 non-null float64
15 compactness error 569 non-null float64
16 concavity error 569 non-null float64
17 concave points error 569 non-null float64
18 symmetry error 569 non-null float64
19 fractal dimension error 569 non-null float64
20 worst radius 569 non-null float64
21 worst texture 569 non-null float64
22 worst perimeter 569 non-null float64
23 worst area 569 non-null float64
24 worst smoothness 569 non-null float64
25 worst compactness 569 non-null float64
26 worst concavity 569 non-null float64
27 worst concave points 569 non-null float64
28 worst symmetry 569 non-null float64
29 worst fractal dimension 569 non-null float64
dtypes: float64(30)
memory usage: 133.5 KB
fx.describe().round(2)
cols = [0,1,2,3,4,5,6,7,8,9] #extracting mean cols
mean_df = fx[fx.columns[cols]]
f, ax = plt.subplots(figsize=(12, 7))
corr = mean_df.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
t= f.suptitle('Mean Correlation Heatmap', fontsize=14)
cols = [10,11,12,13,14,15,16,17,18,19] #extracting error cols
error_df = fx[fx.columns[cols]]
f, ax = plt.subplots(figsize=(12, 7))
corr =error_df.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
t= f.suptitle('Errors Correlation Heatmap', fontsize=14)
cols = [10,21,22,23,24,25,26,27,28,29] #extracting worse cols
worse_df = fx[fx.columns[cols]]
f, ax = plt.subplots(figsize=(12, 7))
corr = worse_df.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
t= f.suptitle('Worse Correlation Heatmap', fontsize=14)
f, ax = plt.subplots(figsize=(12, 7))
corr = X.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
t= f.suptitle('Overall Correlation Heatmap', fontsize=14)
# split the datasets into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 55, test_size= 0.25)
# Code here
pd.DataFrame(X_test).info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 276 to 112
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mean radius 143 non-null float64
1 mean texture 143 non-null float64
2 mean perimeter 143 non-null float64
3 mean area 143 non-null float64
4 mean smoothness 143 non-null float64
5 mean compactness 143 non-null float64
6 mean concavity 143 non-null float64
7 mean concave points 143 non-null float64
8 mean symmetry 143 non-null float64
9 mean fractal dimension 143 non-null float64
10 radius error 143 non-null float64
11 texture error 143 non-null float64
12 perimeter error 143 non-null float64
13 area error 143 non-null float64
14 smoothness error 143 non-null float64
15 compactness error 143 non-null float64
16 concavity error 143 non-null float64
17 concave points error 143 non-null float64
18 symmetry error 143 non-null float64
19 fractal dimension error 143 non-null float64
20 worst radius 143 non-null float64
21 worst texture 143 non-null float64
22 worst perimeter 143 non-null float64
23 worst area 143 non-null float64
24 worst smoothness 143 non-null float64
25 worst compactness 143 non-null float64
26 worst concavity 143 non-null float64
27 worst concave points 143 non-null float64
28 worst symmetry 143 non-null float64
29 worst fractal dimension 143 non-null float64
dtypes: float64(30)
memory usage: 34.6 KB
pd.DataFrame(X_train).info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 193 to 239
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mean radius 426 non-null float64
1 mean texture 426 non-null float64
2 mean perimeter 426 non-null float64
3 mean area 426 non-null float64
4 mean smoothness 426 non-null float64
5 mean compactness 426 non-null float64
6 mean concavity 426 non-null float64
7 mean concave points 426 non-null float64
8 mean symmetry 426 non-null float64
9 mean fractal dimension 426 non-null float64
10 radius error 426 non-null float64
11 texture error 426 non-null float64
12 perimeter error 426 non-null float64
13 area error 426 non-null float64
14 smoothness error 426 non-null float64
15 compactness error 426 non-null float64
16 concavity error 426 non-null float64
17 concave points error 426 non-null float64
18 symmetry error 426 non-null float64
19 fractal dimension error 426 non-null float64
20 worst radius 426 non-null float64
21 worst texture 426 non-null float64
22 worst perimeter 426 non-null float64
23 worst area 426 non-null float64
24 worst smoothness 426 non-null float64
25 worst compactness 426 non-null float64
26 worst concavity 426 non-null float64
27 worst concave points 426 non-null float64
28 worst symmetry 426 non-null float64
29 worst fractal dimension 426 non-null float64
dtypes: float64(30)
memory usage: 103.2 KB
pd.DataFrame(y_train).info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 193 to 239
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 target 426 non-null int64
dtypes: int64(1)
memory usage: 6.7 KB
clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=0.1)
clf.fit(X_train, y_train)
pred = clf.predict(X_train)
print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))
The training accuracy is 94.13145539906104
# Code here
pred = clf.predict(X_test)
print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))
The testing accuracy is 98.6013986013986
pd.DataFrame(metrics.confusion_matrix(y_test, pred))
print(metrics.classification_report(y_test, pred))
precision recall f1-score support
0 0.99 0.99 0.99 90
1 0.98 0.98 0.98 53
accuracy 0.99 143
macro avg 0.99 0.99 0.99 143
weighted avg 0.99 0.99 0.99 143
confusion_matrix(y_test, pred)
from sklearn.neighbors import KNeighborsClassifier
neighbors = KNeighborsClassifier(n_neighbors=1)
neighbors.fit(X_train, y_train)
X_train
pred = neighbors.predict(X_train)
print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))
The training accuracy is 100.0
pred = neighbors.predict(X_test)
print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))
The testing accuracy is 94.4055944055944
pd.DataFrame(metrics.confusion_matrix(y_test, pred)) #confusion matrix
print(metrics.classification_report(y_test, pred))
precision recall f1-score support
0 0.96 0.96 0.96 90
1 0.92 0.92 0.92 53
accuracy 0.94 143
macro avg 0.94 0.94 0.94 143
weighted avg 0.94 0.94 0.94 143
model = GaussianNB()
model.fit(X_train, y_train)
#Predictions
y_pred = model.predict(X_test)
y_pred
#accuracy =
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
print("The accuracy is "+str(acc*100)+"%")
The accuracy is 95.8041958041958%
pred = model.predict(X_train)
print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))
The training accuracy is 93.66197183098592
pred = model.predict(X_test)
print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))
The testing accuracy is 95.8041958041958
pd.DataFrame(metrics.confusion_matrix(y_test, pred)) #confusion matrix
print(metrics.classification_report(y_test, pred))
precision recall f1-score support
0 0.95 0.99 0.97 90
1 0.98 0.91 0.94 53
accuracy 0.96 143
macro avg 0.96 0.95 0.95 143
weighted avg 0.96 0.96 0.96 143
from sklearn.linear_model import LogisticRegression
def PerformLogisticRegression(C_list,X_train,y_train):
'''returns accuracy train and test values while taking the List of C values, the X train data and the y train data '''
accuracy_train = []
accuracy_test = []
predTrain=[]
predTest=[]
for C in C_list:
clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=C)
clf.fit(X_train, y_train)
#print(clf.predict(X_test))
predTrain = clf.predict(X_train)
predTest = clf.predict(X_test)
#accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies
accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100))
accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))
return accuracy_train,accuracy_test
C_list = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 10, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
accuracy_train,accuracy_test = PerformLogisticRegression(C_list,X_train,y_train)
accuracy_train
accuracy_test
import matplotlib.pyplot as plt
def plotValidationCurve(accuracy_train,accuracy_test,listt,title):
'''plots validation curve'''
f, ax = plt.subplots(dpi=150)
plt.plot(accuracy_train, marker='o', color='blue', label = 'train')
plt.plot(accuracy_test, color='orange', marker='o', label = 'val')
plt.title(title)
plt.xticks(ticks=np.arange(len(listt)), labels=listt, rotation=45)
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
plotValidationCurve(accuracy_train,accuracy_test,C_list,"Validation Curve for Logestic Regression")
K_list = [1,3,5,7,9,15,19,25,29,35,39,45,49,99]
accuracy_train = []
accuracy_test = []
predTest=[]
predTrain=[]
for K in K_list:
clf = KNeighborsClassifier(n_neighbors=K)
clf.fit(X_train, y_train)
predTrain = clf.predict(X_train)
predTest = clf.predict(X_test)
#accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies
accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100))
accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))
accuracy_train
accuracy_test
f, ax = plt.subplots(dpi=150)
plt.plot(accuracy_train, marker='o', color='blue', label = 'train')
plt.plot(accuracy_test, color='orange', marker='o', label = 'val')
plt.title('Validation Curve')
plt.xticks(ticks=np.arange(len(K_list)), labels=K_list, rotation=45)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
ss = preprocessing.StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)
accuracy_train = []
accuracy_test = []
predTrain=[]
predTest=[]
for C in C_list:
clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=C)
clf.fit(X_train_ss, y_train)
#print(clf.predict(X_test))
predTrain = clf.predict(X_train_ss)
predTest = clf.predict(X_test_ss)
#accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies
accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100))
accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))
plotValidationCurve(accuracy_train,accuracy_test,C_list,"Validation Curve with Normalization for Logestic Regression")
X_train_ss.shape
X_train_ss[0:426,:].shape[0]
X_train_ss.shape
X_train_ss.shape[0]
y_train
accuracy_train = []
accuracy_val = []
clf = LogisticRegression(max_iter = 1000, solver='lbfgs', penalty='l2', C=0.5, verbose=0)
num_points = list(range(1, X_train_ss.shape[0], 1))
for n in num_points:
clf.fit(X_train_ss, y_train)
pred_trn = clf.predict(X_train_ss[0:n, :])
accuracy_train.append(metrics.accuracy_score(y_true = y_train[0:n], y_pred = pred_trn))
pred_val = clf.predict(X_test_ss)
accuracy_val.append(metrics.accuracy_score(y_true = y_test, y_pred = pred_val))
f, ax = plt.subplots(dpi=150)
ax.plot(num_points, accuracy_train, marker='o', color='blue', label='Train')
ax.plot(num_points, accuracy_val, marker='o', color='orange', label='Val')
ax.set_xlabel('Number of points in training')
ax.set_ylabel('Accuracy')
ax.set_title("Learning Curve")
# ax.set_xticks(np.arange(len(num_points)))
# ax.set_xticklabels(num_points, rotation=90)
ax.legend();
ax.grid()
# Code here
ss = preprocessing.StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)
K_list = [1,3,5,7,9,15,19,25,29,35,39,45,49,99]
accuracy_train = []
accuracy_test = []
predTest=[]
predTrain=[]
for K in K_list:
clf = KNeighborsClassifier(n_neighbors=K)
clf.fit(X_train_ss, y_train)
predTrain = clf.predict(X_train_ss)
predTest = clf.predict(X_test_ss)
#accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies
accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100))
accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))
accuracy_train
accuracy_test
# Code here
f, ax = plt.subplots(dpi=150)
plt.plot(accuracy_train, marker='o', color='blue', label = 'train')
plt.plot(accuracy_test, color='orange', marker='o', label = 'val')
plt.title('Validation Curve after normalizing in KNN')
plt.xticks(ticks=np.arange(len(K_list)), labels=K_list, rotation=45)
plt.xlabel('K')
plt.ylabel('Accuracy in %')
plt.legend()
plt.show()
fx
X = fx.drop(['target','mean radius','mean perimeter','mean concavity'], axis=1)
y = fx.target
X
# split the datasets into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 55, test_size= 0.25)
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
print("The accuracy is "+str(acc*100)+" %")
pred = model.predict(X_train)
print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))
pred = model.predict(X_test)
print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))
The accuracy is 95.1048951048951 %
The training accuracy is 94.36619718309859
The testing accuracy is 95.1048951048951