Wisconsin Breast Cancer Analysis

import sklearn sklearn.__version__

from sklearn.model_selection import train_test_split from sklearn import metrics import matplotlib.pyplot as plt import pandas as pd import numpy as np import seaborn as sns

pd.set_option('display.max_columns', 100)

# Your package imports here from sklearn.metrics import confusion_matrix import seaborn as sn from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn import preprocessing

# Load the dataset from UCI df = pd.read_csv( 'https://archive.ics.uci.edu/ml/' 'machine-learning-databases' '/breast-cancer-wisconsin/wdbc.data', header=None)

df.head()

fx = pd.read_csv('C:/Users/Umaima/DSC540/WisconsinBreastCancer.csv') X = fx.drop('target', axis=1) y = fx.target

X.head()

y.head()

# Code here pple_cancer = df[df[1] == "M"]

pple_cancer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 0 to 567
Data columns (total 32 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       212 non-null    int64  
 1   1       212 non-null    object 
 2   2       212 non-null    float64
 3   3       212 non-null    float64
 4   4       212 non-null    float64
 5   5       212 non-null    float64
 6   6       212 non-null    float64
 7   7       212 non-null    float64
 8   8       212 non-null    float64
 9   9       212 non-null    float64
 10  10      212 non-null    float64
 11  11      212 non-null    float64
 12  12      212 non-null    float64
 13  13      212 non-null    float64
 14  14      212 non-null    float64
 15  15      212 non-null    float64
 16  16      212 non-null    float64
 17  17      212 non-null    float64
 18  18      212 non-null    float64
 19  19      212 non-null    float64
 20  20      212 non-null    float64
 21  21      212 non-null    float64
 22  22      212 non-null    float64
 23  23      212 non-null    float64
 24  24      212 non-null    float64
 25  25      212 non-null    float64
 26  26      212 non-null    float64
 27  27      212 non-null    float64
 28  28      212 non-null    float64
 29  29      212 non-null    float64
 30  30      212 non-null    float64
 31  31      212 non-null    float64
dtypes: float64(30), int64(1), object(1)
memory usage: 54.7+ KB

print(fx.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         569 non-null    float64
 15  compactness error        569 non-null    float64
 16  concavity error          569 non-null    float64
 17  concave points error     569 non-null    float64
 18  symmetry error           569 non-null    float64
 19  fractal dimension error  569 non-null    float64
 20  worst radius             569 non-null    float64
 21  worst texture            569 non-null    float64
 22  worst perimeter          569 non-null    float64
 23  worst area               569 non-null    float64
 24  worst smoothness         569 non-null    float64
 25  worst compactness        569 non-null    float64
 26  worst concavity          569 non-null    float64
 27  worst concave points     569 non-null    float64
 28  worst symmetry           569 non-null    float64
 29  worst fractal dimension  569 non-null    float64
 30  target                   569 non-null    int64  
dtypes: float64(30), int64(1)
memory usage: 137.9 KB
None

fx.isnull().sum()

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         569 non-null    float64
 15  compactness error        569 non-null    float64
 16  concavity error          569 non-null    float64
 17  concave points error     569 non-null    float64
 18  symmetry error           569 non-null    float64
 19  fractal dimension error  569 non-null    float64
 20  worst radius             569 non-null    float64
 21  worst texture            569 non-null    float64
 22  worst perimeter          569 non-null    float64
 23  worst area               569 non-null    float64
 24  worst smoothness         569 non-null    float64
 25  worst compactness        569 non-null    float64
 26  worst concavity          569 non-null    float64
 27  worst concave points     569 non-null    float64
 28  worst symmetry           569 non-null    float64
 29  worst fractal dimension  569 non-null    float64
dtypes: float64(30)
memory usage: 133.5 KB

fx.describe().round(2)

cols = [0,1,2,3,4,5,6,7,8,9] #extracting mean cols mean_df = fx[fx.columns[cols]]

f, ax = plt.subplots(figsize=(12, 7)) corr = mean_df.corr() hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05) t= f.suptitle('Mean Correlation Heatmap', fontsize=14)

cols = [10,11,12,13,14,15,16,17,18,19] #extracting error cols error_df = fx[fx.columns[cols]]

f, ax = plt.subplots(figsize=(12, 7)) corr =error_df.corr() hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05) t= f.suptitle('Errors Correlation Heatmap', fontsize=14)

cols = [10,21,22,23,24,25,26,27,28,29] #extracting worse cols worse_df = fx[fx.columns[cols]]

f, ax = plt.subplots(figsize=(12, 7)) corr = worse_df.corr() hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05) t= f.suptitle('Worse Correlation Heatmap', fontsize=14)

f, ax = plt.subplots(figsize=(12, 7)) corr = X.corr() hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05) t= f.suptitle('Overall Correlation Heatmap', fontsize=14)

# split the datasets into training and test data X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 55, test_size= 0.25)

# Code here

pd.DataFrame(X_test).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 276 to 112
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              143 non-null    float64
 1   mean texture             143 non-null    float64
 2   mean perimeter           143 non-null    float64
 3   mean area                143 non-null    float64
 4   mean smoothness          143 non-null    float64
 5   mean compactness         143 non-null    float64
 6   mean concavity           143 non-null    float64
 7   mean concave points      143 non-null    float64
 8   mean symmetry            143 non-null    float64
 9   mean fractal dimension   143 non-null    float64
 10  radius error             143 non-null    float64
 11  texture error            143 non-null    float64
 12  perimeter error          143 non-null    float64
 13  area error               143 non-null    float64
 14  smoothness error         143 non-null    float64
 15  compactness error        143 non-null    float64
 16  concavity error          143 non-null    float64
 17  concave points error     143 non-null    float64
 18  symmetry error           143 non-null    float64
 19  fractal dimension error  143 non-null    float64
 20  worst radius             143 non-null    float64
 21  worst texture            143 non-null    float64
 22  worst perimeter          143 non-null    float64
 23  worst area               143 non-null    float64
 24  worst smoothness         143 non-null    float64
 25  worst compactness        143 non-null    float64
 26  worst concavity          143 non-null    float64
 27  worst concave points     143 non-null    float64
 28  worst symmetry           143 non-null    float64
 29  worst fractal dimension  143 non-null    float64
dtypes: float64(30)
memory usage: 34.6 KB

pd.DataFrame(X_train).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 193 to 239
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              426 non-null    float64
 1   mean texture             426 non-null    float64
 2   mean perimeter           426 non-null    float64
 3   mean area                426 non-null    float64
 4   mean smoothness          426 non-null    float64
 5   mean compactness         426 non-null    float64
 6   mean concavity           426 non-null    float64
 7   mean concave points      426 non-null    float64
 8   mean symmetry            426 non-null    float64
 9   mean fractal dimension   426 non-null    float64
 10  radius error             426 non-null    float64
 11  texture error            426 non-null    float64
 12  perimeter error          426 non-null    float64
 13  area error               426 non-null    float64
 14  smoothness error         426 non-null    float64
 15  compactness error        426 non-null    float64
 16  concavity error          426 non-null    float64
 17  concave points error     426 non-null    float64
 18  symmetry error           426 non-null    float64
 19  fractal dimension error  426 non-null    float64
 20  worst radius             426 non-null    float64
 21  worst texture            426 non-null    float64
 22  worst perimeter          426 non-null    float64
 23  worst area               426 non-null    float64
 24  worst smoothness         426 non-null    float64
 25  worst compactness        426 non-null    float64
 26  worst concavity          426 non-null    float64
 27  worst concave points     426 non-null    float64
 28  worst symmetry           426 non-null    float64
 29  worst fractal dimension  426 non-null    float64
dtypes: float64(30)
memory usage: 103.2 KB

pd.DataFrame(y_train).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 193 to 239
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   target  426 non-null    int64
dtypes: int64(1)
memory usage: 6.7 KB

clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=0.1)

clf.fit(X_train, y_train)

pred = clf.predict(X_train) print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))

The training accuracy is 94.13145539906104

# Code here

pred = clf.predict(X_test) print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))

The testing accuracy is 98.6013986013986

pd.DataFrame(metrics.confusion_matrix(y_test, pred))

print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99        90
           1       0.98      0.98      0.98        53

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143

confusion_matrix(y_test, pred)

from sklearn.neighbors import KNeighborsClassifier neighbors = KNeighborsClassifier(n_neighbors=1)

neighbors.fit(X_train, y_train)

X_train

pred = neighbors.predict(X_train) print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))

The training accuracy is 100.0

pred = neighbors.predict(X_test) print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))

The testing accuracy is 94.4055944055944

pd.DataFrame(metrics.confusion_matrix(y_test, pred)) #confusion matrix

print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        90
           1       0.92      0.92      0.92        53

    accuracy                           0.94       143
   macro avg       0.94      0.94      0.94       143
weighted avg       0.94      0.94      0.94       143

model = GaussianNB() model.fit(X_train, y_train)

#Predictions y_pred = model.predict(X_test) y_pred

#accuracy = from sklearn.metrics import accuracy_score acc = accuracy_score(y_test,y_pred) print("The accuracy is "+str(acc*100)+"%")

The accuracy is 95.8041958041958%

pred = model.predict(X_train) print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))

The training accuracy is 93.66197183098592

pred = model.predict(X_test) print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))

The testing accuracy is 95.8041958041958

pd.DataFrame(metrics.confusion_matrix(y_test, pred)) #confusion matrix

print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97        90
           1       0.98      0.91      0.94        53

    accuracy                           0.96       143
   macro avg       0.96      0.95      0.95       143
weighted avg       0.96      0.96      0.96       143

from sklearn.linear_model import LogisticRegression

def PerformLogisticRegression(C_list,X_train,y_train): '''returns accuracy train and test values while taking the List of C values, the X train data and the y train data ''' accuracy_train = [] accuracy_test = [] predTrain=[] predTest=[] for C in C_list: clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=C) clf.fit(X_train, y_train) #print(clf.predict(X_test)) predTrain = clf.predict(X_train) predTest = clf.predict(X_test) #accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100)) accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100)) return accuracy_train,accuracy_test

C_list = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 10, 50, 100, 200, 500, 1000, 2000, 5000, 10000] accuracy_train,accuracy_test = PerformLogisticRegression(C_list,X_train,y_train)

accuracy_train

accuracy_test

import matplotlib.pyplot as plt

def plotValidationCurve(accuracy_train,accuracy_test,listt,title): '''plots validation curve''' f, ax = plt.subplots(dpi=150) plt.plot(accuracy_train, marker='o', color='blue', label = 'train') plt.plot(accuracy_test, color='orange', marker='o', label = 'val') plt.title(title) plt.xticks(ticks=np.arange(len(listt)), labels=listt, rotation=45) plt.xlabel('C') plt.ylabel('Accuracy') plt.legend() plt.show()

plotValidationCurve(accuracy_train,accuracy_test,C_list,"Validation Curve for Logestic Regression")

K_list = [1,3,5,7,9,15,19,25,29,35,39,45,49,99] accuracy_train = [] accuracy_test = [] predTest=[] predTrain=[] for K in K_list: clf = KNeighborsClassifier(n_neighbors=K) clf.fit(X_train, y_train) predTrain = clf.predict(X_train) predTest = clf.predict(X_test) #accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100)) accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))

accuracy_train

accuracy_test

f, ax = plt.subplots(dpi=150) plt.plot(accuracy_train, marker='o', color='blue', label = 'train') plt.plot(accuracy_test, color='orange', marker='o', label = 'val') plt.title('Validation Curve') plt.xticks(ticks=np.arange(len(K_list)), labels=K_list, rotation=45) plt.xlabel('K') plt.ylabel('Accuracy') plt.legend() plt.show()

ss = preprocessing.StandardScaler() X_train_ss = ss.fit_transform(X_train) X_test_ss = ss.transform(X_test) accuracy_train = [] accuracy_test = [] predTrain=[] predTest=[] for C in C_list: clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=C) clf.fit(X_train_ss, y_train) #print(clf.predict(X_test)) predTrain = clf.predict(X_train_ss) predTest = clf.predict(X_test_ss) #accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100)) accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))

plotValidationCurve(accuracy_train,accuracy_test,C_list,"Validation Curve with Normalization for Logestic Regression")

X_train_ss.shape

X_train_ss[0:426,:].shape[0]

X_train_ss.shape

X_train_ss.shape[0]

y_train

accuracy_train = [] accuracy_val = [] clf = LogisticRegression(max_iter = 1000, solver='lbfgs', penalty='l2', C=0.5, verbose=0) num_points = list(range(1, X_train_ss.shape[0], 1)) for n in num_points: clf.fit(X_train_ss, y_train) pred_trn = clf.predict(X_train_ss[0:n, :]) accuracy_train.append(metrics.accuracy_score(y_true = y_train[0:n], y_pred = pred_trn)) pred_val = clf.predict(X_test_ss) accuracy_val.append(metrics.accuracy_score(y_true = y_test, y_pred = pred_val))

f, ax = plt.subplots(dpi=150) ax.plot(num_points, accuracy_train, marker='o', color='blue', label='Train') ax.plot(num_points, accuracy_val, marker='o', color='orange', label='Val') ax.set_xlabel('Number of points in training') ax.set_ylabel('Accuracy') ax.set_title("Learning Curve") # ax.set_xticks(np.arange(len(num_points))) # ax.set_xticklabels(num_points, rotation=90) ax.legend(); ax.grid()

# Code here

ss = preprocessing.StandardScaler() X_train_ss = ss.fit_transform(X_train) X_test_ss = ss.transform(X_test)

K_list = [1,3,5,7,9,15,19,25,29,35,39,45,49,99] accuracy_train = [] accuracy_test = [] predTest=[] predTrain=[] for K in K_list: clf = KNeighborsClassifier(n_neighbors=K) clf.fit(X_train_ss, y_train) predTrain = clf.predict(X_train_ss) predTest = clf.predict(X_test_ss) #accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100)) accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))

accuracy_train

accuracy_test

# Code here

f, ax = plt.subplots(dpi=150) plt.plot(accuracy_train, marker='o', color='blue', label = 'train') plt.plot(accuracy_test, color='orange', marker='o', label = 'val') plt.title('Validation Curve after normalizing in KNN') plt.xticks(ticks=np.arange(len(K_list)), labels=K_list, rotation=45) plt.xlabel('K') plt.ylabel('Accuracy in %') plt.legend() plt.show()

X = fx.drop(['target','mean radius','mean perimeter','mean concavity'], axis=1) y = fx.target

# split the datasets into training and test data X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 55, test_size= 0.25)

model = GaussianNB() model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred from sklearn.metrics import accuracy_score acc = accuracy_score(y_test,y_pred) print("The accuracy is "+str(acc*100)+" %") pred = model.predict(X_train) print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100))) pred = model.predict(X_test) print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))

The accuracy is 95.1048951048951 %
The training accuracy is 94.36619718309859
The testing accuracy is 95.1048951048951