import sklearn
sklearn.__version__
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', 100)
# Your package imports here
from sklearn.metrics import confusion_matrix
import seaborn as sn
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
# Load the dataset from UCI
df = pd.read_csv(
'https://archive.ics.uci.edu/ml/'
'machine-learning-databases'
'/breast-cancer-wisconsin/wdbc.data',
header=None)
df.head()
fx = pd.read_csv('C:/Users/Umaima/DSC540/WisconsinBreastCancer.csv')
X = fx.drop('target', axis=1)
y = fx.target
X.head()
y.head()
# Code here
pple_cancer = df[df[1] == "M"]
pple_cancer.info()
print(fx.info())
fx.isnull().sum()
X.info()
fx.describe().round(2)
cols = [0,1,2,3,4,5,6,7,8,9] #extracting mean cols
mean_df = fx[fx.columns[cols]]
f, ax = plt.subplots(figsize=(12, 7))
corr = mean_df.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
t= f.suptitle('Mean Correlation Heatmap', fontsize=14)
cols = [10,11,12,13,14,15,16,17,18,19] #extracting error cols
error_df = fx[fx.columns[cols]]
f, ax = plt.subplots(figsize=(12, 7))
corr =error_df.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
t= f.suptitle('Errors Correlation Heatmap', fontsize=14)
cols = [10,21,22,23,24,25,26,27,28,29] #extracting worse cols
worse_df = fx[fx.columns[cols]]
f, ax = plt.subplots(figsize=(12, 7))
corr = worse_df.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
t= f.suptitle('Worse Correlation Heatmap', fontsize=14)
f, ax = plt.subplots(figsize=(12, 7))
corr = X.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
t= f.suptitle('Overall Correlation Heatmap', fontsize=14)
# split the datasets into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 55, test_size= 0.25)
# Code here
pd.DataFrame(X_test).info()
pd.DataFrame(X_train).info()
pd.DataFrame(y_train).info()
clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=0.1)
clf.fit(X_train, y_train)
pred = clf.predict(X_train)
print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))
# Code here
pred = clf.predict(X_test)
print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))
pd.DataFrame(metrics.confusion_matrix(y_test, pred))
print(metrics.classification_report(y_test, pred))
confusion_matrix(y_test, pred)
from sklearn.neighbors import KNeighborsClassifier
neighbors = KNeighborsClassifier(n_neighbors=1)
neighbors.fit(X_train, y_train)
X_train
pred = neighbors.predict(X_train)
print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))
pred = neighbors.predict(X_test)
print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))
pd.DataFrame(metrics.confusion_matrix(y_test, pred)) #confusion matrix
print(metrics.classification_report(y_test, pred))
model = GaussianNB()
model.fit(X_train, y_train)
#Predictions
y_pred = model.predict(X_test)
y_pred
#accuracy =
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
print("The accuracy is "+str(acc*100)+"%")
pred = model.predict(X_train)
print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))
pred = model.predict(X_test)
print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))
pd.DataFrame(metrics.confusion_matrix(y_test, pred)) #confusion matrix
print(metrics.classification_report(y_test, pred))
from sklearn.linear_model import LogisticRegression
def PerformLogisticRegression(C_list,X_train,y_train):
'''returns accuracy train and test values while taking the List of C values, the X train data and the y train data '''
accuracy_train = []
accuracy_test = []
predTrain=[]
predTest=[]
for C in C_list:
clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=C)
clf.fit(X_train, y_train)
#print(clf.predict(X_test))
predTrain = clf.predict(X_train)
predTest = clf.predict(X_test)
#accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies
accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100))
accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))
return accuracy_train,accuracy_test
C_list = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 10, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
accuracy_train,accuracy_test = PerformLogisticRegression(C_list,X_train,y_train)
accuracy_train
accuracy_test
import matplotlib.pyplot as plt
def plotValidationCurve(accuracy_train,accuracy_test,listt,title):
'''plots validation curve'''
f, ax = plt.subplots(dpi=150)
plt.plot(accuracy_train, marker='o', color='blue', label = 'train')
plt.plot(accuracy_test, color='orange', marker='o', label = 'val')
plt.title(title)
plt.xticks(ticks=np.arange(len(listt)), labels=listt, rotation=45)
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
plotValidationCurve(accuracy_train,accuracy_test,C_list,"Validation Curve for Logestic Regression")
K_list = [1,3,5,7,9,15,19,25,29,35,39,45,49,99]
accuracy_train = []
accuracy_test = []
predTest=[]
predTrain=[]
for K in K_list:
clf = KNeighborsClassifier(n_neighbors=K)
clf.fit(X_train, y_train)
predTrain = clf.predict(X_train)
predTest = clf.predict(X_test)
#accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies
accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100))
accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))
accuracy_train
accuracy_test
f, ax = plt.subplots(dpi=150)
plt.plot(accuracy_train, marker='o', color='blue', label = 'train')
plt.plot(accuracy_test, color='orange', marker='o', label = 'val')
plt.title('Validation Curve')
plt.xticks(ticks=np.arange(len(K_list)), labels=K_list, rotation=45)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
ss = preprocessing.StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)
accuracy_train = []
accuracy_test = []
predTrain=[]
predTest=[]
for C in C_list:
clf = LogisticRegression(max_iter = 10000, solver='lbfgs', penalty='l2', C=C)
clf.fit(X_train_ss, y_train)
#print(clf.predict(X_test))
predTrain = clf.predict(X_train_ss)
predTest = clf.predict(X_test_ss)
#accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies
accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100))
accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))
plotValidationCurve(accuracy_train,accuracy_test,C_list,"Validation Curve with Normalization for Logestic Regression")
X_train_ss.shape
X_train_ss[0:426,:].shape[0]
X_train_ss.shape
X_train_ss.shape[0]
y_train
accuracy_train = []
accuracy_val = []
clf = LogisticRegression(max_iter = 1000, solver='lbfgs', penalty='l2', C=0.5, verbose=0)
num_points = list(range(1, X_train_ss.shape[0], 1))
for n in num_points:
clf.fit(X_train_ss, y_train)
pred_trn = clf.predict(X_train_ss[0:n, :])
accuracy_train.append(metrics.accuracy_score(y_true = y_train[0:n], y_pred = pred_trn))
pred_val = clf.predict(X_test_ss)
accuracy_val.append(metrics.accuracy_score(y_true = y_test, y_pred = pred_val))
f, ax = plt.subplots(dpi=150)
ax.plot(num_points, accuracy_train, marker='o', color='blue', label='Train')
ax.plot(num_points, accuracy_val, marker='o', color='orange', label='Val')
ax.set_xlabel('Number of points in training')
ax.set_ylabel('Accuracy')
ax.set_title("Learning Curve")
# ax.set_xticks(np.arange(len(num_points)))
# ax.set_xticklabels(num_points, rotation=90)
ax.legend();
ax.grid()
# Code here
ss = preprocessing.StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)
K_list = [1,3,5,7,9,15,19,25,29,35,39,45,49,99]
accuracy_train = []
accuracy_test = []
predTest=[]
predTrain=[]
for K in K_list:
clf = KNeighborsClassifier(n_neighbors=K)
clf.fit(X_train_ss, y_train)
predTrain = clf.predict(X_train_ss)
predTest = clf.predict(X_test_ss)
#accuracy scores for training and testing datasets. Create a list of accuracy results for train and test accuracies
accuracy_train.append((metrics.accuracy_score(y_true = y_train, y_pred = predTrain)*100))
accuracy_test.append((metrics.accuracy_score(y_true = y_test, y_pred = predTest)*100))
accuracy_train
accuracy_test
# Code here
f, ax = plt.subplots(dpi=150)
plt.plot(accuracy_train, marker='o', color='blue', label = 'train')
plt.plot(accuracy_test, color='orange', marker='o', label = 'val')
plt.title('Validation Curve after normalizing in KNN')
plt.xticks(ticks=np.arange(len(K_list)), labels=K_list, rotation=45)
plt.xlabel('K')
plt.ylabel('Accuracy in %')
plt.legend()
plt.show()
fx
X = fx.drop(['target','mean radius','mean perimeter','mean concavity'], axis=1)
y = fx.target
X
# split the datasets into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 55, test_size= 0.25)
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
print("The accuracy is "+str(acc*100)+" %")
pred = model.predict(X_train)
print('The training accuracy is '+str((metrics.accuracy_score(y_true = y_train, y_pred = pred)*100)))
pred = model.predict(X_test)
print('The testing accuracy is '+str((metrics.accuracy_score(y_true = y_test, y_pred = pred)*100)))