# Importing libraries - Gets us the software packages we need to do stuff
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
# Loading our data and storing it in a variable so that we can refer to it later
cancer_df = pd.read_csv("data.csv")
# Looking at the first few rows of our data
cancer_df.head(15)
idint64
842302 - 84667401
diagnosisobject
M100%
0
842302
M
1
842517
M
2
84300903
M
3
84348301
M
4
84358402
M
5
843786
M
6
844359
M
7
84458202
M
8
844981
M
9
84501001
M
This chart is empty
Chart was probably not set up properly in the notebook
cancer_df['diagnosis'].value_counts().plot(kind = "bar")
cancer_df['diagnosis'].value_counts()
cancer_df['diagnosis'] = cancer_df['diagnosis'].map({'B' : 0, 'M': 1})
cancer_df = cancer_df.filter(regex='.*_mean|diagnosis', axis=1)
cancer_df.drop(columns = 'id', inplace=True, errors='ignore')
cancer_df
diagnosisint64
0 - 1
radius_meanfloat64
6.981 - 28.11
0
1
17.99
1
1
20.57
2
1
19.69
3
1
11.42
4
1
20.29
5
1
12.45
6
1
18.25
7
1
13.71
8
1
13.0
9
1
12.46
cancer_df.describe()
diagnosisfloat64
radius_meanfloat64
count
569.0
569.0
mean
0.37258347978910367
14.127291739894552
std
0.48391795640316865
3.5240488262120775
min
0.0
6.981
25%
0.0
11.7
50%
0.0
13.37
75%
1.0
15.78
max
1.0
28.11
## You DO NOT HAVE TO understand this code
df = cancer_df.copy()
df.head()
features_mean=list(df.columns[1:11])
# split dataframe into two based on diagnosis
dfM=df[df['diagnosis'] ==1]
dfB=df[df['diagnosis'] ==0]
#Stack the data
plt.rcParams.update({'font.size': 8})
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(8,10))
axes = axes.ravel()
for idx,ax in enumerate(axes):
ax.figure
binwidth= (max(df[features_mean[idx]]) - min(df[features_mean[idx]]))/50
ax.hist([dfM[features_mean[idx]],dfB[features_mean[idx]]], bins=np.arange(min(df[features_mean[idx]]), max(df[features_mean[idx]]) + binwidth, binwidth) , alpha=0.5,stacked=True, density = True, label=['M','B'],color=['r','g'])
ax.legend(loc='upper right')
ax.set_title(features_mean[idx])
plt.tight_layout()
plt.show()
training_data, test_data = train_test_split(cancer_df, test_size=0.3, random_state=2)
training_data.head() # common split is 80% training, 20% testing but you may have to adjust based off of dataset size
diagnosisint64
radius_meanfloat64
166
0
10.8
384
0
13.28
558
0
14.59
451
1
19.59
333
0
11.25
test_data.columns
X_train = training_data.drop(columns='diagnosis')[['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concave points_mean']]
y_train = training_data['diagnosis']
X_test = test_data.drop(columns='diagnosis')[['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concave points_mean']]
y_test = test_data['diagnosis']
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_predicted_logistic = logistic_model.predict(X_test)
accuracy = metrics.accuracy_score(y_predicted_logistic,y_test)
print("Accuracy : %s" % "{0:.3%}".format(accuracy))
Accuracy : 90.643%
pd.concat([pd.Series(y_predicted_logistic).reset_index(), pd.Series(y_test).reset_index()], axis = 1)[[0,'diagnosis']].rename(columns={0:'predicted','diagnosis':'actual'})
predictedint64
0 - 1
actualint64
0 - 1
0
0
0
1
0
0
2
0
0
3
1
1
4
0
0
5
1
1
6
0
0
7
0
0
8
0
0
9
0
0
from sklearn import metrics
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted_logistic)
sns.set(font_scale=3)
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='g', ax=ax); #annot=True to annotate cells, ftm='g' to disable scientific notation
# labels, title and ticks
ax.set_xlabel('Predicted labels', fontsize = 18);
ax.set_ylabel('True labels', fontsize = 18);
ax.set_title('Confusion Matrix', fontsize = 22);
ax.xaxis.set_ticklabels(['benign', 'malignant'], fontsize = 14);
ax.yaxis.set_ticklabels(['benign', 'malignant'], fontsize = 14);
cm