# Importing libraries - Gets us the software packages we need to do stuff
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
# Loading our data and storing it in a variable so that we can refer to it later
cancer_df = pd.read_csv("data.csv")
# Looking at the first few rows of our data
cancer_df.head(15)
idint64
842302 - 84667401
diagnosisobject
M100%
0
842302
M
1
842517
M
2
84300903
M
3
84348301
M
4
84358402
M
5
843786
M
6
844359
M
7
84458202
M
8
844981
M
9
84501001
M
cancer_df['diagnosis'].value_counts().plot(kind = "bar")
cancer_df['diagnosis'].value_counts()
cancer_df['diagnosis'] = cancer_df['diagnosis'].replace(['M'], '1').replace(['B'], '0')
cancer_df
idint64
8670 - 911320502
diagnosisobject
062.7%
137.3%
0
842302
1
1
842517
1
2
84300903
1
3
84348301
1
4
84358402
1
5
843786
1
6
844359
1
7
84458202
1
8
844981
1
9
84501001
1
cancer_df = cancer_df.groupby('diagnosis').mean()
cancer_df.drop(columns = 'id', inplace=True, errors='ignore')
cancer_df
radius_meanfloat64
texture_meanfloat64
0
12.14652380952381
17.914761904761892
1
17.46283018867925
21.60490566037735
cancer_df.describe()
radius_meanfloat64
texture_meanfloat64
count
2.0
2.0
mean
14.80467699910153
19.759833782569622
std
3.7591962915661115
2.609325673148885
min
12.14652380952381
17.914761904761892
25%
13.475600404312669
18.83729784366576
50%
14.80467699910153
19.759833782569622
75%
16.13375359389039
20.682369721473485
max
17.46283018867925
21.60490566037735
## You DO NOT HAVE TO understand this code
df = cancer_df.copy()
df.head()
features_mean=list(df.columns[1:11])
# split dataframe into two based on diagnosis
dfM=df[df['diagnosis'] ==1]
dfB=df[df['diagnosis'] ==0]
#Stack the data
plt.rcParams.update({'font.size': 8})
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(8,10))
axes = axes.ravel()
for idx,ax in enumerate(axes):
ax.figure
binwidth= (max(df[features_mean[idx]]) - min(df[features_mean[idx]]))/50
ax.hist([dfM[features_mean[idx]],dfB[features_mean[idx]]], bins=np.arange(min(df[features_mean[idx]]), max(df[features_mean[idx]]) + binwidth, binwidth) , alpha=0.5,stacked=True, density = True, label=['M','B'],color=['r','g'])
ax.legend(loc='upper right')
ax.set_title(features_mean[idx])
plt.tight_layout()
plt.show()
Execution Error
KeyError: 'diagnosis'
training_data, test_data = train_test_split(cancer_df, test_size=0.3, random_state=2)
training_data.head() # common split is 80% training, 20% testing but you may have to adjust based off of dataset size
radius_meanfloat64
texture_meanfloat64
0
12.14652380952381
17.914761904761892
test_data.head()
radius_meanfloat64
texture_meanfloat64
1
17.46283018867925
21.60490566037735
X_train = training_data.drop(columns='radius_mean', 'perimeter_mean', 'area_mean', 'concavity_mean', 'compactness_mean').values()
y_train = training_data['radius_mean', 'perimeter_mean', 'area_mean', 'concavity_mean', 'compactness_mean'].values()
X_test = ...
y_test = ...
Execution Error
SyntaxError: positional argument follows keyword argument (1938842362.py, line 1)
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_predicted_logistic = logistic_model.predict(X_test)
accuracy = metrics.accuracy_score(y_predicted_logistic,y_test)
print("Accuracy : %s" % "{0:.3%}".format(accuracy))
Execution Error
NameError: name 'logistic_model' is not defined
pd.concat([pd.Series(y_predicted_logistic).reset_index(), pd.Series(y_test).reset_index()], axis = 1)[[0,'diagnosis']].rename(columns={0:'predicted','diagnosis':'actual'})
Execution Error
NameError: name 'pd' is not defined
from sklearn import metrics
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted_logistic)
sns.set(font_scale=3)
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='g', ax=ax); #annot=True to annotate cells, ftm='g' to disable scientific notation
# labels, title and ticks
ax.set_xlabel('Predicted labels', fontsize = 18);
ax.set_ylabel('True labels', fontsize = 18);
ax.set_title('Confusion Matrix', fontsize = 22);
ax.xaxis.set_ticklabels(['benign', 'malignant'], fontsize = 14);
ax.yaxis.set_ticklabels(['benign', 'malignant'], fontsize = 14);
cm
Execution Error
NameError: name 'y_test' is not defined