# Importing libraries - Gets us the software packages we need to do stuff
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
# Loading our data and storing it in a variable so that we can refer to it later
cancer_df = pd.read_csv("data.csv")
# Looking at the first few rows of our data
cancer_df.head(15)
This chart is empty
Chart was probably not set up properly in the notebook
cancer_df['diagnosis'].value_counts().plot(kind = "bar")
cancer_df['diagnosis'].value_counts()
cancer_df['diagnosis'] = cancer_df['diagnosis'].map({'B' : 0, 'M': 1})
cancer_df = cancer_df.filter(regex='.*_mean|diagnosis', axis=1)
cancer_df.drop(columns = 'id', inplace=True, errors='ignore')
cancer_df
cancer_df.describe()
## You DO NOT HAVE TO understand this code
df = cancer_df.copy()
df.head()
features_mean=list(df.columns[1:11])
# split dataframe into two based on diagnosis
dfM=df[df['diagnosis'] ==1]
dfB=df[df['diagnosis'] ==0]
#Stack the data
plt.rcParams.update({'font.size': 8})
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(8,10))
axes = axes.ravel()
for idx,ax in enumerate(axes):
ax.figure
binwidth= (max(df[features_mean[idx]]) - min(df[features_mean[idx]]))/50
ax.hist([dfM[features_mean[idx]],dfB[features_mean[idx]]], bins=np.arange(min(df[features_mean[idx]]), max(df[features_mean[idx]]) + binwidth, binwidth) , alpha=0.5,stacked=True, density = True, label=['M','B'],color=['r','g'])
ax.legend(loc='upper right')
ax.set_title(features_mean[idx])
plt.tight_layout()
plt.show()
training_data, test_data = train_test_split(cancer_df, test_size=0.3, random_state=2)
training_data.head() # common split is 80% training, 20% testing but you may have to adjust based off of dataset size
test_data.columns
X_train = training_data.drop(columns='diagnosis')[['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concave points_mean']]
y_train = training_data['diagnosis']
X_test = test_data.drop(columns='diagnosis')[['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concave points_mean']]
y_test = test_data['diagnosis']
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_predicted_logistic = logistic_model.predict(X_test)
accuracy = metrics.accuracy_score(y_predicted_logistic,y_test)
print("Accuracy : %s" % "{0:.3%}".format(accuracy))
pd.concat([pd.Series(y_predicted_logistic).reset_index(), pd.Series(y_test).reset_index()], axis = 1)[[0,'diagnosis']].rename(columns={0:'predicted','diagnosis':'actual'})
from sklearn import metrics
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted_logistic)
sns.set(font_scale=3)
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='g', ax=ax); #annot=True to annotate cells, ftm='g' to disable scientific notation
# labels, title and ticks
ax.set_xlabel('Predicted labels', fontsize = 18);
ax.set_ylabel('True labels', fontsize = 18);
ax.set_title('Confusion Matrix', fontsize = 22);
ax.xaxis.set_ticklabels(['benign', 'malignant'], fontsize = 14);
ax.yaxis.set_ticklabels(['benign', 'malignant'], fontsize = 14);
cm