Statement of Purpose
import pandas as pd
import numpy as np
from pycaret import classification
import seaborn as sns
import matplotlib.pyplot as plt
cancer_df = pd.read_csv('data.csv')
cancer_df.head()
cancer_df.diagnosis.unique()
cancer_df['malignant'] = cancer_df.diagnosis.apply(lambda x: 1 if x == 'M' else 0)
cancer_df.drop(['id', 'Unnamed: 32', 'diagnosis'], axis=1, inplace=True)
cancer_df.head()
malignant_proportion = cancer_df['malignant'].sum() / cancer_df['malignant'].count() * 100
print(str(round(malignant_proportion, 2)) + "% of the cancers in the dataset are malignant")
cancer_df.describe()
# The Following Template is based on: https://seaborn.pydata.org/examples/many_pairwise_correlations.html
sns.set_theme(style="white")
# Compute the correlation matrix
corr = cancer_df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
corr['malignant'].sort_values(ascending=False)
classification_setup = classification.setup(data=cancer_df, target='malignant', session_id=8530) #sesssion_id = random seed; generated from first run
classification.compare_models()