SI 370 Report - Predicting Breast Cancer

Statement of Purpose

import pandas as pd import numpy as np from pycaret import classification import seaborn as sns import matplotlib.pyplot as plt

cancer_df = pd.read_csv('data.csv')

cancer_df.head()

cancer_df.diagnosis.unique()

cancer_df['malignant'] = cancer_df.diagnosis.apply(lambda x: 1 if x == 'M' else 0) cancer_df.drop(['id', 'Unnamed: 32', 'diagnosis'], axis=1, inplace=True) cancer_df.head()

malignant_proportion = cancer_df['malignant'].sum() / cancer_df['malignant'].count() * 100 print(str(round(malignant_proportion, 2)) + "% of the cancers in the dataset are malignant")

cancer_df.describe()

# The Following Template is based on: https://seaborn.pydata.org/examples/many_pairwise_correlations.html sns.set_theme(style="white") # Compute the correlation matrix corr = cancer_df.corr() # Generate a mask for the upper triangle mask = np.triu(np.ones_like(corr, dtype=bool)) # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sns.diverging_palette(230, 20, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

corr['malignant'].sort_values(ascending=False)

classification_setup = classification.setup(data=cancer_df, target='malignant', session_id=8530) #sesssion_id = random seed; generated from first run

classification.compare_models()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Statement of Purpose

Statement of Purpose