# Load library yang diperlukan
import pandas as pd
# Load data menjadi data frame
dataset = pd.read_csv('breast-cancer-wisconsin.data', header=None)
# menampilkan data
dataset
# Memasukkan nama fitur kedalam dataset
dataset.columns=[
'id',
'clump_thickness',
'uniformity_of_cell_size',
'uniformity_of_cell_shape',
'marginal_adhesion',
'single_epithelial_cell_size',
'bare_nuclei',
'bland_chromatin',
'normal_nucleoli',
'mitoses',
'target'
]
# Lakukan pengecekan apakah dataset sudah benar dengan menampilkan 5 data teratas
dataset.head()
# Menampilkan informasi dari file dataset
dataset.info()
# Menampilkan deskripsi dari file dataset
dataset.describe()
import seaborn as sns
import matplotlib.pyplot as plt
# dibawah adalah contoh kode program untuk fitur 1
sns.set(font_scale=1.0)
dataset['id'].value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Sample_code_number", labelpad=14)
plt.ylabel("Jumlah", labelpad=14)
plt.title("Distribusi Sample_code_number", y=1.02);
# tuliskan kode program untuk Menampilkan distribusi kelas dari semua fitur
dataset.value_counts()
import numpy as np
#menghapus fitur yang tidak dapat digunakan
columns_to_drop = ['id']
dataset = dataset.drop(columns_to_drop, axis=1)
#menggantikan nilai yang salah pada data
dataset.replace('?', np.nan, inplace= True)
dataset.info()
# menghitung nilai Null pada dataset
dataset.isnull().sum()
# mendeteksi keberadaan nilai Null
dataset.loc[:, dataset.isnull().any()].columns
# Mengubah Type data dari salah satu fitur
dataset['bare_nuclei'] = dataset['bare_nuclei'].astype(str).astype(float)
# Menampilkan Informasi dari data
dataset.info()
import missingno
# Memvisualisasikan keberadaan nilai Null
missingdata_df = dataset.columns[dataset.isnull().any()].tolist()
missingno.matrix(dataset[missingdata_df])
# hapus fitur jika fitur yang hilang lebih dari 50%
# dataset.drop(['fitur'], axis="columns", inplace=True),
#jika tidak maka
median_value=dataset['bare_nuclei'].median()
dataset['bare_nuclei'] = dataset['bare_nuclei'].fillna(median_value)
# diiskan jawaban masing-masing
<fix_me>
dataset.isnull().sum()
# Menampilkan data duplikat
duplicate_rows = dataset.duplicated()
print("All Duplicate Rows:")
dataset[dataset.duplicated(keep=False)]
# Menghapus data duplikat, menyimpan data dalam variabel baru "dataClean"
dataClean = dataset.drop_duplicates()
print("All Duplicate Rows:")
dataClean[dataClean.duplicated(keep=False)]
dataset.info()
dataClean.info()
# Menampilkan distribusi kelas dari target
print(dataClean['target'].value_counts())
sns.set(font_scale=1.0)
dataClean['target'].value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Status Pasien kanker payudara", labelpad=14)
plt.ylabel("Jumlah", labelpad=14)
plt.title("Status Pasien kanker payudara", y=1.02);
# Menampilkan Korelasi antar Fitur
correlation = dataClean.corr()
plt.subplots(figsize = (12,12))
sns.heatmap(correlation.round(2),
annot = True,
vmax = 1,
square = True,
cmap = 'RdYlGn_r')
plt.show()
# Menampilkan Boxplot untuk melihat adanya Outlayer
dataClean['target'].value_counts().plot(kind='box',subplots=True,layout=(5,6), sharex=False,figsize = (20,20),
title='figure 1: Data distributions of all features')
plt.show()
#menampilkan deskripsi data yang sudah dibersihkan
dataClean.describe()
dataClean.info()
dataClean
from sklearn.model_selection import train_test_split
X_norm= dataClean.drop("target",axis=1).values
y = dataClean['target']
# perbandingan data training dan data testing adalah 70 : 30
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.3, random_state=42)
# import library pemodelan yang digunakan
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
clean_classifier_nb = GaussianNB()
clean_classifier_nb.fit(X_train, y_train)
clean_classifier_dt = DecisionTreeClassifier(random_state=42)
clean_classifier_dt.fit(X_train, y_train)
clean_classifier_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clean_classifier_rf.fit(X_train, y_train)
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,recall_score,f1_score,precision_score,roc_auc_score,confusion_matrix,precision_score
def evaluation(Y_test,Y_pred):
acc = accuracy_score(Y_test,Y_pred)
rcl = recall_score(Y_test,Y_pred,average = 'weighted')
f1 = f1_score(Y_test,Y_pred,average = 'weighted')
ps = precision_score(Y_test,Y_pred,average = 'weighted')
metric_dict={'accuracy': round(acc,3),
'recall': round(rcl,3),
'F1 score': round(f1,3),
'Precision score': round(ps,3)
}
return print(metric_dict)
y_pred_nb = clean_classifier_nb.predict(X_test)
# Evaluate the Naive Bayes model
print("\nNaive Bayes Model:")
accuracy_nb = round(accuracy_score(y_test, y_pred_nb),3)
print("Accuracy:",accuracy_nb)
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))
evaluation(y_test,y_pred_nb)
y_pred_dt = clean_classifier_dt.predict(X_test)
# Evaluate the Decission Tree model
print("\nDecission Tree Model:")
accuracy_dt = round(accuracy_score(y_test, y_pred_dt),3)
print("Accuracy:",accuracy_dt)
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))
evaluation(y_test, y_pred_dt)
y_pred_rf = clean_classifier_rf.predict(X_test)
# Evaluate the Random Forest model
print("\nRandom Forest Model:")
accuracy_rf = round(accuracy_score(y_test, y_pred_rf),3)
print("Accuracy:",accuracy_rf)
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
evaluation(y_test,y_pred_rf)
model_comp = pd.DataFrame({'Model': ['Naive Bayes','Decision Tree','Random Forest'], 'Accuracy': [accuracy_nb*100,
accuracy_dt*100,accuracy_rf*100]})
# Membuat bar plot dengan keterangan jumlah
fig, ax = plt.subplots()
bars = plt.bar(model_comp['Model'], model_comp['Accuracy'], color=['red', 'green', 'blue'])
plt.xlabel('Model')
plt.ylabel('Accuracy (%)')
plt.title('Clean Data')
plt.xticks(rotation=45, ha='right') # Untuk memutar label sumbu x agar lebih mudah dibaca
# Menambahkan keterangan jumlah di atas setiap bar
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')
plt.show()
#nilai confusion matrix untuk model dengan akurasi tertinggi
cm = confusion_matrix(y_test, y_pred_nb)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title('Confusion Matrix')
plt.xlabel('True')
plt.ylabel('Predict')
plt.show()
# columns_to_drop = ['fix_me', 'fix_me']
columns_to_drop = ['bare_nuclei']
X_selected= dataClean.drop(columns_to_drop, axis=1).values
y = dataClean['target']
X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(X_selected, y, test_size=0.3, random_state=42)
selected_classifier_nb = GaussianNB()
selected_classifier_nb.fit(X_train_selected, y_train_selected)
y_pred_nb_selected = selected_classifier_nb.predict(X_test_selected)
# Evaluate the optimize model
print("\nNaive Bayes Model:")
accuracy_nb_selected = round(accuracy_score(y_test_selected, y_pred_nb_selected),3)
print("Accuracy:",accuracy_nb_selected)
print("Classification Report:")
print(classification_report(y_test_selected, y_pred_nb_selected))
evaluation(y_test_selected,y_pred_nb_selected)
cm = confusion_matrix(y_test, y_pred_nb_selected)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title('Confusion Matrix')
plt.xlabel('True')
plt.ylabel('Predict')
plt.show()
model_comp = pd.DataFrame({'Model': ['Sebelum optimasi','setelah Optimasi'], 'Accuracy': [accuracy_nb*100,
accuracy_nb_selected*100]})
# Membuat bar plot dengan keterangan jumlah
fig, ax = plt.subplots()
bars = plt.bar(model_comp['Model'], model_comp['Accuracy'], color=['red', 'blue'])
plt.xlabel('Model')
plt.ylabel('Accuracy (%)')
plt.title('Optimal Data')
plt.xticks(rotation=45, ha='right') # Untuk memutar label sumbu x agar lebih mudah dibaca
# Menambahkan keterangan jumlah di atas setiap bar
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')
plt.show()