import pandas as pd
import numpy as np
df_data = pd.read_csv('/work/data.csv')
df_data.head(5)
df_data.isnull().sum()
df_data.info()
df_data.drop('Unnamed: 32', axis=1, inplace=True)
df_data.head(5)
df_data.drop('id', axis=1, inplace=True)
df_data['diagnosis'].replace(to_replace="B", value=1, inplace=True)
df_data['diagnosis'].replace(to_replace="M", value=0, inplace=True)
df_data['diagnosis']
df_data_processing = df_data.copy()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
df_data_processing.corr()['diagnosis'].sort_values(ascending=True).plot(kind='bar')
plt.show()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_data_processing_scaled = scaler.fit_transform(df_data_processing)
df_data_processing_scaled = pd.DataFrame(df_data_processing_scaled)
df_data_processing_scaled.columns = df_data_processing.columns
df_data_processing_scaled.head(5)
X = df_data_processing_scaled.drop('diagnosis', axis=1)
y = df_data_processing_scaled['diagnosis'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
result = model.fit(X_train, y_train)
from sklearn import metrics
prediction_test = model.predict(X_test)
print(metrics.accuracy_score(y_test, prediction_test))
model.predict_log_proba(X_test)
model.coef_
model.feature_names_in_
weights = pd.Series(model.coef_[0],
index=X.columns.values)
print(weights.sort_values(ascending=False)[:10].plot(kind='bar'))
print(weights.sort_values(ascending=False)[-10:].plot(kind='bar'))
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
fig = plt.figure(figsize=(11,11))
cm = confusion_matrix(y_test, prediction_test, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=model.classes_)
disp.plot(cmap='gray')
plt.show()