import pandas as pd
import numpy as np
df_data = pd.read_csv('/work/data.csv')
df_data.head(5)
idint64
diagnosisobject
0
842302
M
1
842517
M
2
84300903
M
3
84348301
M
4
84358402
M
df_data.isnull().sum()
df_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 569 non-null int64
1 diagnosis 569 non-null object
2 radius_mean 569 non-null float64
3 texture_mean 569 non-null float64
4 perimeter_mean 569 non-null float64
5 area_mean 569 non-null float64
6 smoothness_mean 569 non-null float64
7 compactness_mean 569 non-null float64
8 concavity_mean 569 non-null float64
9 concave points_mean 569 non-null float64
10 symmetry_mean 569 non-null float64
11 fractal_dimension_mean 569 non-null float64
12 radius_se 569 non-null float64
13 texture_se 569 non-null float64
14 perimeter_se 569 non-null float64
15 area_se 569 non-null float64
16 smoothness_se 569 non-null float64
17 compactness_se 569 non-null float64
18 concavity_se 569 non-null float64
19 concave points_se 569 non-null float64
20 symmetry_se 569 non-null float64
21 fractal_dimension_se 569 non-null float64
22 radius_worst 569 non-null float64
23 texture_worst 569 non-null float64
24 perimeter_worst 569 non-null float64
25 area_worst 569 non-null float64
26 smoothness_worst 569 non-null float64
27 compactness_worst 569 non-null float64
28 concavity_worst 569 non-null float64
29 concave points_worst 569 non-null float64
30 symmetry_worst 569 non-null float64
31 fractal_dimension_worst 569 non-null float64
32 Unnamed: 32 0 non-null float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB
df_data.drop('Unnamed: 32', axis=1, inplace=True)
df_data.head(5)
idint64
diagnosisobject
0
842302
M
1
842517
M
2
84300903
M
3
84348301
M
4
84358402
M
df_data.drop('id', axis=1, inplace=True)
df_data['diagnosis'].replace(to_replace="B", value=1, inplace=True)
df_data['diagnosis'].replace(to_replace="M", value=0, inplace=True)
df_data['diagnosis']
df_data_processing = df_data.copy()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
df_data_processing.corr()['diagnosis'].sort_values(ascending=True).plot(kind='bar')
plt.show()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_data_processing_scaled = scaler.fit_transform(df_data_processing)
df_data_processing_scaled = pd.DataFrame(df_data_processing_scaled)
df_data_processing_scaled.columns = df_data_processing.columns
df_data_processing_scaled.head(5)
diagnosisfloat64
radius_meanfloat64
0
0.0
0.5210374366983765
1
0.0
0.6431444933503716
2
0.0
0.6014955748024045
3
0.0
0.21009039708457572
4
0.0
0.6298925647214728
X = df_data_processing_scaled.drop('diagnosis', axis=1)
y = df_data_processing_scaled['diagnosis'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
result = model.fit(X_train, y_train)
from sklearn import metrics
prediction_test = model.predict(X_test)
print(metrics.accuracy_score(y_test, prediction_test))
0.9766081871345029
model.predict_log_proba(X_test)
model.coef_
model.feature_names_in_
weights = pd.Series(model.coef_[0],
index=X.columns.values)
print(weights.sort_values(ascending=False)[:10].plot(kind='bar'))
AxesSubplot(0.125,0.11;0.775x0.77)
print(weights.sort_values(ascending=False)[-10:].plot(kind='bar'))
AxesSubplot(0.125,0.11;0.775x0.77)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
fig = plt.figure(figsize=(11,11))
cm = confusion_matrix(y_test, prediction_test, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=model.classes_)
disp.plot(cmap='gray')
plt.show()