!python -m pip install -q --upgrade pip
!pip install -q -r requirements.txt
# Miscellaneous
from __future__ import print_function
from IPython.display import display
#Importing Requierd Libraries
import pandas as pd
import numpy as np
# For interactive graphics
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from ydata_profiling import ProfileReport
from yellowbrick.classifier import ClassPredictionError
from tqdm.notebook import trange, tqdm
# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import make_blobs
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, f1_score, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score
from sklearn.model_selection import train_test_split, ShuffleSplit, LearningCurveDisplay
from imblearn.pipeline import Pipeline
## Algorithms for Binary Classification
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
## Visualization of Decision Tree
from sklearn.tree import plot_tree, export_text
# Magics Funtions
%load_ext autoreload
%autoreload 2
%run "template_visualitation.ipynb"
%run "pandas-missing-extension.ipynb"
df_data = pd.read_csv('data.csv', index_col=None, delimiter=',', encoding='utf-8')
df_data.to_feather("data.feather")
df_data = pd.read_feather("data.feather")
df_data.describe()
df_data.info(memory_usage="deep")
memory_usage = df_data.memory_usage(deep=True) / 1024 ** 2
print(f'memory usage of features:\n{memory_usage.head(7)}')
print('memory usage sum:',memory_usage.sum())
def reduce_memory_usage(df, verbose=True):
numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
start_mem = df.memory_usage().sum() / 1024 ** 2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if (
c_min > np.finfo(np.float16).min
and c_max < np.finfo(np.float16).max
):
df[col] = df[col].astype(np.float16)
elif (
c_min > np.finfo(np.float32).min
and c_max < np.finfo(np.float32).max
):
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024 ** 2
if verbose:
print(
"Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
end_mem, 100 * (start_mem - end_mem) / start_mem
)
)
return df
df_data = reduce_memory_usage(df_data, verbose=True)
df_data.info(memory_usage="deep")
df_data.info()
df_data.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)
df_data_processing = df_data.copy()
df_data_processing.diagnosis.value_counts()
df_data_processing.diagnosis = df_data.diagnosis.replace(["M", "B"], [1, 0])
df_data_processing.diagnosis.value_counts()
df_data_processing.missing.missing_variable_summary()
df_data_processing[df_data_processing.duplicated()]
df_data_processing.duplicated().value_counts()
df_data_processing.nunique()
df_data_processing = reduce_memory_usage(df_data_processing, verbose=True)
df_data_processing.info()
profile = ProfileReport(
df_data_processing, title="Pandas Profiling Report", html={"style": {"primary_color": "#FA0087"}},
minimal=True
)
profile
columns_numeric = ['radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst']
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 10, 'color': 'white'}
# plotly setup
plot_rows=5
plot_cols=6
fig = make_subplots(rows=plot_rows, cols=plot_cols, shared_yaxes=False)
# add traces
x = 0
for i in range(1, plot_rows + 1):
for j in range(1, plot_cols + 1):
fig.add_trace(go.Box(y=df_data_processing[columns_numeric[x]].values,
name = df_data_processing[columns_numeric].columns[x],
),
row=i,
col=j)
x=x+1
fig.update_layout(
width=1500,
height=1200, showlegend=False)
fig.show()
columns_categorical = ['diagnosis']
from plotly.subplots import make_subplots
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 10, 'color': 'white'}
# plotly setup
plot_rows=1
plot_cols=1
fig = make_subplots(rows=plot_rows,
cols=plot_cols,
shared_yaxes=False,
vertical_spacing=0.1,
subplot_titles=('1', '2', '3', '4'),
specs=[[{'type':'domain'}]]
)
# add traces
x = 0
names = {}
count = 1
for i in range(1, plot_rows + 1):
for j in range(1, plot_cols + 1):
fig1 = px.pie(df_data, values='area_mean', names=columns_categorical[x])
trace1 = fig1.data[0]
fig.add_trace(trace1,
row=i,
col=j)
names[str(count)]=columns_categorical[x]
x=x+1
count = count + 1
fig.for_each_annotation(lambda a: a.update(text = names[a.text]))
fig.update_layout(width=600, height=400)
fig.update_annotations(y=1.1)
fig.show()
data_corre = df_data_processing.corr()['diagnosis'].sort_values(ascending=True).reset_index()
fig = go.Figure()
fig.add_trace(
go.Bar(
x=data_corre['index'],
y=data_corre['diagnosis'], opacity=0.8,
marker=dict(cmax=1, cmin=-1, color=data_corre['diagnosis'], showscale=True, colorbar={"title": "Correlation"})))
fig.update_xaxes(tickangle=60, tickfont=dict(size=10), automargin='height')
fig.update_layout(title_text='Graphic correlation variable diagnosis', xaxis_title="Feature", yaxis_title="Correlation", coloraxis_colorbar_title_text = 'your title')
fig.show()
df_data_processing.corr().style.background_gradient(cmap="coolwarm", axis=None).format('{:.2f}')
quasi_collinearity_mean = ['perimeter_mean', 'area_mean', 'radius_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean']
quasi_collinearity_se = ['perimeter_se', 'area_se', 'radius_se', 'compactness_se', 'concavity_se', 'concave points_se']
quasi_collinearity_worst = ['perimeter_worst', 'area_worst', 'radius_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst']
df_data_processing[quasi_collinearity_mean].corr().style.background_gradient(cmap="coolwarm", axis=None).format('{:.2f}')
df_data_processing[quasi_collinearity_se].corr().style.background_gradient(cmap="coolwarm", axis=None).format('{:.2f}')
df_data_processing[quasi_collinearity_worst].corr().style.background_gradient(cmap="coolwarm", axis=None).format('{:.2f}')
df_data_processing.drop(['perimeter_mean', 'radius_mean','radius_worst', 'perimeter_worst', 'radius_se', 'perimeter_se', 'compactness_mean', 'concave points_mean', 'compactness_se', 'concave points_se', 'compactness_worst', 'concave points_worst'], axis=1, inplace=True)
fig = px.imshow(
df_data_processing.corr().round(2),
color_continuous_scale=px.colors.diverging.RdBu[::-1],
text_auto=True,
zmin=-1,
zmax=1,
)
fig.update_yaxes(tickfont=dict(family='Arial', size=10), automargin='height')
fig.update_xaxes(tickfont=dict(family='Arial', size=10), automargin='height')
fig.update_layout(title_text='Graphic correlation your dataset', coloraxis_colorbar_title_text = 'Correlation')
fig.show()
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'}
fig = px.histogram(df_data, x="area_se", color="diagnosis", opacity=0.8, barmode='overlay')
fig.update_layout(
title=f"Histogram area_se",
)
fig.show()
fig = px.histogram(df_data, x="fractal_dimension_mean", color="diagnosis", opacity=0.8, barmode='overlay')
fig.update_layout(
title=f"Histogram fractal_dimension_mean",
)
fig.show()
hist_data = [df_data_processing["area_se"]]
group_labels = ["distplot"] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels, bin_size=1, colors=["#FA0087"])
fig.update_layout(
title="Distplot variable area_se"
)
fig.show()
g = sns.catplot(
data=df_data, kind="swarm",
y="area_se", col='diagnosis',
s=5,
)
# x="", color="diagnosis"
# g.set_xticklabels(["Relationship", "Single"])
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle(f"Distribution of 'area_se' by diagnosis",
fontsize=24, fontdict={"weight": "bold"})
plt.show(g)
g = sns.catplot(
data=df_data, kind="bar",
y="area_se", col='diagnosis',
)
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle(f"Barplot of 'area_se' by diagnosis",
fontsize=24, fontdict={"weight": "bold"})
plt.show(g)
scaler = StandardScaler()
df_data_scaled = scaler.fit_transform(df_data_processing)
df_data_scaled = pd.DataFrame(df_data_scaled, columns = df_data_processing.columns)
X = df_data_scaled.drop(['diagnosis'], axis=1)
y = df_data_processing['diagnosis']
def matrix_confusion_plotly(cm, name):
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'}
x_matrix = ['False', 'True']
y_matrix = ['True', 'False']
z_text = [['TN', 'FP'],
['FN', 'TP']]
# Display element name and prediction on hover
hover_text = [['True Neg', 'False Pos'],
['False Neg', 'True Pos']]
hover=[]
for _ in range(len(z_text)):
hover.append([i + '<br>' + 'Quantity prediction: ' + str(j) + '<br>' + 'Percentage data in each quadrant: ' + str(round(j/np.sum(cm)*100, 2)) + '%'
for i, j in zip(hover_text[_], cm[_])])
fig = ff.create_annotated_heatmap(cm[::-1], x=x_matrix, y=y_matrix, annotation_text=z_text[::-1], text=hover[::-1], hoverinfo='text', colorscale=px.colors.diverging.RdBu[::-1], showscale=True)
fig.update_layout(title_text=f'<i><b>Confusion matrix for {name}</b></i>')
# add custom xaxis title
fig.add_annotation(dict(font=dict(color="white",size=14),
x=0.5,
y=-0.12,
showarrow=False,
text="Predicted value",
xref="paper",
yref="paper"))
# add custom yaxis title
fig.add_annotation(dict(font=dict(color="white",size=14),
x=-0.08,
y=0.5,
showarrow=False,
text="Real value",
textangle=-90,
xref="paper",
yref="paper"))
fig.update_layout(width=560, height=560)
fig.show()
import warnings
warnings.filterwarnings("ignore")
# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pca = PCA()
# method used a cross-validation procedure
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
# Grid for classifier
reg_classifier = {
'LogisticRegression': LogisticRegression(),
'RandomForestClassifier': RandomForestClassifier(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'KNeighborsClassifier': KNeighborsClassifier(),
'LGBMClassifier': LGBMClassifier(),
# 'CatBoostClassifier': CatBoostClassifier(),
'AdaBoostClassifier': AdaBoostClassifier(),
'GradientBoostingClassifier': GradientBoostingClassifier(),
'SupportVectorMachine': SVC(),
'GaussianNaiveBayes': GaussianNB(),
'BaggingClassifier': None,
'BoostingDecisionTree': None,
'VotingClassifier': None
}
params = {
'pca':
{
"pca__n_components": np.arange(1, 6, 1),
'pca__random_state': [42]
},
'LogisticRegression':
{
"LogisticRegression__C": np.logspace(-4, 4, 4),
"LogisticRegression__penalty": ['l1', 'l2'],
"LogisticRegression__solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
'LogisticRegression__max_iter': [10000],
'LogisticRegression__random_state': [42],
'LogisticRegression__n_jobs': [-1],
},
'RandomForestClassifier':
{
"RandomForestClassifier__criterion": ['entropy', 'gini'],
"RandomForestClassifier__n_estimators": [160, 192, 224],
"RandomForestClassifier__max_depth": [14, 16, 18],
'RandomForestClassifier__random_state': [42],
'RandomForestClassifier__n_jobs': [-1],
},
'DecisionTreeClassifier':
{"DecisionTreeClassifier__criterion": ['entropy', 'gini'],
"DecisionTreeClassifier__max_depth": [6, 8, 10],
'DecisionTreeClassifier__random_state': [42],
},
'KNeighborsClassifier':
{
'KNeighborsClassifier__n_neighbors': [5, 6, 7],
'KNeighborsClassifier__weights': ['uniform', 'distance'],
'KNeighborsClassifier__n_jobs': [-1],
'KNeighborsClassifier__p': [1, 2, 3],
},
'LGBMClassifier':
{
'LGBMClassifier__num_leaves': [10, 20],
'LGBMClassifier__min_child_samples': [10, 15],
'LGBMClassifier__max_depth': [4, 8],
'LGBMClassifier__learning_rate':[0.1],
'LGBMClassifier__reg_alpha':[0.01, 0.03],
'LGBMClassifier__n_jobs':[-1],
'LGBMClassifier__random_state': [42]
},
# 'CatBoostClassifier':
# {
# 'CatBoostClassifier__max_depth': [8, 16],
# 'CatBoostClassifier__learning_rate': [0.1],
# 'CatBoostClassifier__iterations': [10, 20]
# },
'AdaBoostClassifier':
{"AdaBoostClassifier__n_estimators": [20, 25, 30],
"AdaBoostClassifier__learning_rate": [0.1],
'AdaBoostClassifier__random_state': [42],
},
'GradientBoostingClassifier':
{"GradientBoostingClassifier__n_estimators": [15, 20, 25],
"GradientBoostingClassifier__learning_rate": [0.1],
'GradientBoostingClassifier__random_state': [42],
"GradientBoostingClassifier__loss":["log_loss"],
"GradientBoostingClassifier__min_samples_split": [0.05, 0.1, 0.15],
"GradientBoostingClassifier__min_samples_leaf": [0.05, 0.1, 0.15],
"GradientBoostingClassifier__max_depth":[6, 8, 10],
"GradientBoostingClassifier__max_features":["log2","sqrt"],
"GradientBoostingClassifier__criterion": ["friedman_mse", "mae"],
"GradientBoostingClassifier__subsample":[0.6, 0.7, 0.8],
},
'SupportVectorMachine':
{
'SupportVectorMachine__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
'SupportVectorMachine__random_state': [42],
'SupportVectorMachine__probability': [True],
},
'GaussianNaiveBayes':
{},
'BaggingClassifier':
{
'BaggingClassifier__max_samples': [0.5],
'BaggingClassifier__max_features': [1.0],
'BaggingClassifier__n_estimators': [15, 20, 25],
'BaggingClassifier__n_jobs': [-1],
},
'BoostingDecisionTree':
{
'BoostingDecisionTree__n_estimators': [5, 10, 15],
'BoostingDecisionTree__learning_rate': [0.05, 0.075, 0.1],
'BoostingDecisionTree__random_state': [42],
},
'VotingClassifier':
{
'VotingClassifier__voting': ['soft'],
'VotingClassifier__n_jobs': [-1],
},
}
# Grid for LearningCurveDisplay
common_params = {
"X": X,
"y": y,
"train_sizes": np.linspace(0.25, 1.0, 4),
"cv": ShuffleSplit(n_splits=10, test_size=0.3, random_state=0),
"score_type": "both",
"n_jobs": -1,
"line_kw": {"marker": "o"},
"std_display_style": "fill_between",
"score_name": "Accuracy",
"random_state": 42
}
# Dict and list to save score to models
model_best = {}
pca_best = {}
models_names = []
test_scores = []
train_scores = []
###################################### Models deployment ###########################################
for name, reg in tqdm(reg_classifier.items()):
pipe = Pipeline(steps=[("pca", pca), (name, reg)])
grid_reg = GridSearchCV(pipe, {**params.get('pca'), **params.get(name)}, n_jobs=-1, cv=cv, scoring='accuracy').fit(X, y)
score = np.abs(grid_reg.best_score_)
#Getting the best estimators, score & parameters
GridSearchCV
print(f'├──GridSearchCV of {name}:')
print(f'\t├──Best Estimator: {grid_reg.best_estimator_}')
print(f'\t├──Best recall Score on Train: {grid_reg.best_score_}')
print(f'\t├──Best Params: {grid_reg.best_params_} \n\n')
# Best model and pca for comparative Roc Curves
model_best[name] = grid_reg.best_estimator_.named_steps[name]
pca_best[name] = grid_reg.best_estimator_.named_steps['pca'].n_components
# Save the best algorithms for use in the BaggingClassifier, BoostingDecisionTree and VotingClassifier
if name == 'LogisticRegression':
best_params_logistic_regression = grid_reg.best_estimator_.named_steps['LogisticRegression'].get_params()
if name == 'DecisionTreeClassifier':
best_params_decision_tree_classifier = grid_reg.best_estimator_.named_steps['DecisionTreeClassifier'].get_params()
reg_classifier['BaggingClassifier'] = BaggingClassifier(DecisionTreeClassifier(**best_params_decision_tree_classifier))
reg_classifier['BoostingDecisionTree'] = AdaBoostClassifier(DecisionTreeClassifier(**best_params_decision_tree_classifier))
if name == 'SupportVectorMachine':
best_params_support_vector_machine = grid_reg.best_estimator_.named_steps['SupportVectorMachine'].get_params()
if name == 'GaussianNaiveBayes':
best_params_gaussian_naive_bayes = grid_reg.best_estimator_.named_steps['GaussianNaiveBayes'].get_params()
reg_classifier['VotingClassifier'] = VotingClassifier(estimators=[
('gnb', GaussianNB(**best_params_gaussian_naive_bayes)),
('lr', LogisticRegression(**best_params_logistic_regression)),
('svm', SVC(**best_params_support_vector_machine))
])
# Plot the PCA spectrum
pca.fit(X)
mpl.rcParams['text.color'] = 'white'
plt.style.use('dark_background')
fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(8, 8))
ax0.bar(
np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_
)
ax0.set_ylabel("PCA explained variance ratio")
ax0.plot(
np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "w-s", linewidth=2
)
ax0.axvline(
grid_reg.best_estimator_.named_steps["pca"].n_components,
linestyle="--", color ='red',
label="n_components chosen",
)
ax0.legend(prop=dict(size=12))
## For each number of components, find the best classifier results
results = pd.DataFrame(grid_reg.cv_results_)
components_col = "param_pca__n_components"
best_clfs = results.groupby(components_col).apply(
lambda g: g.nlargest(1, "mean_test_score")
)
best_clfs.plot(
x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1
)
ax1.set_ylabel("Classification accuracy (val)")
ax1.set_xlabel("n_components")
ax1.axvline(
grid_reg.best_estimator_.named_steps["pca"].n_components,
linestyle="--", color ='red',
label="n_components chosen",
)
ax0.legend(prop=dict(size=12))
plt.xlim(0, 12)
plt.suptitle(f"Plot the PCA spectrum for {name}",
fontsize=24, fontdict={"weight": "bold"})
plt.tight_layout()
plt.show()
# Analysis best estimator of GridSearchCV
print(f'Visualize the Classification report and graphics Classification metrics of best estimator:')
# PCA Best Model
pca_b = PCA(n_components=grid_reg.best_estimator_.named_steps['pca'].n_components, random_state=42)
pca_b.fit(X)
X_pca = pca_b.transform(X)
# Split with 'X' and 'y' balanced and pca_components_best of 'X'
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=0)
# Training our best model and prediction
model = grid_reg.best_estimator_.named_steps[name].fit(X_train, y_train)
prediction = model.predict(X_test)
# View report with pro style
df = pd.DataFrame(classification_report(y_test,
prediction,
digits=2,
output_dict=True)).T
df['support'] = df.support.apply(int)
display(df.style.background_gradient(cmap='coolwarm',
subset=pd.IndexSlice['0':'1', :'f1-score']))
# Display table results Test_Score and Train_Score for all models
models_names.append(name)
test_scores.append(model.score(X_test, y_test)*100)
train_scores.append(model.score(X_train, y_train)*100)
# Function plot matrix confusion with plotly
cm = confusion_matrix(y_test, prediction, labels=model.classes_)
matrix_confusion_plotly(cm, name)
# Class Prediction Error of result LogisticRegression# Class Prediction Error of result LogisticRegression
## Instantiate the classification model and visualizer
fig, ax = plt.subplots(figsize=(9,7))
visualizer = ClassPredictionError(
grid_reg.best_estimator_.named_steps[name], classes=['Benign', 'Malignant']
)
## Fit the training data to the visualizer
visualizer.fit(X_train, y_train)
## Evaluate the model on the test data
visualizer.score(X_test, y_test)
## Draw visualization
visualizer.show()
# Learning Curve visualization
fig, ax = plt.subplots(figsize=(9,9))
LearningCurveDisplay.from_estimator(grid_reg.best_estimator_.named_steps[name], **common_params, ax=ax)
handles, label = ax.get_legend_handles_labels()
ax.legend(handles[:2], ["Training Score", "Test Score"], fontsize=12)
plt.title(f"Learning Curve for {name}", fontsize=24, fontdict={"weight": "bold"})
plt.show()
# # ROC Curve single
# fig, ax = plt.subplots(figsize=(9,9))
# clf = grid_reg.best_estimator_.fit(X_train, y_train)
# RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax)
# plt.title(f"ROC Curve for {name}",
# fontsize=24, fontdict={"weight": "bold"})
# plt.show()
# Plot a decision tree
if name == 'DecisionTreeClassifier':
mpl.rcParams['text.color'] = 'black'
sns.set_style('darkgrid', {"grid.color": ".6", "grid.linestyle": ":"})
fig, ax = plt.subplots(figsize=(12, 12), facecolor='k')
plot_tree(grid_reg.best_estimator_.named_steps[name], feature_names=list(X.columns), label=list(np.unique(y)), filled=True, fontsize=12, max_depth= 2, ax=ax)
plt.title(f"Decision tree for {name}", color='white', fontsize=28, fontweight='bold')
plt.show()
print('\n' * 3, '_' * 85, '\n' * 3)
def RocCurves_plotly(model, n_pca):
# Create an empty figure, and iteratively add new lines
# every time we compute a new class
fig = go.Figure()
fig.add_shape(
type='line', line=dict(dash='dash', color='white'),
x0=0, x1=1, y0=0, y1=1
)
for name, reg in model.items():
# PCA Best Model
pca_b = PCA(n_components=n_pca[name], random_state=42)
pca_b.fit(X)
X_pca = pca_b.transform(X)
# split 'X' e 'y' balanceaddos and pca_components_best
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=5)
# Fit the model
reg.fit(X_train, y_train)
y_score = reg.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)
auc_score = roc_auc_score(y_test, y_score)
name = f"{name} (AUC={auc_score:.2f})"
fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))
fig.update_layout(
title="Roc Curves all best classifier",
xaxis_title='False Positive Rate',
yaxis_title='True Positive Rate',
yaxis=dict(scaleanchor="x", scaleratio=1),
xaxis=dict(constrain='domain'),
)
fig.show()
# plot RocCurves for best classifier
RocCurves_plotly(model_best, pca_best)
dict_models_scores = {'Model': models_names, 'Test Score': test_scores, 'Train Score': train_scores}
df = pd.DataFrame(dict_models_scores)
# plot bar comparative scores test and train for best classifier
fig = go.Figure()
fig.add_trace(go.Bar(
y=df['Test Score'],
x=df['Model'],
name='Test Scores',
marker_color='indianred'
))
fig.add_trace(go.Bar(
y=df['Train Score'],
x=df['Model'],
name='Train Scores',
marker_color='lightsalmon'
))
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45, title="Bar comparative scores of test and train for best classifier",)
fig.update_yaxes(tickfont=dict(family='Arial', size=14), automargin='height')
fig.update_xaxes(tickfont=dict(family='Arial', size=14), automargin='height')
fig.show()
# !pip install -q pipreqsnb
# !pipreqsnb '.' --force
# Edit to requirements.txt with any additional libraries you require