Binary-Classification-Implementation-in-Breast-Cancer

!python -m pip install -q --upgrade pip !pip install -q -r requirements.txt

# Miscellaneous from __future__ import print_function from IPython.display import display #Importing Requierd Libraries import pandas as pd import numpy as np # For interactive graphics import plotly.express as px import plotly.graph_objects as go import plotly.figure_factory as ff from plotly.subplots import make_subplots import matplotlib.pyplot as plt import matplotlib as mpl import seaborn as sns from ydata_profiling import ProfileReport from yellowbrick.classifier import ClassPredictionError from tqdm.notebook import trange, tqdm # Sklearn from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.datasets import make_blobs from imblearn.over_sampling import SMOTE from sklearn.model_selection import GridSearchCV from sklearn.metrics import roc_curve, roc_auc_score, precision_score, f1_score, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score from sklearn.model_selection import train_test_split, ShuffleSplit, LearningCurveDisplay from imblearn.pipeline import Pipeline ## Algorithms for Binary Classification from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier from lightgbm import LGBMClassifier from catboost import CatBoostClassifier ## Visualization of Decision Tree from sklearn.tree import plot_tree, export_text # Magics Funtions %load_ext autoreload %autoreload 2 %run "template_visualitation.ipynb" %run "pandas-missing-extension.ipynb"

df_data = pd.read_csv('data.csv', index_col=None, delimiter=',', encoding='utf-8')

df_data.to_feather("data.feather")

df_data = pd.read_feather("data.feather")

df_data.describe()

df_data.info(memory_usage="deep")

memory_usage = df_data.memory_usage(deep=True) / 1024 ** 2 print(f'memory usage of features:\n{memory_usage.head(7)}') print('memory usage sum:',memory_usage.sum())

def reduce_memory_usage(df, verbose=True): numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"] start_mem = df.memory_usage().sum() / 1024 ** 2 for col in df.columns: col_type = df[col].dtypes if col_type in numerics: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if ( c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max ): df[col] = df[col].astype(np.float16) elif ( c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max ): df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024 ** 2 if verbose: print( "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format( end_mem, 100 * (start_mem - end_mem) / start_mem ) ) return df df_data = reduce_memory_usage(df_data, verbose=True)

df_data.info(memory_usage="deep")

df_data.info()

df_data.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)

df_data_processing = df_data.copy()

df_data_processing.diagnosis.value_counts()

df_data_processing.diagnosis = df_data.diagnosis.replace(["M", "B"], [1, 0])

df_data_processing.diagnosis.value_counts()

df_data_processing.missing.missing_variable_summary()

df_data_processing[df_data_processing.duplicated()]

df_data_processing.duplicated().value_counts()

df_data_processing.nunique()

df_data_processing = reduce_memory_usage(df_data_processing, verbose=True) df_data_processing.info()

profile = ProfileReport( df_data_processing, title="Pandas Profiling Report", html={"style": {"primary_color": "#FA0087"}}, minimal=True ) profile

columns_numeric = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']

pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 10, 'color': 'white'} # plotly setup plot_rows=5 plot_cols=6 fig = make_subplots(rows=plot_rows, cols=plot_cols, shared_yaxes=False) # add traces x = 0 for i in range(1, plot_rows + 1): for j in range(1, plot_cols + 1): fig.add_trace(go.Box(y=df_data_processing[columns_numeric[x]].values, name = df_data_processing[columns_numeric].columns[x], ), row=i, col=j) x=x+1 fig.update_layout( width=1500, height=1200, showlegend=False) fig.show()

columns_categorical = ['diagnosis']

from plotly.subplots import make_subplots pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 10, 'color': 'white'} # plotly setup plot_rows=1 plot_cols=1 fig = make_subplots(rows=plot_rows, cols=plot_cols, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=('1', '2', '3', '4'), specs=[[{'type':'domain'}]] ) # add traces x = 0 names = {} count = 1 for i in range(1, plot_rows + 1): for j in range(1, plot_cols + 1): fig1 = px.pie(df_data, values='area_mean', names=columns_categorical[x]) trace1 = fig1.data[0] fig.add_trace(trace1, row=i, col=j) names[str(count)]=columns_categorical[x] x=x+1 count = count + 1 fig.for_each_annotation(lambda a: a.update(text = names[a.text])) fig.update_layout(width=600, height=400) fig.update_annotations(y=1.1) fig.show()

data_corre = df_data_processing.corr()['diagnosis'].sort_values(ascending=True).reset_index()

fig = go.Figure() fig.add_trace( go.Bar( x=data_corre['index'], y=data_corre['diagnosis'], opacity=0.8, marker=dict(cmax=1, cmin=-1, color=data_corre['diagnosis'], showscale=True, colorbar={"title": "Correlation"}))) fig.update_xaxes(tickangle=60, tickfont=dict(size=10), automargin='height') fig.update_layout(title_text='Graphic correlation variable diagnosis', xaxis_title="Feature", yaxis_title="Correlation", coloraxis_colorbar_title_text = 'your title') fig.show()

df_data_processing.corr().style.background_gradient(cmap="coolwarm", axis=None).format('{:.2f}')

quasi_collinearity_mean = ['perimeter_mean', 'area_mean', 'radius_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean'] quasi_collinearity_se = ['perimeter_se', 'area_se', 'radius_se', 'compactness_se', 'concavity_se', 'concave points_se'] quasi_collinearity_worst = ['perimeter_worst', 'area_worst', 'radius_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst']

df_data_processing[quasi_collinearity_mean].corr().style.background_gradient(cmap="coolwarm", axis=None).format('{:.2f}')

df_data_processing[quasi_collinearity_se].corr().style.background_gradient(cmap="coolwarm", axis=None).format('{:.2f}')

df_data_processing[quasi_collinearity_worst].corr().style.background_gradient(cmap="coolwarm", axis=None).format('{:.2f}')

df_data_processing.drop(['perimeter_mean', 'radius_mean','radius_worst', 'perimeter_worst', 'radius_se', 'perimeter_se', 'compactness_mean', 'concave points_mean', 'compactness_se', 'concave points_se', 'compactness_worst', 'concave points_worst'], axis=1, inplace=True)

fig = px.imshow( df_data_processing.corr().round(2), color_continuous_scale=px.colors.diverging.RdBu[::-1], text_auto=True, zmin=-1, zmax=1, ) fig.update_yaxes(tickfont=dict(family='Arial', size=10), automargin='height') fig.update_xaxes(tickfont=dict(family='Arial', size=10), automargin='height') fig.update_layout(title_text='Graphic correlation your dataset', coloraxis_colorbar_title_text = 'Correlation') fig.show()

pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'} fig = px.histogram(df_data, x="area_se", color="diagnosis", opacity=0.8, barmode='overlay') fig.update_layout( title=f"Histogram area_se", ) fig.show() fig = px.histogram(df_data, x="fractal_dimension_mean", color="diagnosis", opacity=0.8, barmode='overlay') fig.update_layout( title=f"Histogram fractal_dimension_mean", ) fig.show()

hist_data = [df_data_processing["area_se"]] group_labels = ["distplot"] # name of the dataset fig = ff.create_distplot(hist_data, group_labels, bin_size=1, colors=["#FA0087"]) fig.update_layout( title="Distplot variable area_se" ) fig.show()

g = sns.catplot( data=df_data, kind="swarm", y="area_se", col='diagnosis', s=5, ) # x="", color="diagnosis" # g.set_xticklabels(["Relationship", "Single"]) g.fig.subplots_adjust(top=0.8) g.fig.suptitle(f"Distribution of 'area_se' by diagnosis", fontsize=24, fontdict={"weight": "bold"}) plt.show(g)

g = sns.catplot( data=df_data, kind="bar", y="area_se", col='diagnosis', ) g.fig.subplots_adjust(top=0.8) g.fig.suptitle(f"Barplot of 'area_se' by diagnosis", fontsize=24, fontdict={"weight": "bold"}) plt.show(g)

scaler = StandardScaler() df_data_scaled = scaler.fit_transform(df_data_processing)

df_data_scaled = pd.DataFrame(df_data_scaled, columns = df_data_processing.columns)

X = df_data_scaled.drop(['diagnosis'], axis=1) y = df_data_processing['diagnosis']

def matrix_confusion_plotly(cm, name): pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'} x_matrix = ['False', 'True'] y_matrix = ['True', 'False'] z_text = [['TN', 'FP'], ['FN', 'TP']] # Display element name and prediction on hover hover_text = [['True Neg', 'False Pos'], ['False Neg', 'True Pos']] hover=[] for _ in range(len(z_text)): hover.append([i + ' ' + 'Quantity prediction: ' + str(j) + ' ' + 'Percentage data in each quadrant: ' + str(round(j/np.sum(cm)*100, 2)) + '%' for i, j in zip(hover_text[_], cm[_])]) fig = ff.create_annotated_heatmap(cm[::-1], x=x_matrix, y=y_matrix, annotation_text=z_text[::-1], text=hover[::-1], hoverinfo='text', colorscale=px.colors.diverging.RdBu[::-1], showscale=True) fig.update_layout(title_text=f'Confusion matrix for {name}') # add custom xaxis title fig.add_annotation(dict(font=dict(color="white",size=14), x=0.5, y=-0.12, showarrow=False, text="Predicted value", xref="paper", yref="paper")) # add custom yaxis title fig.add_annotation(dict(font=dict(color="white",size=14), x=-0.08, y=0.5, showarrow=False, text="Real value", textangle=-90, xref="paper", yref="paper")) fig.update_layout(width=560, height=560) fig.show()

import warnings warnings.filterwarnings("ignore") # Define a pipeline to search for the best combination of PCA truncation # and classifier regularization. pca = PCA() # method used a cross-validation procedure cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) # Grid for classifier reg_classifier = { 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'KNeighborsClassifier': KNeighborsClassifier(), 'LGBMClassifier': LGBMClassifier(), # 'CatBoostClassifier': CatBoostClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'SupportVectorMachine': SVC(), 'GaussianNaiveBayes': GaussianNB(), 'BaggingClassifier': None, 'BoostingDecisionTree': None, 'VotingClassifier': None } params = { 'pca': { "pca__n_components": np.arange(1, 6, 1), 'pca__random_state': [42] }, 'LogisticRegression': { "LogisticRegression__C": np.logspace(-4, 4, 4), "LogisticRegression__penalty": ['l1', 'l2'], "LogisticRegression__solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], 'LogisticRegression__max_iter': [10000], 'LogisticRegression__random_state': [42], 'LogisticRegression__n_jobs': [-1], }, 'RandomForestClassifier': { "RandomForestClassifier__criterion": ['entropy', 'gini'], "RandomForestClassifier__n_estimators": [160, 192, 224], "RandomForestClassifier__max_depth": [14, 16, 18], 'RandomForestClassifier__random_state': [42], 'RandomForestClassifier__n_jobs': [-1], }, 'DecisionTreeClassifier': {"DecisionTreeClassifier__criterion": ['entropy', 'gini'], "DecisionTreeClassifier__max_depth": [6, 8, 10], 'DecisionTreeClassifier__random_state': [42], }, 'KNeighborsClassifier': { 'KNeighborsClassifier__n_neighbors': [5, 6, 7], 'KNeighborsClassifier__weights': ['uniform', 'distance'], 'KNeighborsClassifier__n_jobs': [-1], 'KNeighborsClassifier__p': [1, 2, 3], }, 'LGBMClassifier': { 'LGBMClassifier__num_leaves': [10, 20], 'LGBMClassifier__min_child_samples': [10, 15], 'LGBMClassifier__max_depth': [4, 8], 'LGBMClassifier__learning_rate':[0.1], 'LGBMClassifier__reg_alpha':[0.01, 0.03], 'LGBMClassifier__n_jobs':[-1], 'LGBMClassifier__random_state': [42] }, # 'CatBoostClassifier': # { # 'CatBoostClassifier__max_depth': [8, 16], # 'CatBoostClassifier__learning_rate': [0.1], # 'CatBoostClassifier__iterations': [10, 20] # }, 'AdaBoostClassifier': {"AdaBoostClassifier__n_estimators": [20, 25, 30], "AdaBoostClassifier__learning_rate": [0.1], 'AdaBoostClassifier__random_state': [42], }, 'GradientBoostingClassifier': {"GradientBoostingClassifier__n_estimators": [15, 20, 25], "GradientBoostingClassifier__learning_rate": [0.1], 'GradientBoostingClassifier__random_state': [42], "GradientBoostingClassifier__loss":["log_loss"], "GradientBoostingClassifier__min_samples_split": [0.05, 0.1, 0.15], "GradientBoostingClassifier__min_samples_leaf": [0.05, 0.1, 0.15], "GradientBoostingClassifier__max_depth":[6, 8, 10], "GradientBoostingClassifier__max_features":["log2","sqrt"], "GradientBoostingClassifier__criterion": ["friedman_mse", "mae"], "GradientBoostingClassifier__subsample":[0.6, 0.7, 0.8], }, 'SupportVectorMachine': { 'SupportVectorMachine__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 'SupportVectorMachine__random_state': [42], 'SupportVectorMachine__probability': [True], }, 'GaussianNaiveBayes': {}, 'BaggingClassifier': { 'BaggingClassifier__max_samples': [0.5], 'BaggingClassifier__max_features': [1.0], 'BaggingClassifier__n_estimators': [15, 20, 25], 'BaggingClassifier__n_jobs': [-1], }, 'BoostingDecisionTree': { 'BoostingDecisionTree__n_estimators': [5, 10, 15], 'BoostingDecisionTree__learning_rate': [0.05, 0.075, 0.1], 'BoostingDecisionTree__random_state': [42], }, 'VotingClassifier': { 'VotingClassifier__voting': ['soft'], 'VotingClassifier__n_jobs': [-1], }, } # Grid for LearningCurveDisplay common_params = { "X": X, "y": y, "train_sizes": np.linspace(0.25, 1.0, 4), "cv": ShuffleSplit(n_splits=10, test_size=0.3, random_state=0), "score_type": "both", "n_jobs": -1, "line_kw": {"marker": "o"}, "std_display_style": "fill_between", "score_name": "Accuracy", "random_state": 42 } # Dict and list to save score to models model_best = {} pca_best = {} models_names = [] test_scores = [] train_scores = [] ###################################### Models deployment ########################################### for name, reg in tqdm(reg_classifier.items()): pipe = Pipeline(steps=[("pca", pca), (name, reg)]) grid_reg = GridSearchCV(pipe, {**params.get('pca'), **params.get(name)}, n_jobs=-1, cv=cv, scoring='accuracy').fit(X, y) score = np.abs(grid_reg.best_score_) #Getting the best estimators, score & parameters GridSearchCV print(f'├──GridSearchCV of {name}:') print(f'\t├──Best Estimator: {grid_reg.best_estimator_}') print(f'\t├──Best recall Score on Train: {grid_reg.best_score_}') print(f'\t├──Best Params: {grid_reg.best_params_} \n\n') # Best model and pca for comparative Roc Curves model_best[name] = grid_reg.best_estimator_.named_steps[name] pca_best[name] = grid_reg.best_estimator_.named_steps['pca'].n_components # Save the best algorithms for use in the BaggingClassifier, BoostingDecisionTree and VotingClassifier if name == 'LogisticRegression': best_params_logistic_regression = grid_reg.best_estimator_.named_steps['LogisticRegression'].get_params() if name == 'DecisionTreeClassifier': best_params_decision_tree_classifier = grid_reg.best_estimator_.named_steps['DecisionTreeClassifier'].get_params() reg_classifier['BaggingClassifier'] = BaggingClassifier(DecisionTreeClassifier(**best_params_decision_tree_classifier)) reg_classifier['BoostingDecisionTree'] = AdaBoostClassifier(DecisionTreeClassifier(**best_params_decision_tree_classifier)) if name == 'SupportVectorMachine': best_params_support_vector_machine = grid_reg.best_estimator_.named_steps['SupportVectorMachine'].get_params() if name == 'GaussianNaiveBayes': best_params_gaussian_naive_bayes = grid_reg.best_estimator_.named_steps['GaussianNaiveBayes'].get_params() reg_classifier['VotingClassifier'] = VotingClassifier(estimators=[ ('gnb', GaussianNB(**best_params_gaussian_naive_bayes)), ('lr', LogisticRegression(**best_params_logistic_regression)), ('svm', SVC(**best_params_support_vector_machine)) ]) # Plot the PCA spectrum pca.fit(X) mpl.rcParams['text.color'] = 'white' plt.style.use('dark_background') fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(8, 8)) ax0.bar( np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_ ) ax0.set_ylabel("PCA explained variance ratio") ax0.plot( np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "w-s", linewidth=2 ) ax0.axvline( grid_reg.best_estimator_.named_steps["pca"].n_components, linestyle="--", color ='red', label="n_components chosen", ) ax0.legend(prop=dict(size=12)) ## For each number of components, find the best classifier results results = pd.DataFrame(grid_reg.cv_results_) components_col = "param_pca__n_components" best_clfs = results.groupby(components_col).apply( lambda g: g.nlargest(1, "mean_test_score") ) best_clfs.plot( x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1 ) ax1.set_ylabel("Classification accuracy (val)") ax1.set_xlabel("n_components") ax1.axvline( grid_reg.best_estimator_.named_steps["pca"].n_components, linestyle="--", color ='red', label="n_components chosen", ) ax0.legend(prop=dict(size=12)) plt.xlim(0, 12) plt.suptitle(f"Plot the PCA spectrum for {name}", fontsize=24, fontdict={"weight": "bold"}) plt.tight_layout() plt.show() # Analysis best estimator of GridSearchCV print(f'Visualize the Classification report and graphics Classification metrics of best estimator:') # PCA Best Model pca_b = PCA(n_components=grid_reg.best_estimator_.named_steps['pca'].n_components, random_state=42) pca_b.fit(X) X_pca = pca_b.transform(X) # Split with 'X' and 'y' balanced and pca_components_best of 'X' X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=0) # Training our best model and prediction model = grid_reg.best_estimator_.named_steps[name].fit(X_train, y_train) prediction = model.predict(X_test) # View report with pro style df = pd.DataFrame(classification_report(y_test, prediction, digits=2, output_dict=True)).T df['support'] = df.support.apply(int) display(df.style.background_gradient(cmap='coolwarm', subset=pd.IndexSlice['0':'1', :'f1-score'])) # Display table results Test_Score and Train_Score for all models models_names.append(name) test_scores.append(model.score(X_test, y_test)*100) train_scores.append(model.score(X_train, y_train)*100) # Function plot matrix confusion with plotly cm = confusion_matrix(y_test, prediction, labels=model.classes_) matrix_confusion_plotly(cm, name) # Class Prediction Error of result LogisticRegression# Class Prediction Error of result LogisticRegression ## Instantiate the classification model and visualizer fig, ax = plt.subplots(figsize=(9,7)) visualizer = ClassPredictionError( grid_reg.best_estimator_.named_steps[name], classes=['Benign', 'Malignant'] ) ## Fit the training data to the visualizer visualizer.fit(X_train, y_train) ## Evaluate the model on the test data visualizer.score(X_test, y_test) ## Draw visualization visualizer.show() # Learning Curve visualization fig, ax = plt.subplots(figsize=(9,9)) LearningCurveDisplay.from_estimator(grid_reg.best_estimator_.named_steps[name], **common_params, ax=ax) handles, label = ax.get_legend_handles_labels() ax.legend(handles[:2], ["Training Score", "Test Score"], fontsize=12) plt.title(f"Learning Curve for {name}", fontsize=24, fontdict={"weight": "bold"}) plt.show() # # ROC Curve single # fig, ax = plt.subplots(figsize=(9,9)) # clf = grid_reg.best_estimator_.fit(X_train, y_train) # RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax) # plt.title(f"ROC Curve for {name}", # fontsize=24, fontdict={"weight": "bold"}) # plt.show() # Plot a decision tree if name == 'DecisionTreeClassifier': mpl.rcParams['text.color'] = 'black' sns.set_style('darkgrid', {"grid.color": ".6", "grid.linestyle": ":"}) fig, ax = plt.subplots(figsize=(12, 12), facecolor='k') plot_tree(grid_reg.best_estimator_.named_steps[name], feature_names=list(X.columns), label=list(np.unique(y)), filled=True, fontsize=12, max_depth= 2, ax=ax) plt.title(f"Decision tree for {name}", color='white', fontsize=28, fontweight='bold') plt.show() print('\n' * 3, '_' * 85, '\n' * 3)

def RocCurves_plotly(model, n_pca): # Create an empty figure, and iteratively add new lines # every time we compute a new class fig = go.Figure() fig.add_shape( type='line', line=dict(dash='dash', color='white'), x0=0, x1=1, y0=0, y1=1 ) for name, reg in model.items(): # PCA Best Model pca_b = PCA(n_components=n_pca[name], random_state=42) pca_b.fit(X) X_pca = pca_b.transform(X) # split 'X' e 'y' balanceaddos and pca_components_best X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=5) # Fit the model reg.fit(X_train, y_train) y_score = reg.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_score) auc_score = roc_auc_score(y_test, y_score) name = f"{name} (AUC={auc_score:.2f})" fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) fig.update_layout( title="Roc Curves all best classifier", xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', yaxis=dict(scaleanchor="x", scaleratio=1), xaxis=dict(constrain='domain'), ) fig.show()

# plot RocCurves for best classifier RocCurves_plotly(model_best, pca_best) dict_models_scores = {'Model': models_names, 'Test Score': test_scores, 'Train Score': train_scores} df = pd.DataFrame(dict_models_scores) # plot bar comparative scores test and train for best classifier fig = go.Figure() fig.add_trace(go.Bar( y=df['Test Score'], x=df['Model'], name='Test Scores', marker_color='indianred' )) fig.add_trace(go.Bar( y=df['Train Score'], x=df['Model'], name='Train Scores', marker_color='lightsalmon' )) # Here we modify the tickangle of the xaxis, resulting in rotated labels. fig.update_layout(barmode='group', xaxis_tickangle=-45, title="Bar comparative scores of test and train for best classifier",) fig.update_yaxes(tickfont=dict(family='Arial', size=14), automargin='height') fig.update_xaxes(tickfont=dict(family='Arial', size=14), automargin='height') fig.show()

# !pip install -q pipreqsnb # !pipreqsnb '.' --force # Edit to requirements.txt with any additional libraries you require