TP analyse predictive

import pandas as pd import numpy as np import sklearn.metrics as metrics import matplotlib.pyplot as plt df=pd.read_csv("Vins.csv")

df["Classe"]=np.where(df['quality']>= 7, 1, 0)

df.head()

fixed acidityfloat64

volatile acidityfloat64

7.4

0.7

7.8

0.88

7.8

0.76

11.2

0.28

7.4

0.7

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( df.drop(['Classe','quality'], axis=1), df['Classe'], test_size=0.33, random_state=42 )

from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth=6) clf = clf.fit(X_train, y_train) # tree.plot_tree(clf)

clf.score(X_test,y_test) #premiere eval du modele (precision)

y_test_pred=clf.predict(X_test)

from sklearn.metrics import confusion_matrix conf_matrix=confusion_matrix(y_test, y_test_pred) conf_matrix

#on associe les valeurs a des variables vn,fp=conf_matrix[0] fn,vp=conf_matrix[1]

#precision rappel from sklearn.metrics import recall_score from sklearn.metrics import precision_score precision=precision_score(y_test, y_test_pred) recall=recall_score(y_test, y_test_pred) print("precision:",precision,"\nrecall:",recall)

precision: 0.5081967213114754 
recall: 0.4025974025974026

print("precision: ", vp/(vp+fp)) print("recall", vp/(vp+fn))

precision:  0.5081967213114754
recall 0.4025974025974026

specificity=vn/(vn+fp) print(specificity) sensibility=vp/(vp+fn) print(sensibility)

0.9334811529933481
0.4025974025974026

probas=clf.predict_proba(X_test) plt.hist(probas)

y_score = clf.predict_proba(X_test) #Calcul des points de la courbe ROC fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils #Affichage de la courbe ROC plt.plot(fpr, tpr) metrics.auc(fpr, tpr)

L'AUC est supérieur à 0,5 Il y a donc de grandes chances que le modèle arrive à distinguer les classes positives des classes négatives. Le point idéal correspond (environ) au point (0.1,0.6). La ou la courbe change de vecteur directeur, il s'agit du meilleur compromis entre vrais positifs et vrais négatifs.

u_s_list=[] v_s_list=[] probas=clf.predict_proba(X_test) # probas score=probas[:,1] step = 0.1 x_val = np.arange(0,1,step) print(x_val) for s in x_val: pred_seuil = np.where(score > s, 1, score) pred_seuil = np.where(score <= s, 0, score) # pred_seuil vp = 0 fp = 0 vn = 0 fn = 0 for i in range(0,len(pred_seuil)-1): if pred_seuil[i]==1 and list(y_test)[i]==1: vp+=1 elif pred_seuil[i]==1 and list(y_test)[i]==0: fp+=1 elif pred_seuil[i]==0 and list(y_test)[i]==1: fn+=1 elif pred_seuil[i]==0 and list(y_test)[i]==0: vn+=1 u_s=(vp+fp)/(vp+fn+fp+vn) v_s=vp/(vp+fn) u_s_list+=[u_s] v_s_list+=[v_s] print(s) print(u_s,v_s) print(vp,fp,"\n",fn,vn,"\n")

[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
0.0
0.2288135593220339 0.7837837837837838
29 25 
 8 174 

0.1
0.12385321100917432 0.48333333333333334
29 25 
 31 351 

0.2
0.10714285714285714 0.3972602739726027
29 25 
 44 406 

0.30000000000000004
0.1062992125984252 0.38666666666666666
29 25 
 46 408 

0.4
0.1062992125984252 0.38666666666666666
29 25 
 46 408 

0.5
0.10384615384615385 0.38666666666666666
29 25 
 46 420 

0.6000000000000001
0.10384615384615385 0.38666666666666666
29 25 
 46 420 

0.7000000000000001
0.10384615384615385 0.38666666666666666
29 25 
 46 420 

0.8
0.10384615384615385 0.38666666666666666
29 25 
 46 420 

0.9
0.10246679316888045 0.37662337662337664
29 25 
 48 425

plt.plot(v_s_list[::-1]) plt.title("Courbe lift a reverse ")

clf = tree.DecisionTreeClassifier(max_depth=6) clf = clf.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test])) tree.plot_tree(clf)

from sklearn.model_selection import KFold kf = KFold(n_splits=4) X = np.array(df.drop(['Classe','quality'], axis=1)) y = np.array(df['Classe']) kf.get_n_splits(X) print(kf) print(kf.get_n_splits(X))

KFold(n_splits=4, random_state=None, shuffle=False)
4

fig = plt.figure() ax = plt.subplot(111) i=0 auc_list = [] for train_index, test_index in kf.split(X): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #modèle predictif clf = tree.DecisionTreeClassifier(max_depth=6) clf = clf.fit(X_train, y_train) # tree.plot_tree(clf) y_score = clf.predict_proba(X_test) #Calcul des points de la courbe ROC fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils #Affichage de la courbe ROC plt.plot(fpr, tpr, label=i) i=i+1 #Calcul de l'air sur la courbe print("AUC de ",i," : ",metrics.auc(fpr, tpr)) auc_list += [metrics.auc(fpr, tpr)] ax.legend() plt.show() mean_auc = sum(auc_list)/kf.get_n_splits(X)

AUC de  1  :  0.6963194444444444
AUC de  2  :  0.7907466539846457
AUC de  3  :  0.7695091459402554
AUC de  4  :  0.7446817082997581

print("précision avec l'AUC moyen", mean_auc)

précision avec l'AUC moyen 0.750314238167276

ecart_type=0 for val in auc_list: ecart_type += (val-mean_auc)**2 ecart_type = (1/kf.get_n_splits(X)) * ecart_type print("Robustesse avec écart type : ", ecart_type)

Robustesse avec écart type :  0.0012375969687803735

clf = tree.DecisionTreeClassifier(max_depth=6) clf = clf.fit(X,y) # tree.plot_tree(clf)

X_train, X_test, y_train, y_test = train_test_split( df.drop(['Classe','quality'], axis=1), df['Classe'], test_size=0.33, random_state=42 )

from sklearn.linear_model import LinearRegression from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import Perceptron from sklearn.ensemble import RandomForestClassifier from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.model_selection import HalvingGridSearchCV fig = plt.figure(figsize=(10,10)) ax = plt.subplot(111) model_list = [tree.DecisionTreeClassifier(max_depth=6), tree.DecisionTreeClassifier(max_depth=4), tree.DecisionTreeClassifier(max_depth=3), RandomForestClassifier(max_depth=5,random_state=0), LinearRegression(), MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1), MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), Perceptron(max_iter=40, eta0=0.1, random_state=0), KNeighborsClassifier(n_neighbors=3), KNeighborsClassifier(n_neighbors=6)] i=0 auc_list = [] for m in model_list: #modèle predictif model = m.fit(X_train, y_train) try: y_score = model.predict_proba(X_test) #Calcul des points de la courbe ROC fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils except: fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict(X_test)) #false positive rate, true positive rate, ensemble des seuils #Affichage de la courbe ROC plt.plot(fpr, tpr, label=m) i=i+1 #Calcul de l'air sur la courbe print("AUC de ",m," : ",metrics.auc(fpr, tpr)) auc_list += [metrics.auc(fpr, tpr)] ax.legend() plt.show() mean_auc = sum(auc_list)/kf.get_n_splits(X)

AUC de  DecisionTreeClassifier(max_depth=6)  :  0.7679615284936793
AUC de  DecisionTreeClassifier(max_depth=4)  :  0.7820139948743052
AUC de  DecisionTreeClassifier(max_depth=3)  :  0.8163820658277421
AUC de  RandomForestClassifier(max_depth=5, random_state=0)  :  0.8817922653842831
AUC de  LinearRegression()  :  0.8739885391770092
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:549: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
AUC de  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
              solver='lbfgs')  :  0.8923892072450832
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:549: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
AUC de  MLPClassifier(alpha=1e-05, random_state=1, solver='lbfgs')  :  0.877674431998157
AUC de  Perceptron(eta0=0.1, max_iter=40)  :  0.6854608805828317
AUC de  KNeighborsClassifier(n_neighbors=3)  :  0.7553920580528118
AUC de  KNeighborsClassifier(n_neighbors=6)  :  0.778126529789501

param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]} base_estimator = RandomForestClassifier(random_state=0) sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, factor=2, resource='n_estimators', max_resources=30).fit(X, y) print(sh.best_estimator_)

RandomForestClassifier(max_depth=3, min_samples_split=5, n_estimators=24,
                       random_state=0)