import pandas as pd
import numpy as np
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
df=pd.read_csv("Vins.csv")
df["Classe"]=np.where(df['quality']>= 7, 1, 0)
df.head()
fixed acidityfloat64
volatile acidityfloat64
0
7.4
0.7
1
7.8
0.88
2
7.8
0.76
3
11.2
0.28
4
7.4
0.7
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
df.drop(['Classe','quality'], axis=1),
df['Classe'], test_size=0.33, random_state=42
)
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=6)
clf = clf.fit(X_train, y_train)
# tree.plot_tree(clf)
clf.score(X_test,y_test) #premiere eval du modele (precision)
y_test_pred=clf.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_matrix=confusion_matrix(y_test, y_test_pred)
conf_matrix
#on associe les valeurs a des variables
vn,fp=conf_matrix[0]
fn,vp=conf_matrix[1]
#precision rappel
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
precision=precision_score(y_test, y_test_pred)
recall=recall_score(y_test, y_test_pred)
print("precision:",precision,"\nrecall:",recall)
precision: 0.5081967213114754
recall: 0.4025974025974026
print("precision: ", vp/(vp+fp))
print("recall", vp/(vp+fn))
precision: 0.5081967213114754
recall 0.4025974025974026
specificity=vn/(vn+fp)
print(specificity)
sensibility=vp/(vp+fn)
print(sensibility)
0.9334811529933481
0.4025974025974026
probas=clf.predict_proba(X_test)
plt.hist(probas)
y_score = clf.predict_proba(X_test)
#Calcul des points de la courbe ROC
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils
#Affichage de la courbe ROC
plt.plot(fpr, tpr)
metrics.auc(fpr, tpr)
L'AUC est supérieur à 0,5 Il y a donc de grandes chances que le modèle arrive à distinguer les classes positives des classes négatives. Le point idéal correspond (environ) au point (0.1,0.6). La ou la courbe change de vecteur directeur, il s'agit du meilleur compromis entre vrais positifs et vrais négatifs.
u_s_list=[]
v_s_list=[]
probas=clf.predict_proba(X_test)
# probas
score=probas[:,1]
step = 0.1
x_val = np.arange(0,1,step)
print(x_val)
for s in x_val:
pred_seuil = np.where(score > s, 1, score)
pred_seuil = np.where(score <= s, 0, score)
# pred_seuil
vp = 0
fp = 0
vn = 0
fn = 0
for i in range(0,len(pred_seuil)-1):
if pred_seuil[i]==1 and list(y_test)[i]==1:
vp+=1
elif pred_seuil[i]==1 and list(y_test)[i]==0:
fp+=1
elif pred_seuil[i]==0 and list(y_test)[i]==1:
fn+=1
elif pred_seuil[i]==0 and list(y_test)[i]==0:
vn+=1
u_s=(vp+fp)/(vp+fn+fp+vn)
v_s=vp/(vp+fn)
u_s_list+=[u_s]
v_s_list+=[v_s]
print(s)
print(u_s,v_s)
print(vp,fp,"\n",fn,vn,"\n")
[0. 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
0.0
0.2288135593220339 0.7837837837837838
29 25
8 174
0.1
0.12385321100917432 0.48333333333333334
29 25
31 351
0.2
0.10714285714285714 0.3972602739726027
29 25
44 406
0.30000000000000004
0.1062992125984252 0.38666666666666666
29 25
46 408
0.4
0.1062992125984252 0.38666666666666666
29 25
46 408
0.5
0.10384615384615385 0.38666666666666666
29 25
46 420
0.6000000000000001
0.10384615384615385 0.38666666666666666
29 25
46 420
0.7000000000000001
0.10384615384615385 0.38666666666666666
29 25
46 420
0.8
0.10384615384615385 0.38666666666666666
29 25
46 420
0.9
0.10246679316888045 0.37662337662337664
29 25
48 425
plt.plot(v_s_list[::-1])
plt.title("Courbe lift a reverse ")
clf = tree.DecisionTreeClassifier(max_depth=6)
clf = clf.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))
tree.plot_tree(clf)
from sklearn.model_selection import KFold
kf = KFold(n_splits=4)
X = np.array(df.drop(['Classe','quality'], axis=1))
y = np.array(df['Classe'])
kf.get_n_splits(X)
print(kf)
print(kf.get_n_splits(X))
KFold(n_splits=4, random_state=None, shuffle=False)
4
fig = plt.figure()
ax = plt.subplot(111)
i=0
auc_list = []
for train_index, test_index in kf.split(X):
# print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
#modèle predictif
clf = tree.DecisionTreeClassifier(max_depth=6)
clf = clf.fit(X_train, y_train)
# tree.plot_tree(clf)
y_score = clf.predict_proba(X_test)
#Calcul des points de la courbe ROC
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils
#Affichage de la courbe ROC
plt.plot(fpr, tpr, label=i)
i=i+1
#Calcul de l'air sur la courbe
print("AUC de ",i," : ",metrics.auc(fpr, tpr))
auc_list += [metrics.auc(fpr, tpr)]
ax.legend()
plt.show()
mean_auc = sum(auc_list)/kf.get_n_splits(X)
AUC de 1 : 0.6963194444444444
AUC de 2 : 0.7907466539846457
AUC de 3 : 0.7695091459402554
AUC de 4 : 0.7446817082997581
print("précision avec l'AUC moyen", mean_auc)
précision avec l'AUC moyen 0.750314238167276
ecart_type=0
for val in auc_list:
ecart_type += (val-mean_auc)**2
ecart_type = (1/kf.get_n_splits(X)) * ecart_type
print("Robustesse avec écart type : ", ecart_type)
Robustesse avec écart type : 0.0012375969687803735
clf = tree.DecisionTreeClassifier(max_depth=6)
clf = clf.fit(X,y)
# tree.plot_tree(clf)
X_train, X_test, y_train, y_test = train_test_split(
df.drop(['Classe','quality'], axis=1),
df['Classe'], test_size=0.33, random_state=42
)
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111)
model_list = [tree.DecisionTreeClassifier(max_depth=6),
tree.DecisionTreeClassifier(max_depth=4),
tree.DecisionTreeClassifier(max_depth=3),
RandomForestClassifier(max_depth=5,random_state=0),
LinearRegression(),
MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1),
Perceptron(max_iter=40, eta0=0.1, random_state=0),
KNeighborsClassifier(n_neighbors=3),
KNeighborsClassifier(n_neighbors=6)]
i=0
auc_list = []
for m in model_list:
#modèle predictif
model = m.fit(X_train, y_train)
try:
y_score = model.predict_proba(X_test)
#Calcul des points de la courbe ROC
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils
except:
fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict(X_test)) #false positive rate, true positive rate, ensemble des seuils
#Affichage de la courbe ROC
plt.plot(fpr, tpr, label=m)
i=i+1
#Calcul de l'air sur la courbe
print("AUC de ",m," : ",metrics.auc(fpr, tpr))
auc_list += [metrics.auc(fpr, tpr)]
ax.legend()
plt.show()
mean_auc = sum(auc_list)/kf.get_n_splits(X)
AUC de DecisionTreeClassifier(max_depth=6) : 0.7679615284936793
AUC de DecisionTreeClassifier(max_depth=4) : 0.7820139948743052
AUC de DecisionTreeClassifier(max_depth=3) : 0.8163820658277421
AUC de RandomForestClassifier(max_depth=5, random_state=0) : 0.8817922653842831
AUC de LinearRegression() : 0.8739885391770092
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:549: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
AUC de MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
solver='lbfgs') : 0.8923892072450832
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:549: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
AUC de MLPClassifier(alpha=1e-05, random_state=1, solver='lbfgs') : 0.877674431998157
AUC de Perceptron(eta0=0.1, max_iter=40) : 0.6854608805828317
AUC de KNeighborsClassifier(n_neighbors=3) : 0.7553920580528118
AUC de KNeighborsClassifier(n_neighbors=6) : 0.778126529789501
param_grid = {'max_depth': [3, 5, 10],
'min_samples_split': [2, 5, 10]}
base_estimator = RandomForestClassifier(random_state=0)
sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5,
factor=2, resource='n_estimators',
max_resources=30).fit(X, y)
print(sh.best_estimator_)
RandomForestClassifier(max_depth=3, min_samples_split=5, n_estimators=24,
random_state=0)