import pandas as pd
import numpy as np
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
df=pd.read_csv("Vins.csv")
df["Classe"]=np.where(df['quality']>= 7, 1, 0)
df.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
df.drop(['Classe','quality'], axis=1),
df['Classe'], test_size=0.33, random_state=42
)
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=6)
clf = clf.fit(X_train, y_train)
# tree.plot_tree(clf)
clf.score(X_test,y_test) #premiere eval du modele (precision)
y_test_pred=clf.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_matrix=confusion_matrix(y_test, y_test_pred)
conf_matrix
#on associe les valeurs a des variables
vn,fp=conf_matrix[0]
fn,vp=conf_matrix[1]
#precision rappel
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
precision=precision_score(y_test, y_test_pred)
recall=recall_score(y_test, y_test_pred)
print("precision:",precision,"\nrecall:",recall)
print("precision: ", vp/(vp+fp))
print("recall", vp/(vp+fn))
specificity=vn/(vn+fp)
print(specificity)
sensibility=vp/(vp+fn)
print(sensibility)
probas=clf.predict_proba(X_test)
plt.hist(probas)
y_score = clf.predict_proba(X_test)
#Calcul des points de la courbe ROC
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils
#Affichage de la courbe ROC
plt.plot(fpr, tpr)
metrics.auc(fpr, tpr)
L'AUC est supérieur à 0,5 Il y a donc de grandes chances que le modèle arrive à distinguer les classes positives des classes négatives. Le point idéal correspond (environ) au point (0.1,0.6). La ou la courbe change de vecteur directeur, il s'agit du meilleur compromis entre vrais positifs et vrais négatifs.
u_s_list=[]
v_s_list=[]
probas=clf.predict_proba(X_test)
# probas
score=probas[:,1]
step = 0.1
x_val = np.arange(0,1,step)
print(x_val)
for s in x_val:
pred_seuil = np.where(score > s, 1, score)
pred_seuil = np.where(score <= s, 0, score)
# pred_seuil
vp = 0
fp = 0
vn = 0
fn = 0
for i in range(0,len(pred_seuil)-1):
if pred_seuil[i]==1 and list(y_test)[i]==1:
vp+=1
elif pred_seuil[i]==1 and list(y_test)[i]==0:
fp+=1
elif pred_seuil[i]==0 and list(y_test)[i]==1:
fn+=1
elif pred_seuil[i]==0 and list(y_test)[i]==0:
vn+=1
u_s=(vp+fp)/(vp+fn+fp+vn)
v_s=vp/(vp+fn)
u_s_list+=[u_s]
v_s_list+=[v_s]
print(s)
print(u_s,v_s)
print(vp,fp,"\n",fn,vn,"\n")
plt.plot(v_s_list[::-1])
plt.title("Courbe lift a reverse ")
clf = tree.DecisionTreeClassifier(max_depth=6)
clf = clf.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))
tree.plot_tree(clf)
from sklearn.model_selection import KFold
kf = KFold(n_splits=4)
X = np.array(df.drop(['Classe','quality'], axis=1))
y = np.array(df['Classe'])
kf.get_n_splits(X)
print(kf)
print(kf.get_n_splits(X))
fig = plt.figure()
ax = plt.subplot(111)
i=0
auc_list = []
for train_index, test_index in kf.split(X):
# print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
#modèle predictif
clf = tree.DecisionTreeClassifier(max_depth=6)
clf = clf.fit(X_train, y_train)
# tree.plot_tree(clf)
y_score = clf.predict_proba(X_test)
#Calcul des points de la courbe ROC
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils
#Affichage de la courbe ROC
plt.plot(fpr, tpr, label=i)
i=i+1
#Calcul de l'air sur la courbe
print("AUC de ",i," : ",metrics.auc(fpr, tpr))
auc_list += [metrics.auc(fpr, tpr)]
ax.legend()
plt.show()
mean_auc = sum(auc_list)/kf.get_n_splits(X)
print("précision avec l'AUC moyen", mean_auc)
ecart_type=0
for val in auc_list:
ecart_type += (val-mean_auc)**2
ecart_type = (1/kf.get_n_splits(X)) * ecart_type
print("Robustesse avec écart type : ", ecart_type)
clf = tree.DecisionTreeClassifier(max_depth=6)
clf = clf.fit(X,y)
# tree.plot_tree(clf)
X_train, X_test, y_train, y_test = train_test_split(
df.drop(['Classe','quality'], axis=1),
df['Classe'], test_size=0.33, random_state=42
)
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111)
model_list = [tree.DecisionTreeClassifier(max_depth=6),
tree.DecisionTreeClassifier(max_depth=4),
tree.DecisionTreeClassifier(max_depth=3),
RandomForestClassifier(max_depth=5,random_state=0),
LinearRegression(),
MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1),
Perceptron(max_iter=40, eta0=0.1, random_state=0),
KNeighborsClassifier(n_neighbors=3),
KNeighborsClassifier(n_neighbors=6)]
i=0
auc_list = []
for m in model_list:
#modèle predictif
model = m.fit(X_train, y_train)
try:
y_score = model.predict_proba(X_test)
#Calcul des points de la courbe ROC
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,-1]) #false positive rate, true positive rate, ensemble des seuils
except:
fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict(X_test)) #false positive rate, true positive rate, ensemble des seuils
#Affichage de la courbe ROC
plt.plot(fpr, tpr, label=m)
i=i+1
#Calcul de l'air sur la courbe
print("AUC de ",m," : ",metrics.auc(fpr, tpr))
auc_list += [metrics.auc(fpr, tpr)]
ax.legend()
plt.show()
mean_auc = sum(auc_list)/kf.get_n_splits(X)
param_grid = {'max_depth': [3, 5, 10],
'min_samples_split': [2, 5, 10]}
base_estimator = RandomForestClassifier(random_state=0)
sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5,
factor=2, resource='n_estimators',
max_resources=30).fit(X, y)
print(sh.best_estimator_)