import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns
# Step 1 - create dataframe
df = pd.read_csv('white_wine_quality.csv', sep=";")
# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
df['target'] = df['quality'].map( { 0:0,\
1:0,\
2:0,\
3:0,\
4:0,\
5:0,\
6:0,\
7:1,\
8:1,\
9:1,\
10:1 \
} )
# Step 2. Nan values?
if df.isnull().values.any() :
print("Nan values found")
else :
print("no Nan values found")
no Nan values found
# Step 3. Quality groupings and countplot
for i in range(3,11) :
print(f"quality: {i}, quality: {len(df[df.quality == i])}")
quality: 3, quality: 20
quality: 4, quality: 163
quality: 5, quality: 1457
quality: 6, quality: 2198
quality: 7, quality: 880
quality: 8, quality: 175
quality: 9, quality: 5
quality: 10, quality: 0
sns.set_style('whitegrid')
sns.countplot(data=df, x='quality')
# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
print(f"number of data entires: {len(df)}")
number of data entires: 4898
# Step 4 example - volatile acidity feature
for feature in df :
f = df[feature].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = (f[i] - mean) / std
if (z>5) :
count = count + 1
df = df.drop([i])
print(f"{count} {feature}'s have been dropped")
2 fixed acidity's have been dropped
9 volatile acidity's have been dropped
8 citric acid's have been dropped
1 residual sugar's have been dropped
56 chlorides's have been dropped
7 free sulfur dioxide's have been dropped
2 total sulfur dioxide's have been dropped
3 density's have been dropped
0 pH's have been dropped
1 sulphates's have been dropped
0 alcohol's have been dropped
0 quality's have been dropped
0 target's have been dropped
# Step 5. get data into correct form
#
df_mod = df.drop(['quality', 'target'], axis=1)
X = df_mod.values
y = df['target'].to_numpy()
print(f"X: {len(X[1])} features x {len(X)} entires")
print(f"y: {len(y)} data entries")
X: 11 features x 4809 entires
y: 4809 data entries
# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2)
print(f"training: {len(X_train)}, testing: {len(X_test)}")
training: 3847, testing: 962
# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver ='lbfgs' )
lr.fit(X_test, y_test)
percent = round(100* lr.score(X_test,y_test), 2)
print(f"Logisitic Regression accuracy: {percent}%")
Logisitic Regression accuracy: 82.22%
# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier( max_iter = 300, solver='adam' )
mlp.fit( X_train , y_train )
percent = round(100 * mlp.score( X_test , y_test ), 2)
print (f"ANN accuracy: {percent}%")
ANN accuracy: 84.1%
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:617: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (300) reached and the optimization hasn't converged yet.
% self.max_iter, ConvergenceWarning)
# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
max = 0
rng = [1, 2, 3, 4, 5]
for i in ( rng ) :
knni = KNeighborsClassifier (n_neighbors = i)
# fitting the data
knni.fit (X_train, y_train)
percent =round(100* knni.score( X_train , y_train ), 2)
print (f"nearest neighbor {i}, quality: {percent}%")
if (percent > max ) :
max = percent
index = i
print(f"kNN with {index} nearest neighbor percent accuracy: {max}%")
nearest neighbor 1, quality: 100.0%
nearest neighbor 2, quality: 92.93%
nearest neighbor 3, quality: 91.97%
nearest neighbor 4, quality: 89.03%
nearest neighbor 5, quality: 89.24%
kNN with 1 nearest neighbor percent accuracy: 100.0%
# Step 11. Refitting the data for the kNN test
knn = KNeighborsClassifier (n_neighbors = 1)
knn.fit (X_test, y_test)
percent = round(100* knn.score( X_test , y_test ), 2)
print(f"k: {index}, percent accuracy {percent}%")
k: 1, percent accuracy 100.0%
the LR and ANN models had a similar outcome of 80-82% accuracy. however, the ANN took much longer to compile. the kNN model had a much better outcome, with 100% accuracy for both the training and testing sets when 1 nearest neighbor was used. this is the model that did best.