import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns
# Step 1 - create dataframe
file='white_wine_quality.csv'
wine = pd.read_csv(file, sep=";")
wine.head()
# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
wine['target'] = wine['quality'].map( { \
0: 0,\
1:0, \
2:0, \
3:0, \
4:0, \
5:0, \
6:0, \
7:1, \
8:1, \
9:1, \
10:1} )
wine.head()
# Step 2. Nan values?
if wine.isnull().values.any() :
print("there are Nan values. please clean the data before proceeding.")
else :
print("there are no Nan values, the data is clean. you can proceed.")
there are no Nan values, the data is clean. you can proceed.
# Step 3. Quality groupings and countplot
sns.set_style('whitegrid')
sns.countplot(x='quality', data=wine)
print("quality | number of wines")
print("--------------------------")
for i in range(3,11) :
print(f" {i} | {len(wine[wine.quality == i])}")
quality | number of wines
--------------------------
3 | 20
4 | 163
5 | 1457
6 | 2198
7 | 880
8 | 175
9 | 5
10 | 0
# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
total = len(wine)
print(f"total data entires: {total}")
total data entires: 4898
# Step 4 example - volatile acidity feature
for feature in wine :
f = wine[feature].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
count = count + 1
wine = wine.drop([i])
print(f"{count} | {feature}'s have been dropped")
total = len(wine)
print(f"\n total data entries: {total}")
2 | fixed acidity's have been dropped
9 | volatile acidity's have been dropped
8 | citric acid's have been dropped
1 | residual sugar's have been dropped
56 | chlorides's have been dropped
7 | free sulfur dioxide's have been dropped
2 | total sulfur dioxide's have been dropped
3 | density's have been dropped
0 | pH's have been dropped
1 | sulphates's have been dropped
0 | alcohol's have been dropped
0 | quality's have been dropped
0 | target's have been dropped
total data entries: 4809
# Step 5. get data into correct form
#
wine_mod = wine.drop(['quality', 'target'], axis=1)
X = wine_mod.values
y = wine['target'].to_numpy()
print(f"X | {len(X[1])} features x {len(X)} entires")
print(f"y | {len(y)} data entries")
X | 11 features x 4809 entires
y | 4809 data entries
# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2)
print(f"training data entries | {len(X_train)}")
print(f"testing data entries | {len(X_test)}")
training data entries | 3847
testing data entries | 962
# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver ='lbfgs' )
lr.fit(X_test, y_test)
percent = round(100* lr.score(X_test,y_test), 2)
print(f"The percent accuracy of logisitic regression on the testing set is {percent}%")
The percent accuracy of logisitic regression on the testing set is 80.15%
# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier( max_iter = 300, solver='adam', random_state=1 )
mlp.fit( X_train , y_train )
percent = round(100 * mlp.score( X_test , y_test ), 2)
print (f"Percent of accuracy of ANN on testing set is {percent}%" )
Percent of accuracy of ANN on testing set is 85.45%
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:617: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (300) reached and the optimization hasn't converged yet.
% self.max_iter, ConvergenceWarning)
# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
max = 0
print("nearest neighbors | percent accuracy")
print("----------------------------------------")
for i in ( 1, 2, 3, 4, 5 ) :
knni = KNeighborsClassifier (n_neighbors = i)
# fitting the data
knni.fit (X_train, y_train)
percent =round(100* knni.score( X_train , y_train ), 2)
print (f" {i} | {percent}%")
if (percent > max ) :
max = percent
max_index = i
print(f"\nbest value of k | {max_index}")
print(f"percent accuracy of {max_index} neighbor(s) | {max}%")
nearest neighbors | percent accuracy
----------------------------------------
1 | 100.0%
2 | 92.7%
3 | 91.55%
4 | 88.77%
5 | 88.72%
best value of k | 1
percent accuracy of 1 neighbor(s) | 100.0%
# Step 11. Refitting the data for the kNN test
knn = KNeighborsClassifier (n_neighbors = 1)
knn.fit (X_test, y_test)
percent = round(100* knn.score( X_test , y_test ), 2)
print(f"percent accuracy of {max_index} neighbor(s) | {percent}%")
percent accuracy of 1 neighbor(s) | 100.0%
print("model | time to compile | percent accuracy")
print("--------------------------------------------------")
print("LR | <1s | 80.15%")
print("ANN | 45s | 85.45%")
print("kNN | <1s | 100%")
model | time to compile | percent accuracy
--------------------------------------------------
LR | <1s | 80.15%
ANN | 45s | 85.45%
kNN | <1s | 100%