import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns
# Step 1 - create dataframe
df= pd.read_csv ('white_wine_quality.csv',sep=';' )
print(df)
fixed acidity volatile acidity citric acid residual sugar chlorides \
0 7.0 0.27 0.36 20.7 0.045
1 6.3 0.30 0.34 1.6 0.049
2 8.1 0.28 0.40 6.9 0.050
3 7.2 0.23 0.32 8.5 0.058
4 7.2 0.23 0.32 8.5 0.058
... ... ... ... ... ...
4893 6.2 0.21 0.29 1.6 0.039
4894 6.6 0.32 0.36 8.0 0.047
4895 6.5 0.24 0.19 1.2 0.041
4896 5.5 0.29 0.30 1.1 0.022
4897 6.0 0.21 0.38 0.8 0.020
free sulfur dioxide total sulfur dioxide density pH sulphates \
0 45.0 170.0 1.00100 3.00 0.45
1 14.0 132.0 0.99400 3.30 0.49
2 30.0 97.0 0.99510 3.26 0.44
3 47.0 186.0 0.99560 3.19 0.40
4 47.0 186.0 0.99560 3.19 0.40
... ... ... ... ... ...
4893 24.0 92.0 0.99114 3.27 0.50
4894 57.0 168.0 0.99490 3.15 0.46
4895 30.0 111.0 0.99254 2.99 0.46
4896 20.0 110.0 0.98869 3.34 0.38
4897 22.0 98.0 0.98941 3.26 0.32
alcohol quality
0 8.8 6
1 9.5 6
2 10.1 6
3 9.9 6
4 9.9 6
... ... ...
4893 11.2 6
4894 9.6 5
4895 9.4 6
4896 12.8 7
4897 11.8 6
[4898 rows x 12 columns]
# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
df['wine_acceptable'] = df['quality'].map({0: 0, 1:0 ,2:0, 3:0, 4:0, 5:0, 6:0, 7:1, 8:1, 9:1, 10:1 })
df['wine_t/f'] = df['wine_acceptable'].map({0: "false", 1: "true"})
df
# Step 2. Nan values?
df.info ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4898 non-null float64
1 volatile acidity 4898 non-null float64
2 citric acid 4898 non-null float64
3 residual sugar 4898 non-null float64
4 chlorides 4898 non-null float64
5 free sulfur dioxide 4898 non-null float64
6 total sulfur dioxide 4898 non-null float64
7 density 4898 non-null float64
8 pH 4898 non-null float64
9 sulphates 4898 non-null float64
10 alcohol 4898 non-null float64
11 quality 4898 non-null int64
12 wine_acceptable 4898 non-null int64
13 wine_t/f 4898 non-null object
dtypes: float64(11), int64(2), object(1)
memory usage: 535.8+ KB
# Step 3. Quality groupings and countplot
sns.set_style( 'whitegrid')
sns.countplot(x='quality', data=df, hue= 'wine_acceptable')
# rating 6 seems to have the most wines in it
df.groupby('quality').size()
# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature
f = df['fixed acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
8.705105871848145
5.860769568232091
number of data instances dropped is 2
# Step 4 example - volatile acidity feature
f = df['volatile acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
6.21758267510449
5.67196082288019
6.267184661670337
7.209622406421397
6.4655926079337185
5.3247469169192705
6.812806513894635
8.152060151172464
5.0271349975241995
number of data instances dropped is 9
f = df['citric acid'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
10.954598384631716
5.5012317050890545
5.4186049372171965
5.5012317050890545
5.5012317050890545
5.5012317050890545
7.4016473661418
5.5012317050890545
number of data instances dropped is 8
f = df['residual sugar'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
11.712597104042679
number of data instances dropped is 1
f = df['chlorides'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.768331607642535
5.81403527782101
7.048034372639853
6.910923362104426
6.910923362104426
13.720770218697288
6.408182990141194
6.1339609690703405
8.876181179778875
11.161364688702655
6.362479319962718
5.676924267285585
9.058995860492779
7.0937380428183285
7.0937380428183285
11.664105060665886
5.631220597107109
5.5855169269286336
5.768331607642535
5.539813256750158
8.8304775096004
7.550774744603084
9.56173623245601
7.230849053353754
7.413663734067657
5.5855169269286336
5.219887565500828
6.088257298891865
7.824996765673937
5.082776554965402
5.082776554965402
5.1284802251438775
5.082776554965402
5.5855169269286336
5.082776554965402
5.037072884786926
5.676924267285585
5.72262793746406
5.631220597107109
6.316775649784242
6.316775649784242
5.219887565500828
5.539813256750158
6.773812351568999
6.362479319962718
5.905442618177962
5.859738947999486
7.413663734067657
7.459367404246132
5.951146288356438
5.951146288356438
10.29299495531162
7.59647841478156
5.81403527782101
5.905442618177962
5.859738947999486
number of data instances dropped is 56
f = df['free sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.631361046022421
5.131230288481984
6.543364192125572
5.454844308066972
6.072652890911042
5.219488657459707
14.927909245009381
number of data instances dropped is 7
f = df['total sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.374481636966608
7.105858813452604
number of data instances dropped is 2
f = df['density'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.433035717143655
5.433035717143655
15.007731123102785
number of data instances dropped is 3
f = df['pH'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
number of data instances dropped is 0
f = df['sulphates'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.167319535096534
number of data instances dropped is 1
f = df['alcohol'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
number of data instances dropped is 0
for i in range (0,11) :
f = df.columns[i].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
#attempt to loop throuhg columns didnt work...
AttributeError: 'str' object has no attribute 'values'
df.info ()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4897
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4809 non-null float64
1 volatile acidity 4809 non-null float64
2 citric acid 4809 non-null float64
3 residual sugar 4809 non-null float64
4 chlorides 4809 non-null float64
5 free sulfur dioxide 4809 non-null float64
6 total sulfur dioxide 4809 non-null float64
7 density 4809 non-null float64
8 pH 4809 non-null float64
9 sulphates 4809 non-null float64
10 alcohol 4809 non-null float64
11 quality 4809 non-null int64
12 wine_acceptable 4809 non-null int64
13 wine_t/f 4809 non-null object
dtypes: float64(11), int64(2), object(1)
memory usage: 563.6+ KB
# Step 5. get data into correct form
#
y= df['wine_acceptable'].to_numpy()
df_mod= df.drop(['quality', "wine_acceptable", "wine_t/f"], axis= 1)
X= df_mod.values
# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train, y_test) = train_test_split(X,y, test_size=0.2)
# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver ='lbfgs' )
lr.fit(X_train, y_train)
train_score= lr.score (X_test,y_test)*100
train_score = round(train_score,2)
print( "Precent of accuracy on training data is ",train_score)
test_score = lr.score (X_test,y_test)*100
test_score = round(test_score,2)
print ("Precent of accuracy on test data using Logisitic Regression is ",test_score )
Precent of accuracy on training data is 79.83
Precent of accuracy on test data using Logisitic Regression is 79.83
# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier ( max_iter = 300, solver='adam', random_state=1)
mlp.fit(X_train,y_train)
print("Precent of accuracy on test data is ",100 * mlp.score(X_train,y_train) )
print ("Precent of accuracy on test data is ",100 * mlp.score(X_test,y_test) )
Precent of accuracy on test data is 88.19859630881206
Precent of accuracy on test data is 83.36798336798337
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:617: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (300) reached and the optimization hasn't converged yet.
% self.max_iter, ConvergenceWarning)
# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
for k in range (1,6):
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit(X_train,y_train)
print (f"Precent of accuracy on training data using k={k} is {100 * knn.score (X_train,y_train)}")
Precent of accuracy on training data using k=1 is 100.0
Precent of accuracy on training data using k=2 is 93.00753834156485
Precent of accuracy on training data using k=3 is 91.91577852872368
Precent of accuracy on training data using k=4 is 88.9004419027814
Precent of accuracy on training data using k=5 is 89.08240187158825
#k=1 looks to be the best
#step 11
# kNN neighbors seems to be the most accurate predictor, which I believe I also saw in previous projects. This was seen
# across all k values
#The logistic regression is the least accurate at around 79.
# ANN is pretty accurate prediction at 83 for the test set.