import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns
# Step 1 - create dataframe
df = pd.read_csv ('white_wine_quality.csv', sep=';')
df
# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
def wine (quality) :
if (quality >= 7):
return(1)
else:
return(0)
df['Is the wine acceptable?']=df['quality'].apply (wine)
df
# Step 2. Nan values?
df.info()
#there are no NaN values
#4898 data entries
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4898 non-null float64
1 volatile acidity 4898 non-null float64
2 citric acid 4898 non-null float64
3 residual sugar 4898 non-null float64
4 chlorides 4898 non-null float64
5 free sulfur dioxide 4898 non-null float64
6 total sulfur dioxide 4898 non-null float64
7 density 4898 non-null float64
8 pH 4898 non-null float64
9 sulphates 4898 non-null float64
10 alcohol 4898 non-null float64
11 quality 4898 non-null int64
12 Is the wine acceptable? 4898 non-null int64
dtypes: float64(11), int64(2)
memory usage: 497.6 KB
# Step 3. Quality groupings and countplot
sns.countplot (x='quality', data = df)
# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature
# Step 4 example - volatile acidity feature
f = df['volatile acidity'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
6.218817206612536
5.673097063450098
6.26842812871821
7.211035648726053
6.466871817140914
5.325820608710367
6.814148271880645
8.153643168733899
5.028155076076311
number of data instances dropped is 9
# Step 4- citric acid
f = df['citric acid'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
10.97321349925364
5.510519600158539
5.42775151077831
5.510519600158539
5.510519600158539
5.510519600158539
7.414185655903801
5.510519600158539
number of data instances dropped is 8
# Step 4 - residual sugar
f = df['residual sugar'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.041619025018026
5.041619025018026
number of data instances dropped is 2
# Step 4 - fixed acidity
f = df['fixed acidity'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
8.741090597247966
5.88590150484045
number of data instances dropped is 2
# Step 4 - chlorides
f = df['chlorides'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.768590740404831
5.814290879683098
7.048194640196295
6.911094222361495
6.911094222361495
13.720414974823202
6.408392690300563
6.134191854630964
8.876200211326953
11.161207175240278
6.362692551022296
5.6771904618483
9.05900076844002
7.093894779474561
7.093894779474561
11.66390870730121
5.631490322570032
5.585790183291766
5.768590740404831
5.5400900440135
8.830500072048688
7.550896172257225
9.561702300500952
7.230995197309359
7.413795754422425
5.585790183291766
5.220189069065634
6.0884917153526965
7.825097007926824
5.083088651230834
5.083088651230834
5.128788790509101
5.083088651230834
5.585790183291766
5.083088651230834
5.037388511952567
5.6771904618483
5.722890601126566
5.631490322570032
6.31699241174403
6.31699241174403
5.220189069065634
5.5400900440135
6.773993804526695
6.362692551022296
5.90569115823963
5.859991018961364
7.413795754422425
7.459495893700692
5.9513912975178975
5.9513912975178975
10.292904528953217
7.5965963115354915
5.814290879683098
5.90569115823963
5.859991018961364
number of data instances dropped is 56
# Step 4 - total sulfur dioxide
f = df['total sulfur dioxide'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.3708962576263035
7.100776543264627
number of data instances dropped is 2
# Step 4 - free sulfur dioxide
f = df['free sulfur dioxide'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.624741684413022
5.125019816324625
6.535999208574216
5.448369260381822
6.065672744491018
5.213206028340224
14.913689350056165
number of data instances dropped is 7
# Step 4 - density
f = df['density'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.57414020242395
5.57414020242395
number of data instances dropped is 2
# Step 4 - ph
f = df['pH'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
number of data instances dropped is 0
# Step 4 - sulphates
f = df['sulphates'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
5.003516532032999
5.179016027930873
number of data instances dropped is 2
# Step 4 - alcohol
f = df['alcohol'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
number of data instances dropped is 0
# Step 4 - quality
f = df['quality'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
number of data instances dropped is 0
# Step 5. get data into correct form
#
volatile_acidity = df['volatile acidity'].values
citric_acid = df['citric acid'].values
residual_sugar = df['residual sugar'].values
fixed_acidity = df['fixed acidity'].values
chlorides = df['chlorides'].values
free_sulfur = df['free sulfur dioxide'].values
total_sulfur = df['total sulfur dioxide'].values
density = df['density'].values
pH = df['pH'].values
sulphates= df['sulphates'].values
alcohol = df['alcohol'].values
quality = df['quality'].values
X=np.vstack((volatile_acidity,citric_acid,residual_sugar,fixed_acidity,chlorides,free_sulfur,total_sulfur,density,pH,sulphates,alcohol,quality))
X=np.transpose(X)
print(X)
X.shape
[[ 0.27 0.36 20.7 ... 0.45 8.8 6. ]
[ 0.3 0.34 1.6 ... 0.49 9.5 6. ]
[ 0.28 0.4 6.9 ... 0.44 10.1 6. ]
...
[ 0.24 0.19 1.2 ... 0.46 9.4 6. ]
[ 0.29 0.3 1.1 ... 0.38 12.8 7. ]
[ 0.21 0.38 0.8 ... 0.32 11.8 6. ]]
# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
# split the data
#y = df.quality.values
(X_train, X_test, y_train, y_test) = train_test_split (X, y, test_size=.2)
# Print out length of each set
n_train = len(X_train); n_test = len(X_test)
print (n_train, n_test )
3900 976
# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
# Create model
lr = LogisticRegression(solver ='lbfgs' )
#
# Fit with training set
lr.fit(X_train, y_train)
#
# Calculate training score using .score(X_train, y_train) & print out percent accuracy
#
train_score = lr.score (X_train,y_train)*100
print ("The percent accuracy of logistic regression model on training set is ", \
round(train_score,2))
test_score = lr.score (X_test,y_test)*100
print ("The percent accuracy of logistic regression model on test set is", round(test_score,2))
The percent accuracy of logistic regression model on training set is 99.64
The percent accuracy of logistic regression model on test set is 99.27
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
# Create classifier and train
mlp = MLPClassifier (max_iter=300, solver='adam', random_state=1)
mlp.fit(X_train, y_train)
predictions = mlp.predict(X_test)
print ("Percent of accuracy on test data is ", 100 * mlp.score(X_test, y_test))
Percent of accuracy on test data is 99.2827868852459
# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
for k in range (1,6):
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit(X_train,y_train)
# Print out precent accuracy of training set
print (f"Precent of accuracy on training data using k={k} is {100 * knn.score (X_train,y_train)}")
# Conclusion k=2 or 5 is best
knn = KNeighborsClassifier (n_neighbors = 1)
knn.fit(X_train,y_train)
# Print out percent accuracy of test set
print ("Percent of accuracy on test data using kNN with k=1 is ",100 * knn.score(X_test,y_test) )
Precent of accuracy on training data using k=1 is 100.0
Precent of accuracy on training data using k=2 is 95.92307692307692
Precent of accuracy on training data using k=3 is 95.41025641025641
Precent of accuracy on training data using k=4 is 93.71794871794872
Precent of accuracy on training data using k=5 is 93.7948717948718
Percent of accuracy on test data using kNN with k=1 is 90.3688524590164
The percent accuracy of logistic regression model on test set is 99.27
Percent of accuracy on test data for ANN classifier is 99.2827868852459
Percent of accuracy on test data using kNN with k=1 is 90.3688524590164
The algoirthm with the best test set is the ANN classifier is a 99.28 percent accuracy