import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns
# Step 1 - create dataframe
df = pd.read_csv ('white_wine_quality.csv', sep=';')
df.head ()
# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
df1 = pd.read_csv('white_wine_quality.csv', sep=';') # create temporary data frame to use
def create_acceptance (quality) :
if (quality >= 7) :
return('1')
else :
return ('0')
accept = df1[ 'quality' ].apply ( create_acceptance )
df1['acceptance']= accept
df1.head(15)
# Step 2. Nan values?
df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4898 non-null float64
1 volatile acidity 4898 non-null float64
2 citric acid 4898 non-null float64
3 residual sugar 4898 non-null float64
4 chlorides 4898 non-null float64
5 free sulfur dioxide 4898 non-null float64
6 total sulfur dioxide 4898 non-null float64
7 density 4898 non-null float64
8 pH 4898 non-null float64
9 sulphates 4898 non-null float64
10 alcohol 4898 non-null float64
11 quality 4898 non-null int64
12 acceptance 4898 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 497.6+ KB
# All features have 4898 values; each is either integor or floating so no strings
# Step 3. Quality groupings and countplot
sns.set_style("whitegrid")
sns.countplot(x='quality', data= df1)
plt.title("How many wines are in each quality class?")
df1.groupby("quality").size()
# What quality rating does the majority of the wines in the dataset have?
# ANSWER: 6
# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature
# Step 4 - volatile acidity feature
f = df1['volatile acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
6.218817206612536
5.673097063450098
6.26842812871821
7.211035648726053
6.466871817140914
5.325820608710367
6.814148271880645
8.153643168733899
5.028155076076311
number of data instances dropped is 9
# Step 4 - citric acid feature
f = df1['citric acid'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
10.97321349925364
5.510519600158539
5.42775151077831
5.510519600158539
5.510519600158539
5.510519600158539
7.414185655903801
5.510519600158539
number of data instances dropped is 8
# Step 4 - residual sugar feature
f = df1['residual sugar'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
5.041619025018026
5.041619025018026
number of data instances dropped is 2
# Step 4 - chlorides feature
f = df1['chlorides'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
5.769807785945281
5.815516932660704
7.049663893977103
6.912536453830836
6.912536453830836
13.723199314428735
6.409735839961193
6.1354809596686595
8.878029762593988
11.163487098365094
6.36402669324577
5.678389492514438
9.060866349455676
7.095373040692525
7.095373040692525
11.666287712234736
5.632680345799017
5.586971199083594
5.769807785945281
5.541262052368172
8.832320615878565
7.552464507846746
9.56366696332532
7.23250048083879
7.415337067700479
5.586971199083594
5.221298025360217
6.089771812953237
7.826719388139279
5.0841705852139505
5.0841705852139505
5.129879731929373
5.0841705852139505
5.586971199083594
5.0841705852139505
5.038461438498528
5.678389492514438
5.724098639229861
5.632680345799017
6.318317546530348
6.318317546530348
5.221298025360217
5.541262052368172
6.77540901368457
6.36402669324577
5.906935226091548
5.861226079376126
7.415337067700479
7.4610462144159015
5.9526443728069705
5.9526443728069705
10.295013310772076
7.598173654562167
5.815516932660704
5.906935226091548
5.861226079376126
number of data instances dropped is 56
# Step 4 - free sulfur dioxide feature
f = df1['free sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
5.625009043784972
5.125292256459296
6.5362573030259075
5.448638412964145
6.065935620837037
5.21347757186971
14.913862267015158
number of data instances dropped is 7
# Step 4 - total sulfur dioxide feature
f = df1['total sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
5.367799874302915
7.096416745330942
number of data instances dropped is 2
# Step 4 - density feature
f = df1['density'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
5.567488075272593
5.567488075272593
number of data instances dropped is 2
# Step 4 - pH feature
f = df1['pH'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
number of data instances dropped is 0
# Step 4 - sulphates feature
f = df1['sulphates'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
5.000143911376127
5.175455889965817
number of data instances dropped is 2
# Step 4 - alcohol feature
f = df1['alcohol'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)
number of data instances dropped is 0
# Step 5. get data into correct form
#
y = df1.acceptance.values
X = df1.values
df1.info()
df1.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4810 entries, 0 to 4897
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4810 non-null float64
1 volatile acidity 4810 non-null float64
2 citric acid 4810 non-null float64
3 residual sugar 4810 non-null float64
4 chlorides 4810 non-null float64
5 free sulfur dioxide 4810 non-null float64
6 total sulfur dioxide 4810 non-null float64
7 density 4810 non-null float64
8 pH 4810 non-null float64
9 sulphates 4810 non-null float64
10 alcohol 4810 non-null float64
11 quality 4810 non-null int64
12 acceptance 4810 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 526.1+ KB
# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train,X_test, y_train, y_test )= \
train_test_split(X,y, test_size = 0.20, random_state=0)
n_train = len(X_train); n_test = len(X_test)
print (n_train, n_test )
3848 962
# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression (solver ='lbfgs' )
lr.fit (X_train, y_train )
percent = 100 * round (lr.score(X_train, y_train))
print (f"Logistic Regression predicts the training set {percent}% accurately")
Logistic Regression predicts the training set 100% accurately
pred_log_reg = lr.predict(X_test)
# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier (max_iter=300, solver = 'adam', random_state=1)
mlp.fit (X_train, y_train )
percent = 100 * round (mlp.score(X_train, y_train))
print (f"ANN predicts the training set {percent}% accurately")
ANN predicts the training set 100% accurately
pred_ANN = mlp.predict(X_test)
# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
for k in range(1,6) :
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit (X_train, y_train)
percent = 100 * round (knn.score(X_train, y_train))
print (f"kNN with k = {k} predicts the training set {percent}% accurately")
kNN with k = 1 predicts the training set 100% accurately
kNN with k = 2 predicts the training set 100% accurately
kNN with k = 3 predicts the training set 100% accurately
kNN with k = 4 predicts the training set 100% accurately
kNN with k = 5 predicts the training set 100% accurately
pred_kNN = knn.predict (X_test)
# Compare predictions for kNN and ANN
count = 0
for i in range (0, n_test):
if (pred_kNN[i] != pred_ANN[i]):
count = count + 1
print (f"For wine {i} kNN got {pred_kNN[i]} whereas ANN got {pred_ANN[i]}")
print (count)
0
# Compare predictions for Logistic Regression and ANN
count = 0
for i in range (0, n_test):
if (pred_log_reg[i] != pred_ANN[i]):
count = count + 1
print (f"For wine {i} Logistic Regression got {pred_log_reg[i]} whereas ANN got {pred_ANN[i]}")
print (count)
0
# Compare predictions for Logistic Regression and kNN
count = 0
for i in range (0, n_test):
if (pred_log_reg[i] != pred_kNN[i]):
count = count + 1
print (f"For wine {i} Logistic Regression got {pred_log_reg[i]} whereas kNN got {pred_kNN[i]}")
print (count)
0
# The results for all 3 Classifier Algorithms (Logistic Regression, ANN & kNN) are all the same - the accuracy is 100%.