import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns
# Step 1 - create dataframe
df=pd.read_csv('white_wine_quality.csv',sep=';')
df.head(6)
# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
df['acceptable'] = df.apply(lambda _: '', axis=1)
df.loc[df.quality>= 7, "acceptable"] = 1
df.loc[df.quality<7, "acceptable"] = 0
df.head(15)
# Step 2. Nan values?
df.isnull().values.any()
#no NaN values
print("This data set has",len(df.index), "instances")
This data set has 4898 instances
# Step 3. Quality groupings and countplot
df.sort_values(by=['quality'])
for i in range (1,11):
print ("Number of wines of quality ",i, " is ",len(df[df['quality'] == i]))
Number of wines of quality 1 is 0
Number of wines of quality 2 is 0
Number of wines of quality 3 is 20
Number of wines of quality 4 is 163
Number of wines of quality 5 is 1457
Number of wines of quality 6 is 2198
Number of wines of quality 7 is 880
Number of wines of quality 8 is 175
Number of wines of quality 9 is 5
Number of wines of quality 10 is 0
sns.countplot(df['quality'])
#most wines have a quality o
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature
# doing features in given order
f = df['fixed acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in fixed acidity is ", count)
f = df['volatile acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in volatile acidity is ", count)
f = df['citric acid'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in citric acid is ", count)
f = df['residual sugar'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in residual sugar is ", count)
f = df['chlorides'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in chlorides is ", count)
f = df['free sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in free sulfur dioxide is ", count)
f = df['total sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in total sulfur dioxide is ", count)
f = df['density'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in density is ", count)
f = df['pH'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in pH is ", count)
f = df['sulphates'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in sulphates is ", count)
f = df['alcohol'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in alcohol is ", count)
f = df['quality'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in quality is ", count)
print("This data set has",len(df.index), "instances remaining")
number of data instances dropped due to outliers in fixed acidity is 2
number of data instances dropped due to outliers in volatile acidity is 9
number of data instances dropped due to outliers in citric acid is 8
number of data instances dropped due to outliers in residual sugar is 1
number of data instances dropped due to outliers in chlorides is 56
number of data instances dropped due to outliers in free sulfur dioxide is 7
number of data instances dropped due to outliers in total sulfur dioxide is 2
number of data instances dropped due to outliers in density is 3
number of data instances dropped due to outliers in pH is 0
number of data instances dropped due to outliers in sulphates is 1
number of data instances dropped due to outliers in alcohol is 0
number of data instances dropped due to outliers in quality is 0
This data set has 4809 instances remaining
# Step 5. get data into correct form
#
y = df.acceptable.values
y=y.astype('int')
# Get NumPy array for english and math scores, stack them and take transpose
# Remember that np.vstack takes 1 argument so put your arrays in ( ) and remember to use .T to transpose
f=df['fixed acidity'].values
v=df['volatile acidity'].values
c=df['citric acid'].values
r=df['residual sugar'].values
ch=df['chlorides'].values
fr=df['free sulfur dioxide'].values
t=df['total sulfur dioxide'].values
d=df['density'].values
p=df['pH'].values
s=df['sulphates'].values
a=df['alcohol'].values
X = np.vstack((f,v,c,r,ch,fr,t,d,p,s,a)).T
# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
#
# split the data
(X_train,X_test, y_train, y_test )= \
train_test_split(X,y, test_size = 0.20, random_state=0)
#
# Print out length of each set
n_train = len(X_train); n_test = len(X_test)
print (n_train, n_test )
3847 962
# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
scaler.fit(X )
X = scaler.transform(X )
# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver ='lbfgs' )
#
# Fit with training set
lr.fit(X_train, y_train)
#
test_score = lr.score (X_test,y_test)*100
print ("The percent accuracy of logistic regression model on test set is ", \
round(test_score,2))
The percent accuracy of logistic regression model on test set is 80.04
# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier ( max_iter = 300, solver='adam', random_state=1)
mlp.fit(X_train,y_train)
#
# Print out precent accuracy of training set
print ("Percent of accuracy of ANN on test data is ",100 * mlp.score (X_test,y_test) )
Percent of accuracy of ANN on test data is 82.22453222453223
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:617: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (300) reached and the optimization hasn't converged yet.
% self.max_iter, ConvergenceWarning)
# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
for k in range (1,6):
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit(X_train,y_train)
#
# Print out precent accuracy of training set
print (f"Percent of accuracy on training data using k={k} is {100 * knn.score (X_train,y_train)}")
#k=1 appears to be the best value for this situation
knn = KNeighborsClassifier (n_neighbors = 1)
knn.fit(X_train,y_train)
# Print out percent accuracy of test set using kNN
print ("Percent of accuracy on test data using kNN with k=1 is ",100 * knn.score(X_test,y_test) )
Percent of accuracy on training data using k=1 is 100.0
Percent of accuracy on training data using k=2 is 92.66961268520926
Percent of accuracy on training data using k=3 is 91.39589290356122
Percent of accuracy on training data using k=4 is 88.9004419027814
Percent of accuracy on training data using k=5 is 88.53652196516765
Percent of accuracy on test data using kNN with k=1 is 85.34303534303534
#using Naive Bayes algorithm
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print ("Percent of accuracy using Naive Bayes on training data is ",100 * gnb.score (X_test,y_test) )
Percent of accuracy using Naive Bayes on training data is 72.86902286902287
#using SVM algorithm
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print ("Percent of accuracy using SVM on training data is ",100 * svclassifier.score (X_test,y_test) )
Percent of accuracy using SVM on training data is 77.65072765072765
#using decision tree algorithm
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print ("Percent of accuracy using Decision Tree on training data is ",100 * classifier.score (X_test,y_test) )
Percent of accuracy using Decision Tree on training data is 82.95218295218295
# using the 3 required algorithms, logistic regression performed the worst only predicting around 80.4% correctly.
# ANN was the middle of the bunch, clocking in at around 82.2% correct predictions.
#K-means had optimal performance at k=1, and performed highest of the pack with 85.3% accurate prediction rates
#I used Naive Bayes, SVM, and Decision tree algorithms as well.
# The NB performed worse than the 3 required algorithms, being the lowest of all at 72.8% correct prediction.
# SVM was slightly better than NB, but still worse than the required 3 at 77.6% correct prediction.
#Decision tree actually performed better than all except k-means, with 82.9% accurate predictions.