import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns

# Step 1 - create dataframe
df = pd.read_csv ('white_wine_quality.csv', sep=';')
df

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
def wine (quality) :
if (quality >= 7):
return(1)
else:
return(0)
df['Is the wine acceptable?']=df['quality'].apply (wine)
df

# Step 2. Nan values?
df.info()
#there are no NaN values
#4898 data entries

# Step 3. Quality groupings and countplot
sns.countplot (x='quality', data = df)

# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature

# Step 4 example - volatile acidity feature
f = df['volatile acidity'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4- citric acid
f = df['citric acid'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - residual sugar
f = df['residual sugar'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - fixed acidity
f = df['fixed acidity'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - chlorides
f = df['chlorides'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - total sulfur dioxide
f = df['total sulfur dioxide'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - free sulfur dioxide
f = df['free sulfur dioxide'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - density
f = df['density'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - ph
f = df['pH'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - sulphates
f = df['sulphates'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - alcohol
f = df['alcohol'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - quality
f = df['quality'].values
factor = 5
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 5. get data into correct form
#
volatile_acidity = df['volatile acidity'].values
citric_acid = df['citric acid'].values
residual_sugar = df['residual sugar'].values
fixed_acidity = df['fixed acidity'].values
chlorides = df['chlorides'].values
free_sulfur = df['free sulfur dioxide'].values
total_sulfur = df['total sulfur dioxide'].values
density = df['density'].values
pH = df['pH'].values
sulphates= df['sulphates'].values
alcohol = df['alcohol'].values
quality = df['quality'].values
X=np.vstack((volatile_acidity,citric_acid,residual_sugar,fixed_acidity,chlorides,free_sulfur,total_sulfur,density,pH,sulphates,alcohol,quality))
X=np.transpose(X)
print(X)
X.shape

# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
# split the data
#y = df.quality.values
(X_train, X_test, y_train, y_test) = train_test_split (X, y, test_size=.2)
# Print out length of each set
n_train = len(X_train); n_test = len(X_test)
print (n_train, n_test )

# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
# Create model
lr = LogisticRegression(solver ='lbfgs' )
#
# Fit with training set
lr.fit(X_train, y_train)
#
# Calculate training score using .score(X_train, y_train) & print out percent accuracy
#
train_score = lr.score (X_train,y_train)*100
print ("The percent accuracy of logistic regression model on training set is ", \
round(train_score,2))
test_score = lr.score (X_test,y_test)*100
print ("The percent accuracy of logistic regression model on test set is", round(test_score,2))

# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
# Create classifier and train
mlp = MLPClassifier (max_iter=300, solver='adam', random_state=1)
mlp.fit(X_train, y_train)
predictions = mlp.predict(X_test)
print ("Percent of accuracy on test data is ", 100 * mlp.score(X_test, y_test))

# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
for k in range (1,6):
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit(X_train,y_train)
# Print out precent accuracy of training set
print (f"Precent of accuracy on training data using k={k} is {100 * knn.score (X_train,y_train)}")
# Conclusion k=2 or 5 is best
knn = KNeighborsClassifier (n_neighbors = 1)
knn.fit(X_train,y_train)
# Print out percent accuracy of test set
print ("Percent of accuracy on test data using kNN with k=1 is ",100 * knn.score(X_test,y_test) )

The percent accuracy of logistic regression model on test set is 99.27
Percent of accuracy on test data for ANN classifier is 99.2827868852459
Percent of accuracy on test data using kNN with k=1 is 90.3688524590164
The algoirthm with the best test set is the ANN classifier is a 99.28 percent accuracy