import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns

# Step 1 - create dataframe
df = pd.read_csv ('white_wine_quality.csv', sep=';')
df.head ()

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
df1 = pd.read_csv('white_wine_quality.csv', sep=';') # create temporary data frame to use
def create_acceptance (quality) :
if (quality >= 7) :
return('1')
else :
return ('0')
accept = df1[ 'quality' ].apply ( create_acceptance )
df1['acceptance']= accept
df1.head(15)

# Step 2. Nan values?
df1.info()

# All features have 4898 values; each is either integor or floating so no strings

# Step 3. Quality groupings and countplot
sns.set_style("whitegrid")

sns.countplot(x='quality', data= df1)
plt.title("How many wines are in each quality class?")
df1.groupby("quality").size()

# What quality rating does the majority of the wines in the dataset have?
# ANSWER: 6

# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature

# Step 4 - volatile acidity feature
f = df1['volatile acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - citric acid feature
f = df1['citric acid'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - residual sugar feature
f = df1['residual sugar'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - chlorides feature
f = df1['chlorides'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - free sulfur dioxide feature
f = df1['free sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - total sulfur dioxide feature
f = df1['total sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - density feature
f = df1['density'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - pH feature
f = df1['pH'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - sulphates feature
f = df1['sulphates'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 - alcohol feature
f = df1['alcohol'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df1 = df1.drop( [i])
print ("number of data instances dropped is ", count)

# Step 5. get data into correct form
#
y = df1.acceptance.values
X = df1.values
df1.info()
df1.head()

# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train,X_test, y_train, y_test )= \
train_test_split(X,y, test_size = 0.20, random_state=0)
n_train = len(X_train); n_test = len(X_test)
print (n_train, n_test )

# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression (solver ='lbfgs' )
lr.fit (X_train, y_train )
percent = 100 * round (lr.score(X_train, y_train))
print (f"Logistic Regression predicts the training set {percent}% accurately")

pred_log_reg = lr.predict(X_test)

# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier (max_iter=300, solver = 'adam', random_state=1)
mlp.fit (X_train, y_train )
percent = 100 * round (mlp.score(X_train, y_train))
print (f"ANN predicts the training set {percent}% accurately")

pred_ANN = mlp.predict(X_test)

# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
for k in range(1,6) :
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit (X_train, y_train)
percent = 100 * round (knn.score(X_train, y_train))
print (f"kNN with k = {k} predicts the training set {percent}% accurately")

pred_kNN = knn.predict (X_test)

# Compare predictions for kNN and ANN
count = 0
for i in range (0, n_test):
if (pred_kNN[i] != pred_ANN[i]):
count = count + 1
print (f"For wine {i} kNN got {pred_kNN[i]} whereas ANN got {pred_ANN[i]}")
print (count)

# Compare predictions for Logistic Regression and ANN
count = 0
for i in range (0, n_test):
if (pred_log_reg[i] != pred_ANN[i]):
count = count + 1
print (f"For wine {i} Logistic Regression got {pred_log_reg[i]} whereas ANN got {pred_ANN[i]}")
print (count)

# Compare predictions for Logistic Regression and kNN
count = 0
for i in range (0, n_test):
if (pred_log_reg[i] != pred_kNN[i]):
count = count + 1
print (f"For wine {i} Logistic Regression got {pred_log_reg[i]} whereas kNN got {pred_kNN[i]}")
print (count)

# The results for all 3 Classifier Algorithms (Logistic Regression, ANN & kNN) are all the same - the accuracy is 100%.