import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns

# Step 1 - create dataframe
df=pd.read_csv('white_wine_quality.csv',sep=';')
df.head(6)

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
df['acceptable'] = df.apply(lambda _: '', axis=1)
df.loc[df.quality>= 7, "acceptable"] = 1
df.loc[df.quality<7, "acceptable"] = 0
df.head(15)

# Step 2. Nan values?
df.isnull().values.any()
#no NaN values
print("This data set has",len(df.index), "instances")

# Step 3. Quality groupings and countplot
df.sort_values(by=['quality'])
for i in range (1,11):
print ("Number of wines of quality ",i, " is ",len(df[df['quality'] == i]))

sns.countplot(df['quality'])
#most wines have a quality o

# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature

# doing features in given order
f = df['fixed acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in fixed acidity is ", count)
f = df['volatile acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in volatile acidity is ", count)
f = df['citric acid'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in citric acid is ", count)
f = df['residual sugar'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in residual sugar is ", count)
f = df['chlorides'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in chlorides is ", count)
f = df['free sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in free sulfur dioxide is ", count)
f = df['total sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in total sulfur dioxide is ", count)
f = df['density'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in density is ", count)
f = df['pH'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in pH is ", count)
f = df['sulphates'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in sulphates is ", count)
f = df['alcohol'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in alcohol is ", count)
f = df['quality'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean )
if (z>5*std) :
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped due to outliers in quality is ", count)
print("This data set has",len(df.index), "instances remaining")

# Step 5. get data into correct form
#
y = df.acceptable.values
y=y.astype('int')
# Get NumPy array for english and math scores, stack them and take transpose
# Remember that np.vstack takes 1 argument so put your arrays in ( ) and remember to use .T to transpose
f=df['fixed acidity'].values
v=df['volatile acidity'].values
c=df['citric acid'].values
r=df['residual sugar'].values
ch=df['chlorides'].values
fr=df['free sulfur dioxide'].values
t=df['total sulfur dioxide'].values
d=df['density'].values
p=df['pH'].values
s=df['sulphates'].values
a=df['alcohol'].values
X = np.vstack((f,v,c,r,ch,fr,t,d,p,s,a)).T

# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
#
# split the data
(X_train,X_test, y_train, y_test )= \
train_test_split(X,y, test_size = 0.20, random_state=0)
#
# Print out length of each set
n_train = len(X_train); n_test = len(X_test)
print (n_train, n_test )

# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
scaler.fit(X )
X = scaler.transform(X )

# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver ='lbfgs' )
#
# Fit with training set
lr.fit(X_train, y_train)
#
test_score = lr.score (X_test,y_test)*100
print ("The percent accuracy of logistic regression model on test set is ", \
round(test_score,2))

# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier ( max_iter = 300, solver='adam', random_state=1)
mlp.fit(X_train,y_train)
#
# Print out precent accuracy of training set
print ("Percent of accuracy of ANN on test data is ",100 * mlp.score (X_test,y_test) )

# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
for k in range (1,6):
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit(X_train,y_train)
#
# Print out precent accuracy of training set
print (f"Percent of accuracy on training data using k={k} is {100 * knn.score (X_train,y_train)}")
#k=1 appears to be the best value for this situation
knn = KNeighborsClassifier (n_neighbors = 1)
knn.fit(X_train,y_train)
# Print out percent accuracy of test set using kNN
print ("Percent of accuracy on test data using kNN with k=1 is ",100 * knn.score(X_test,y_test) )

#using Naive Bayes algorithm
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print ("Percent of accuracy using Naive Bayes on training data is ",100 * gnb.score (X_test,y_test) )

#using SVM algorithm
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print ("Percent of accuracy using SVM on training data is ",100 * svclassifier.score (X_test,y_test) )

#using decision tree algorithm
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print ("Percent of accuracy using Decision Tree on training data is ",100 * classifier.score (X_test,y_test) )

# using the 3 required algorithms, logistic regression performed the worst only predicting around 80.4% correctly.
# ANN was the middle of the bunch, clocking in at around 82.2% correct predictions.
#K-means had optimal performance at k=1, and performed highest of the pack with 85.3% accurate prediction rates
#I used Naive Bayes, SVM, and Decision tree algorithms as well.
# The NB performed worse than the 3 required algorithms, being the lowest of all at 72.8% correct prediction.
# SVM was slightly better than NB, but still worse than the required 3 at 77.6% correct prediction.
#Decision tree actually performed better than all except k-means, with 82.9% accurate predictions.