import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns

# Step 1 - create dataframe
df= pd.read_csv ('white_wine_quality.csv',sep=';' )
print(df)

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
df['wine_acceptable'] = df['quality'].map({0: 0, 1:0 ,2:0, 3:0, 4:0, 5:0, 6:0, 7:1, 8:1, 9:1, 10:1 })
df['wine_t/f'] = df['wine_acceptable'].map({0: "false", 1: "true"})
df

# Step 2. Nan values?
df.info ()

# Step 3. Quality groupings and countplot
sns.set_style( 'whitegrid')
sns.countplot(x='quality', data=df, hue= 'wine_acceptable')

# rating 6 seems to have the most wines in it
df.groupby('quality').size()

# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature

f = df['fixed acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

# Step 4 example - volatile acidity feature
f = df['volatile acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['citric acid'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['residual sugar'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['chlorides'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['free sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['total sulfur dioxide'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['density'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['pH'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['sulphates'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

f = df['alcohol'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>5) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)

for i in range (0,11) :
f = df.columns[i].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
#attempt to loop throuhg columns didnt work...

df.info ()

# Step 5. get data into correct form
#
y= df['wine_acceptable'].to_numpy()
df_mod= df.drop(['quality', "wine_acceptable", "wine_t/f"], axis= 1)
X= df_mod.values

# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train, y_test) = train_test_split(X,y, test_size=0.2)

# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver ='lbfgs' )
lr.fit(X_train, y_train)
train_score= lr.score (X_test,y_test)*100
train_score = round(train_score,2)
print( "Precent of accuracy on training data is ",train_score)
test_score = lr.score (X_test,y_test)*100
test_score = round(test_score,2)
print ("Precent of accuracy on test data using Logisitic Regression is ",test_score )

# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier ( max_iter = 300, solver='adam', random_state=1)
mlp.fit(X_train,y_train)
print("Precent of accuracy on test data is ",100 * mlp.score(X_train,y_train) )
print ("Precent of accuracy on test data is ",100 * mlp.score(X_test,y_test) )

# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
for k in range (1,6):
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit(X_train,y_train)
print (f"Precent of accuracy on training data using k={k} is {100 * knn.score (X_train,y_train)}")

#k=1 looks to be the best

#step 11
# kNN neighbors seems to be the most accurate predictor, which I believe I also saw in previous projects. This was seen
# across all k values
#The logistic regression is the least accurate at around 79.
# ANN is pretty accurate prediction at 83 for the test set.