import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns

# Step 1 - create dataframe
df = pd.read_csv('white_wines.csv')
df.tail()

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
# I wasn't sure how to map a range of values, so I found this seimple method in numpy:
condition = [df['quality'].between(0,6), df['quality'].between(7,10)]
values = [0,1]
df['target'] = np.select(condition,values,0)
df.head()

# Step 2. Nan values?
instances = len(df.index)
print(f'There are {instances} instances of data in the file.')
if df.isnull().values.any() == False:
print('No Nan values found.')
else:
print('Some Nan values have been found and they should be removed.')

# Step 3. Quality groupings and countplot
sns.set_style('whitegrid')
sns.countplot(x='quality', data=df)

quality_groups = df.groupby('quality')
print('QUALITY NUMBER OF WINES')
for i in range (3,10):
quant_group = len(quality_groups.get_group(i))
print(' ',i,'.........',quant_group)

# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
#
factor = 5 # Set the criterion for data elimination: (value-mean)/std > 5
for column in df: # Looping through the features
f = df[column].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
dev = abs(f[i]-mean)/std
if dev > factor:
count = count + 1
df = df.drop([i])
print(f'{count} instances have been dropped based on the outliers in {column}.')
#
# Determining the number of remaining data instances
#
instances = len(df.index)
print(f'There are {instances} instances of data in the file.')

# Step 5. get data into correct form
#
df_mod = df.drop(['quality','target'], axis=1)
X = df_mod.values
y = df['target'].to_numpy()
print(X)
print(y)

# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2)
n_train = len(X_train)
n_test = len(X_test)
print(n_train, n_test)

# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs',max_iter = 200)
lr.fit(X_train,y_train)
test_score_lr = lr.score(X_test,y_test)
print(f'The accuracy of predicting the test set with Logistic Regression is {round(100*test_score_lr,3)}%.')

# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
ann = MLPClassifier(max_iter = 1000, solver = 'adam')
ann.fit(X_train,y_train)
test_score_ann = ann.score(X_test,y_test)
print(f'The accuracy of predicting the test set with Artificial Neural Network is {round(100*test_score_ann,3)}%.')

# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
# Initializing the parameter that will track the best accuracy for training with different values of k
best_accuracy = 0
# Running the loop to find the best value of k
for i in range(1,6):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
train_score = knn.score(X_train,y_train)
# Checking if the accuracy has been improved and keeping track of the best k value
if train_score > best_accuracy:
best_accuracy = train_score
best_k = i
# Explanation for the IF condition: if the score with the larger k doesn't exceed
# the score with the smaller k, there's no reason to use the larger k,
# since it make the problem more computationally expensive (for really large data sets)
print(f'For the model with k = {i}, the accuracy of the training set is {round(100*train_score,3)}%.')
print(f'The best accuracy has been achieved using k = {best_k}.')

# Refitting the data with the best value of k
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train,y_train)
test_score_knn = knn.score(X_test,y_test)
print(f'For the model with k = {best_k}, the accuracy of predicting the test set is {round(100*test_score_knn,3)}%.')