import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns

# Step 1 - create dataframe
df = pd.read_csv('white_wines.csv')
df.tail()

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
# I wasn't sure how to map a range of values, so I found this seimple method in numpy:
condition = [df['quality'].between(0,6), df['quality'].between(7,10)]
values = [0,1]
df['target'] = np.select(condition,values,0)
df.head()

# Step 2. Nan values?
instances = len(df.index)
print(f'There are {instances} instances of data in the file.')
if df.isnull().values.any() == False:
print('No Nan values found.')
else:
print('Some Nan values have been found and they should be removed.')

```
There are 4898 instances of data in the file.
No Nan values found.
```

# Step 3. Quality groupings and countplot
sns.set_style('whitegrid')
sns.countplot(x='quality', data=df)

quality_groups = df.groupby('quality')
print('QUALITY NUMBER OF WINES')
for i in range (3,10):
quant_group = len(quality_groups.get_group(i))
print(' ',i,'.........',quant_group)

```
QUALITY NUMBER OF WINES
3 ......... 20
4 ......... 163
5 ......... 1457
6 ......... 2198
7 ......... 880
8 ......... 175
9 ......... 5
```

# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
#
factor = 5 # Set the criterion for data elimination: (value-mean)/std > 5
for column in df: # Looping through the features
f = df[column].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
dev = abs(f[i]-mean)/std
if dev > factor:
count = count + 1
df = df.drop([i])
print(f'{count} instances have been dropped based on the outliers in {column}.')
#
# Determining the number of remaining data instances
#
instances = len(df.index)
print(f'There are {instances} instances of data in the file.')

```
2 instances have been dropped based on the outliers in fixed acidity.
9 instances have been dropped based on the outliers in volatile acidity.
8 instances have been dropped based on the outliers in citric acid.
1 instances have been dropped based on the outliers in residual sugar.
56 instances have been dropped based on the outliers in chlorides.
7 instances have been dropped based on the outliers in free sulfur dioxide.
2 instances have been dropped based on the outliers in total sulfur dioxide.
3 instances have been dropped based on the outliers in density.
0 instances have been dropped based on the outliers in pH.
1 instances have been dropped based on the outliers in sulphates.
0 instances have been dropped based on the outliers in alcohol.
0 instances have been dropped based on the outliers in quality.
0 instances have been dropped based on the outliers in target.
There are 4809 instances of data in the file.
```

# Step 5. get data into correct form
#
df_mod = df.drop(['quality','target'], axis=1)
X = df_mod.values
y = df['target'].to_numpy()
print(X)
print(y)

```
[[ 7. 0.27 0.36 ... 3. 0.45 8.8 ]
[ 6.3 0.3 0.34 ... 3.3 0.49 9.5 ]
[ 8.1 0.28 0.4 ... 3.26 0.44 10.1 ]
...
[ 6.5 0.24 0.19 ... 2.99 0.46 9.4 ]
[ 5.5 0.29 0.3 ... 3.34 0.38 12.8 ]
[ 6. 0.21 0.38 ... 3.26 0.32 11.8 ]]
[0 0 0 ... 0 1 0]
```

# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2)
n_train = len(X_train)
n_test = len(X_test)
print(n_train, n_test)

```
3847 962
```

# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs',max_iter = 200)
lr.fit(X_train,y_train)
test_score_lr = lr.score(X_test,y_test)
print(f'The accuracy of predicting the test set with Logistic Regression is {round(100*test_score_lr,3)}%.')

```
The accuracy of predicting the test set with Logistic Regression is 80.457%.
```

# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
ann = MLPClassifier(max_iter = 1000, solver = 'adam')
ann.fit(X_train,y_train)
test_score_ann = ann.score(X_test,y_test)
print(f'The accuracy of predicting the test set with Artificial Neural Network is {round(100*test_score_ann,3)}%.')

```
The accuracy of predicting the test set with Artificial Neural Network is 83.992%.
```

# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
# Initializing the parameter that will track the best accuracy for training with different values of k
best_accuracy = 0
# Running the loop to find the best value of k
for i in range(1,6):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
train_score = knn.score(X_train,y_train)
# Checking if the accuracy has been improved and keeping track of the best k value
if train_score > best_accuracy:
best_accuracy = train_score
best_k = i
# Explanation for the IF condition: if the score with the larger k doesn't exceed
# the score with the smaller k, there's no reason to use the larger k,
# since it make the problem more computationally expensive (for really large data sets)
print(f'For the model with k = {i}, the accuracy of the training set is {round(100*train_score,3)}%.')
print(f'The best accuracy has been achieved using k = {best_k}.')

```
For the model with k = 1, the accuracy of the training set is 100.0%.
For the model with k = 2, the accuracy of the training set is 93.293%.
For the model with k = 3, the accuracy of the training set is 91.812%.
For the model with k = 4, the accuracy of the training set is 88.874%.
For the model with k = 5, the accuracy of the training set is 88.9%.
The best accuracy has been achieved using k = 1.
```

# Refitting the data with the best value of k
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train,y_train)
test_score_knn = knn.score(X_test,y_test)
print(f'For the model with k = {best_k}, the accuracy of predicting the test set is {round(100*test_score_knn,3)}%.')

```
For the model with k = 1, the accuracy of predicting the test set is 84.927%.
```