import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
import pandas as pd
import seaborn as sns

# Step 1 - create dataframe
# Taken from the first block of code
df = pd.read_csv('white_wine_quality.csv', sep=';')

df.head()

fixed acidityfloat64

volatile acidityfloat64

0

7.0

0.27

1

6.3

0.3

2

8.1

0.28

3

7.2

0.23

4

7.2

0.23

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
# Rating of 7 or higher is acceptable
df["target"] = df["quality"].map({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1})

# Step 2. Nan values?
if df.isnull().values.any ():
print (f"NaN values found")
else:
print (f"No NaN values found")

```
No NaN values found
```

# Step 3. Quality groupings
for i in range (3, 11):
print (f"Quality: {i}, Number in Group: {len (df[df.quality == i])}")

```
Quality: 3, Number in Group: 20
Quality: 4, Number in Group: 163
Quality: 5, Number in Group: 1457
Quality: 6, Number in Group: 2198
Quality: 7, Number in Group: 880
Quality: 8, Number in Group: 175
Quality: 9, Number in Group: 5
Quality: 10, Number in Group: 0
```

# Step 3. Countplot
sns.set_style("whitegrid")
sns.countplot(data = df, x = "quality")

# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
for feature in df:
f = df[feature].values
mean = np.mean(f)
std = np.std(f)
count = 0
for i in range (0, len(f)):
z = (f[i] - mean) / std
if (z > 5):
count = count + 1
df = df.drop([i])
print(f"Number of data instances dropped from {feature} is {count}")

```
Number of data instances dropped from fixed acidity is 2
Number of data instances dropped from volatile acidity is 1
Number of data instances dropped from citric acid is 8
Number of data instances dropped from residual sugar is 2
Number of data instances dropped from chlorides is 56
Number of data instances dropped from free sulfur dioxide is 7
Number of data instances dropped from total sulfur dioxide is 2
Number of data instances dropped from density is 2
Number of data instances dropped from pH is 0
Number of data instances dropped from sulphates is 2
Number of data instances dropped from alcohol is 0
Number of data instances dropped from quality is 0
Number of data instances dropped from target is 0
```

# Step 5. get data into correct form
df_mod = df.drop(["quality", "target"], axis = 1)
X = df_mod.values
Y = df["target"].to_numpy()
print (f"X = {len(X[1])} features * {len(X)} entries")
print (f"Y = {len(Y)} data entries")

```
X = 11 features * 4795 entries
Y = 4795 data entries
```

# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train, X_test, Y_train, Y_test) = train_test_split (X, Y, test_size = 0.2)
print (f"Training: {len(X_train)}, Testing: {len(X_test)}")

```
Training: 3836, Testing: 959
```

# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit (X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression (solver = "lbfgs")
lr.fit (X_test, Y_test)
percent = round(100 * lr.score(X_test, Y_test), 2)
print (f"Logistic Regression accuracy is {percent}%")

```
Logistic Regression accuracy is 80.19%
```

# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier (max_iter = 300, solver = "adam")
mlp.fit (X_train, Y_train)
ann_percent = round (100 * mlp.score (X_test, Y_test), 2)
print (f"ANN Accuracy is {ann_percent}%")

```
ANN Accuracy is 83.0%
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (300) reached and the optimization hasn't converged yet.
ConvergenceWarning,
```

# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
max = 0
rng = [1, 2, 3, 4, 5]
for i in (rng):
knni = KNeighborsClassifier (n_neighbors= i)
knni.fit (X_train, Y_train)
knni_percent = round (100 * knni.score (X_train, Y_train), 2)
print (f"Nearest neighbor {i}, Quality = {knni_percent}%")
if (knni_percent > max):
max = knni_percent
index = i
print (f"kNN algorithm with {index} nearest neighbor has a percent accuracy of {max}%")

```
Nearest neighbor 1, Quality = 100.0%
Nearest neighbor 2, Quality = 93.04%
Nearest neighbor 3, Quality = 91.92%
Nearest neighbor 4, Quality = 88.66%
Nearest neighbor 5, Quality = 88.63%
kNN algorithm with 1 nearest neighbor has a percent accuracy of 100.0%
```

# Logistic Regression accuracy: 80.19%
# ANN accuracy: 83.0%
# kNN accuracy: 100.0%
# The LR and ANN classifiers weren't as reliable in comparison to the kNN classifier.
# kNN classifier is the most reliable of the three tested.