Final Python Project - Classification

import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier from sklearn.cluster import KMeans import pandas as pd import seaborn as sns

# Step 1 - create dataframe # Taken from the first block of code df = pd.read_csv('white_wine_quality.csv', sep=';')

df.head()

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False # Rating of 7 or higher is acceptable df["target"] = df["quality"].map({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1})

# Step 2. Nan values? if df.isnull().values.any (): print (f"NaN values found") else: print (f"No NaN values found")

# Step 3. Quality groupings for i in range (3, 11): print (f"Quality: {i}, Number in Group: {len (df[df.quality == i])}")

# Step 3. Countplot sns.set_style("whitegrid") sns.countplot(data = df, x = "quality")

# Step 4. For each feature determine if any data instance is an outlier; # if it is delete that data instance for feature in df: f = df[feature].values mean = np.mean(f) std = np.std(f) count = 0 for i in range (0, len(f)): z = (f[i] - mean) / std if (z > 5): count = count + 1 df = df.drop([i]) print(f"Number of data instances dropped from {feature} is {count}")

# Step 5. get data into correct form df_mod = df.drop(["quality", "target"], axis = 1) X = df_mod.values Y = df["target"].to_numpy() print (f"X = {len(X[1])} features * {len(X)} entries") print (f"Y = {len(Y)} data entries")

# Step 6. Split data in test and trial sets with 80-20 split from sklearn.model_selection import train_test_split (X_train, X_test, Y_train, Y_test) = train_test_split (X, Y, test_size = 0.2) print (f"Training: {len(X_train)}, Testing: {len(X_test)}")

# Step 7. Scale the data from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit (X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test)

# Step 8. Logistic Regression from sklearn.linear_model import LogisticRegression lr = LogisticRegression (solver = "lbfgs") lr.fit (X_test, Y_test) percent = round(100 * lr.score(X_test, Y_test), 2) print (f"Logistic Regression accuracy is {percent}%")

# Step 9. Create ANN classifier and train from sklearn.neural_network import MLPClassifier mlp = MLPClassifier (max_iter = 300, solver = "adam") mlp.fit (X_train, Y_train) ann_percent = round (100 * mlp.score (X_test, Y_test), 2) print (f"ANN Accuracy is {ann_percent}%")

# Step 10. Create kNN classifier and see what value of k is best from sklearn.neighbors import KNeighborsClassifier max = 0 rng = [1, 2, 3, 4, 5] for i in (rng): knni = KNeighborsClassifier (n_neighbors= i) knni.fit (X_train, Y_train) knni_percent = round (100 * knni.score (X_train, Y_train), 2) print (f"Nearest neighbor {i}, Quality = {knni_percent}%") if (knni_percent > max): max = knni_percent index = i print (f"kNN algorithm with {index} nearest neighbor has a percent accuracy of {max}%")

# Logistic Regression accuracy: 80.19% # ANN accuracy: 83.0% # kNN accuracy: 100.0% # The LR and ANN classifiers weren't as reliable in comparison to the kNN classifier. # kNN classifier is the most reliable of the three tested.