Final Project - Duplicate

import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame import pandas as pd import seaborn as sns

# Step 1 - create dataframe df = pd.read_csv('white_wine_quality.csv',sep=";")

# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False df['target'] = df['quality'].map( { 0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:1,8:1,9:1,10:1} )

# Step 2. Nan values? if df.isnull().values.any(): print("Nan values found.") else: print("No Nan values found.")

# Step 3. Quality groupings and countplot for i in range(3,11) : print(f"Quality: {i}, Quality: {len(df[df.quality == i])}")

sns.set_style('whitegrid') sns.countplot(data=df, x='quality')

# Step 4. For each feature determine if any data instance is an outlier; # if it is delete that data instance # Example follows for one feature print(f"Number of data entires: {len(df)}")

# Step 4 example - volatile acidity feature for feature in df: f = df['volatile acidity'].values mean = np.mean(f) std = np.std(f) n = len(f) count = 0 for i in range (0,n): z = ( f[i] - mean ) / std if (z>5) : count = count + 1 df = df.drop( [i]) print ("Number of data instances dropped is ", count)

# Step 5. get data into correct form # df_mod = df.drop(['quality', 'target'], axis=1) X = df_mod.values y = df['target'].to_numpy() print(f"X = {len(X[1])} features x {len(X)} entires") print(f"Y = {len(y)} data entries")

# Step 6. Split data in test and trial sets with 80-20 split from sklearn.model_selection import train_test_split (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2) print(f"Training: {len(X_train)}, Testing: {len(X_test)}")

# Step 7. Scale the data from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test)

# Step 8. Logistic Regression from sklearn.linear_model import LogisticRegression lr = LogisticRegression(solver ='lbfgs' ) lr.fit(X_test, y_test) percent = round(100* lr.score(X_test,y_test), 2) print(f"Logisitic Regression accuracy = {percent}%")

# Step 9. Create ANN classifier and train from sklearn.neural_network import MLPClassifier mlp = MLPClassifier( max_iter = 300, solver='adam' ) mlp.fit( X_train , y_train ) percent = round(100 * mlp.score( X_test , y_test ), 2) print (f"ANN accuracy = {percent}%")

# Step 10. Create kNN classifier and see what value of k is best from sklearn.neighbors import KNeighborsClassifier max = 0 rng = [1, 2, 3, 4, 5] for i in ( rng ) : knni = KNeighborsClassifier (n_neighbors = i) # fitting the data knni.fit (X_train, y_train) percent =round(100* knni.score( X_train , y_train ), 2) print (f"Nearest neighbor {i}, Quality = {percent}%") if (percent > max ) : max = percent index = i print(f"kNN algorithm with {index} nearest neighbor percent accuracy = {max}%")

The model with the highest accuracy was the kNN with 1 nearest neighbor. This model obtained 100 percent accuracy while the LR and ANN models were around 81 to 83 percent, making them sub-optimal choices.