import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns
# Step 1 - create dataframe
df = pd.read_csv('white_wine_quality.csv', sep = ';')
df.head()
# Step 1 continued. Is the wine acceptable? 1 = True, 0 = False
outcome = [df.quality]
df['quality'] = df.quality
df['outcome'] = df['quality'].map ({0 : 0, 1 : 0, 2 : 0, 3 : 0, 4 : 0, 5 : 0, 6 : 0, 7:1, 8:1, 9:1, 10:1 })
df.tail()
# Step 2. Nan values?
df.info()
print (f"{len(df) } instances of data")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4898 non-null float64
1 volatile acidity 4898 non-null float64
2 citric acid 4898 non-null float64
3 residual sugar 4898 non-null float64
4 chlorides 4898 non-null float64
5 free sulfur dioxide 4898 non-null float64
6 total sulfur dioxide 4898 non-null float64
7 density 4898 non-null float64
8 pH 4898 non-null float64
9 sulphates 4898 non-null float64
10 alcohol 4898 non-null float64
11 quality 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
4898 instances of data
print(df.columns.values)
['fixed acidity' 'volatile acidity' 'citric acid' 'residual sugar'
'chlorides' 'free sulfur dioxide' 'total sulfur dioxide' 'density' 'pH'
'sulphates' 'alcohol' 'quality' 'outcome']
sns.countplot(x='quality', data=df)
df.groupby ("quality").size()
# There are 7 differnt classes of data with the majority of the data being a 5 or a 6.
# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
# Example follows for one feature
#The example for step 4 with the volitile acidity was not working for me
# Step 4 example - volatile acidity feature
f = df['volatile acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
NameError: name 'factor' is not defined
# Step 4. For each feature determine if any data instance is an outlier;
# if it is delete that data instance
f = df['fixed acidity'].values
mean = np.mean(f)
std = np.std(f)
n = len(f)
count = 0
for i in range (0,n):
z = ( f[i] - mean ) / std
if (z>factor) :
print (z)
count = count + 1
df = df.drop( [i])
print ("number of data instances dropped is ", count)
NameError: name 'factor' is not defined
# Step 5. get data into correct form
#
df_mod = df.drop(['quality','outcome'], axis=1)
X = df_mod.values
y = df['outcome'].to_numpy()
df_mod.head()
# Step 6. Split data in test and trial sets with 80-20 split
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train,y_test) = train_test_split (X, y, test_size = .2)
# Step 7. Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Step 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver ='lbfgs' )
lr.fit(X_train, y_train)
#
# Print out percent accuracy on test set
#
test_score = lr.score (X_test,y_test)*100
test_score = round(test_score,2)
print ("Precent of accuracy on test data using Logisitic Regression is ",test_score )
Precent of accuracy on test data using Logisitic Regression is 79.39
# Step 9. Create ANN classifier and train
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier ( max_iter = 300, solver='adam', random_state=1)
mlp.fit(X_train,y_train)
print ("Precent of accuracy on test data using ANN is ",100 * mlp.score(X_test,y_test) )
Precent of accuracy on test data using ANN is 82.0408163265306
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:617: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (300) reached and the optimization hasn't converged yet.
% self.max_iter, ConvergenceWarning)
# Step 10. Create kNN classifier and see what value of k is best
from sklearn.neighbors import KNeighborsClassifier
k = np.arange(1,6)
accuracy =[]
for i in k :
knn = KNeighborsClassifier (n_neighbors = i)
knn.fit (X_train, y_train)
accuracy.append ( 100* knn.score ( X_test, y_test) )
print (accuracy)
#
#
plt.plot (k, accuracy)
[85.0, 84.08163265306122, 82.55102040816327, 81.53061224489797, 83.06122448979592]