Analysis of Pizza Data
pip install graphviz
Collecting graphviz
Downloading graphviz-0.16-py2.py3-none-any.whl (19 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.16
WARNING: You are using pip version 21.0.1; however, version 21.1 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import sklearn
import graphviz
import joblib
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition,datasets,preprocessing,svm,metrics
from sklearn.decomposition import PCA,IncrementalPCA
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.linear_model import RANSACRegressor,HuberRegressor,LinearRegression
from sklearn.svm import SVR,SVC
from sklearn.model_selection import train_test_split,cross_val_score,KFold,RandomizedSearchCV,GridSearchCV
from sklearn.metrics import mean_squared_error,classification_report,r2_score,f1_score,jaccard_score,accuracy_score,confusion_matrix,silhouette_score
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,BaggingClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression,Lasso,Ridge,LinearRegression,ElasticNet
from six import StringIO
from io import StringIO
from IPython.display import SVG
from sklearn.cluster import KMeans,MiniBatchKMeans,SpectralClustering,AgglomerativeClustering,DBSCAN
from IPython.display import IFrame
from scipy.cluster import hierarchy
from scipy.spatial import distance_matrix
%matplotlib inline
pizza = pd.read_csv("Pizza.csv")
pizza.head()
# brand -- Pizza brand (class label)
labels = pizza['brand']
classes = pizza['brand'].unique()
print(classes)
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J']
pizza['brand'].value_counts()
le1 = preprocessing.LabelEncoder()
pizza['brand'] = le1.fit_transform(pizza['brand'])
labels = pizza['brand']
classes = pizza['brand'].unique()
print(classes)
[0 1 2 3 4 5 6 7 8 9]
pizza_no_id = pizza.drop(columns='id').copy()
## Check for null values
pizza_no_id.isnull().sum()
## look at variables data type before we proceed to model
pizza_no_id.dtypes
pizza_no_brand = pizza_no_id.drop(columns='brand').copy()
pizza_no_brand.head()
# look at relationship among variables
d = pd.plotting.scatter_matrix(pizza_no_brand, c = pizza.brand, figsize = (10, 10))
sc_x = StandardScaler()
sc_x.fit(pizza_no_brand.values)
X_scaled = sc_x.transform(pizza_no_brand.values)
X_scaled.shape
sklearn_pca = PCA(n_components = None)
sklearn_transf = sklearn_pca.fit_transform(X_scaled)
varianza_expl = sklearn_pca.explained_variance_ratio_
print(varianza_expl)
#s = list(zip(varianza_expl, pizza_no_brand.columns)) # No hacer zip porque no es correcto
print()
print("The first two components explain ~93% of the variability in the types of pizza. Let's visualize it:")
cum_var_exp = np.cumsum(varianza_expl)
plt.figure(figsize = (10, 6))
plt.xlabel('Number of main components')
plt.ylabel('Cumulative explained variance')
plt.title('Cumulative curve of explained variance versus number of principal components')
nc = np.arange(1, varianza_expl.shape[0] + 1)
plt.plot(nc, cum_var_exp, 'g^')
plt.plot(nc, cum_var_exp, '--r')
plt.show()
[5.95968842e-01 3.27208198e-01 5.92231918e-02 1.35963182e-02
3.95385973e-03 4.82299078e-05 1.35982576e-06]
The first two components explain ~93% of the variability in the types of pizza. Let's visualize it:
pizza_plot1 = pizza_no_brand.drop(['ash','sodium','cal'], axis = 1)
pizza_plot2 = pizza_no_brand.drop(['mois','prot','fat', 'carb'], axis = 1)
fig = plt.figure(figsize=(15,4))
ax0=fig.add_subplot(131)
ax1=fig.add_subplot(132)
pizza_plot1.boxplot(ax = ax0)
pizza_plot2.boxplot(ax = ax1)
fig.subplots_adjust(wspace = 0.5)
describeBasicStats = pizza_no_brand.describe()
describeBasicStats
outlierFat1 = pizza_no_brand['fat'].max() > abs(pizza_no_brand['fat'].mean() + 3 * pizza_no_brand['fat'].std())
outlierFat2 = pizza_no_brand['fat'].min() < abs(pizza_no_brand['fat'].mean() - 3 * pizza_no_brand['fat'].std())
outlierNa1 = pizza_no_brand['sodium'].max() > abs(pizza_no_brand['sodium'].mean() + 3 * pizza_no_brand['sodium'].std())
outlierNa2 = pizza_no_brand['sodium'].min() < abs(pizza_no_brand['sodium'].mean() - 3 * pizza_no_brand['sodium'].std())
outlierCal1 = pizza_no_brand['cal'].max() > abs(pizza_no_brand['cal'].mean() + 3 * pizza_no_brand['cal'].std())
outlierCal2 = pizza_no_brand['cal'].min() < abs(pizza_no_brand['cal'].mean() - 3 * pizza_no_brand['cal'].std())
print('Outliers in the column fat: ' + str(outlierFat1) + ', ' + str(outlierFat2))
print('Outliers in the column sodium: ' + str(outlierNa1) + ', ' + str(outlierNa2))
print('Outliers in the column cal: ' + str(outlierCal1) + ', ' + str(outlierCal2))
Outliers in the column fat: True, True
Outliers in the column sodium: True, True
Outliers in the column cal: False, False
## Data Prep
x1 = pizza_no_id.values[:, 1:8]
print(type(x1))
x = pizza_no_brand
print(type(x))
y = pizza_no_id['brand']
print(type(y))
x.head(2)
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
## check correlation
corr = x.corr()
fig, ax = plt.subplots(figsize=(8,7))
sns.heatmap(corr, annot=True, linewidths=.5, ax=ax)
ax.autoscale(enable=True)
X = StandardScaler().fit_transform(x) # To normalize the data, it also converts x into a ndarray
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
pca = PCA(n_components=2)
ipca = IncrementalPCA(n_components=2)
lr = LinearRegression().fit(X, y) # No tiene sentido para predecir tipo de pizza, usar logistica
print('Without PCA: ',lr.score(X_test, y_test))
pcaFit = pca.fit(X)
X_pca_train = pcaFit.transform(X_train)
X_pca_test = pcaFit.transform(X_test)
lr.fit(X_pca_train, y_train)
print('With PCA: ', lr.score(X_pca_test, y_test))
ipcaFit = ipca.fit(X)
X_ipca_train = ipcaFit.transform(X_train)
X_ipca_test = ipcaFit.transform(X_test)
lr.fit(X_ipca_train, y_train)
print('With IncrementalPCA: ', lr.score(X_ipca_test, y_test))
Without PCA: 0.7102500470496629
With PCA: 0.6197473404292506
With IncrementalPCA: 0.6193888218408723
pizza_no_id_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100,max_depth=50,min_samples_leaf=1)
pizza_no_id_entropy.fit(x, y)
y_pred = pizza_no_id_entropy.predict(x) ## entropy esta relacionado con pureza
print(accuracy_score(y, y_pred)*100)
print(classification_report(y, y_pred))
99.66666666666667
precision recall f1-score support
0 1.00 1.00 1.00 29
1 1.00 1.00 1.00 31
2 0.96 1.00 0.98 27
3 1.00 0.97 0.98 32
4 1.00 1.00 1.00 28
5 1.00 1.00 1.00 30
6 1.00 1.00 1.00 29
7 1.00 1.00 1.00 33
8 1.00 1.00 1.00 29
9 1.00 1.00 1.00 32
accuracy 1.00 300
macro avg 1.00 1.00 1.00 300
weighted avg 1.00 1.00 1.00 300
pizza_no_id_gini=DecisionTreeClassifier(criterion="gini",random_state=100,max_depth=50,min_samples_leaf=1)
pizza_no_id_gini.fit(x, y)
y_pred = pizza_no_id_gini.predict(x)
print(accuracy_score(y, y_pred)*100) ## gini esta relacionado con error
print(classification_report(y, y_pred))
99.66666666666667
precision recall f1-score support
0 1.00 1.00 1.00 29
1 1.00 1.00 1.00 31
2 0.96 1.00 0.98 27
3 1.00 0.97 0.98 32
4 1.00 1.00 1.00 28
5 1.00 1.00 1.00 30
6 1.00 1.00 1.00 29
7 1.00 1.00 1.00 33
8 1.00 1.00 1.00 29
9 1.00 1.00 1.00 32
accuracy 1.00 300
macro avg 1.00 1.00 1.00 300
weighted avg 1.00 1.00 1.00 300
dot_data = StringIO()
filename = "pizzaTree.png"
pizza_no_id_entropy
dot_data = export_graphviz(pizza_no_id_entropy)
graph = graphviz.Source(dot_data)
SVG(graph.pipe(format='svg'))
print(graph.source)
digraph Tree {
node [shape=box] ;
0 [label="X[0] <= 41.935\nentropy = 3.319\nsamples = 300\nvalue = [29, 31, 27, 32, 28, 30, 29, 33, 29, 32]"] ;
1 [label="X[0] <= 33.215\nentropy = 2.32\nsamples = 149\nvalue = [29, 0, 0, 0, 28, 30, 29, 33, 0, 0]"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="X[3] <= 3.205\nentropy = 1.656\nsamples = 88\nvalue = [28, 0, 0, 0, 0, 30, 29, 1, 0, 0]"] ;
1 -> 2 ;
3 [label="X[1] <= 8.205\nentropy = 1.105\nsamples = 60\nvalue = [0, 0, 0, 0, 0, 30, 29, 1, 0, 0]"] ;
2 -> 3 ;
4 [label="X[3] <= 1.435\nentropy = 0.969\nsamples = 40\nvalue = [0, 0, 0, 0, 0, 29, 10, 1, 0, 0]"] ;
3 -> 4 ;
5 [label="X[1] <= 7.845\nentropy = 1.245\nsamples = 19\nvalue = [0, 0, 0, 0, 0, 9, 9, 1, 0, 0]"] ;
4 -> 5 ;
6 [label="X[6] <= 3.465\nentropy = 0.592\nsamples = 7\nvalue = [0, 0, 0, 0, 0, 6, 0, 1, 0, 0]"] ;
5 -> 6 ;
7 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"] ;
6 -> 7 ;
8 [label="entropy = 0.0\nsamples = 6\nvalue = [0, 0, 0, 0, 0, 6, 0, 0, 0, 0]"] ;
6 -> 8 ;
9 [label="X[5] <= 45.255\nentropy = 0.811\nsamples = 12\nvalue = [0, 0, 0, 0, 0, 3, 9, 0, 0, 0]"] ;
5 -> 9 ;
10 [label="X[2] <= 19.665\nentropy = 0.811\nsamples = 4\nvalue = [0, 0, 0, 0, 0, 3, 1, 0, 0, 0]"] ;
9 -> 10 ;
11 [label="entropy = 0.0\nsamples = 3\nvalue = [0, 0, 0, 0, 0, 3, 0, 0, 0, 0]"] ;
10 -> 11 ;
12 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"] ;
10 -> 12 ;
13 [label="entropy = 0.0\nsamples = 8\nvalue = [0, 0, 0, 0, 0, 0, 8, 0, 0, 0]"] ;
9 -> 13 ;
14 [label="X[5] <= 48.235\nentropy = 0.276\nsamples = 21\nvalue = [0, 0, 0, 0, 0, 20, 1, 0, 0, 0]"] ;
4 -> 14 ;
15 [label="entropy = 0.0\nsamples = 20\nvalue = [0, 0, 0, 0, 0, 20, 0, 0, 0, 0]"] ;
14 -> 15 ;
16 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"] ;
14 -> 16 ;
17 [label="X[0] <= 30.795\nentropy = 0.286\nsamples = 20\nvalue = [0, 0, 0, 0, 0, 1, 19, 0, 0, 0]"] ;
3 -> 17 ;
18 [label="entropy = 0.0\nsamples = 19\nvalue = [0, 0, 0, 0, 0, 0, 19, 0, 0, 0]"] ;
17 -> 18 ;
19 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"] ;
17 -> 19 ;
20 [label="entropy = 0.0\nsamples = 28\nvalue = [28, 0, 0, 0, 0, 0, 0, 0, 0, 0]"] ;
2 -> 20 ;
21 [label="X[4] <= 0.435\nentropy = 1.101\nsamples = 61\nvalue = [1, 0, 0, 0, 28, 0, 0, 32, 0, 0]"] ;
1 -> 21 ;
22 [label="X[0] <= 35.4\nentropy = 0.787\nsamples = 34\nvalue = [0, 0, 0, 0, 8, 0, 0, 26, 0, 0]"] ;
21 -> 22 ;
23 [label="X[6] <= 3.335\nentropy = 0.98\nsamples = 12\nvalue = [0, 0, 0, 0, 7, 0, 0, 5, 0, 0]"] ;
22 -> 23 ;
24 [label="entropy = 0.0\nsamples = 4\nvalue = [0, 0, 0, 0, 4, 0, 0, 0, 0, 0]"] ;
23 -> 24 ;
25 [label="X[4] <= 0.425\nentropy = 0.954\nsamples = 8\nvalue = [0, 0, 0, 0, 3, 0, 0, 5, 0, 0]"] ;
23 -> 25 ;
26 [label="X[4] <= 0.395\nentropy = 0.65\nsamples = 6\nvalue = [0, 0, 0, 0, 1, 0, 0, 5, 0, 0]"] ;
25 -> 26 ;
27 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"] ;
26 -> 27 ;
28 [label="entropy = 0.0\nsamples = 5\nvalue = [0, 0, 0, 0, 0, 0, 0, 5, 0, 0]"] ;
26 -> 28 ;
29 [label="entropy = 0.0\nsamples = 2\nvalue = [0, 0, 0, 0, 2, 0, 0, 0, 0, 0]"] ;
25 -> 29 ;
30 [label="X[5] <= 37.725\nentropy = 0.267\nsamples = 22\nvalue = [0, 0, 0, 0, 1, 0, 0, 21, 0, 0]"] ;
22 -> 30 ;
31 [label="X[3] <= 1.355\nentropy = 0.918\nsamples = 3\nvalue = [0, 0, 0, 0, 1, 0, 0, 2, 0, 0]"] ;
30 -> 31 ;
32 [label="entropy = 0.0\nsamples = 2\nvalue = [0, 0, 0, 0, 0, 0, 0, 2, 0, 0]"] ;
31 -> 32 ;
33 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"] ;
31 -> 33 ;
34 [label="entropy = 0.0\nsamples = 19\nvalue = [0, 0, 0, 0, 0, 0, 0, 19, 0, 0]"] ;
30 -> 34 ;
35 [label="X[3] <= 1.42\nentropy = 0.979\nsamples = 27\nvalue = [1, 0, 0, 0, 20, 0, 0, 6, 0, 0]"] ;
21 -> 35 ;
36 [label="X[0] <= 35.345\nentropy = 0.65\nsamples = 6\nvalue = [0, 0, 0, 0, 1, 0, 0, 5, 0, 0]"] ;
35 -> 36 ;
37 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"] ;
36 -> 37 ;
38 [label="entropy = 0.0\nsamples = 5\nvalue = [0, 0, 0, 0, 0, 0, 0, 5, 0, 0]"] ;
36 -> 38 ;
39 [label="X[5] <= 18.045\nentropy = 0.549\nsamples = 21\nvalue = [1, 0, 0, 0, 19, 0, 0, 1, 0, 0]"] ;
35 -> 39 ;
40 [label="entropy = 0.0\nsamples = 1\nvalue = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"] ;
39 -> 40 ;
41 [label="X[3] <= 1.635\nentropy = 0.286\nsamples = 20\nvalue = [0, 0, 0, 0, 19, 0, 0, 1, 0, 0]"] ;
39 -> 41 ;
42 [label="entropy = 0.0\nsamples = 17\nvalue = [0, 0, 0, 0, 17, 0, 0, 0, 0, 0]"] ;
41 -> 42 ;
43 [label="X[1] <= 8.055\nentropy = 0.918\nsamples = 3\nvalue = [0, 0, 0, 0, 2, 0, 0, 1, 0, 0]"] ;
41 -> 43 ;
44 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"] ;
43 -> 44 ;
45 [label="entropy = 0.0\nsamples = 2\nvalue = [0, 0, 0, 0, 2, 0, 0, 0, 0, 0]"] ;
43 -> 45 ;
46 [label="X[5] <= 12.075\nentropy = 2.319\nsamples = 151\nvalue = [0, 31, 27, 32, 0, 0, 0, 0, 29, 32]"] ;
0 -> 46 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
47 [label="X[1] <= 17.785\nentropy = 1.581\nsamples = 90\nvalue = [0, 31, 27, 32, 0, 0, 0, 0, 0, 0]"] ;
46 -> 47 ;
48 [label="entropy = 0.0\nsamples = 31\nvalue = [0, 31, 0, 0, 0, 0, 0, 0, 0, 0]"] ;
47 -> 48 ;
49 [label="X[1] <= 23.45\nentropy = 0.995\nsamples = 59\nvalue = [0, 0, 27, 32, 0, 0, 0, 0, 0, 0]"] ;
47 -> 49 ;
50 [label="entropy = 0.0\nsamples = 30\nvalue = [0, 0, 0, 30, 0, 0, 0, 0, 0, 0]"] ;
49 -> 50 ;
51 [label="X[2] <= 17.745\nentropy = 0.362\nsamples = 29\nvalue = [0, 0, 27, 2, 0, 0, 0, 0, 0, 0]"] ;
49 -> 51 ;
52 [label="X[2] <= 17.455\nentropy = 0.65\nsamples = 12\nvalue = [0, 0, 10, 2, 0, 0, 0, 0, 0, 0]"] ;
51 -> 52 ;
53 [label="X[4] <= 0.605\nentropy = 0.439\nsamples = 11\nvalue = [0, 0, 10, 1, 0, 0, 0, 0, 0, 0]"] ;
52 -> 53 ;
54 [label="entropy = 0.0\nsamples = 9\nvalue = [0, 0, 9, 0, 0, 0, 0, 0, 0, 0]"] ;
53 -> 54 ;
55 [label="entropy = 1.0\nsamples = 2\nvalue = [0, 0, 1, 1, 0, 0, 0, 0, 0, 0]"] ;
53 -> 55 ;
56 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"] ;
52 -> 56 ;
57 [label="entropy = 0.0\nsamples = 17\nvalue = [0, 0, 17, 0, 0, 0, 0, 0, 0, 0]"] ;
51 -> 57 ;
58 [label="X[0] <= 50.665\nentropy = 0.998\nsamples = 61\nvalue = [0, 0, 0, 0, 0, 0, 0, 0, 29, 32]"] ;
46 -> 58 ;
59 [label="entropy = 0.0\nsamples = 32\nvalue = [0, 0, 0, 0, 0, 0, 0, 0, 0, 32]"] ;
58 -> 59 ;
60 [label="entropy = 0.0\nsamples = 29\nvalue = [0, 0, 0, 0, 0, 0, 0, 0, 29, 0]"] ;
58 -> 60 ;
}
x = pizza_no_brand
X = preprocessing.StandardScaler().fit(x).transform(x.astype(float))
y = pizza_no_id['brand'].values
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.15, random_state=4,stratify=y)
Ks = 50
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
knc = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
yhat=knc.predict(X_test)
mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()
print( "The best test accuracy was with", mean_acc.max(), "with k =", mean_acc.argmax()+1)
print("Train set Accuracy: ", metrics.accuracy_score(y_train, knc.predict(X_train)))
The best test accuracy was with 0.8888888888888888 with k = 31
Train set Accuracy: 0.6980392156862745