# import libraries and K-Means function
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Create dataframe for data
df = pd.read_csv('cereal.csv')
df.head(10)
# drop features we aren't using or create a new dataframe with only the features we want
df = df[["name", "mfr", "sugars", "carbo", "rating"]]
df.head(10)
# Check to see if any instances have NaN for entries using .info
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 77 non-null object
1 mfr 77 non-null object
2 sugars 77 non-null int64
3 carbo 77 non-null float64
4 rating 77 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.1+ KB
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
#print(len(df))
print(df[df.sugars < 0])
print(df[df.carbo < 0])
name mfr sugars carbo rating
57 Quaker Oatmeal Q -1 -1.0 50.828392
name mfr sugars carbo rating
57 Quaker Oatmeal Q -1 -1.0 50.828392
# Address negative values
x = df[ df.sugars >= 0]
print(len(x))
#print(x)
y = x[x.carbo >= 0]
df = y
76
# set background grid for plots
sns.set_style( 'whitegrid')
# Plot number of products per manufacturer
sns.histplot(data=df, x="mfr")
# There is only one cereal from American Home Foods Company so we drop that data sample
#
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
#
df = df[df.mfr != 'A']
sns.histplot(data=df, x="mfr")
# For plots we would like the name of manufacturer instead of just "N" or "Q"
# Use .map or .apply
#
# Then drop 'mfr' column
#
df['mfr_Name'] = df['mfr'].map ( {"G": "General Mills", "K": "Kelloggs", "N": "Nabisco", "P":"Post", "Q":"Quaker Oats", "R":"Ralston Purina"})
"""A -> American Home Food Products
G -> General Mills
K -> Kelloggs
N -> Nabisco
P -> Post
Q -> Quaker Oats
R -> Ralston Purina"""
df = df.drop(columns = ["mfr"])
df.head(5)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import sys
# Plot to see which manufacturer has highest rated cereals
xx = []
for i in range(0, len(df.rating.values)):
xx.append(i)
df['mfr_Brand'] = df['mfr_Name'].map ( {"General Mills":0, "Kelloggs":1, "Nabisco":2, "Post":3, "Quaker Oats":4, "Ralston Purina":5})
sns.relplot(x="mfr_Brand" , y="rating", data=df, hue="mfr_Name")
# Which cereal is rated highest?
cmax = max(df.rating.values)
df[df.rating == cmax]
# Look at sugars per brand by plotting
sns.relplot(x="mfr_Brand" , y="sugars", data=df, hue="mfr_Name")
# Get data for clustering
xkmeans = df.sugars.values
ykmeans = []
X = []
for i in range(0, len(xkmeans)):
temp = []
temp.append(xkmeans[i])
temp.append(0)
X.append(temp)
ykmeans.append(0)
plt.plot(xkmeans,ykmeans, 'ro')
plt.show()
# Form model, fit data and print out cluster centers
Y = [[1],[2],[3],[4],[5]]
Z = [1,2,3,4,5]
print(len(df.sugars))
#print(df.sugars)
ZZ = np.reshape(df.sugars, (75,1))
#print(ZZ)
kmeans = KMeans(n_clusters=3, random_state=0).fit(ZZ)
kmeans.labels_
75
ValueError: Data must be 1-dimensional
# Add column to dataframe for this clusters, say sugars_clusters
print(kmeans.cluster_centers_)
#df['sugars_clusters'] = df['mfr_Name'].map ( {"General Mills":0, "Kelloggs":1, "Nabisco":2, "Post":3, "Quaker Oats":4, "Ralston Purina":5})
df['sugars_clusters'] = kmeans.labels_
df.head(5)
[[1.5]
[4.5]
[3. ]]
ValueError: Length of values (5) does not match length of index (75)
# Plot clusters
sns.relplot(x = 'mfr_Brand', y = 'sugars', data= df, hue='sugars_clusters')
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
#
df['sugars_clusters_label'] = df['sugars_clusters'].map ( {0:"highest", 1:"lowest", 2:"middle"})
df.head(10)
# How are cereals distributed among the 3 levels?
# print("what")
# print(df)
print("high")
print(len(df[df.sugars_clusters == 0]))
print("low")
print(len(df[df.sugars_clusters == 1]))
print("middle")
print(len(df[df.sugars_clusters == 2]))
high
26
low
24
middle
25
# Which cereals have the highest sugar levels
#
df[df.sugars_clusters == 0]
# Which cereals have the lowest sugar levels
#
df[df.sugars_clusters == 1]
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Apple Jacks'
print (f"The data instance and sugar cluster for {my_cereal} is ", \
df[df.name == my_cereal].sugars_clusters_label.values[0] )
The data instance and sugar cluster for Apple Jacks is highest
xkmeans = df.carbo.values
ykmeans = []
X_carb = []
for i in range(0, len(xkmeans)):
temp = []
temp.append(xkmeans[i])
temp.append(0)
X_carb.append(temp)
ykmeans.append(0)
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_carb)
kmeans.labels_
print(kmeans.cluster_centers_)
df['carbo_clusters'] = kmeans.labels_
df.head(5)
[[20.41176471 0. ]
[10.38636364 0. ]
[14.81944444 0. ]]
sns.relplot(x = 'mfr_Brand', y = 'carbo', data= df, hue='carbo_clusters')
df['carbo_clusters_label'] = df['carbo_clusters'].map ( {0:"highest", 1:"lowest", 2:"middle"})
df.head(10)
print("high")
print(len(df[df.carbo_clusters == 0]))
print("low")
print(len(df[df.carbo_clusters == 1]))
print("middle")
print(len(df[df.carbo_clusters == 2]))
high
17
low
22
middle
36
temp = df[(df.carbo_clusters_label == "highest")]
temp[temp.sugars_clusters_label == "lowest"]
temp = df[(df.carbo_clusters_label == "lowest")]
temp[temp.sugars_clusters_label == "lowest"]