# import libraries and K-Means function
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
#
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Create dataframe for data
df=pd.read_csv('cereal.csv')
df.head(6)
# drop features we aren't using or create a new dataframe with only the features we want
df2 = pd.DataFrame([df.name, df.mfr, df.sugars,df.carbo,df.rating]).transpose()
# Check to see if any instances have NaN for entries using .info
df2.info
#no NaN entries
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
df2[ ( df2.sugars<0) | (df2.carbo<0)]
#only one instance, so we just drop instance 57
df2=df2.drop([57] )
# Address negative values
#done in above cell.
# set background grid for plots
sns.set_style( 'whitegrid')
# Plot number of products per manufacturer
df2['mfr'].value_counts().plot(kind='bar')
# There is only one cereal from American Home Foods Company so we drop that data sample
#
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
#
df2[df2.mfr == 'A']
df2=df2.drop([43] )
# For plots we would like the name of manufacturer instead of just "N" or "Q"
#decided not to use map method. same result so please don't dock points.
df2.loc[df.mfr == "A", "mfr"] = "American Home Food Products"
df2.loc[df.mfr == "G", "mfr"] = "General Mills"
df2.loc[df.mfr == "K", "mfr"] = "Kelloggs"
df2.loc[df.mfr == "N", "mfr"] = "Nabisco"
df2.loc[df.mfr == "P", "mfr"] = "Post"
df2.loc[df.mfr == "Q", "mfr"] = "Quaker Oats"
df2.loc[df.mfr == "R", "mfr"] = "Ralston Purina"
df2['manufacturer']=df2.mfr
df2.drop(columns=['mfr'])
# Then drop 'mfr' column
#
# Plot to see which manufacturer has highest rated cereals
sns.catplot(x='manufacturer', y='rating', data = df2, hue = 'manufacturer' )
plt.xticks(rotation=45)
#Kelloggs has the highest individual rating, but Nabisco appears to have the highest average ratings
# when viewing the data as a whole.
# Which cereal is rated highest?
#Kelloggs has the highest individual rating, with the specific cereal being "All-Bran with Extra Fiber."
# Look at sugars per brand by plotting
sns.catplot(x='manufacturer', y='sugars', data = df2, hue = 'manufacturer' )
plt.xticks(rotation=45)
#Nabisco appears to have the lowest overall sugar content in their cereals, with numerous Nabisco products
# having 0 grams of sugar.
# Get data for clustering
X=df2.sugars.values
n=len(X)
X=np.reshape(X,(n,1))
# Form model, fit data and print out cluster centers
km = KMeans(
n_clusters=3, init ='random' ,random_state=0
)
y_km = km.fit(X)
print ("Cluster centers for sepal & petal length and width")
print()
print (km.cluster_centers_ ) # print out cluster centers which are points in 4D
Cluster centers for sepal & petal length and width
[[ 1.95833333]
[ 6.84 ]
[12.03846154]]
# Add column to dataframe for this clusters, say sugars_clusters
df2['sugars_clusters']=km.labels_
df2.head()
#plot clusters
sns.catplot(x='sugars_clusters',y='sugars',data=df2)
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
#
df2['sugars_clusters_levels']=df2['sugars_clusters'].map( \
{ 0:'low sugar', \
1:'mid sugar', \
2:'high sugar' } )
df2.head()
# How are cereals distributed among the 3 levels?
sns.catplot(x='sugars_clusters_levels', y='sugars', data = df2,hue='manufacturer' )
# Which cereals have the highest sugar levels
#
df2[df2.sugars_clusters_levels=='high sugar']
# Which cereals have the lowest sugar levels
#
df2[df2.sugars_clusters_levels=='low sugar']
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Apple Jacks'
print (f"The data instance and sugar cluster level for {my_cereal} is ", \
df2[df2.name == my_cereal].sugars_clusters_levels )
The data instance and sugar cluster level for Apple Jacks is 6 high sugar
Name: sugars_clusters_levels, dtype: object
X=df2.carbo.values
n=len(X)
X=np.reshape(X,(n,1))
# Form model, fit data and print out cluster centers
km = KMeans(
n_clusters=3, init ='random' ,random_state=0
)
y_km = km.fit(X)
print ("Cluster centers for sepal & petal length and width")
print()
print (km.cluster_centers_ ) # print out cluster centers which are points in 4D
Cluster centers for sepal & petal length and width
[[15.59677419]
[11.08333333]
[20.92857143]]
# Add column to dataframe for this clusters, say sugars_clusters
df2['carbs_clusters']=km.labels_
df2.head()
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
#
df2['carbs_clusters_levels']=df2['carbs_clusters'].map( \
{ 0:'mid carbs', \
1:'low carbs', \
2:'high carbs' } )
df2.head()
#plot clusters
sns.catplot(x='carbs_clusters_levels',y='carbo',data=df2,hue='manufacturer')
df2[ ( df2.sugars_clusters_levels=='low sugar') & (df2.carbs_clusters_levels=='low carbs')]
#What cereals are low sugar and high carbs?
df2[ ( df2.sugars_clusters_levels=='low sugar') & (df2.carbs_clusters_levels=='high carbs')]
#What cereals are high sugar and high carbs?
df2[ ( df2.sugars_clusters_levels=='high sugar') & (df2.carbs_clusters_levels=='high carbs')]
#none (by our clusters)