# import libraries and K-Means function
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
#
from sklearn.cluster import KMeans
# Create dataframe for data
file='cereal.csv'
cereal = pd.read_csv(file)
# drop features we aren't using or create a new dataframe with only the features we want
# relevant column names are name, mfr, carbo, sugars, and rating
cereal = cereal.drop( ['type', 'calories', 'protein', 'fat', 'sodium', 'fiber', 'potass', 'vitamins', 'shelf', 'weight', 'cups'], axis=1)
# checking length (cereal & cereal.head() were bad for formatting when exporting to pdf)
len(cereal)
# Check to see if any instances have NaN for entries using .info
cereal.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 77 non-null object
1 mfr 77 non-null object
2 carbo 77 non-null float64
3 sugars 77 non-null int64
4 rating 77 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.1+ KB
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
cereal[cereal.sugars < 0]
cereal[cereal.carbo < 0]
# Address negative values
cereal = cereal.drop([57])
# testing to see if negative index/row was dropped
len(cereal)
# testing to check if negative values are still present
cereal[cereal.sugars < 0]
cereal[cereal.carbo < 0]
# set background grid for plots
sns.set_style( 'whitegrid')
# Plot number of products per manufacturer
mfr = sns.countplot (x='mfr', data = cereal )
# There is only one cereal from American Home Foods Company so we drop that data sample
#
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
#
cereal[cereal.mfr == 'A']
cereal = cereal.drop([43])
mfr = sns.countplot (x='mfr', data = cereal )
# For plots we would like the name of manufacturer instead of just "N" or "Q"
# Use .map or .apply
#
# Then drop 'mfr' column
#
cereal['mfr_name'] = cereal['mfr'].map ({ \
'A': 'American Home Food Products', \
'G':'General Mills', \
'K': 'Kelloggs', \
'N': 'Nabisco', \
'P': 'Post', \
'Q': 'Quaker Oats', \
'R': 'Ralston Purina' })
cereal = cereal.drop(columns=['mfr'])
# Plot to see which manufacturer has highest rated cereals
sns.catplot(x='mfr_name', y='rating' , data=cereal )
plt.xticks(rotation=30)
# Which cereal is rated highest?
cereal[cereal.rating == cereal.rating.max()]
# Look at sugars per brand by plotting
mfr = sns.catplot(x='mfr_name',y='sugars' , data = cereal )
plt.xticks(rotation=30)
# Get data for clustering
X = cereal.sugars.values
X = np.reshape( X, (len(X), 1) )
km = KMeans( n_clusters=3, init='random', random_state=0 )
y_km = km.fit(X)
# Form model, fit data and print out cluster centers
print ("Sugar cluster centers\n")
print (f"{km.cluster_centers_}")
Sugar cluster centers
[[ 1.95833333]
[ 6.84 ]
[12.03846154]]
# Add column to dataframe for this clusters, say sugar_cluster
cereal['sugar_cluster'] = km.labels_
cereal.tail()
# Plot clusters
sns.catplot(x= 'sugar_cluster', y='sugars', data = cereal)
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
cereal['sugar_level'] = cereal['sugar_cluster'].map ({ \
0: 'low', \
1:'medium', \
2: 'high' })
cereal.tail()
# How are cereals distributed among the 3 levels?
sns.catplot(data=cereal, x="sugar_level", y="mfr_name", order=['low', 'medium', 'high'])
# Which cereals have the highest sugar levels
#
cereal[(cereal.sugar_level == 'high')]
# Which cereals have the lowest sugar levels
#
cereal[(cereal.sugar_level == 'low')]
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Apple Jacks'
my_cereal_index = cereal[cereal.name == my_cereal].index.values
my_cereal_cluster = cereal[cereal.name == my_cereal].sugar_cluster.values
my_cereal_level = cereal[cereal.name == my_cereal].sugar_level.values
print (f"The data instance for {my_cereal} is {my_cereal_index}")
print (f"The sugar cluster for {my_cereal} is {my_cereal_cluster}")
print (f"The sugar level for {my_cereal} is {my_cereal_level}")
The data instance for Apple Jacks is [6]
The sugar cluster for Apple Jacks is [2]
The sugar level for Apple Jacks is ['high']
# Look at carbs per brand by plotting
mfr = sns.catplot(x='mfr_name',y='carbo' , data = cereal )
plt.xticks(rotation=30)
# Get data for clustering
X = cereal.carbo.values
X = np.reshape( X, (len(X), 1) )
km = KMeans( n_clusters=3, init='random', random_state=0 )
y_km = km.fit(X)
# Form model, fit data and print out cluster centers
print ("carb cluster centers")
print()
print (km.cluster_centers_ )
carb cluster centers
[[15.59677419]
[11.08333333]
[20.92857143]]
# Add column to dataframe for this clusters, say carb_cluster
cereal['carb_cluster'] = km.labels_
cereal.tail()
# Plot clusters
sns.catplot(x= 'carb_cluster', y='carbo', data = cereal)
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
cereal['carb_level'] = cereal['carb_cluster'].map ({ \
0: 'low', \
1:'medium', \
2: 'high' })
cereal.tail()
# How are cereals distributed among the 3 levels?
sns.catplot(data=cereal, x="carb_level", y="mfr_name", order=['low', 'medium', 'high'])
# Which cereals have the highest carb levels
#
cereal[(cereal.carb_level == 'high')]
# Which cereals have the lowest carb levels
#
cereal[(cereal.carb_level == 'low')]
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Apple Jacks'
my_cereal_index = cereal[cereal.name == my_cereal].index.values
my_cereal_cluster = cereal[cereal.name == my_cereal].carb_cluster.values
my_cereal_level = cereal[cereal.name == my_cereal].carb_level.values
print (f"The data instance for {my_cereal} is {my_cereal_index}")
print (f"The carb cluster for {my_cereal} is {my_cereal_cluster}")
print (f"The carb level for {my_cereal} is {my_cereal_level}")
The data instance for Apple Jacks is [6]
The carb cluster for Apple Jacks is [1]
The carb level for Apple Jacks is ['medium']
cereal[ (cereal.carb_level == 'high') & (cereal.sugar_level == 'low') ]
# What cereals are low carbs and low sugar?
cereal[ (cereal.carb_level == 'low') & (cereal.sugar_level == 'low') ]