# import libraries and K-Means function
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
#Import models
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Create dataframe for data
df=pd.read_csv('cereal.csv')
df.head()
# drop features we aren't using or create a new dataframe with only the features we want
#Name, Manufacturer, Sugars and Carbohydrates, and Rating;
df=df.drop(['type'],axis=1)
df=df.drop(['calories'],axis=1)
df=df.drop(['protein'],axis=1)
df=df.drop(['fat'],axis=1)
df=df.drop(['sodium'],axis=1)
df=df.drop(['fiber'],axis=1)
df=df.drop(['potass'],axis=1)
df=df.drop(['shelf'],axis=1)
df=df.drop(['weight'],axis=1)
df=df.drop(['cups'],axis=1)
df.head()
# Check to see if any instances have NaN for entries using .info
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 77 non-null object
1 mfr 77 non-null object
2 carbo 77 non-null float64
3 sugars 77 non-null int64
4 vitamins 77 non-null int64
5 rating 77 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 3.7+ KB
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
df[df.sugars < 0].shape[0]
df[df.carbo < 0].shape[0]
# Address negative values
#There are no cereals with negative values for sugars and carbs.
# set background grid for plots
sns.set_style( 'whitegrid')
# Plot number of products per manufacturer
sns.countplot (x='mfr',data=df)
plt.xticks(rotation=-45)
# There is only one cereal from American Home Foods Company so we drop that data sample
#
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
#
df[df.mfr == 'A']
df=df.drop([43])
df.head()
# For plots we would like the name of manufacturer instead of just "N" or "Q"
# Use .map or .apply
#
# Then drop 'mfr' column
#
df['mfr_name']=df['mfr'].map ({'A':'American Home Food Products', 'G':'General Mills', 'K':'Kelloggs', 'N':'Nabisco', 'P':'Post', 'Q':'Quaker Oats', 'R':'Ralston Purina'})
df=df.drop(columns='mfr')
df.head()
# Plot to see which manufacturer has highest rated
df[df.rating == 93.704912]
sns.catplot(x='mfr_name', y='rating', data=df)
plt.xticks(rotation=-45)
# Which cereal is rated highest?
df[df.rating>93]
#Set background grid for Seaborn
#
sns.set_style('whitegrid')
# Look at sugars per brand by plotting
sns.catplot(x='mfr_name',y='sugars',data = df )
plt.xticks(rotation=-45)
# Get data for clustering
X=df.sugars.values
n=len(X)
X=np.reshape(X, (n,1))
# Form model, fit data and print out cluster centers
km=KMeans(n_clusters=3, init='random', random_state=0)
y_km=km.fit(X)
print("Cluster centers for sugars clusters")
print(km.cluster_centers_)
Cluster centers for sugars clusters
[[ 1.84 ]
[ 6.84 ]
[12.03846154]]
# Add column to dataframe for this clusters, say sugars_clusters
df['sugars_clusters']=km.labels_
df.head()
# Plot clusters
sns.catplot(x='sugars_clusters', y='sugars', data=df)
plt.xticks(rotation=-45)
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
#
df['sugars_levels'] = df['sugars_clusters'].map ({0:'low sugar', 1:'medium sugar', 2:'high sugar'})
df.head()
# How are cereals distributed among the 3 levels?
# The distributions of each sugar levels, low medium and high, are all symetrically distributed.
# Which cereals have the highest sugar levels
df[df.sugars_clusters == 2]
# Which cereals have the lowest sugar levels
df[df.sugars_clusters == 0]
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Apple Jacks'
print (f"The data instance and sugar cluster for {my_cereal} is ", \
df[df.name == my_cereal].sugars_clusters )
The data instance and sugar cluster for Apple Jacks is 6 2
Name: sugars_clusters, dtype: int32
sns.catplot(x='mfr_name',y='carbo',data = df )
plt.xticks(rotation=-45)
X = df.carbo.values
n=len(X)
X=np.reshape(X, (n,1))
km= KMeans(n_clusters= 3, init='random', random_state=0)
y_km = km.fit(X)
print("Cluster centers for carbohydrates clusters")
print()
print(km.cluster_centers_)
Cluster centers for carbohydrates clusters
[[19.52173913]
[ 8.8 ]
[13.86842105]]
df['carbo_clusters']=km.labels_
df.head()
sns.catplot(x='carbo_clusters', y='carbo', data=df)
plt.xticks(rotation=-45)
df['carbo_levels'] = df['carbo_clusters'].map ({0:'low carb', 1:'medium carb', 2:'high carb'})
df.head()
df[df.sugars_clusters == 0]
df[df.carbo_clusters == 2]
df[df.sugars_clusters == 0]
df[df.carbo_clusters == 0]