# import libraries and K-Means function
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
from sklearn.cluster import KMeans
# Create dataframe for data
dfi = pd.read_csv('cereal.csv')
dfi.tail()
# drop features we aren't using or create a new dataframe with only the features we want
df = DataFrame(dfi,columns=['name','mfr','sugars','carbo','rating'])
df.head(10)
# Check to see if any instances have NaN for entries using .info
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 77 non-null object
1 mfr 77 non-null object
2 sugars 77 non-null int64
3 carbo 77 non-null float64
4 rating 77 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.1+ KB
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
df[(df.sugars < 0) | (df.carbo < 0)]
# Address negative values
df.drop(df[(df.sugars < 0) | (df.carbo < 0)].index,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 76 entries, 0 to 76
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 76 non-null object
1 mfr 76 non-null object
2 sugars 76 non-null int64
3 carbo 76 non-null float64
4 rating 76 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.6+ KB
# set background grid for plots
sns.set_style( 'whitegrid')
# Plot number of products per manufacturer
#sns.scatterplot(x=dfi_new['bmi'],y=dfi_new['scaled_charges'],hue=dfi_new['label'])
df['mfr'].value_counts().plot(kind='bar',xlabel='Manufacturer',ylabel='Number of Products')
# There is only one cereal from American Home Foods Company so we drop that data sample
#
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
#
df.drop(df[df.mfr == 'A'].index,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 75 entries, 0 to 76
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 75 non-null object
1 mfr 75 non-null object
2 sugars 75 non-null int64
3 carbo 75 non-null float64
4 rating 75 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.5+ KB
# For plots we would like the name of manufacturer instead of just "N" or "Q"
# Use .map or .apply
#
# Then drop 'mfr' column
#
df['Manufacturer'] = df['mfr'].map({'G':'General Mills', 'K':'Kellogs', 'N':'Nabisco','P':'Post','Q':'Quaker Oats','R':'Ralston Purina'})
del df['mfr']
df.head()
# Plot to see which manufacturer has highest rated cereals
sns.barplot(x='Manufacturer', y='rating', data=df)
plt.xticks(rotation=-45)
plt.show()
# In general, Nabisco has highest rated cereals
# Which cereal is rated highest?
max_rating = df.rating.max()
df[df.rating == max_rating]
# The highest rated single cereal product is All-Bran with Extra Fiber by Kellogs
# Look at sugars per brand by plotting
sns.barplot(x='Manufacturer', y='sugars', data=df)
plt.xticks(rotation=-45)
plt.show()
# Get data for clustering
df_num = DataFrame(df,columns=['sugars','carbo','rating'])
df_array = df_num.to_numpy()
# Form model, fit data and print out cluster centers
km = KMeans(n_clusters=3,init='random',random_state=0)
km.fit(df_array)
print(km.cluster_centers_)
[[ 4.7 16.75 45.29564313]
[11.06060606 13.34848485 30.84906442]
[ 2.08333333 13.83333333 66.89096292]]
# Add column to dataframe for this clusters, say sugars_clusters
df['sugars_clusters'] = km.labels_
df.head()
# Plot clusters
sns.barplot(x=df['sugars_clusters'],y=df['sugars'],data=df)
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
df['sugars_content'] = df['sugars_clusters'].map({0:'middle', 1:'highest', 2:'lowest'})
#
df.head()
# How are cereals distributed among the 3 levels?
sns.countplot(x='sugars_content', data=df)
# Which cereals have the highest sugar levels
#
max_sugars = df.sugars.max()
df[df.sugars == max_sugars]
# Which cereals have the lowest sugar levels
#
min_sugars = df.sugars.min()
df[df.sugars == min_sugars]
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'All-Bran with Extra Fiber'
print (f"The data instance and sugar cluster for {my_cereal} is ", \
df[df.name == my_cereal].sugars_clusters )
The data instance and sugar cluster for All-Bran with Extra Fiber is 3 2
Name: sugars_clusters, dtype: int32
# Plotting clusters for carbohydrate content (the clustering remains the same as for sugars)
sns.barplot(x=df['sugars_clusters'],y=df['carbo'],data=df)
# Mapping the lowest, middle, and highest carb level to a new column
#
df['carbs_content'] = df['sugars_clusters'].map({0:'highest', 1:'lowest', 2:'middle'})
#
df.head()
# The distribution of cereals for carbohydrate content
sns.countplot(x='carbs_content', data=df)
# To see which cereals have the highest carb levels
#
max_carbs = df.carbo.max()
df[df.carbo == max_carbs]
# To see which cereals have the lowest carb levels
#
min_carbs = df.carbo.min()
df[df.carbo == min_carbs]
sns.relplot(x='sugars',y='carbo',data=df,hue='Manufacturer')