# import libraries and K-Means function
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# Create dataframe for data
df = pd.read_csv( 'cereal.csv' )
df.head()
# drop features we aren't using or create a new dataframe with only the features we want
df_mod = df.drop(columns = ['name', 'type', 'calories', 'protein', 'fat', 'sodium', 'fiber', 'potass', 'vitamins', 'shelf', 'weight', 'cups'])
df_mod.head()
# Check to see if any instances have NaN for entries using .info
df_mod.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mfr 77 non-null object
1 carbo 77 non-null float64
2 sugars 77 non-null int64
3 rating 77 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 2.5+ KB
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
print(df_mod[ ( df_mod.sugars<0) & (df_mod.carbo < 0)])
df_mod= df_mod.drop([57])
df_mod.head(58)
mfr carbo sugars rating
57 Q -1.0 -1 50.828392
# set background grid for plots
sns.set_style( 'whitegrid')
# Plot number of products per manufacturer
sns.countplot (x='mfr', data = df_mod)
plt.xticks(rotation=-45)
# There is only one cereal from American Home Foods Company so we drop that data sample
#
print(df_mod[(df_mod.mfr=='A')])
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
#
df_mod = df_mod.drop([43])
df_mod.head()
mfr carbo sugars rating
43 A 16.0 3 54.850917
# For plots we would like the name of manufacturer instead of just "N" or "Q"
# Use .map or .apply
print(df_mod.mfr)
#mfr_name = [df_mod.mfr]
#print(df_mod.head())
mfr_name = [df_mod.mfr]
df_mod['mfr']= df_mod.mfr
df_mod['mfr_name'] = df_mod['mfr'].map ( {'G':'General Mills', 'K':"Kelloggs",\
'N':'Nabisco', 'P': "Post", 'Q':"Quaker Oats", 'R':"Ralston Purina"})
df_mod.head()
# Then drop 'mfr' column
#
df_mod = df_mod.drop(columns = ['mfr'])
df_mod.head()
0 N
1 Q
2 K
3 K
4 R
..
72 G
73 G
74 R
75 G
76 G
Name: mfr, Length: 75, dtype: object
# Plot to see which manufacturer has highest rated cereals
sns.countplot (x='mfr_name', data = df_mod)
plt.xticks(rotation=-45)
# Which cereal is rated highest?
max_rating= max(df_mod.rating)
print(df_mod[(df_mod.rating==max_rating)])
carbo sugars rating mfr_name
3 8.0 0 93.704912 Kelloggs
# Look at sugars per brand by plotting
sns.catplot(x='mfr_name', y='sugars', data = df_mod)
plt.xticks(rotation=-45)
# Get data for clustering
df_mod['target'] = df_mod['sugars'] # add a column with the target species
df_mod['sugars'] = df_mod['target'].map( \
{0:df_mod.sugar_lvls[0], \
1:df_mod.sugar_lvls[1], \
2:df_mod.sugar_lvls[2] } )
AttributeError: 'DataFrame' object has no attribute 'sugar_lvls'
# Form model, fit data and print out cluster centers
# Add column to dataframe for this clusters, say sugars_clusters
# Plot clusters
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
#
# How are cereals distributed among the 3 levels?
# Which cereals have the highest sugar levels
#
# Which cereals have the lowest sugar levels
#
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Apple Jacks'
print (f"The data instance and sugar cluster for {my_cereal} is ", \
df[df.name == my_cereal].sugars_clusters )
AttributeError: 'DataFrame' object has no attribute 'sugars_clusters'