# import libraries and K-Means function
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import pandas as pd
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
# Create dataframe for data
df = pd.read_csv ('cereal.csv')
df
# drop features we aren't using or create a new dataframe with only the features we want
#Name, Manufacturer, Sugars and Carbohydrates, and Rating
df=df.drop(columns=['type', 'calories','protein','fat','sodium','fiber','potass','vitamins','shelf','weight','cups'])
df.head()
# Check to see if any instances have NaN for entries using .info
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 77 non-null object
1 mfr 77 non-null object
2 carbo 77 non-null float64
3 sugars 77 non-null int64
4 rating 77 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.1+ KB
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
df[df.sugars < 0]
df[df.carbo <0]
# Address negative values
#we dropped Quaker Oatmeal, row 57 which had negative sugars and carbs
df = df.drop([57])
print(type (df.sugars))
df.sugars
<class 'pandas.core.series.Series'>
# set background grid for plots
sns.set_style( 'whitegrid')
# Plot number of products per manufacturer
#introduction seaborn part 1, use account plot x axis manufacturer
sns.countplot (x='mfr', data = df)
#Manufacturer K Kelloggs has the most amount of cereals
# There is only one cereal from American Home Foods Company so we drop that data sample
#
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
#
df[df.mfr == 'A']
df = df.drop([43])
sns.countplot (x='mfr', data = df)
# For plots we would like the name of manufacturer instead of just "N" or "Q"
# Use .map or .apply
#
# Then drop 'mfr' column
#
def name (mfr) :
if (mfr =='N'):
return('Nabisco')
elif (mfr =='Q'):
return('Quaker Oats')
elif (mfr =='K'):
return('Kellogs')
elif (mfr=='R'):
return('Ralston Purina')
elif (mfr =='G'):
return('General Mills')
elif (mfr=='P'):
return('Post')
else:
return("NA")
df['Manufacturer']=df['mfr'].apply (name)
df = df.drop(columns=['mfr'])
df.head()
# Plot to see which manufacturer has highest rated cereals
sns.catplot(x='Manufacturer', y='rating',data = df )
plt.xticks(rotation=-45)
# Which cereal is rated highest?
column = df['rating']
max_value = column.max()
print(max_value)
df [df.rating == 93.704912]
#all-Bran with Extra Fiber is the cereal with the highest rating
93.704912
# Look at sugars per brand by plotting
sns.catplot(x='Manufacturer', y='sugars',data = df )
plt.xticks(rotation=-45)
# Get data for clustering
X = df.sugars.values
n = len(X)
X = np.reshape (X, (n,1))
km = KMeans(
n_clusters=3, init='random', random_state=0
)
y_km = km.fit(X)
# Form model, fit data and print out cluster centers
print ("Cluster centers for sugar clusters")
print()
print (km.cluster_centers_ )
Cluster centers for sugar clusters
[[ 1.95833333]
[ 6.84 ]
[12.03846154]]
# Add column to dataframe for this clusters, say sugars_clusters
df['sugar_clusters'] = km.labels_
df.head()
# Plot clusters
sns.catplot (x= 'sugar_clusters', y='sugars', data = df)
plt.xticks(rotation=45)
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
df['sugar_clusters_levels'] = df['sugar_clusters'].map (\
{0: 'low sugar', 1: 'mid sugar',\
2: 'high sugar'})
#
df.head()
# How are cereals distributed among the 3 levels?
sns.catplot(x='Manufacturer', y='sugar_clusters_levels',data = df )
plt.xticks(rotation=-100)
# Which cereals have the highest sugar levels
#
df[(df.sugar_clusters ==2)]
# Which cereals have the lowest sugar levels
#
df[(df.sugar_clusters ==0)]
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Apple Jacks'
print (f"The data instance and sugar cluster for {my_cereal} is ", \
df[df.name == my_cereal].sugar_clusters )
The data instance and sugar cluster for Apple Jacks is 6 2
Name: sugar_clusters, dtype: int32
# Get data for clustering carbs
X = df.carbo.values
n = len(X)
X = np.reshape (X, (n,1))
km = KMeans(
n_clusters=3, init='random', random_state=0
)
y_km = km.fit(X)
# Form model, fit data and print out cluster centers
print ("Cluster centers for carb clusters")
print()
print (km.cluster_centers_ )
Cluster centers for carb clusters
[[15.59677419]
[11.08333333]
[20.92857143]]
# Add column to dataframe for this clusters, say carbo_clusters
df['carbo_clusters'] = km.labels_
df.head()
# Plot clusters
sns.catplot (x= 'carbo_clusters', y='carbo', data = df)
plt.xticks(rotation=45)
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
df['carbo_clusters_levels'] = df['carbo_clusters'].map (\
{0: 'mid carbs', 1: 'low carbs',\
2: 'high carbs'})
#
df.head()
# How are cereals distributed among the 3 levels?
sns.catplot(x='Manufacturer', y='carbo_clusters_levels',data = df )
plt.xticks(rotation=-100)
# Which cereals have the highest carb levels
#
df[(df.carbo_clusters ==2)]
# Which cereals have the lowest carb levels
#
df[(df.carbo_clusters ==1)]
#cereals that are high carbs and low sugar
df[ (df.carbo_clusters ==2) & (df.sugar_clusters==0)]
#Cereal that are low carbs and low sugar
df[ (df.carbo_clusters ==1) & (df.sugar_clusters==0)]