# import libraries and K-Means function
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Create dataframe for data
df = pd.read_csv('cereal.csv')
df
# drop features we aren't using or create a new dataframe with only the features we want
df = df[["name", "mfr", "sugars", "carbo", "rating"]]
df.head(10)
# Check to see if any instances have NaN for entries using .info
df.info()
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
neg_sugars = df[df.sugars < 0]
print(neg_sugars)
neg_carbo = df[df.carbo < 0]
print(neg_carbo)
# Address negative values
x_sugars = df[df.sugars >= 0]
print(len(x_sugars))
x_carbo = df[df.carbo >= 0]
print(len(x_carbo))
# set background grid for plots
sns.set_style('whitegrid')
# Plot number of products per manufacturer
sns.histplot(data = df, x = "mfr")
# There is only one cereal from American Home Foods Company so we drop that data sample
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
df = df[df.mfr != "A"] # need to have no space after ! or else it won't work
sns.histplot (data = df, x = "mfr")
# For plots we would like the name of manufacturer instead of just "N" or "Q"
# Use .map or .apply
df["mfr_name"] = df["mfr"].map ({"G": "General Mills", "K": "Kelloggs", "N": "Nabisco", "P": "Post", "Q": "Quaker Oats", "R": "Ralston Purina"})
# Then drop 'mfr' column
df.drop(columns = ["mfr"])
# Plot to see which manufacturer has highest rated cereals
sns.relplot (x = "mfr_name", y = "rating", data = df, hue = "mfr_name").set(title = "Manufacturers and their ratings")
plt.xticks(rotation = 45)
# Which cereal is rated highest?
df[df.rating > 93]
# don't know why the mfr column is still present even thought I droppped it in a previous question
# Look at sugars per brand by plotting
sns.catplot (x = "mfr_name", y = "sugars", data = df, hue = "mfr_name").set(title = "Sugars by brand")
plt.xticks(rotation = 45)
plt.legend()
# Don't know how to move the legend away from being in front of 2 points, sorry
# Get data for clustering
x = df.sugars.values
n = len(x)
x = np.reshape(x, (n, 1))
# Form model, fit data and print out cluster centers
km = KMeans (n_clusters = 3, init = "random", random_state = 0)
y_km = km.fit(x)
print (f"Cluster centers for sugars clusters:")
print(km.cluster_centers_)
# Add column to dataframe for this clusters, say sugars_clusters
df["sugars_clusters"] = km.labels_
df.head()
# Plot clusters
sns.catplot(x = "sugars_clusters", y = "sugars", data = df)
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
df["sugars_clusters_label"] = df["sugars_clusters"].map ({0: "low sugar", 1:"medium sugar", 2: "high sugar"})
df.head(10)
# How are cereals distributed among the 3 levels?
print (f"The cereals are distributed evenly among the 3 levels")
# Which cereals have the highest sugar levels
df[df.sugars_clusters == 2]
# Which cereals have the lowest sugar levels
df[df.sugars_clusters == 0]
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Fruity Pebbles'
print(f"The data instance and sugar cluster for {my_cereal} is {df[df.name == my_cereal].sugars_clusters}")
sns.catplot (x = "mfr_name", y = "carbo", data = df, hue = "mfr_name")
plt.xticks(rotation = 45)
X = df.carbo.values
n = len(X)
X = np.reshape (X, (n, 1))
km = KMeans (n_clusters = 3, init = "random", random_state = 0)
y_km = km.fit(X)
print (f"Clusters centers for carbo clusters:")
print(km.cluster_centers_)
df["carbo_clusters"] = km.labels_
df.head(10)
sns.catplot(x = "carbo_clusters", y = "carbo", data = df)
df["carbo_clusters_label"] = df["carbo_clusters"].map ({0: "low carbs", 1:"medium carbs", 2: "high carbs"})
df.head(10)
# High carbs and low sugar
df[df.sugars_clusters == 0]
df[df.carbo_clusters == 2]
# When I do just the low sugar version it shows the low version,
# But when I add the high carb version it prioritizes that and disregards the first part
# Low carbs and low sugar
df[df.sugars_clusters == 0]
df[df.carbo_clusters == 0]
# I don't know why it shows the medium sugar clusters