# import libraries and K-Means function
# import libraries and K-Means function
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# Create dataframe for data
df = pd.read_csv('cereal.csv')
df.head()
# drop features we aren't using or create a new dataframe with only the features we want
df_mod = df.drop(columns =['name','type','calories','protein','fat','sodium','fiber','potass','vit
df_mod.head()
# Check to see if any instances have NaN for entries using .info
# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those
# data instances; otherwise replace negative values with 0
# Address negative values
# set background grid for plots
sns.set_style( 'whitegrid')
# Plot number of products per manufacturer
# There is only one cereal from American Home Foods Company so we drop that data sample
#
# Type df[df.mfr == 'A'] and it will give you all data from this manufacturer.
# Make sure you set df = drop command; use command to make sure it was deleted
#
# For plots we would like the name of manufacturer instead of just "N" or "Q"
# Use .map or .apply
#
# Then drop 'mfr' column
#
# Plot to see which manufacturer has highest rated cereals
# Which cereal is rated highest?
# Look at sugars per brand by plotting
# Get data for clustering
# Form model, fit data and print out cluster centers
# Add column to dataframe for this clusters, say sugars_clusters
# Plot clusters
# Determine which cluster number corresponds to lowest, middle and highest level and create a new
# column in dataframe using .map
#
#
# How are cereals distributed among the 3 levels?
# Which cereals have the highest sugar levels
#
# Which cereals have the lowest sugar levels
#
# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in?
my_cereal = 'Apple Jacks'
print (f"The data instance and sugar cluster for {my_cereal} is ", \
df[df.name == my_cereal].sugars_clusters )