Python Project 5 - K-Means Clustering

# import libraries and K-Means function import pandas as pd from sklearn.cluster import KMeans import numpy as np import seaborn as sns import matplotlib.pyplot as plt

# Create dataframe for data df = pd.read_csv('cereal.csv') df

# drop features we aren't using or create a new dataframe with only the features we want df = df[["name", "mfr", "sugars", "carbo", "rating"]] df.head(10)

# Check to see if any instances have NaN for entries using .info df.info()

# Check to see if any cereals have negative values for sugars or carbohydrates; if 2 or less cereals, drop those # data instances; otherwise replace negative values with 0 neg_sugars = df[df.sugars < 0] print(neg_sugars) neg_carbo = df[df.carbo < 0] print(neg_carbo)

# Address negative values x_sugars = df[df.sugars >= 0] print(len(x_sugars)) x_carbo = df[df.carbo >= 0] print(len(x_carbo))

# set background grid for plots sns.set_style('whitegrid')

# Plot number of products per manufacturer sns.histplot(data = df, x = "mfr")

# There is only one cereal from American Home Foods Company so we drop that data sample # Type df[df.mfr == 'A'] and it will give you all data from this manufacturer. # Make sure you set df = drop command; use command to make sure it was deleted df = df[df.mfr != "A"] # need to have no space after ! or else it won't work sns.histplot (data = df, x = "mfr")

# For plots we would like the name of manufacturer instead of just "N" or "Q" # Use .map or .apply df["mfr_name"] = df["mfr"].map ({"G": "General Mills", "K": "Kelloggs", "N": "Nabisco", "P": "Post", "Q": "Quaker Oats", "R": "Ralston Purina"}) # Then drop 'mfr' column df.drop(columns = ["mfr"])

# Plot to see which manufacturer has highest rated cereals sns.relplot (x = "mfr_name", y = "rating", data = df, hue = "mfr_name").set(title = "Manufacturers and their ratings") plt.xticks(rotation = 45)

# Which cereal is rated highest? df[df.rating > 93] # don't know why the mfr column is still present even thought I droppped it in a previous question

# Look at sugars per brand by plotting sns.catplot (x = "mfr_name", y = "sugars", data = df, hue = "mfr_name").set(title = "Sugars by brand") plt.xticks(rotation = 45) plt.legend() # Don't know how to move the legend away from being in front of 2 points, sorry

# Get data for clustering x = df.sugars.values n = len(x) x = np.reshape(x, (n, 1))

# Form model, fit data and print out cluster centers km = KMeans (n_clusters = 3, init = "random", random_state = 0) y_km = km.fit(x) print (f"Cluster centers for sugars clusters:") print(km.cluster_centers_)

# Add column to dataframe for this clusters, say sugars_clusters df["sugars_clusters"] = km.labels_ df.head()

# Plot clusters sns.catplot(x = "sugars_clusters", y = "sugars", data = df)

# Determine which cluster number corresponds to lowest, middle and highest level and create a new # column in dataframe using .map df["sugars_clusters_label"] = df["sugars_clusters"].map ({0: "low sugar", 1:"medium sugar", 2: "high sugar"}) df.head(10)

# How are cereals distributed among the 3 levels? print (f"The cereals are distributed evenly among the 3 levels")

# Which cereals have the highest sugar levels df[df.sugars_clusters == 2]

# Which cereals have the lowest sugar levels df[df.sugars_clusters == 0]

# If you eat a particular cereal like Apple Jacks, Froot Loops, etc. what cluster is it in? my_cereal = 'Fruity Pebbles' print(f"The data instance and sugar cluster for {my_cereal} is {df[df.name == my_cereal].sugars_clusters}")

sns.catplot (x = "mfr_name", y = "carbo", data = df, hue = "mfr_name") plt.xticks(rotation = 45)

X = df.carbo.values n = len(X) X = np.reshape (X, (n, 1)) km = KMeans (n_clusters = 3, init = "random", random_state = 0) y_km = km.fit(X) print (f"Clusters centers for carbo clusters:") print(km.cluster_centers_)

df["carbo_clusters"] = km.labels_ df.head(10)

sns.catplot(x = "carbo_clusters", y = "carbo", data = df)

df["carbo_clusters_label"] = df["carbo_clusters"].map ({0: "low carbs", 1:"medium carbs", 2: "high carbs"}) df.head(10)

# High carbs and low sugar df[df.sugars_clusters == 0] df[df.carbo_clusters == 2] # When I do just the low sugar version it shows the low version, # But when I add the high carb version it prioritizes that and disregards the first part

# Low carbs and low sugar df[df.sugars_clusters == 0] df[df.carbo_clusters == 0] # I don't know why it shows the medium sugar clusters