Final Project Data 601

#Import libaries needed to clean data import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.pyplot import figure import seaborn as sns

# import the cosmetics data set retrived from https://www.kaggle.com/kingabzpro/cosmetics-datasets df_C1 = pd.read_csv('cosmetics.csv', header=None) df_C1.columns = ["Label","Brand","Name","Price","Rank","Ingredients","Combination","Dry","Normal","Oily","Sensitive"]

# Preview the Data df_C1.head()

# find the length of the data frame to see the scale of data that we are working with len(df_C1)

df_C1 = df_C1.drop(labels=0, axis=0)

df_C1 = df_C1[~df_C1["Name"].str.contains("Limited Edition", na=False)] df_C1 = df_C1[~df_C1["Name"].str.contains("Limited-Edition", na=False)] df_C1 = df_C1[~df_C1["Name"].str.contains("Mini", na=False)]

df_C1.dtypes

df_C1['Price']= df_C1['Price'].astype(str).astype(int)

# Create a data set of just Mositurizers from the original data set using the label column df_M= df_C1[df_C1["Label"].str.contains("Moisturizer", na=False)]

# sort by brand to see if there is any double products df_M= df_M.sort_values("Brand") df_M

# Sort the Mosturizers by Rank to determine the top and bottom rated products df_M= df_M.sort_values("Rank") df_M

# Plot the number of products in each rank sns.set_theme(style="ticks", color_codes=True) sns.catplot(x="Rank", kind="count", palette="Greens_r", data=df_M).set(title="Distribution of Mosturizers at Various Ranks")

# Plot the Price v.s the Rank to see if there is some correlation between the two sns.catplot(x="Rank", y="Price", kind="swarm", data=df_M,height=8.27, aspect=11.7/8.27).set(title="Rating V.S. Price for Mosturizers")

# make a small data frame for the combination skin type and sort by rank df_MCombo= df_M.sort_values("Combination") df_MCombo = df_MCombo[~df_MCombo["Combination"].str.contains("0", na=False)] df_MCombo= df_MCombo.sort_values("Rank") df_MCombo.tail()

# make a small data frame for the dry skin type and sort by rank df_Mdry= df_M.sort_values("Dry") df_Mdry = df_Mdry[~df_Mdry["Dry"].str.contains("0", na=False)] df_Mdry= df_Mdry.sort_values("Rank") df_Mdry.tail()

# make a small data frame for the Normal skin type and sort by rank df_MNorm= df_M.sort_values("Normal") df_MNorm = df_MNorm[~df_MNorm["Normal"].str.contains("0", na=False)] df_MNorm= df_MNorm.sort_values("Rank") df_MNorm.tail()

# make a small data frame for the Oily skin type and sort by rank df_Moil= df_M.sort_values("Oily") df_Moil = df_Moil[~df_Moil["Oily"].str.contains("0", na=False)] df_Moil= df_Moil.sort_values("Rank") df_Moil.tail()

# make a small data frame for the sensative skin type and sort by rank df_Msen= df_M.sort_values("Sensitive") df_Msen = df_Msen[~df_Msen["Sensitive"].str.contains("0", na=False)] df_Msen= df_Msen.sort_values("Rank") df_Msen.tail()

# create the cleansers data frame from the origninal data set df_c= df_C1[df_C1["Label"].str.contains("Cleanser", na=False)]

# Sort the Cleansers by Rank to determine the top and bottom rated products df_c= df_c.sort_values("Rank") df_c

# Plot the number of products in each rank sns.catplot(x="Rank", kind="count", palette="Greens_r", data=df_c).set(title="Distribution of Cleansers at Various Ranks")

# Plot the Price v.s the Rank to see if there is some correlation between the two sns.catplot(x="Rank", y="Price", kind="swarm", data=df_c, height=8.27, aspect=11.7/8.27).set(title="Rating V.S. Price for Cleansers")

# make a Face masks data frame from the larger data frame df_FM= df_C1[df_C1["Label"].str.contains("Face Mask", na=False)]

df_FM= df_FM.sort_values("Rank") df_FM.tail()

sns.catplot(x="Rank", kind="count", palette="Greens_r", data=df_FM).set(title="Distribution of Face Masks at Various Ranks")

sns.catplot(x="Rank", y="Price", kind="swarm", data=df_FM, height=8.27, aspect=11.7/8.27).set(title="Rating V.S. Price for Face Masks")

# read the CSV data into a panads dataframe df_Chem = pd.read_csv('Chemicals_in_Cosmetics.csv', header=None) df_Chem.columns = ["CDPHId", "ProductName", "CSFId", "CSF", "CompanyId", "CompanyName", "BrandName", "PrimaryCategoryId","PrimaryCategory", "SubCategoryId", "SubCategory", "CasId","CasNumber", "ChemicalId", "ChemicalName","InitialDateReported", "MostRecentDateReported", "DiscontinuedDate", "ChemicalCreatedAt", "ChemicalUpdatedAt", "ChemicalDateRemoved", "ChemicalCount"] df_Chem.head()

# Clean the data set by removing columns that we are not planing to use for our analysis del df_Chem["CDPHId"] del df_Chem["CSFId"] del df_Chem["CSF"] del df_Chem["CompanyId"] del df_Chem["CasId"] del df_Chem["CasNumber"] del df_Chem["InitialDateReported"] del df_Chem["MostRecentDateReported"] del df_Chem["DiscontinuedDate"] del df_Chem["ChemicalCreatedAt"] del df_Chem["ChemicalUpdatedAt"] del df_Chem["ChemicalDateRemoved"]

# Sort by the skin care products catergory df_Chem = df_Chem[df_Chem["PrimaryCategory"].str.contains("Skin Care Products", na=False)] df_Chem.head()

# Remove cosmetic products so that we can focus soley on the skin care products sub category df_Chem = df_Chem[~df_Chem["SubCategory"].str.contains("cosmetic", na=False)] df_Chem = df_Chem[~df_Chem["SubCategory"].str.contains("Other", na=False)]

# Create a smaller data frame just looking at facial mosturizers using the facial cream sub category df_MChem= df_Chem[df_Chem["SubCategory"].str.contains("Facial Cream", na=False)] df_MChem.head()

ax = sns.countplot(x="ChemicalName", data=df_MChem) ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right") plt.rcParams["figure.figsize"] = (20, 8) plt.rcParams["xtick.labelsize"] = 10 plt.show()

# remove titianium Dioxide to see better seperation of other chemicals df_MChem = df_MChem[~df_MChem["ChemicalName"].str.contains("Titanium dioxide", na=False)]

df_M[df_M['Ingredients'].str.contains("Titanium Dioxide")]

df_M[df_M['Ingredients'].str.contains("Benzophenon")]

df_M[df_M['Ingredients'].str.contains("Triethanolamine")]

# create a small data frame of skin cleansers from the chemicals data base df_CChem= df_Chem[df_Chem["SubCategory"].str.contains("Skin Cleansers", na=False)] df_CChem.head()

ax = sns.countplot(x="ChemicalName", data=df_CChem) ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right") plt.rcParams["figure.figsize"] = (20, 8) plt.rcParams["xtick.labelsize"] = 10 plt.show()

df_CChem = df_CChem[~df_CChem["ChemicalName"].str.contains("Titanium dioxide", na=False)]

df_c[df_c['Ingredients'].str.contains("Titanium Dioxide")]

#cocamide diethanolamine is known as triethanol amine in product formulations df_c[df_c['Ingredients'].str.contains("Triethanolamine")]

df_c[df_c['Ingredients'].str.contains("Retinol")]