#Import libaries needed to clean data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
# import the cosmetics data set retrived from https://www.kaggle.com/kingabzpro/cosmetics-datasets
df_C1 = pd.read_csv('cosmetics.csv', header=None)
df_C1.columns = ["Label","Brand","Name","Price","Rank","Ingredients","Combination","Dry","Normal","Oily","Sensitive"]
# Preview the Data
df_C1.head()
# find the length of the data frame to see the scale of data that we are working with
len(df_C1)
df_C1 = df_C1.drop(labels=0, axis=0)
df_C1 = df_C1[~df_C1["Name"].str.contains("Limited Edition", na=False)]
df_C1 = df_C1[~df_C1["Name"].str.contains("Limited-Edition", na=False)]
df_C1 = df_C1[~df_C1["Name"].str.contains("Mini", na=False)]
df_C1.dtypes
df_C1['Price']= df_C1['Price'].astype(str).astype(int)
# Create a data set of just Mositurizers from the original data set using the label column
df_M= df_C1[df_C1["Label"].str.contains("Moisturizer", na=False)]
# sort by brand to see if there is any double products
df_M= df_M.sort_values("Brand")
df_M
# Sort the Mosturizers by Rank to determine the top and bottom rated products
df_M= df_M.sort_values("Rank")
df_M
# Plot the number of products in each rank
sns.set_theme(style="ticks", color_codes=True)
sns.catplot(x="Rank", kind="count", palette="Greens_r", data=df_M).set(title="Distribution of Mosturizers at Various Ranks")
# Plot the Price v.s the Rank to see if there is some correlation between the two
sns.catplot(x="Rank", y="Price", kind="swarm", data=df_M,height=8.27, aspect=11.7/8.27).set(title="Rating V.S. Price for Mosturizers")
# make a small data frame for the combination skin type and sort by rank
df_MCombo= df_M.sort_values("Combination")
df_MCombo = df_MCombo[~df_MCombo["Combination"].str.contains("0", na=False)]
df_MCombo= df_MCombo.sort_values("Rank")
df_MCombo.tail()
# make a small data frame for the dry skin type and sort by rank
df_Mdry= df_M.sort_values("Dry")
df_Mdry = df_Mdry[~df_Mdry["Dry"].str.contains("0", na=False)]
df_Mdry= df_Mdry.sort_values("Rank")
df_Mdry.tail()
# make a small data frame for the Normal skin type and sort by rank
df_MNorm= df_M.sort_values("Normal")
df_MNorm = df_MNorm[~df_MNorm["Normal"].str.contains("0", na=False)]
df_MNorm= df_MNorm.sort_values("Rank")
df_MNorm.tail()
# make a small data frame for the Oily skin type and sort by rank
df_Moil= df_M.sort_values("Oily")
df_Moil = df_Moil[~df_Moil["Oily"].str.contains("0", na=False)]
df_Moil= df_Moil.sort_values("Rank")
df_Moil.tail()
# make a small data frame for the sensative skin type and sort by rank
df_Msen= df_M.sort_values("Sensitive")
df_Msen = df_Msen[~df_Msen["Sensitive"].str.contains("0", na=False)]
df_Msen= df_Msen.sort_values("Rank")
df_Msen.tail()
# create the cleansers data frame from the origninal data set
df_c= df_C1[df_C1["Label"].str.contains("Cleanser", na=False)]
# Sort the Cleansers by Rank to determine the top and bottom rated products
df_c= df_c.sort_values("Rank")
df_c
# Plot the number of products in each rank
sns.catplot(x="Rank", kind="count", palette="Greens_r", data=df_c).set(title="Distribution of Cleansers at Various Ranks")
# Plot the Price v.s the Rank to see if there is some correlation between the two
sns.catplot(x="Rank", y="Price", kind="swarm", data=df_c, height=8.27, aspect=11.7/8.27).set(title="Rating V.S. Price for Cleansers")
# make a Face masks data frame from the larger data frame
df_FM= df_C1[df_C1["Label"].str.contains("Face Mask", na=False)]
df_FM= df_FM.sort_values("Rank")
df_FM.tail()
sns.catplot(x="Rank", kind="count", palette="Greens_r", data=df_FM).set(title="Distribution of Face Masks at Various Ranks")
sns.catplot(x="Rank", y="Price", kind="swarm", data=df_FM, height=8.27, aspect=11.7/8.27).set(title="Rating V.S. Price for Face Masks")
# read the CSV data into a panads dataframe
df_Chem = pd.read_csv('Chemicals_in_Cosmetics.csv', header=None)
df_Chem.columns = ["CDPHId", "ProductName", "CSFId", "CSF", "CompanyId", "CompanyName", "BrandName", "PrimaryCategoryId","PrimaryCategory", "SubCategoryId", "SubCategory", "CasId","CasNumber", "ChemicalId", "ChemicalName","InitialDateReported", "MostRecentDateReported", "DiscontinuedDate", "ChemicalCreatedAt", "ChemicalUpdatedAt", "ChemicalDateRemoved", "ChemicalCount"]
df_Chem.head()
# Clean the data set by removing columns that we are not planing to use for our analysis
del df_Chem["CDPHId"]
del df_Chem["CSFId"]
del df_Chem["CSF"]
del df_Chem["CompanyId"]
del df_Chem["CasId"]
del df_Chem["CasNumber"]
del df_Chem["InitialDateReported"]
del df_Chem["MostRecentDateReported"]
del df_Chem["DiscontinuedDate"]
del df_Chem["ChemicalCreatedAt"]
del df_Chem["ChemicalUpdatedAt"]
del df_Chem["ChemicalDateRemoved"]
# Sort by the skin care products catergory
df_Chem = df_Chem[df_Chem["PrimaryCategory"].str.contains("Skin Care Products", na=False)]
df_Chem.head()
# Remove cosmetic products so that we can focus soley on the skin care products sub category
df_Chem = df_Chem[~df_Chem["SubCategory"].str.contains("cosmetic", na=False)]
df_Chem = df_Chem[~df_Chem["SubCategory"].str.contains("Other", na=False)]
# Create a smaller data frame just looking at facial mosturizers using the facial cream sub category
df_MChem= df_Chem[df_Chem["SubCategory"].str.contains("Facial Cream", na=False)]
df_MChem.head()
ax = sns.countplot(x="ChemicalName", data=df_MChem)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.rcParams["figure.figsize"] = (20, 8)
plt.rcParams["xtick.labelsize"] = 10
plt.show()
# remove titianium Dioxide to see better seperation of other chemicals
df_MChem = df_MChem[~df_MChem["ChemicalName"].str.contains("Titanium dioxide", na=False)]
ax = sns.countplot(x="ChemicalName", data=df_MChem)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.rcParams["figure.figsize"] = (20, 8)
plt.rcParams["xtick.labelsize"] = 10
plt.show()
df_M[df_M['Ingredients'].str.contains("Titanium Dioxide")]
df_M[df_M['Ingredients'].str.contains("Benzophenon")]
df_M[df_M['Ingredients'].str.contains("Triethanolamine")]
# create a small data frame of skin cleansers from the chemicals data base
df_CChem= df_Chem[df_Chem["SubCategory"].str.contains("Skin Cleansers", na=False)]
df_CChem.head()
ax = sns.countplot(x="ChemicalName", data=df_CChem)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.rcParams["figure.figsize"] = (20, 8)
plt.rcParams["xtick.labelsize"] = 10
plt.show()
df_CChem = df_CChem[~df_CChem["ChemicalName"].str.contains("Titanium dioxide", na=False)]
ax = sns.countplot(x="ChemicalName", data=df_CChem)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.rcParams["figure.figsize"] = (20, 8)
plt.rcParams["xtick.labelsize"] = 10
plt.show()
df_c[df_c['Ingredients'].str.contains("Titanium Dioxide")]
#cocamide diethanolamine is known as triethanol amine in product formulations
df_c[df_c['Ingredients'].str.contains("Triethanolamine")]
df_c[df_c['Ingredients'].str.contains("Retinol")]