Python - Chocolate Bar Cocoa Source Analysis

import pandas as pd import missingno as msno import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots

# Load in data df = pd.read_csv('chocolate_bars.csv') df.head()

df.describe(include='all')

df.company_location.value_counts(normalize=True).head(1)

# Creating a separate dataframe for null_values null = pd.DataFrame(df.isnull().sum(), columns=['null_values']) # Adding percent_missing column null['percent_missing'] = (df.isna().sum()/len(df) * 100).round(2) null

# Visualizing missing value locations msno.matrix(df)

avg_rating = df.groupby('bean_origin').agg(avg_rating=('rating', 'mean')) avg_rating.nlargest(10, 'avg_rating')

avg_rating.nsmallest(1, 'avg_rating')

n_rating = df.groupby('bean_origin').agg(n_rating=('rating', 'count')) n_rating.nlargest(10, 'n_rating')

top_ten_avg = avg_rating.nlargest(10, 'avg_rating') top_ten_n = n_rating.nlargest(10, 'n_rating') n_rating_list = top_ten_avg.index.values.tolist() top_ten_n[top_ten_n.index.isin(n_rating_list) == True]

under_50 = n_rating[n_rating['n_rating'] <= 20].count() under_10 = n_rating[n_rating['n_rating'] <= 10].count() print('Number of origins with less than or equal to 20 ratings = {}'.format(under_50[0])) print('Number of origins with less than or equal to 10 ratings = {}'.format(under_10[0]))

# Creating new dataframe grouping bean_origin, avg_rating, and n_rating df_avg = df.groupby('bean_origin', as_index=False).agg(avg_rating=('rating', 'mean'), n_rating=('rating', 'count')) # Separate dataframe for n_ratings 20 or greater over_20 = df_avg[df_avg['n_rating'] >= 20].sort_values('n_rating', ascending=False, ignore_index=True) over_20.head()

# Creating a subplot that shares xaxis, secondary yaxis fig = make_subplots(specs=[[{"secondary_y": True}]]) # Adding bar chart for number of ratings fig.add_trace(go.Bar(x=over_20['bean_origin'], y=over_20['n_rating'], name='n_rating'), secondary_y=False) # Adding line graph for average rating fig.add_trace(go.Scatter(x=over_20['bean_origin'], y=over_20['avg_rating'], name='avg rating'), secondary_y=True) # Add figure titles fig.update_layout(title_text='Bean origin by count and average rating') # Update xaxis fig.update_xaxes(title_text='bean origin') # Update yaxes fig.update_yaxes(title_text='<b>number</b> of ratings', secondary_y=False) fig.update_yaxes(title_text='<b>average</b> of ratings', secondary_y=True) # Show graph fig.show()

blend = over_20[over_20['bean_origin'] == 'Blend'] no_blend = over_20[over_20['bean_origin'] != 'Blend'] blend = blend.avg_rating.mean().round(2) no_blend = no_blend.avg_rating.mean().round(2) print('Avg rating for Blend: {}'.format(blend)) print('Avg rating for other countries: {}'.format(no_blend))

fig = px.imshow(df_avg.corr()) fig.show()

high_rating = df[df['rating'] >= 3.5] print('Average cocoa percentage: {}'.format(round(high_rating.cocoa_percent.mean(), 3)))

high_rating.cocoa_percent.value_counts(normalize=True).head(1)

# Distribution plot of cocoa_percentage fig = px.histogram(high_rating, x="cocoa_percent", nbins=20, title='Distribution of cocoa percentage for "Highly Recommended" bars') # Update axis titles fig.update_xaxes(title_text='cocoa percentage') fig.update_yaxes(title_text='number of chocolate bars') # Show graph fig.show()

# Creating a copy of original data df_copy = df.copy() # Adding a boolean column, 'has_lecithin' df_copy['has_lecithin'] = df['ingredients'].str.contains('L') == True # Value counts of bars with/without lecithin df_copy['has_lecithin'].value_counts(normalize=True)

round(df_copy.groupby('has_lecithin')[['rating']].mean(), 2)

# Visualize distribution of ratings by bars with/without lecithin fig = px.histogram(df_copy, x='rating', color='has_lecithin', barmode='overlay', nbins=10, title='Distirbution of ratings by bars with/without lecithin', labels={'has_lecithin': 'has lecithin'}) # Update yaxis label fig.update_yaxes(title_text='number of ratings') # Show graph fig.show()