import pandas as pd
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Load in data
df = pd.read_csv('chocolate_bars.csv')
df.head()
df.describe(include='all')
df.company_location.value_counts(normalize=True).head(1)
# Creating a separate dataframe for null_values
null = pd.DataFrame(df.isnull().sum(), columns=['null_values'])
# Adding percent_missing column
null['percent_missing'] = (df.isna().sum()/len(df) * 100).round(2)
null
# Visualizing missing value locations
msno.matrix(df)
avg_rating = df.groupby('bean_origin').agg(avg_rating=('rating', 'mean'))
avg_rating.nlargest(10, 'avg_rating')
avg_rating.nsmallest(1, 'avg_rating')
n_rating = df.groupby('bean_origin').agg(n_rating=('rating', 'count'))
n_rating.nlargest(10, 'n_rating')
top_ten_avg = avg_rating.nlargest(10, 'avg_rating')
top_ten_n = n_rating.nlargest(10, 'n_rating')
n_rating_list = top_ten_avg.index.values.tolist()
top_ten_n[top_ten_n.index.isin(n_rating_list) == True]
under_50 = n_rating[n_rating['n_rating'] <= 20].count()
under_10 = n_rating[n_rating['n_rating'] <= 10].count()
print('Number of origins with less than or equal to 20 ratings = {}'.format(under_50[0]))
print('Number of origins with less than or equal to 10 ratings = {}'.format(under_10[0]))
# Creating new dataframe grouping bean_origin, avg_rating, and n_rating
df_avg = df.groupby('bean_origin', as_index=False).agg(avg_rating=('rating', 'mean'),
n_rating=('rating', 'count'))
# Separate dataframe for n_ratings 20 or greater
over_20 = df_avg[df_avg['n_rating'] >= 20].sort_values('n_rating', ascending=False, ignore_index=True)
over_20.head()
# Creating a subplot that shares xaxis, secondary yaxis
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Adding bar chart for number of ratings
fig.add_trace(go.Bar(x=over_20['bean_origin'], y=over_20['n_rating'],
name='n_rating'), secondary_y=False)
# Adding line graph for average rating
fig.add_trace(go.Scatter(x=over_20['bean_origin'], y=over_20['avg_rating'],
name='avg rating'), secondary_y=True)
# Add figure titles
fig.update_layout(title_text='Bean origin by count and average rating')
# Update xaxis
fig.update_xaxes(title_text='bean origin')
# Update yaxes
fig.update_yaxes(title_text='<b>number</b> of ratings', secondary_y=False)
fig.update_yaxes(title_text='<b>average</b> of ratings', secondary_y=True)
# Show graph
fig.show()
blend = over_20[over_20['bean_origin'] == 'Blend']
no_blend = over_20[over_20['bean_origin'] != 'Blend']
blend = blend.avg_rating.mean().round(2)
no_blend = no_blend.avg_rating.mean().round(2)
print('Avg rating for Blend: {}'.format(blend))
print('Avg rating for other countries: {}'.format(no_blend))
fig = px.imshow(df_avg.corr())
fig.show()
high_rating = df[df['rating'] >= 3.5]
print('Average cocoa percentage: {}'.format(round(high_rating.cocoa_percent.mean(), 3)))
high_rating.cocoa_percent.value_counts(normalize=True).head(1)
# Distribution plot of cocoa_percentage
fig = px.histogram(high_rating, x="cocoa_percent", nbins=20,
title='Distribution of cocoa percentage for "Highly Recommended" bars')
# Update axis titles
fig.update_xaxes(title_text='cocoa percentage')
fig.update_yaxes(title_text='number of chocolate bars')
# Show graph
fig.show()
# Creating a copy of original data
df_copy = df.copy()
# Adding a boolean column, 'has_lecithin'
df_copy['has_lecithin'] = df['ingredients'].str.contains('L') == True
# Value counts of bars with/without lecithin
df_copy['has_lecithin'].value_counts(normalize=True)
round(df_copy.groupby('has_lecithin')[['rating']].mean(), 2)
# Visualize distribution of ratings by bars with/without lecithin
fig = px.histogram(df_copy, x='rating', color='has_lecithin', barmode='overlay', nbins=10,
title='Distirbution of ratings by bars with/without lecithin',
labels={'has_lecithin': 'has lecithin'})
# Update yaxis label
fig.update_yaxes(title_text='number of ratings')
# Show graph
fig.show()