Analysis of Countries with Most Highly Rated Coffee

import pandas as pd import numpy as np import matplotlib.pyplot as plt pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000)

# load .csv as a pandas dataframe df = pd.read_csv('https://github.com/jldbc/coffee-quality-database/raw/master/data/arabica_data_cleaned.csv') df_gdp = pd.read_csv(r'gdp.csv', skiprows=3)

# check characteristics of the data df.head()

# get size of dataframe displayed as (rows, columns) df.shape

# concise summary of index dtype and columns, non-null values, and memory usage df.info()

# Converting the data type base on the contents of the column. df = df.convert_dtypes() df.dtypes

# checking to see if all are of the same species being considered # expect to see only arabica beans df['Species'].value_counts()

# slicing dataframe to contain country and grading characteristics countries = df.loc[:,'Country.of.Origin'] df_ratings = pd.concat([df['Country.of.Origin'] ,df.loc[:, 'Aroma':'Category.One.Defects' ],df['Category.Two.Defects'] ], axis= 1) print(df_ratings.info()) print(df_ratings.shape) df_ratings

# Which entry does not have country of origin? df_ratings[df_ratings.isna().any(axis=1)]

df.loc[1197]

df_ratings = df_ratings.dropna()

# checking dataframe consistency by sampling random columns df_ratings.sample(12)

# checking countries to see if there are any other unexpected values df_ratings['Country.of.Origin'].value_counts()

counts = df_ratings['Country.of.Origin'].value_counts() df_cleaned = df_ratings[df_ratings['Country.of.Origin'].isin(counts.index[counts>=10])] df_cleaned['Country.of.Origin'].value_counts()

# preparing to add gdp for 2018 to my dataframe df_gdp = df_gdp[['Country Name', '2018']] df_gdp = df_gdp.rename(columns={"Country Name": "Country.of.Origin", "2018": "GDP"})

# make sure that names match dataframes to prevent data loss df_cleaned = df_cleaned.replace({'United States (Hawaii)': 'United States', 'Tanzania, United Republic Of' : 'Tanzania' }) print(df_cleaned.shape) df_cleaned['Country.of.Origin'].value_counts()

df_merged = pd.merge(df_cleaned, df_gdp, on='Country.of.Origin', how='inner') print(df_merged.shape) df_merged

# check for min and max values print('{0:30}'.format(''),'{0:30}'.format('min'), 'max') for _ in df_merged.loc[:, 'Aroma':]: minmax = [df_merged[_].min(), df_merged[_].max()] print('{0:30}'.format(_+':'), minmax[0],'{0:30}'.format(minmax[1]) )

df_ratings_grouped = df_merged.groupby('Country.of.Origin')

details_by_country = df_ratings_grouped.describe() details_by_country

mean_by_country = df_ratings_grouped.mean() for category in mean_by_country: # print(category[0]+'\n',details_by_country[category[0]][category[1]]) mean_by_country[category].plot(kind='bar',color='green') plt.title(category,fontweight="bold") plt.xlabel('Country',fontsize=13) plt.ylabel('Score',fontsize=13) plt.show() # mean_by_country