import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# load .csv as a pandas dataframe
df = pd.read_csv('https://github.com/jldbc/coffee-quality-database/raw/master/data/arabica_data_cleaned.csv')
df_gdp = pd.read_csv(r'gdp.csv', skiprows=3)
# check characteristics of the data
df.head()
# get size of dataframe displayed as (rows, columns)
df.shape
# concise summary of index dtype and columns, non-null values, and memory usage
df.info()
# Converting the data type base on the contents of the column.
df = df.convert_dtypes()
df.dtypes
# checking to see if all are of the same species being considered
# expect to see only arabica beans
df['Species'].value_counts()
# slicing dataframe to contain country and grading characteristics
countries = df.loc[:,'Country.of.Origin']
df_ratings = pd.concat([df['Country.of.Origin'] ,df.loc[:, 'Aroma':'Category.One.Defects' ],df['Category.Two.Defects'] ], axis= 1)
print(df_ratings.info())
print(df_ratings.shape)
df_ratings
# Which entry does not have country of origin?
df_ratings[df_ratings.isna().any(axis=1)]
df.loc[1197]
df_ratings = df_ratings.dropna()
# checking dataframe consistency by sampling random columns
df_ratings.sample(12)
# checking countries to see if there are any other unexpected values
df_ratings['Country.of.Origin'].value_counts()
counts = df_ratings['Country.of.Origin'].value_counts()
df_cleaned = df_ratings[df_ratings['Country.of.Origin'].isin(counts.index[counts>=10])]
df_cleaned['Country.of.Origin'].value_counts()
# preparing to add gdp for 2018 to my dataframe
df_gdp = df_gdp[['Country Name', '2018']]
df_gdp = df_gdp.rename(columns={"Country Name": "Country.of.Origin", "2018": "GDP"})
# make sure that names match dataframes to prevent data loss
df_cleaned = df_cleaned.replace({'United States (Hawaii)': 'United States', 'Tanzania, United Republic Of' : 'Tanzania' })
print(df_cleaned.shape)
df_cleaned['Country.of.Origin'].value_counts()
df_merged = pd.merge(df_cleaned, df_gdp, on='Country.of.Origin', how='inner')
print(df_merged.shape)
df_merged
# check for min and max values
print('{0:30}'.format(''),'{0:30}'.format('min'), 'max')
for _ in df_merged.loc[:, 'Aroma':]:
minmax = [df_merged[_].min(), df_merged[_].max()]
print('{0:30}'.format(_+':'), minmax[0],'{0:30}'.format(minmax[1]) )
df_ratings_grouped = df_merged.groupby('Country.of.Origin')
details_by_country = df_ratings_grouped.describe()
details_by_country
mean_by_country = df_ratings_grouped.mean()
for category in mean_by_country:
# print(category[0]+'\n',details_by_country[category[0]][category[1]])
mean_by_country[category].plot(kind='bar',color='green')
plt.title(category,fontweight="bold")
plt.xlabel('Country',fontsize=13)
plt.ylabel('Score',fontsize=13)
plt.show()
# mean_by_country