import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# load .csv as a pandas dataframe
df = pd.read_csv('https://github.com/jldbc/coffee-quality-database/raw/master/data/arabica_data_cleaned.csv')
df_gdp = pd.read_csv(r'gdp.csv', skiprows=3)
# check characteristics of the data
df.head()
# get size of dataframe displayed as (rows, columns)
df.shape
# concise summary of index dtype and columns, non-null values, and memory usage
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 44 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 1311 non-null int64
1 Species 1311 non-null object
2 Owner 1304 non-null object
3 Country.of.Origin 1310 non-null object
4 Farm.Name 955 non-null object
5 Lot.Number 270 non-null object
6 Mill 1001 non-null object
7 ICO.Number 1165 non-null object
8 Company 1102 non-null object
9 Altitude 1088 non-null object
10 Region 1254 non-null object
11 Producer 1081 non-null object
12 Number.of.Bags 1311 non-null int64
13 Bag.Weight 1311 non-null object
14 In.Country.Partner 1311 non-null object
15 Harvest.Year 1264 non-null object
16 Grading.Date 1311 non-null object
17 Owner.1 1304 non-null object
18 Variety 1110 non-null object
19 Processing.Method 1159 non-null object
20 Aroma 1311 non-null float64
21 Flavor 1311 non-null float64
22 Aftertaste 1311 non-null float64
23 Acidity 1311 non-null float64
24 Body 1311 non-null float64
25 Balance 1311 non-null float64
26 Uniformity 1311 non-null float64
27 Clean.Cup 1311 non-null float64
28 Sweetness 1311 non-null float64
29 Cupper.Points 1311 non-null float64
30 Total.Cup.Points 1311 non-null float64
31 Moisture 1311 non-null float64
32 Category.One.Defects 1311 non-null int64
33 Quakers 1310 non-null float64
34 Color 1095 non-null object
35 Category.Two.Defects 1311 non-null int64
36 Expiration 1311 non-null object
37 Certification.Body 1311 non-null object
38 Certification.Address 1311 non-null object
39 Certification.Contact 1311 non-null object
40 unit_of_measurement 1311 non-null object
41 altitude_low_meters 1084 non-null float64
42 altitude_high_meters 1084 non-null float64
43 altitude_mean_meters 1084 non-null float64
dtypes: float64(16), int64(4), object(24)
memory usage: 450.8+ KB
# Converting the data type base on the contents of the column.
df = df.convert_dtypes()
df.dtypes
# checking to see if all are of the same species being considered
# expect to see only arabica beans
df['Species'].value_counts()
# slicing dataframe to contain country and grading characteristics
countries = df.loc[:,'Country.of.Origin']
df_ratings = pd.concat([df['Country.of.Origin'] ,df.loc[:, 'Aroma':'Category.One.Defects' ],df['Category.Two.Defects'] ], axis= 1)
print(df_ratings.info())
print(df_ratings.shape)
df_ratings
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Country.of.Origin 1310 non-null string
1 Aroma 1311 non-null float64
2 Flavor 1311 non-null float64
3 Aftertaste 1311 non-null float64
4 Acidity 1311 non-null float64
5 Body 1311 non-null float64
6 Balance 1311 non-null float64
7 Uniformity 1311 non-null float64
8 Clean.Cup 1311 non-null float64
9 Sweetness 1311 non-null float64
10 Cupper.Points 1311 non-null float64
11 Total.Cup.Points 1311 non-null float64
12 Moisture 1311 non-null float64
13 Category.One.Defects 1311 non-null Int64
14 Category.Two.Defects 1311 non-null Int64
dtypes: Int64(2), float64(12), string(1)
memory usage: 156.3 KB
None
(1311, 15)
# Which entry does not have country of origin?
df_ratings[df_ratings.isna().any(axis=1)]
df.loc[1197]
df_ratings = df_ratings.dropna()
# checking dataframe consistency by sampling random columns
df_ratings.sample(12)
# checking countries to see if there are any other unexpected values
df_ratings['Country.of.Origin'].value_counts()
counts = df_ratings['Country.of.Origin'].value_counts()
df_cleaned = df_ratings[df_ratings['Country.of.Origin'].isin(counts.index[counts>=10])]
df_cleaned['Country.of.Origin'].value_counts()
# preparing to add gdp for 2018 to my dataframe
df_gdp = df_gdp[['Country Name', '2018']]
df_gdp = df_gdp.rename(columns={"Country Name": "Country.of.Origin", "2018": "GDP"})
# make sure that names match dataframes to prevent data loss
df_cleaned = df_cleaned.replace({'United States (Hawaii)': 'United States', 'Tanzania, United Republic Of' : 'Tanzania' })
print(df_cleaned.shape)
df_cleaned['Country.of.Origin'].value_counts()
(1255, 15)
df_merged = pd.merge(df_cleaned, df_gdp, on='Country.of.Origin', how='inner')
print(df_merged.shape)
df_merged
(1180, 16)
# check for min and max values
print('{0:30}'.format(''),'{0:30}'.format('min'), 'max')
for _ in df_merged.loc[:, 'Aroma':]:
minmax = [df_merged[_].min(), df_merged[_].max()]
print('{0:30}'.format(_+':'), minmax[0],'{0:30}'.format(minmax[1]) )
min max
Aroma: 0.0 8.75
Flavor: 0.0 8.83
Aftertaste: 0.0 8.67
Acidity: 0.0 8.75
Body: 0.0 8.58
Balance: 0.0 8.75
Uniformity: 0.0 10.0
Clean.Cup: 0.0 10.0
Sweetness: 0.0 10.0
Cupper.Points: 0.0 9.25
Total.Cup.Points: 0.0 90.58
Moisture: 0.0 0.28
Category.One.Defects: 0 31
Category.Two.Defects: 0 55
GDP: 9712994564.91596 20611860934000.0
df_ratings_grouped = df_merged.groupby('Country.of.Origin')
details_by_country = df_ratings_grouped.describe()
details_by_country
mean_by_country = df_ratings_grouped.mean()
for category in mean_by_country:
# print(category[0]+'\n',details_by_country[category[0]][category[1]])
mean_by_country[category].plot(kind='bar',color='green')
plt.title(category,fontweight="bold")
plt.xlabel('Country',fontsize=13)
plt.ylabel('Score',fontsize=13)
plt.show()
# mean_by_country