import os
import time
import math
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from googletrans import Translator
import pycountry
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('stopwords')
nltk.download('wordnet')
df_chunk = pd.read_csv(
filepath_or_buffer='/datasets/openfoodfact/en.openfoodfacts.org.products.csv',
sep='\t', encoding='utf-8', chunksize=200000, low_memory=False)
def generate_general_info(chunk):
tmp = chunk.loc[:, [
'code',
'product_name',
'generic_name'
]]
tmp['code'] = tmp['code'].astype('str')
tmp['product_name'] = tmp['product_name'].astype('str')
tmp['generic_name'] = tmp['generic_name'].astype('str')
return tmp
def generate_tags(chunk):
tmp = chunk.loc[:, [
'brands_tags',
'categories_tags',
'labels_tags',
'origins_tags'
]]
tmp['brands_tags'] = tmp['brands_tags'].astype('str')
tmp['categories_tags'] = tmp['categories_tags'].astype('str')
tmp['labels_tags'] = tmp['labels_tags'].astype('str')
tmp['origins_tags'] = tmp['origins_tags'].astype('str')
return tmp
def generate_ingredients(chunk):
tmp = chunk.loc[:, [
'ingredients_text',
'allergens',
'traces_tags'
]]
tmp['ingredients_text'] = tmp['ingredients_text'].astype('str')
tmp['allergens'] = tmp['allergens'].astype('str')
tmp['traces_tags'] = tmp['traces_tags'].astype('str')
return tmp
def generate_misc_data(chunk):
tmp = chunk.loc[:, [
'additives_tags',
'ingredients_from_palm_oil_tags',
'ingredients_that_may_be_from_palm_oil_tags',
'nutriscore_grade',
'pnns_groups_1',
'pnns_groups_2',
'brand_owner',
'ecoscore_grade_fr',
'main_category'
]]
tmp['additives_tags'] = tmp['additives_tags'].astype('str')
tmp['ingredients_from_palm_oil_tags'] = tmp['ingredients_from_palm_oil_tags'].astype('str')
tmp['ingredients_that_may_be_from_palm_oil_tags'] = tmp['ingredients_that_may_be_from_palm_oil_tags'].astype('str')
tmp['nutriscore_grade'] = tmp['nutriscore_grade'].astype('str')
tmp['pnns_groups_1'] = tmp['pnns_groups_1'].astype('str')
tmp['pnns_groups_2'] = tmp['pnns_groups_2'].astype('str')
tmp['brand_owner'] = tmp['brand_owner'].astype('str')
tmp['ecoscore_grade_fr'] = tmp['ecoscore_grade_fr'].astype('str')
tmp['main_category'] = tmp['main_category'].astype('str')
return tmp
def generate_nutrition_fact(chunk):
tmp = chunk.loc[:, [
'energy-kcal_100g',
'proteins_100g',
'sugars_100g',
'saturated-fat_100g',
'omega-3-fat_100g',
'cholesterol_100g',
'fiber_100g',
'sodium_100g',
'alcohol_100g',
'vitamin-a_100g',
'vitamin-d_100g',
'vitamin-e_100g',
'vitamin-k_100g',
'vitamin-c_100g',
'vitamin-b1_100g',
'vitamin-b2_100g',
'vitamin-pp_100g',
'vitamin-b6_100g',
'vitamin-b9_100g',
'vitamin-b12_100g',
'calcium_100g',
'magnesium_100g',
'caffeine_100g',
'ph_100g',
'fruits-vegetables-nuts_100g',
]]
for column in tmp.columns:
tmp[column] = tmp[column].astype('float64')
return tmp
def clean_columns(df):
frames = []
for chunk in df:
# filter for french countries
not_null_countries = ~chunk['countries_tags'].isnull()
french_countries = not_null_countries & chunk['countries_tags'].str.contains('en:france')
chunk = chunk[french_countries]
# clean columns
general_information = generate_general_info(chunk)
tags = generate_tags(chunk)
ingredients = generate_ingredients(chunk)
misc_data = generate_misc_data(chunk)
nutrition_fact = generate_nutrition_fact(chunk)
# concat columns
frame = pd.concat([
general_information,
tags,
ingredients,
misc_data,
nutrition_fact], axis=1)
frames.append(frame)
return pd.concat(frames)
# df_cleaned_columns = clean_columns(df_chunk)
# df_cleaned_columns
# df_mem_size = df_cleaned_columns.memory_usage().sum() / 1000000
# print('Dataframe memory usage:', f'{df_mem_size} Mb')
# if not os.path.exists('./dataset'):
# os.mkdir('./dataset')
# df_cleaned_columns.to_csv('./dataset/openFoodFact.csv', sep=';', encoding='utf-8')
df_dtypes = {
'code': 'str',
'product_name': 'str',
'generic_name': 'str',
'brands_tags': 'str',
'categories_tags': 'str',
'origins_tags': 'str',
'labels_tags': 'str',
'ingredients_text': 'str',
'allergens': 'str',
'traces_tags': 'str',
'additives_tags': 'str',
'ingredients_from_palm_oil_tags': 'str',
'ingredients_that_may_be_from_palm_oil_tags': 'str',
'nutriscore_grade': 'str',
'pnns_groups_1': 'str',
'pnns_groups_2': 'str',
'brand_owner': 'str',
'ecoscore_grade_fr': 'str',
'main_category': 'str',
'energy-kcal_100g': 'float64',
'proteins_100g': 'float64',
'sugars_100g': 'float64',
'lactose_100g': 'float64',
'saturated-fat_100g': 'float64',
'omega-3-fat_100g': 'float64',
'cholesterol_100g': 'float64',
'fiber_100g': 'float64',
'sodium_100g': 'float64',
'alcohol_100g': 'float64',
'vitamin-a_100g': 'float64',
'vitamin-d_100g': 'float64',
'vitamin-e_100g': 'float64',
'vitamin-k_100g': 'float64',
'vitamin-c_100g': 'float64',
'vitamin-b1_100g': 'float64',
'vitamin-b2_100g': 'float64',
'vitamin-pp_100g': 'float64',
'vitamin-b6_100g': 'float64',
'vitamin-b9_100g': 'float64',
'vitamin-b12_100g': 'float64',
'calcium_100g': 'float64',
'magnesium_100g': 'float64',
'caffeine_100g': 'float64',
'ph_100g': 'float64',
'fruits-vegetables-nuts_100g': 'float64'
}
df_import = pd.read_csv(
'./dataset/openFoodFact.csv',
sep=';',
encoding='utf-8',
dtype=df_dtypes,
low_memory=False
)
df_import = df_import.iloc[:, 1:]
df_import
df_final = df_import.copy()
df_final['nutriscore_grade'] = df_import['nutriscore_grade'].fillna('unknown').astype('category')
df_final['ecoscore_grade_fr'] = df_import['ecoscore_grade_fr'].fillna('unknown').astype('category')
translator = Translator()
lemmatizer = WordNetLemmatizer()
regex_tokenizer = RegexpTokenizer('[\w\']+')
stop_words = set(stopwords.words('english'))
list_country = [x.name.lower() for x in pycountry.countries]
def lemmatize_text(text):
s_lemmatized_token = lemmatizer.lemmatize(text)
return lemmatizer.lemmatize(s_lemmatized_token, pos='v')
def clean_tags(text, is_country=False):
if type(text) is not str:
return ''
# split string by comma
text_list = text.split(',')
# select string that starts with 'en:'
text_list = [x for x in text_list if x.startswith('en:')]
# remove 'en:' from string
text_list = [x[3:] for x in text_list]
if not is_country:
# split string by dash
text_list = [x.split('-') for x in text_list]
# create list of elements in other lists
text_list = [x for y in text_list for x in y]
# remove stop words
text_list = [x for x in text_list if x not in stop_words]
# lemmatize words
text_list = [lemmatize_text(word) for word in text_list]
# remove duplicates
text_list = list(set(text_list))
return ','.join(text_list)
def translate_to_en(text):
return translator.translate(text, dest='en').text
def split_into_words(text):
if type(text) is not str:
return ''
#text = translate_to_en(text) # decomment to 100 000 rows sample
text = text.lower()
tokens = regex_tokenizer.tokenize(text)
tokens = [token for token in tokens if token not in stop_words]
tokens = [lemmatize_text(token) for token in tokens]
return ','.join(tokens)
col_tags_names = [
'categories_tags',
'labels_tags',
'origins_tags',
'allergens',
'traces_tags',
'additives_tags',
'ingredients_from_palm_oil_tags',
'ingredients_that_may_be_from_palm_oil_tags',
'main_category'
]
for name in col_tags_names:
print(f'Working with column: {name}')
df_final[name] = df_import[name].apply(lambda x: clean_tags(x, False))
print(f'Working with column: origins_tags')
df_final['origins_tags'] = df_import['origins_tags'].apply(lambda x: clean_tags(x, True))
col_text_names = [
'ingredients_text',
'pnns_groups_1',
'pnns_groups_2'
]
for name in col_text_names:
print(f'Working with column: {name}')
df_final[name] = df_import[name].apply(lambda x: split_into_words(x))
# execute later
#tmp = df_final[:10000]
#for name in col_text_names:
# print(f'Working with column: {name}')
# tmp[name] = tmp[name].apply(lambda x: split_into_words(x))
list(df_final['origins_tags'].unique())
df_final[(df_final['origins_tags'].str.contains('water'))]
df_without_uknown = df_final[df_final['pnns_groups_1'] != 'unknown']
df_without_uknown = df_without_uknown[df_without_uknown['nutriscore_grade'] != 'unknown']
df_without_uknown
#define data
tmp_by_pnns_groups_1 = df_import.groupby(['pnns_groups_1']).code.count()
tmp_pcts = tmp_by_pnns_groups_1.groupby(level=0).apply(lambda x: 100 * x / tmp_by_pnns_groups_1.sum())
data = tmp_pcts.values
labels = tmp_pcts.index
#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:5]
#create pie chart
fig = plt.figure(1, figsize=(12, 8))
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.show()
#define data
tmp_by_nutriscore = df_final.groupby(['nutriscore_grade']).code.count()
tmp_pcts_2 = tmp_by_nutriscore.groupby(level=0).apply(lambda x: 100 * x / tmp_by_nutriscore.sum())
#create barplot
fig = plt.figure(1, figsize=(12, 8))
sns.barplot(x=tmp_pcts_2.index.categories, y=tmp_pcts_2.values)
plt.show()