Advanced ML - Open Food Fact

import os import time import math import pandas as pd import nltk from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from googletrans import Translator import pycountry import matplotlib.pyplot as plt import seaborn as sns

nltk.download('stopwords') nltk.download('wordnet')

df_chunk = pd.read_csv( filepath_or_buffer='/datasets/openfoodfact/en.openfoodfacts.org.products.csv', sep='\t', encoding='utf-8', chunksize=200000, low_memory=False)

def generate_general_info(chunk): tmp = chunk.loc[:, [ 'code', 'product_name', 'generic_name' ]] tmp['code'] = tmp['code'].astype('str') tmp['product_name'] = tmp['product_name'].astype('str') tmp['generic_name'] = tmp['generic_name'].astype('str') return tmp

def generate_tags(chunk): tmp = chunk.loc[:, [ 'brands_tags', 'categories_tags', 'labels_tags', 'origins_tags' ]] tmp['brands_tags'] = tmp['brands_tags'].astype('str') tmp['categories_tags'] = tmp['categories_tags'].astype('str') tmp['labels_tags'] = tmp['labels_tags'].astype('str') tmp['origins_tags'] = tmp['origins_tags'].astype('str') return tmp

def generate_ingredients(chunk): tmp = chunk.loc[:, [ 'ingredients_text', 'allergens', 'traces_tags' ]] tmp['ingredients_text'] = tmp['ingredients_text'].astype('str') tmp['allergens'] = tmp['allergens'].astype('str') tmp['traces_tags'] = tmp['traces_tags'].astype('str') return tmp

def generate_misc_data(chunk): tmp = chunk.loc[:, [ 'additives_tags', 'ingredients_from_palm_oil_tags', 'ingredients_that_may_be_from_palm_oil_tags', 'nutriscore_grade', 'pnns_groups_1', 'pnns_groups_2', 'brand_owner', 'ecoscore_grade_fr', 'main_category' ]] tmp['additives_tags'] = tmp['additives_tags'].astype('str') tmp['ingredients_from_palm_oil_tags'] = tmp['ingredients_from_palm_oil_tags'].astype('str') tmp['ingredients_that_may_be_from_palm_oil_tags'] = tmp['ingredients_that_may_be_from_palm_oil_tags'].astype('str') tmp['nutriscore_grade'] = tmp['nutriscore_grade'].astype('str') tmp['pnns_groups_1'] = tmp['pnns_groups_1'].astype('str') tmp['pnns_groups_2'] = tmp['pnns_groups_2'].astype('str') tmp['brand_owner'] = tmp['brand_owner'].astype('str') tmp['ecoscore_grade_fr'] = tmp['ecoscore_grade_fr'].astype('str') tmp['main_category'] = tmp['main_category'].astype('str') return tmp

def generate_nutrition_fact(chunk): tmp = chunk.loc[:, [ 'energy-kcal_100g', 'proteins_100g', 'sugars_100g', 'saturated-fat_100g', 'omega-3-fat_100g', 'cholesterol_100g', 'fiber_100g', 'sodium_100g', 'alcohol_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g', 'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g', 'vitamin-b9_100g', 'vitamin-b12_100g', 'calcium_100g', 'magnesium_100g', 'caffeine_100g', 'ph_100g', 'fruits-vegetables-nuts_100g', ]] for column in tmp.columns: tmp[column] = tmp[column].astype('float64') return tmp

def clean_columns(df): frames = [] for chunk in df: # filter for french countries not_null_countries = ~chunk['countries_tags'].isnull() french_countries = not_null_countries & chunk['countries_tags'].str.contains('en:france') chunk = chunk[french_countries] # clean columns general_information = generate_general_info(chunk) tags = generate_tags(chunk) ingredients = generate_ingredients(chunk) misc_data = generate_misc_data(chunk) nutrition_fact = generate_nutrition_fact(chunk) # concat columns frame = pd.concat([ general_information, tags, ingredients, misc_data, nutrition_fact], axis=1) frames.append(frame) return pd.concat(frames)

# df_cleaned_columns = clean_columns(df_chunk) # df_cleaned_columns

# df_mem_size = df_cleaned_columns.memory_usage().sum() / 1000000 # print('Dataframe memory usage:', f'{df_mem_size} Mb')

# if not os.path.exists('./dataset'): # os.mkdir('./dataset') # df_cleaned_columns.to_csv('./dataset/openFoodFact.csv', sep=';', encoding='utf-8')

df_dtypes = { 'code': 'str', 'product_name': 'str', 'generic_name': 'str', 'brands_tags': 'str', 'categories_tags': 'str', 'origins_tags': 'str', 'labels_tags': 'str', 'ingredients_text': 'str', 'allergens': 'str', 'traces_tags': 'str', 'additives_tags': 'str', 'ingredients_from_palm_oil_tags': 'str', 'ingredients_that_may_be_from_palm_oil_tags': 'str', 'nutriscore_grade': 'str', 'pnns_groups_1': 'str', 'pnns_groups_2': 'str', 'brand_owner': 'str', 'ecoscore_grade_fr': 'str', 'main_category': 'str', 'energy-kcal_100g': 'float64', 'proteins_100g': 'float64', 'sugars_100g': 'float64', 'lactose_100g': 'float64', 'saturated-fat_100g': 'float64', 'omega-3-fat_100g': 'float64', 'cholesterol_100g': 'float64', 'fiber_100g': 'float64', 'sodium_100g': 'float64', 'alcohol_100g': 'float64', 'vitamin-a_100g': 'float64', 'vitamin-d_100g': 'float64', 'vitamin-e_100g': 'float64', 'vitamin-k_100g': 'float64', 'vitamin-c_100g': 'float64', 'vitamin-b1_100g': 'float64', 'vitamin-b2_100g': 'float64', 'vitamin-pp_100g': 'float64', 'vitamin-b6_100g': 'float64', 'vitamin-b9_100g': 'float64', 'vitamin-b12_100g': 'float64', 'calcium_100g': 'float64', 'magnesium_100g': 'float64', 'caffeine_100g': 'float64', 'ph_100g': 'float64', 'fruits-vegetables-nuts_100g': 'float64' }

df_import = pd.read_csv( './dataset/openFoodFact.csv', sep=';', encoding='utf-8', dtype=df_dtypes, low_memory=False ) df_import = df_import.iloc[:, 1:] df_import

df_final = df_import.copy()

df_final['nutriscore_grade'] = df_import['nutriscore_grade'].fillna('unknown').astype('category') df_final['ecoscore_grade_fr'] = df_import['ecoscore_grade_fr'].fillna('unknown').astype('category')

translator = Translator() lemmatizer = WordNetLemmatizer() regex_tokenizer = RegexpTokenizer('[\w\']+') stop_words = set(stopwords.words('english')) list_country = [x.name.lower() for x in pycountry.countries]

def lemmatize_text(text): s_lemmatized_token = lemmatizer.lemmatize(text) return lemmatizer.lemmatize(s_lemmatized_token, pos='v')

def clean_tags(text, is_country=False): if type(text) is not str: return '' # split string by comma text_list = text.split(',') # select string that starts with 'en:' text_list = [x for x in text_list if x.startswith('en:')] # remove 'en:' from string text_list = [x[3:] for x in text_list] if not is_country: # split string by dash text_list = [x.split('-') for x in text_list] # create list of elements in other lists text_list = [x for y in text_list for x in y] # remove stop words text_list = [x for x in text_list if x not in stop_words] # lemmatize words text_list = [lemmatize_text(word) for word in text_list] # remove duplicates text_list = list(set(text_list)) return ','.join(text_list)

def translate_to_en(text): return translator.translate(text, dest='en').text def split_into_words(text): if type(text) is not str: return '' #text = translate_to_en(text) # decomment to 100 000 rows sample text = text.lower() tokens = regex_tokenizer.tokenize(text) tokens = [token for token in tokens if token not in stop_words] tokens = [lemmatize_text(token) for token in tokens] return ','.join(tokens)

col_tags_names = [ 'categories_tags', 'labels_tags', 'origins_tags', 'allergens', 'traces_tags', 'additives_tags', 'ingredients_from_palm_oil_tags', 'ingredients_that_may_be_from_palm_oil_tags', 'main_category' ] for name in col_tags_names: print(f'Working with column: {name}') df_final[name] = df_import[name].apply(lambda x: clean_tags(x, False))

print(f'Working with column: origins_tags') df_final['origins_tags'] = df_import['origins_tags'].apply(lambda x: clean_tags(x, True))

col_text_names = [ 'ingredients_text', 'pnns_groups_1', 'pnns_groups_2' ] for name in col_text_names: print(f'Working with column: {name}') df_final[name] = df_import[name].apply(lambda x: split_into_words(x))

# execute later #tmp = df_final[:10000] #for name in col_text_names: # print(f'Working with column: {name}') # tmp[name] = tmp[name].apply(lambda x: split_into_words(x))

list(df_final['origins_tags'].unique())

df_final[(df_final['origins_tags'].str.contains('water'))]

df_without_uknown = df_final[df_final['pnns_groups_1'] != 'unknown'] df_without_uknown = df_without_uknown[df_without_uknown['nutriscore_grade'] != 'unknown'] df_without_uknown

#define data tmp_by_pnns_groups_1 = df_import.groupby(['pnns_groups_1']).code.count() tmp_pcts = tmp_by_pnns_groups_1.groupby(level=0).apply(lambda x: 100 * x / tmp_by_pnns_groups_1.sum()) data = tmp_pcts.values labels = tmp_pcts.index #define Seaborn color palette to use colors = sns.color_palette('pastel')[0:5] #create pie chart fig = plt.figure(1, figsize=(12, 8)) plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%') plt.show()

#define data tmp_by_nutriscore = df_final.groupby(['nutriscore_grade']).code.count() tmp_pcts_2 = tmp_by_nutriscore.groupby(level=0).apply(lambda x: 100 * x / tmp_by_nutriscore.sum()) #create barplot fig = plt.figure(1, figsize=(12, 8)) sns.barplot(x=tmp_pcts_2.index.categories, y=tmp_pcts_2.values) plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Model

Model