# download spacy language model
!python -m spacy download en_core_web_lg
Collecting en_core_web_lg==2.3.1
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz (782.7 MB)
|████████████████████████▎ | 594.1 MB 113.6 MB/s eta 0:00:02IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)
|████████████████████████████████| 782.7 MB 132.5 MB/s
Requirement already satisfied: spacy<2.4.0,>=2.3.0 in /opt/venv/lib/python3.7/site-packages (from en_core_web_lg==2.3.1) (2.3.2)
Requirement already satisfied: blis<0.5.0,>=0.4.0 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (0.4.1)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (2.0.4)
Requirement already satisfied: setuptools in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (50.3.2)
Requirement already satisfied: plac<1.2.0,>=0.9.6 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (1.1.3)
Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (0.8.0)
Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (1.0.0)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (4.52.0)
Requirement already satisfied: numpy>=1.15.0 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (1.19.4)
Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (1.0.4)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (3.0.4)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (2.25.0)
Requirement already satisfied: thinc==7.4.1 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (7.4.1)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/venv/lib/python3.7/site-packages (from spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (1.0.4)
Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in /opt/venv/lib/python3.7/site-packages (from catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (2.0.0)
Requirement already satisfied: chardet<4,>=3.0.2 in /opt/venv/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /opt/venv/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (2020.11.8)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/venv/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (1.26.2)
Requirement already satisfied: idna<3,>=2.5 in /opt/venv/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (2.10)
Requirement already satisfied: zipp>=0.5 in /opt/venv/lib/python3.7/site-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en_core_web_lg==2.3.1) (3.4.0)
Building wheels for collected packages: en-core-web-lg
Building wheel for en-core-web-lg (setup.py) ... done
Created wheel for en-core-web-lg: filename=en_core_web_lg-2.3.1-py3-none-any.whl size=782936123 sha256=a2476b400043b49b30186fa1890ae79192b1d461ba0bed048e3d10d91b4b2475
Stored in directory: /tmp/pip-ephem-wheel-cache-24xczx7i/wheels/41/75/77/c4a98e18b2c317a2a13931cbbea7e3ca7f3a21efc36adc1d71
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.3.1
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')
# import libraries
import en_core_web_lg
import pandas as pd
import re
import random
import spacy
from spacy.util import minibatch, compounding
import warnings
import matplotlib.pyplot as plt
# read in the food csv file
food_df = pd.read_csv("food.csv")
# print row and column information
food_df.head()
# print the size
food_df["description"].size
# diaqualify foods with special characters, lowercase and extract results from "description" column
foods = food_df[food_df["description"].str.contains("[^a-zA-Z ]") == False]["description"].apply(lambda food: food.lower())
# filter out foods with more than 3 words, drop any duplicates
foods = foods[foods.str.split().apply(len) <= 3].drop_duplicates()
# print the remaining size
foods.size
# find one-worded, two-worded and three-worded foods
one_worded_foods = foods[foods.str.split().apply(len) == 1]
two_worded_foods = foods[foods.str.split().apply(len) == 2]
three_worded_foods = foods[foods.str.split().apply(len) == 3]
# create a bar plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar([1, 2, 3], [one_worded_foods.size, two_worded_foods.size, three_worded_foods.size])
# label the x-axis instances
ax.set_xticks([1, 2, 3])
ax.set_xticklabels(["one", "two", "three"])
# set the title and the xy-axis labels
plt.title("Number of Words in Food Entities")
plt.xlabel("Number of Words")
plt.ylabel("Food Entities")
# display the plot
plt.show()
# total number of foods
total_num_foods = round(one_worded_foods.size / 45 * 100)
# shuffle the 2-worded and 3-worded foods since we'll be slicing them
two_worded_foods = two_worded_foods.sample(frac=1)
three_worded_foods = three_worded_foods.sample(frac=1)
# append the foods together
foods = one_worded_foods.append(two_worded_foods[:round(total_num_foods * 0.30)]).append(three_worded_foods[:round(total_num_foods * 0.25)])
# print the resulting sizes
for i in range(3):
print(f"{i+1}-worded food entities:", foods[foods.str.split().apply(len) == i + 1].size)
1-worded food entities: 1258
2-worded food entities: 839
3-worded food entities: 699
food_templates = [
"I ate my {}",
"I'm eating a {}",
"I just ate a {}",
"I only ate the {}",
"I'm done eating a {}",
"I've already eaten a {}",
"I just finished my {}",
"When I was having lunch I ate a {}",
"I had a {} and a {} today",
"I ate a {} and a {} for lunch",
"I made a {} and {} for lunch",
"I ate {} and {}",
"today I ate a {} and a {} for lunch",
"I had {} with my husband last night",
"I brought you some {} on my birthday",
"I made {} for yesterday's dinner",
"last night, a {} was sent to me with {}",
"I had {} yesterday and I'd like to eat it anyway",
"I ate a couple of {} last night",
"I had some {} at dinner last night",
"Last night, I ordered some {}",
"I made a {} last night",
"I had a bowl of {} with {} and I wanted to go to the mall today",
"I brought a basket of {} for breakfast this morning",
"I had a bowl of {}",
"I ate a {} with {} in the morning",
"I made a bowl of {} for my breakfast",
"There's {} for breakfast in the bowl this morning",
"This morning, I made a bowl of {}",
"I decided to have some {} as a little bonus",
"I decided to enjoy some {}",
"I've decided to have some {} for dessert",
"I had a {}, a {} and {} at home",
"I took a {}, {} and {} on the weekend",
"I ate a {} with {} and {} just now",
"Last night, I ate an {} with {} and {}",
"I tasted some {}, {} and {} at the office",
"There's a basket of {}, {} and {} that I consumed",
"I devoured a {}, {} and {}",
"I've already had a bag of {}, {} and {} from the fridge"
]
# create dictionaries to store the generated food combinations. Do note that one_food != one_worded_food. one_food == "barbecue sauce", one_worded_food == "sauce"
TRAIN_FOOD_DATA = {
"one_food": [],
"two_foods": [],
"three_foods": []
}
TEST_FOOD_DATA = {
"one_food": [],
"two_foods": [],
"three_foods": []
}
# one_food, two_food, and three_food combinations will be limited to 167 sentences
FOOD_SENTENCE_LIMIT = 167
# helper function for deciding what dictionary and subsequent array to append the food sentence on to
def get_food_data(count):
return {
1: TRAIN_FOOD_DATA["one_food"] if len(TRAIN_FOOD_DATA["one_food"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["one_food"],
2: TRAIN_FOOD_DATA["two_foods"] if len(TRAIN_FOOD_DATA["two_foods"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["two_foods"],
3: TRAIN_FOOD_DATA["three_foods"] if len(TRAIN_FOOD_DATA["three_foods"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["three_foods"],
}[count]
# the pattern to replace from the template sentences
pattern_to_replace = "{}"
# shuffle the data before starting
foods = foods.sample(frac=1)
# the count that helps us decide when to break from the for loop
food_entity_count = foods.size - 1
# start the while loop, ensure we don't get an index out of bounds error
while food_entity_count >= 2:
entities = []
# pick a random food template
sentence = food_templates[random.randint(0, len(food_templates) - 1)]
# find out how many braces "{}" need to be replaced in the template
matches = re.findall(pattern_to_replace, sentence)
# for each brace, replace with a food entity from the shuffled food data
for match in matches:
food = foods.iloc[food_entity_count]
food_entity_count -= 1
# replace the pattern, but then find the match of the food entity we just inserted
sentence = sentence.replace(match, food, 1)
match_span = re.search(food, sentence).span()
# use that match to find the index positions of the food entity in the sentence, append
entities.append((match_span[0], match_span[1], "FOOD"))
# append the sentence and the position of the entities to the correct dictionary and array
get_food_data(len(matches)).append((sentence, {"entities": entities}))
# print the number of food sentences, as well as an example sentence
for key in TRAIN_FOOD_DATA:
print("{} {} sentences: {}".format(len(TRAIN_FOOD_DATA[key]), key, TRAIN_FOOD_DATA[key][0]))
167 one_food sentences: ('Last night, I ordered some bueno', {'entities': [(27, 32, 'FOOD')]})
167 two_foods sentences: ('I ate a tokyo style ramen and a bunuelos dessert for lunch', {'entities': [(8, 25, 'FOOD'), (32, 48, 'FOOD')]})
167 three_foods sentences: ('Last night, I ate an edam with strawberry lozenges and springhill strawberry jam', {'entities': [(21, 25, 'FOOD'), (31, 50, 'FOOD'), (55, 80, 'FOOD')]})
for key in TEST_FOOD_DATA:
print("{} {} items: {}".format(len(TEST_FOOD_DATA[key]), key, TEST_FOOD_DATA[key][0]))
876 one_food items: ("I've already eaten a eggnog", {'entities': [(21, 27, 'FOOD')]})
191 two_foods items: ('I made a deli chicken salad and organic nutrition bar for lunch', {'entities': [(9, 27, 'FOOD'), (32, 53, 'FOOD')]})
178 three_foods items: ("There's a basket of tahinibar, gumballs and smoothies that I consumed", {'entities': [(20, 29, 'FOOD'), (31, 39, 'FOOD'), (44, 53, 'FOOD')]})
# read in the revision data (just used a random article dataset from a different course I had taken)
npr_df = pd.read_csv("npr.csv")
# print row and column information
npr_df.head()
# create an nlp object as we'll use this to seperate the sentences and identify existing entities
nlp = en_core_web_lg.load()
revision_texts = []
# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in nlp.pipe(npr_df["Article"][:6000], batch_size=30, disable=["tagger", "ner"]):
for sentence in doc.sents:
if 40 < len(sentence.text) < 80:
# some of the sentences had excessive whitespace in between words, so we're trimming that
revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))
revisions = []
# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
# don't append sentences that have no entities
if len(doc.ents) > 0:
revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))
# print an example of the revision sentence
print(revisions[0][0])
# print an example of the revision data
print(revisions[0][1])
And in that sense, this year shows little sign of ending on Dec. 31.
{'entities': [(19, 28, 'DATE'), (60, 67, 'DATE')]}
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []
# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}
# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100
# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
for entity in entities:
label = entity[2]
if label in entity_counter:
entity_counter[label] += 1
else:
entity_counter[label] = 1
random.shuffle(revisions)
for revision in revisions:
# get the entities from the revision sentence
entities = revision[1]["entities"]
# simple hack to make sure spaCy entities don't get too one-sided
should_append_to_train_counter = 0
for _, _, label in entities:
if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
should_append_to_train_counter -= 1
else:
should_append_to_train_counter += 1
# simple switch for deciding whether to append to train data or test data
if should_append_to_train_counter >= 0:
TRAIN_REVISION_DATA.append(revision)
increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
else:
TEST_REVISION_DATA.append(revision)
increment_revision_counters(TEST_ENTITY_COUNTER, entities)
TRAIN_ENTITY_COUNTER
TEST_ENTITY_COUNTER
# combine the food training data
TRAIN_FOOD_DATA_COMBINED = TRAIN_FOOD_DATA["one_food"] + TRAIN_FOOD_DATA["two_foods"] + TRAIN_FOOD_DATA["three_foods"]
# print the length of the food training data
print("FOOD", len(TRAIN_FOOD_DATA_COMBINED))
# print the length of the revision training data
print("REVISION", len(TRAIN_REVISION_DATA))
# join and print the combined length
TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_FOOD_DATA_COMBINED
print("COMBINED", len(TRAIN_DATA))
FOOD 501
REVISION 1490
COMBINED 1991
# add NER to the pipeline and the new label
ner = nlp.get_pipe("ner")
ner.add_label("FOOD")
# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
# start the training loop, only training NER
epochs = 30
optimizer = nlp.resume_training()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
warnings.filterwarnings("once", category=UserWarning, module='spacy')
sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatc
for epoch in range(epochs):
examples = TRAIN_DATA
random.shuffle(examples)
batches = minibatch(examples, size=sizes)
losses = {}
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
print("Losses ({}/{})".format(epoch + 1, epochs), losses)
Losses (1/30) {'ner': 16073.626560282552}
Losses (2/30) {'ner': 14584.542186669307}
Losses (3/30) {'ner': 14548.422105267644}
Losses (4/30) {'ner': 14300.272813861375}
Losses (5/30) {'ner': 14186.099578664172}
Losses (6/30) {'ner': 14101.325425518793}
Losses (7/30) {'ner': 14000.43209932954}
Losses (8/30) {'ner': 13994.203212552093}
Losses (9/30) {'ner': 14022.144056472462}
Losses (10/30) {'ner': 13901.634234594181}
Losses (11/30) {'ner': 13745.580805012956}
Losses (12/30) {'ner': 13903.877741001314}
Losses (13/30) {'ner': 13817.979523293063}
Losses (14/30) {'ner': 13830.568540076929}
Losses (15/30) {'ner': 13860.916276267613}
Losses (16/30) {'ner': 13778.726491773035}
Losses (17/30) {'ner': 13602.03644076937}
Losses (18/30) {'ner': 13822.509093627625}
Losses (19/30) {'ner': 13501.720368203663}
Losses (20/30) {'ner': 13691.972102675238}
Losses (21/30) {'ner': 13686.256363266613}
Losses (22/30) {'ner': 13478.341214498476}
Losses (23/30) {'ner': 13622.263516337844}
Losses (24/30) {'ner': 13655.980778428377}
Losses (25/30) {'ner': 13632.509101368283}
Losses (26/30) {'ner': 13548.80638261576}
Losses (27/30) {'ner': 13608.923688475392}
Losses (28/30) {'ner': 13581.22338909845}
Losses (29/30) {'ner': 13501.752293019905}
Losses (30/30) {'ner': 13605.513101932593}
# display sentence involving original entities
spacy.displacy.render(nlp("Apple is looking at buying U.K. startup for $1 billion"), style="ent")
# display sentences involving target entity
spacy.displacy.render(nlp("I had a hamburger and chips for lunch today."), style="ent")
spacy.displacy.render(nlp("I decided to have chocolate ice cream as a little treat for myself."), style="ent")
spacy.displacy.render(nlp("I ordered basmati rice, leaf spinach and cheese from Tesco yesterday"), style="ent")
# dictionary to hold our evaluation data
food_evaluation = {
"one_food": {
"correct": 0,
"total": 0,
},
"two_foods": {
"correct": 0,
"total": 0
},
"three_foods": {
"correct": 0,
"total": 0
}
}
word_evaluation = {
"1_worded_foods": {
"correct": 0,
"total": 0
},
"2_worded_foods": {
"correct": 0,
"total": 0
},
"3_worded_foods": {
"correct": 0,
"total": 0
}
}
# loop over data from our test food set (3 keys in total)
for key in TEST_FOOD_DATA:
foods = TEST_FOOD_DATA[key]
for food in foods:
# extract the sentence and correct food entities according to our test data
sentence = food[0]
entities = food[1]["entities"]
# for each entity, use our updated model to make a prediction on the sentence
for entity in entities:
doc = nlp(sentence)
correct_text = sentence[entity[0]:entity[1]]
n_worded_food = len(correct_text.split())
# if we find that there's a match for predicted entity and predicted text, increment correct counters
for ent in doc.ents:
if ent.label_ == entity[2] and ent.text == correct_text:
food_evaluation[key]["correct"] += 1
if n_worded_food > 0:
word_evaluation[f"{n_worded_food}_worded_foods"]["correct"] += 1
# this break is important, ensures that we're not double counting on a correct match
break
# increment total counters after each entity loop
food_evaluation[key]["total"] += 1
if n_worded_food > 0:
word_evaluation[f"{n_worded_food}_worded_foods"]["total"] += 1
for key in word_evaluation:
correct = word_evaluation[key]["correct"]
total = word_evaluation[key]["total"]
print(f"{key}: {correct / total * 100:.2f}%")
food_total_sum = 0
food_correct_sum = 0
print("---")
for key in food_evaluation:
correct = food_evaluation[key]["correct"]
total = food_evaluation[key]["total"]
food_total_sum += total
food_correct_sum += correct
print(f"{key}: {correct / total * 100:.2f}%")
print(f"\nTotal: {food_correct_sum/food_total_sum * 100:.2f}%")
1_worded_foods: 91.10%
2_worded_foods: 96.69%
3_worded_foods: 96.88%
---
one_food: 91.44%
two_foods: 94.76%
three_foods: 98.13%
Total: 94.14%
# dictionary which will be populated with the entities and result information
entity_evaluation = {}
# helper function to udpate the entity_evaluation dictionary
def update_results(entity, metric):
if entity not in entity_evaluation:
entity_evaluation[entity] = {"correct": 0, "total": 0}
entity_evaluation[entity][metric] += 1
# same as before, see if entities from test set match what spaCy currently predicts
for data in TEST_REVISION_DATA:
sentence = data[0]
entities = data[1]["entities"]
for entity in entities:
doc = nlp(sentence)
correct_text = sentence[entity[0]:entity[1]]
for ent in doc.ents:
if ent.label_ == entity[2] and ent.text == correct_text:
update_results(ent.label_, "correct")
break
update_results(entity[2], "total")
sum_total = 0
sum_correct = 0
for entity in entity_evaluation:
total = entity_evaluation[entity]["total"]
correct = entity_evaluation[entity]["correct"]
sum_total += total
sum_correct += correct
print("{} | {:.2f}%".format(entity, correct / total * 100))
print()
print("Overall accuracy: {:.2f}%".format(sum_correct / sum_total * 100))
PERSON | 80.18%
ORG | 50.84%
DATE | 68.61%
GPE | 82.56%
NORP | 83.61%
CARDINAL | 70.11%
QUANTITY | 79.53%
PERCENT | 88.44%
TIME | 50.88%
FAC | 56.58%
LOC | 68.69%
ORDINAL | 94.53%
MONEY | 84.11%
WORK_OF_ART | 58.78%
PRODUCT | 42.86%
EVENT | 63.46%
LANGUAGE | 91.67%
LAW | 75.00%
Overall accuracy: 71.23%
nlp.meta["name"] = "food_entity_extractor_v2"
nlp.to_disk("./models/v2")