Assignment 4: Natural Language Processing
by William Jönsson & Matthieu Moutot - Group 43
import collections
import random
import itertools
(a) Warmup - Word frequency
10 most frequent words in each language:
# Open the file, remove commas and dots, split the text into words
def open_remove_split(file_name):
raw_text = open(file_name).read()
t1 = raw_text.replace('.', '')
t2 = t1.replace(',', '')
t3 = t2.replace(' '', '\'')
t4 = t3.replace('"','')
text_split = t4.split()
return text_split
# English (most frequent words are similar in other texts)
text_en_sv = open_remove_split('/work/dat410_europarl/europarl-v7.sv-en.lc.en')
print('English: ', collections.Counter(text_en_sv).most_common(10))
# Swedish
text_sv_en = open_remove_split('/work/dat410_europarl/europarl-v7.sv-en.lc.sv')
print('Swedish: ', collections.Counter(text_sv_en).most_common(10))
# French
text_fr_en = open_remove_split('/work/dat410_europarl/europarl-v7.fr-en.lc.fr')
print('French: ', collections.Counter(text_fr_en).most_common(10))
# German
text_de_en = open_remove_split('/work/dat410_europarl/europarl-v7.de-en.lc.de')
print('German: ', collections.Counter(text_de_en).most_common(10))
Probability that a randomly selected word in the text is "speaker" or "zebra":
# Function to get any word's frequency
def get_word_freq(text, word):
occ = collections.Counter(text).get(word)
if occ != None:
p_word = occ / (len(text) + occ)
return p_word
else:
return 0
p_speaker = get_word_freq(text_en_sv, 'speaker')
p_zebra = get_word_freq(text_en_sv, 'zebra')
print("Probability to get 'speaker': ", p_speaker)
print("Probability to get 'zebra': ", p_zebra)
(b) Bigram Language Model
class LanguageHandler:
# Normalize text file
def __init__(self, file_name):
raw_text = open(file_name).read()
t1 = raw_text.replace('.', '')
t2 = t1.replace(',', '')
t3 = t2.replace(' '', '\'')
t4 = t3.replace('"','')
self.text = t4
# Get frequency of a single word
def get_word_freq(self, text, word):
if isinstance(text, str):
text = text.split()
occ = collections.Counter(text).get(word)
if occ != None:
p_word = occ / len(text)
return p_word
else:
return 0
def get_word_pair_freq(self, text, word):
if isinstance(text, str):
text = text.split()
occ = collections.Counter(text).get(word)
if occ != None:
p_word = occ / (len(text) + occ)
return p_word
else:
return 0
# Get frequency of a pair of words
def get_pair_freq(self, prev_word, word):
t1 = self.text
t2 = t1.replace(prev_word + " " + word, prev_word+word) # replace paired words by a single one
t3 = t2.split()
f_pair = self.get_word_pair_freq(t3, prev_word+word)
f_prev_word = self.get_word_freq(self.text, prev_word)
if f_prev_word == 0:
return 0
else:
return f_pair / f_prev_word
# Get frequency of a sequence of several words
def get_sequence_prob(self, raw_words):
if isinstance(raw_words, str):
words = raw_words.split()
else:
words = raw_words
prob = self.get_word_freq(self.text, words[0])
for i in range(1, len(words)):
prob *= self.get_pair_freq(words[i-1], words[i])
return prob
# Test on an example
t = LanguageHandler('/work/dat410_europarl/europarl-v7.sv-en.lc.en')
p = t.get_sequence_prob('the commission is')
p
(c) Translation Modeling
Print ten most likely translations for the word "european":
# Load and normalize english and french source texts, split them into sentences
raw_text = open('/work/dat410_europarl/europarl-v7.fr-en.lc.en').read()
t1 = raw_text.replace('.', '')
t2 = t1.replace(',', '')
t3 = t2.replace(' '', '\'')
t4 = t3.replace('"','')
english_sentences = t4.splitlines()
raw_text = open('/work/dat410_europarl/europarl-v7.fr-en.lc.fr').read()
t1 = raw_text.replace('.', '')
t2 = t1.replace(',', '')
t3 = t2.replace(' '', '\'')
t4 = t3.replace('"','')
french_sentences = t4.splitlines()
c1 = dict()
c2 = dict()
t = dict()
# Calculate delta for each iteration
def delta(i,j, f, e):
sum = 0
for e_word in e:
sum += t[f[i], e_word]
return t[f[i], e[j]] / sum
def IBM_model_1(french_sentences, english_sentences):
# Init, set random coefficients for alignment table ---------------
for index in range(len(english_sentences)):
e = english_sentences[index].split() # english sentence split into list of words
for ek in e: # for every word word in one english sentence
f = french_sentences[index].split()
for fk in f:
t[fk,ek] = random.uniform(0,1)
for T in range(5): # number of epochs (better accuracy when increasing T)
print("T:",T)
# Set pseudocounts to 0
for index in range(len(english_sentences)):
e = english_sentences[index].split() # english sentence split into list of words
for ek in e: # one word in one english sentence
c1[ek] = 0
f = french_sentences[index].split()
for fk in f:
c2[ek,fk] = 0
# Update pseudocounts
for k in range(len(english_sentences)): # senctence pair example, f ={"le chat noir} e = {"the black cat"}, pairs = (le,the), (le,black), (le,cat) ...isinstance
if k%1000 == 0:
print("k:",k)
f = french_sentences[k].split() # f is french sentence
e = english_sentences[k].split() # e its equivalent english sentence
for i in range(len(f)): # For each French word
for j in range(len(e)): # For each English word
d = delta(i, j, f, e)
c2[e[j], f[i]] += d
c1[e[j]] += d
# Recalculate coefficients in alignment table
for k in range(len(english_sentences)): # sentence pair example, f ={"le chat noir} e = {"the black cat"}, pairs = (le,the), (le,black), (le,cat) ...isinstance
f = french_sentences[k].split() # f is a french sentence
e = english_sentences[k].split() # e its equivalent english sentence
for i in range(len(f)): # For each French word
for j in range(len(e)): # For each English word
t[f[i], e[j]] = c2[e[j], f[i]] / c1[e[j]]
print(T)
return t
table = IBM_model_1(french_sentences, english_sentences)
table['la', 'the']
# Find 10 most likely translations of the word "european"
keys = table.keys()
keys = sorted(keys, key=lambda x:x[1])
has_found_european = False
european = dict()
for k in keys:
if k[1] == 'european':
has_found_european = True
european[k[0]] = table[k]
elif has_found_european:
break
sorted_tuples = sorted(european.items(), key=lambda item: item[1], reverse=True)
sorted_tuples[:10]
(d) Decoding
# Get three most likely translations for a given word
def find_highest_prob_translation_word(t_table, word):
keys = table.keys()
keys = sorted(keys, key=lambda x:x[0])
has_found_word = False
translations = dict()
for k in keys: # (french, english)
if k[0] == word: # french == word
has_found_word = True
translations[k[1]] = table[k] # trans[english] = table prob
elif has_found_word:
break
sorted_tuples = sorted(translations.items(), key=lambda item: item[1], reverse=True)
first_tuple_elements = [a_tuple[0] for a_tuple in sorted_tuples[:3]]
return first_tuple_elements
First use EM table to get the translated words, then use the language modelling to find the best sequence of those words.
# Get top three translation for every word in the french sentence
french_to_be_translated = 'semaine dernière'
f_words = french_to_be_translated.split()
translated_words = list()
for word in f_words:
translated_words.append(find_highest_prob_translation_word(table, word))
print(translated_words)
# Get best translation out of all possibilities (depending on order and word translation)
l = LanguageHandler('/work/dat410_europarl/europarl-v7.fr-en.lc.en')
max_prob = 0
best_translation = ''
for perm_order in itertools.permutations(translated_words):
for sentence in list(itertools.product(*perm_order)):
string = ' '.join(sentence)
p = l.get_sequence_prob(string)
if p > max_prob:
max_prob = p
best_translation = string
print('The best translation of "', french_to_be_translated, '" is: ', best_translation)
# Used to get data in a file
import pickle
dictionary_data = table
a_file = open("data.pkl", "wb")
pickle.dump(dictionary_data, a_file)
a_file.close()
a_file = open("data.pkl", "rb")
output = pickle.load(a_file)
print(output)