module4-group43

Assignment 4: Natural Language Processing

by William Jönsson & Matthieu Moutot - Group 43

import collections import random import itertools

(a) Warmup - Word frequency

10 most frequent words in each language:

# Open the file, remove commas and dots, split the text into words def open_remove_split(file_name): raw_text = open(file_name).read() t1 = raw_text.replace('.', '') t2 = t1.replace(',', '') t3 = t2.replace(' '', '\'') t4 = t3.replace('"','') text_split = t4.split() return text_split

# English (most frequent words are similar in other texts) text_en_sv = open_remove_split('/work/dat410_europarl/europarl-v7.sv-en.lc.en') print('English: ', collections.Counter(text_en_sv).most_common(10)) # Swedish text_sv_en = open_remove_split('/work/dat410_europarl/europarl-v7.sv-en.lc.sv') print('Swedish: ', collections.Counter(text_sv_en).most_common(10)) # French text_fr_en = open_remove_split('/work/dat410_europarl/europarl-v7.fr-en.lc.fr') print('French: ', collections.Counter(text_fr_en).most_common(10)) # German text_de_en = open_remove_split('/work/dat410_europarl/europarl-v7.de-en.lc.de') print('German: ', collections.Counter(text_de_en).most_common(10))

Probability that a randomly selected word in the text is "speaker" or "zebra":

# Function to get any word's frequency def get_word_freq(text, word): occ = collections.Counter(text).get(word) if occ != None: p_word = occ / (len(text) + occ) return p_word else: return 0

p_speaker = get_word_freq(text_en_sv, 'speaker') p_zebra = get_word_freq(text_en_sv, 'zebra') print("Probability to get 'speaker': ", p_speaker) print("Probability to get 'zebra': ", p_zebra)

(b) Bigram Language Model

class LanguageHandler: # Normalize text file def __init__(self, file_name): raw_text = open(file_name).read() t1 = raw_text.replace('.', '') t2 = t1.replace(',', '') t3 = t2.replace(' '', '\'') t4 = t3.replace('"','') self.text = t4 # Get frequency of a single word def get_word_freq(self, text, word): if isinstance(text, str): text = text.split() occ = collections.Counter(text).get(word) if occ != None: p_word = occ / len(text) return p_word else: return 0 def get_word_pair_freq(self, text, word): if isinstance(text, str): text = text.split() occ = collections.Counter(text).get(word) if occ != None: p_word = occ / (len(text) + occ) return p_word else: return 0 # Get frequency of a pair of words def get_pair_freq(self, prev_word, word): t1 = self.text t2 = t1.replace(prev_word + " " + word, prev_word+word) # replace paired words by a single one t3 = t2.split() f_pair = self.get_word_pair_freq(t3, prev_word+word) f_prev_word = self.get_word_freq(self.text, prev_word) if f_prev_word == 0: return 0 else: return f_pair / f_prev_word # Get frequency of a sequence of several words def get_sequence_prob(self, raw_words): if isinstance(raw_words, str): words = raw_words.split() else: words = raw_words prob = self.get_word_freq(self.text, words[0]) for i in range(1, len(words)): prob *= self.get_pair_freq(words[i-1], words[i]) return prob

# Test on an example t = LanguageHandler('/work/dat410_europarl/europarl-v7.sv-en.lc.en') p = t.get_sequence_prob('the commission is') p

(c) Translation Modeling

Print ten most likely translations for the word "european":

# Load and normalize english and french source texts, split them into sentences raw_text = open('/work/dat410_europarl/europarl-v7.fr-en.lc.en').read() t1 = raw_text.replace('.', '') t2 = t1.replace(',', '') t3 = t2.replace(' '', '\'') t4 = t3.replace('"','') english_sentences = t4.splitlines() raw_text = open('/work/dat410_europarl/europarl-v7.fr-en.lc.fr').read() t1 = raw_text.replace('.', '') t2 = t1.replace(',', '') t3 = t2.replace(' '', '\'') t4 = t3.replace('"','') french_sentences = t4.splitlines()

c1 = dict() c2 = dict() t = dict() # Calculate delta for each iteration def delta(i,j, f, e): sum = 0 for e_word in e: sum += t[f[i], e_word] return t[f[i], e[j]] / sum def IBM_model_1(french_sentences, english_sentences): # Init, set random coefficients for alignment table --------------- for index in range(len(english_sentences)): e = english_sentences[index].split() # english sentence split into list of words for ek in e: # for every word word in one english sentence f = french_sentences[index].split() for fk in f: t[fk,ek] = random.uniform(0,1) for T in range(5): # number of epochs (better accuracy when increasing T) print("T:",T) # Set pseudocounts to 0 for index in range(len(english_sentences)): e = english_sentences[index].split() # english sentence split into list of words for ek in e: # one word in one english sentence c1[ek] = 0 f = french_sentences[index].split() for fk in f: c2[ek,fk] = 0 # Update pseudocounts for k in range(len(english_sentences)): # senctence pair example, f ={"le chat noir} e = {"the black cat"}, pairs = (le,the), (le,black), (le,cat) ...isinstance if k%1000 == 0: print("k:",k) f = french_sentences[k].split() # f is french sentence e = english_sentences[k].split() # e its equivalent english sentence for i in range(len(f)): # For each French word for j in range(len(e)): # For each English word d = delta(i, j, f, e) c2[e[j], f[i]] += d c1[e[j]] += d # Recalculate coefficients in alignment table for k in range(len(english_sentences)): # sentence pair example, f ={"le chat noir} e = {"the black cat"}, pairs = (le,the), (le,black), (le,cat) ...isinstance f = french_sentences[k].split() # f is a french sentence e = english_sentences[k].split() # e its equivalent english sentence for i in range(len(f)): # For each French word for j in range(len(e)): # For each English word t[f[i], e[j]] = c2[e[j], f[i]] / c1[e[j]] print(T) return t

table = IBM_model_1(french_sentences, english_sentences)

table['la', 'the']

# Find 10 most likely translations of the word "european" keys = table.keys() keys = sorted(keys, key=lambda x:x[1]) has_found_european = False european = dict() for k in keys: if k[1] == 'european': has_found_european = True european[k[0]] = table[k] elif has_found_european: break sorted_tuples = sorted(european.items(), key=lambda item: item[1], reverse=True) sorted_tuples[:10]

(d) Decoding

# Get three most likely translations for a given word def find_highest_prob_translation_word(t_table, word): keys = table.keys() keys = sorted(keys, key=lambda x:x[0]) has_found_word = False translations = dict() for k in keys: # (french, english) if k[0] == word: # french == word has_found_word = True translations[k[1]] = table[k] # trans[english] = table prob elif has_found_word: break sorted_tuples = sorted(translations.items(), key=lambda item: item[1], reverse=True) first_tuple_elements = [a_tuple[0] for a_tuple in sorted_tuples[:3]] return first_tuple_elements

First use EM table to get the translated words, then use the language modelling to find the best sequence of those words.

# Get top three translation for every word in the french sentence french_to_be_translated = 'semaine dernière' f_words = french_to_be_translated.split() translated_words = list() for word in f_words: translated_words.append(find_highest_prob_translation_word(table, word)) print(translated_words)

# Get best translation out of all possibilities (depending on order and word translation) l = LanguageHandler('/work/dat410_europarl/europarl-v7.fr-en.lc.en') max_prob = 0 best_translation = '' for perm_order in itertools.permutations(translated_words): for sentence in list(itertools.product(*perm_order)): string = ' '.join(sentence) p = l.get_sequence_prob(string) if p > max_prob: max_prob = p best_translation = string print('The best translation of "', french_to_be_translated, '" is: ', best_translation)

# Used to get data in a file import pickle dictionary_data = table a_file = open("data.pkl", "wb") pickle.dump(dictionary_data, a_file) a_file.close() a_file = open("data.pkl", "rb") output = pickle.load(a_file) print(output)

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Assignment 4: Natural Language Processing

(a) Warmup - Word frequency

(b) Bigram Language Model

(c) Translation Modeling

(d) Decoding

Assignment 4: Natural Language Processing