Assignment4_Group11

import numpy as np import string from collections import Counter import time import pandas as pd import random import pickle from operator import itemgetter from itertools import product

with open('europarl-v7.de-en.lc.de') as f: lines_de_en_de = f.readlines() with open('europarl-v7.de-en.lc.en') as f: lines_de_en_en = f.readlines() with open('europarl-v7.fr-en.lc.en') as f: lines_fr_en_en = f.readlines() with open('europarl-v7.fr-en.lc.fr') as f: lines_fr_en_fr = f.readlines() with open('europarl-v7.sv-en.lc.en') as f: lines_sv_en_en = f.readlines() with open('europarl-v7.sv-en.lc.sv') as f: lines_sv_en_sv = f.readlines()

string_de_en_de = ' '.join(lines_de_en_de) string_de_en_en = ' '.join(lines_de_en_en) string_fr_en_en = ' '.join(lines_fr_en_en) string_fr_en_fr = ' '.join(lines_fr_en_fr) string_sv_en_en = ' '.join(lines_sv_en_en) string_sv_en_sv = ' '.join(lines_sv_en_sv)

list_de_en_de = string_de_en_de.split(' ') list_de_en_en = string_de_en_en.split(' ') list_fr_en_en = string_fr_en_en.split(' ') list_fr_en_fr = string_fr_en_fr.split(' ') list_sv_en_en = string_sv_en_en.split(' ') list_sv_en_sv = string_sv_en_sv.split(' ')

filtered_list_de_en_de = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_de_en_de)) filtered_list_de_en_de = [word.replace('\n', '') for word in filtered_list_de_en_de] filtered_list_de_en_en = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_de_en_en)) filtered_list_de_en_en = [word.replace('\n', '') for word in filtered_list_de_en_en] filtered_list_fr_en_en = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_fr_en_en)) filtered_list_fr_en_en = [word.replace('\n', '') for word in filtered_list_fr_en_en] filtered_list_fr_en_fr = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_fr_en_fr)) filtered_list_fr_en_fr = [word.replace('\n', '') for word in filtered_list_fr_en_fr] filtered_list_sv_en_en = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_sv_en_en)) filtered_list_sv_en_en = [word.replace('\n', '') for word in filtered_list_sv_en_en] filtered_list_sv_en_sv = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_sv_en_sv)) filtered_list_sv_en_sv = [word.replace('\n', '') for word in filtered_list_sv_en_sv]

# for (language, word_list) in [('de_en_de', filtered_list_de_en_de), ('de_en_en', filtered_list_de_en_en), ('fr_en_en', filtered_list_fr_en_en), ('fr_en_fr', filtered_list_fr_en_fr), ('sv_en_en', filtered_list_sv_en_en), ('sv_en_sv', filtered_list_sv_en_sv)]: language = 'en' word_list = filtered_list_de_en_en + filtered_list_fr_en_en + filtered_list_sv_en_en cnt = Counter(word_list) print('Language: ', language) print(cnt.most_common(10)) c_total = len(sorted(cnt.elements())) c_speaker = max([cnt['sprecher'],cnt['sprecherin'],cnt['speaker'], cnt['conférencier'], cnt['högtalare']]) p_speaker = c_speaker/c_total print('Prob. Speaker: ', p_speaker) c_zebra = max(cnt['zebra'], cnt['zèbre']) p_zebra = c_zebra/c_total print('Prob. Zebra: ', p_zebra, '\n')

# Get all data with english words together string_en = string_de_en_en + string_fr_en_en + string_sv_en_en list_en = string_en.split(' ') filtered_list_en = list(filter(lambda a: (a not in string.punctuation or a == '.') & (not a.isdigit()) & (a != ''') & (a != ''s') & (a != '!\n') & (a != '?\n') & (a != '"'), list_en))

def get_mle_two_following_words(word1, word2, word_list): """ Computes des MLE for two following english words, the resulting probability can be used in the bigram model Input: word1: first word (string), word2: following word (string), word_list: list with all english sentences in the training data, each word seperatly (list of strings) Output: probability that word1 and word2 follow each other (float) """ counter_first_word = 0 counter_word_follows = 0 max_index_word_list = len(word_list) -1 if word1 == '<START>': for i, word in enumerate(word_list): if word in ['.', '.\n'] or i == 0: counter_first_word += 1 if i<max_index_word_list: if word_list[i+1] == word2: counter_word_follows += 1 else: for i, word in enumerate(word_list): if word==word1: counter_first_word += 1 if i<max_index_word_list: if word_list[i+1] == word2: counter_word_follows += 1 if counter_first_word == 0: mle=0 else: mle = counter_word_follows / counter_first_word return mle

def get_p_e(sentence, word_list): """ Computes P(E) (language model) according to the bigram model Input: sentence: English sentence for which the probability should be computed (string), word_list: Training data with all english sentences seperated by word (list of strings), Output: The probability P(E) for an english sentence E """ # multiply probablities proba_sentence = 1 split_sentence = ('<START> ' + sentence).split() for word in range(0,len(split_sentence)-1): proba_sentence *= get_mle_two_following_words(split_sentence[word], split_sentence[word+1], word_list) return proba_sentence

short_sentence = 'i would like your advice about rule concerning inadmissibility' foreign_sentence = 'i would like your Rat about rule concerning inadmissibility' long_sentence = 'what we cannot do commissioner and i would like to end on this note is to give way and give ammunition to those who regard the european institutions as the property of the great and the good of the rich and not of the citizen or the small and medium business which in the worst case scenario which will never become reality because the requisite measures will be applied to prevent it will result in courts which make disparate judgements very late with no real possibility of control apart from what the french call le parcours du combatant that is to say after lord knows how many years when the court in luxembourg passes judgement a court which we know to be inundated at the moment'

proba_short = get_p_e(short_sentence, filtered_list_en) proba_foreign = get_p_e(foreign_sentence, filtered_list_en) print("Probablilty of a short sentence:" ,proba_short, '\n') print("Probablilty of a sentence containing a word that is not in the training data:", proba_foreign, '\n') start_time = time.time() proba_long = get_p_e(long_sentence, filtered_list_en) print("Execution time for calculating the probablilty of a long sentence:", (time.time() - start_time), "s") print("Probablilty of a long sentence:" ,proba_long)

sentence_list_de = string_de_en_de.split('\n') sentence_list_en = string_de_en_en.split('\n')

def get_t_dict(sentence_list_en, sentence_list_de): """ Implementation of the Expectation–Maximization algorithm according to the given .pdf file Computes the translation probabilities t(f|e) for the german words f and the english words e from the training data pairs Input: sentence_list_en: English sentence list (list of strings), sentence_list_de: Corresponding German sentence list (list of strings), Output: A dict with possible german-english pairs and the translation probabilities t(f|e) """ T = 10 t_dict = {} for em_iteration in range(T): print('EM Iteration:', em_iteration) # set all counts c(f ,e) and c(e) to 0 c_de_dict = {} c_e_dict = {} for sentence_index in range(len(sentence_list_en)-1): # if sentence_index % 100 ==0: # print(sentence_index) sentence_en = sentence_list_en[sentence_index] sentence_de = sentence_list_de[sentence_index] word_list_en = sentence_en.split(' ') word_list_de = sentence_de.split(' ') filtered_word_list_en = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"') & (a != '...') & (a != '....') & (a != '.....') & (a != '..'), word_list_en)) filtered_word_list_en = filtered_word_list_en + ['NULL'] filtered_word_list_de = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"') & (a != '...') & (a != '....') & (a != '.....') & (a != '..'), word_list_de)) for word_de in filtered_word_list_de: sum_t = 0 for word_en in filtered_word_list_en: if word_en in t_dict: if not word_de in t_dict[word_en]: t_dict[word_en][word_de] = random.uniform(0,1) else: t_dict[word_en] = {word_de: random.uniform(0,1)} if not word_en in c_e_dict: c_e_dict[word_en] = 0 c_de_dict[word_en] = {word_de: 0} elif not word_de in c_de_dict[word_en]: c_de_dict[word_en][word_de] = 0 t_de_en = t_dict[word_en][word_de] sum_t += t_de_en for word_en in filtered_word_list_en: delta = t_dict[word_en][word_de] / sum_t c_de_dict[word_en][word_de] += delta c_e_dict[word_en] += delta for i, word_en in enumerate(list(t_dict.keys())): c_e = c_e_dict[word_en] # if i%100 == 0: # print('Second Loop', i, 'Em_iteration:', em_iteration) for word_de in list(t_dict[word_en].keys()): t_dict[word_en][word_de] = c_de_dict[word_en][word_de] / c_e if c_e > 0 else 0 output = open('t_dict.pkl', 'wb') pickle.dump(t_dict, output) output.close() output = open('c_de_dict.pkl', 'wb') pickle.dump(c_de_dict, output) output.close() output = open('c_e_dict.pkl', 'wb') pickle.dump(c_e_dict, output) output.close() return t_dict t_dict = get_t_dict(sentence_list_en, sentence_list_de) t_dict_inversed = get_t_dict(sentence_list_de, sentence_list_en)

word = 'european' possible_translations = list(t_dict[word].keys()) translation_probas = np.array(list(t_dict[word].values())) ind_max = np.argpartition(translation_probas, -10)[-10:] ind_max_sorted = ind_max[np.argsort(translation_probas[ind_max])] for i in range(0,10): print("Top", 10-i, "likeliest translation:", possible_translations[ind_max_sorted[i]]) print("Estimated likelihood:", translation_probas[ind_max_sorted[i]])

def get_top_n_translations(word_f, n, dict_e): """ Input: german word to be translated (string), number of desired translations (int), dictionary containing english->german translations (dict) Output: dictionary containing top n translations and corresponding probabilities p(f|e) (dict) """ probas = {} max_probas = {} probas = t_dict_inversed[word_f].copy() for i in range(1,n+1): translation = max(probas, key=probas.get) # proba_translation = probas[translation] proba_translation = dict_e[translation][word_f] max_probas[translation] = proba_translation probas.pop(translation) return max_probas

def get_sentence_p_f_e(sentence, dict_e): """ Computes possible translations for a non-english (German) sentence and computes the translation probability P(F|E) for each sentence Input: sentence: Non-english (German) sentence for which possible translation sentences and the probabilities P(F|E) should be computed (string), dict_e: Dictionary containing english-german translations and the translation probabilites t(f|e) (dict), Output: Dictionary with possible english translations and the corresponding translation probability p(F|E) (dict) """ split_sentence = test_sentence.split() possible_translations = [] sentence_p_f_e = {} list_of_dicts = [] for word in range(0,len(split_sentence)): possible_translations.append(get_top_n_translations(split_sentence[word], 3, t_dict)) for i in range(0,len(split_sentence)): list_of_dicts.append(possible_translations[i].items()) sentence_combinations = [x for x in product(*list_of_dicts)] for i in range(0,len(sentence_combinations)): sentence = '' proba_total = 1 for word, proba in sentence_combinations[i]: sentence = sentence + word + ' ' proba_total = proba_total * proba sentence_p_f_e[sentence] = proba_total return sentence_p_f_e

def get_total_proba(dict_p_f_e): """ Computes total probability P(E)P(F|E) for each sentence in the dict Input: dict_p_f_e: Dictionary with possible english translations and the corresponding translation probability p(F|E) (dict), Output: Dictionary with possible english translations and the corresponding total probability P(E)P(F|E) (dict) """ sentence_probas = {} p_e_dict = {} sum_p_e = 0 for i, sentence_e in enumerate(list(dict_p_f_e.keys())): p_e = get_p_e(sentence_e, filtered_list_en) p_e_dict[sentence_e] = p_e sum_p_e += p_e if sum_p_e == 0: sentence_probas = dict_p_f_e else: for sentence_e in list(dict_p_f_e.keys()): total_proba = p_e_dict[sentence_e] * dict_p_f_e[sentence_e] sentence_probas[sentence_e] = total_proba return sentence_probas

def translate_german_to_english(german_sentence, t_dict): """ Translates German sentence to english and print probability Input: german_sentence: German sentence to be translated (string), t_dict: Dictionary containing english-german translations and the translation probabilites t(f|e) (dict), """ sentence_p_f_e = get_sentence_p_f_e(test_sentence, t_dict) sentece_proba = get_total_proba(sentence_p_f_e) max_prob_sentence = max(sentece_proba, key=sentece_proba.get) probability = sentece_proba[max_prob_sentence] print('The translation is:', max_prob_sentence) print('The achieved probability is:', probability)

test_sentence = 'das ist ein haus' translate_german_to_english(test_sentence, t_dict)

test_sentence = 'was sind die folgen' translate_german_to_english(test_sentence, t_dict)