import numpy as np
import string
from collections import Counter
import time
import pandas as pd
import random
import pickle
from operator import itemgetter
from itertools import product
with open('europarl-v7.de-en.lc.de') as f:
lines_de_en_de = f.readlines()
with open('europarl-v7.de-en.lc.en') as f:
lines_de_en_en = f.readlines()
with open('europarl-v7.fr-en.lc.en') as f:
lines_fr_en_en = f.readlines()
with open('europarl-v7.fr-en.lc.fr') as f:
lines_fr_en_fr = f.readlines()
with open('europarl-v7.sv-en.lc.en') as f:
lines_sv_en_en = f.readlines()
with open('europarl-v7.sv-en.lc.sv') as f:
lines_sv_en_sv = f.readlines()
string_de_en_de = ' '.join(lines_de_en_de)
string_de_en_en = ' '.join(lines_de_en_en)
string_fr_en_en = ' '.join(lines_fr_en_en)
string_fr_en_fr = ' '.join(lines_fr_en_fr)
string_sv_en_en = ' '.join(lines_sv_en_en)
string_sv_en_sv = ' '.join(lines_sv_en_sv)
list_de_en_de = string_de_en_de.split(' ')
list_de_en_en = string_de_en_en.split(' ')
list_fr_en_en = string_fr_en_en.split(' ')
list_fr_en_fr = string_fr_en_fr.split(' ')
list_sv_en_en = string_sv_en_en.split(' ')
list_sv_en_sv = string_sv_en_sv.split(' ')
filtered_list_de_en_de = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_de_en_de))
filtered_list_de_en_de = [word.replace('\n', '') for word in filtered_list_de_en_de]
filtered_list_de_en_en = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_de_en_en))
filtered_list_de_en_en = [word.replace('\n', '') for word in filtered_list_de_en_en]
filtered_list_fr_en_en = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_fr_en_en))
filtered_list_fr_en_en = [word.replace('\n', '') for word in filtered_list_fr_en_en]
filtered_list_fr_en_fr = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_fr_en_fr))
filtered_list_fr_en_fr = [word.replace('\n', '') for word in filtered_list_fr_en_fr]
filtered_list_sv_en_en = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_sv_en_en))
filtered_list_sv_en_en = [word.replace('\n', '') for word in filtered_list_sv_en_en]
filtered_list_sv_en_sv = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"'), list_sv_en_sv))
filtered_list_sv_en_sv = [word.replace('\n', '') for word in filtered_list_sv_en_sv]
# for (language, word_list) in [('de_en_de', filtered_list_de_en_de), ('de_en_en', filtered_list_de_en_en), ('fr_en_en', filtered_list_fr_en_en), ('fr_en_fr', filtered_list_fr_en_fr), ('sv_en_en', filtered_list_sv_en_en), ('sv_en_sv', filtered_list_sv_en_sv)]:
language = 'en'
word_list = filtered_list_de_en_en + filtered_list_fr_en_en + filtered_list_sv_en_en
cnt = Counter(word_list)
print('Language: ', language)
print(cnt.most_common(10))
c_total = len(sorted(cnt.elements()))
c_speaker = max([cnt['sprecher'],cnt['sprecherin'],cnt['speaker'], cnt['conférencier'], cnt['högtalare']])
p_speaker = c_speaker/c_total
print('Prob. Speaker: ', p_speaker)
c_zebra = max(cnt['zebra'], cnt['zèbre'])
p_zebra = c_zebra/c_total
print('Prob. Zebra: ', p_zebra, '\n')
# Get all data with english words together
string_en = string_de_en_en + string_fr_en_en + string_sv_en_en
list_en = string_en.split(' ')
filtered_list_en = list(filter(lambda a: (a not in string.punctuation or a == '.') & (not a.isdigit()) & (a != ''') & (a != ''s') & (a != '!\n') & (a != '?\n') & (a != '"'), list_en))
def get_mle_two_following_words(word1, word2, word_list):
"""
Computes des MLE for two following english words, the resulting probability can be used in the bigram model
Input:
word1: first word (string),
word2: following word (string),
word_list: list with all english sentences in the training data, each word seperatly (list of strings)
Output: probability that word1 and word2 follow each other (float)
"""
counter_first_word = 0
counter_word_follows = 0
max_index_word_list = len(word_list) -1
if word1 == '<START>':
for i, word in enumerate(word_list):
if word in ['.', '.\n'] or i == 0:
counter_first_word += 1
if i<max_index_word_list:
if word_list[i+1] == word2:
counter_word_follows += 1
else:
for i, word in enumerate(word_list):
if word==word1:
counter_first_word += 1
if i<max_index_word_list:
if word_list[i+1] == word2:
counter_word_follows += 1
if counter_first_word == 0:
mle=0
else:
mle = counter_word_follows / counter_first_word
return mle
def get_p_e(sentence, word_list):
"""
Computes P(E) (language model) according to the bigram model
Input:
sentence: English sentence for which the probability should be computed (string),
word_list: Training data with all english sentences seperated by word (list of strings),
Output: The probability P(E) for an english sentence E
"""
# multiply probablities
proba_sentence = 1
split_sentence = ('<START> ' + sentence).split()
for word in range(0,len(split_sentence)-1):
proba_sentence *= get_mle_two_following_words(split_sentence[word], split_sentence[word+1], word_list)
return proba_sentence
short_sentence = 'i would like your advice about rule concerning inadmissibility'
foreign_sentence = 'i would like your Rat about rule concerning inadmissibility'
long_sentence = 'what we cannot do commissioner and i would like to end on this note is to give way and give ammunition to those who regard the european institutions as the property of the great and the good of the rich and not of the citizen or the small and medium business which in the worst case scenario which will never become reality because the requisite measures will be applied to prevent it will result in courts which make disparate judgements very late with no real possibility of control apart from what the french call le parcours du combatant that is to say after lord knows how many years when the court in luxembourg passes judgement a court which we know to be inundated at the moment'
proba_short = get_p_e(short_sentence, filtered_list_en)
proba_foreign = get_p_e(foreign_sentence, filtered_list_en)
print("Probablilty of a short sentence:" ,proba_short, '\n')
print("Probablilty of a sentence containing a word that is not in the training data:", proba_foreign, '\n')
start_time = time.time()
proba_long = get_p_e(long_sentence, filtered_list_en)
print("Execution time for calculating the probablilty of a long sentence:", (time.time() - start_time), "s")
print("Probablilty of a long sentence:" ,proba_long)
sentence_list_de = string_de_en_de.split('\n')
sentence_list_en = string_de_en_en.split('\n')
def get_t_dict(sentence_list_en, sentence_list_de):
"""
Implementation of the Expectation–Maximization algorithm according to the given .pdf file
Computes the translation probabilities t(f|e) for the german words f and the english words e from the training data pairs
Input:
sentence_list_en: English sentence list (list of strings),
sentence_list_de: Corresponding German sentence list (list of strings),
Output: A dict with possible german-english pairs and the translation probabilities t(f|e)
"""
T = 10
t_dict = {}
for em_iteration in range(T):
print('EM Iteration:', em_iteration)
# set all counts c(f ,e) and c(e) to 0
c_de_dict = {}
c_e_dict = {}
for sentence_index in range(len(sentence_list_en)-1):
# if sentence_index % 100 ==0:
# print(sentence_index)
sentence_en = sentence_list_en[sentence_index]
sentence_de = sentence_list_de[sentence_index]
word_list_en = sentence_en.split(' ')
word_list_de = sentence_de.split(' ')
filtered_word_list_en = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"') & (a != '...') & (a != '....') & (a != '.....') & (a != '..'), word_list_en))
filtered_word_list_en = filtered_word_list_en + ['NULL']
filtered_word_list_de = list(filter(lambda a: (a not in string.punctuation) & (not a.isdigit()) & (a != ''') & (a != '.\n') & (a != '!\n') & (a != '?\n') & (a != '"') & (a != '...') & (a != '....') & (a != '.....') & (a != '..'), word_list_de))
for word_de in filtered_word_list_de:
sum_t = 0
for word_en in filtered_word_list_en:
if word_en in t_dict:
if not word_de in t_dict[word_en]:
t_dict[word_en][word_de] = random.uniform(0,1)
else:
t_dict[word_en] = {word_de: random.uniform(0,1)}
if not word_en in c_e_dict:
c_e_dict[word_en] = 0
c_de_dict[word_en] = {word_de: 0}
elif not word_de in c_de_dict[word_en]:
c_de_dict[word_en][word_de] = 0
t_de_en = t_dict[word_en][word_de]
sum_t += t_de_en
for word_en in filtered_word_list_en:
delta = t_dict[word_en][word_de] / sum_t
c_de_dict[word_en][word_de] += delta
c_e_dict[word_en] += delta
for i, word_en in enumerate(list(t_dict.keys())):
c_e = c_e_dict[word_en]
# if i%100 == 0:
# print('Second Loop', i, 'Em_iteration:', em_iteration)
for word_de in list(t_dict[word_en].keys()):
t_dict[word_en][word_de] = c_de_dict[word_en][word_de] / c_e if c_e > 0 else 0
output = open('t_dict.pkl', 'wb')
pickle.dump(t_dict, output)
output.close()
output = open('c_de_dict.pkl', 'wb')
pickle.dump(c_de_dict, output)
output.close()
output = open('c_e_dict.pkl', 'wb')
pickle.dump(c_e_dict, output)
output.close()
return t_dict
t_dict = get_t_dict(sentence_list_en, sentence_list_de)
t_dict_inversed = get_t_dict(sentence_list_de, sentence_list_en)
word = 'european'
possible_translations = list(t_dict[word].keys())
translation_probas = np.array(list(t_dict[word].values()))
ind_max = np.argpartition(translation_probas, -10)[-10:]
ind_max_sorted = ind_max[np.argsort(translation_probas[ind_max])]
for i in range(0,10):
print("Top", 10-i, "likeliest translation:", possible_translations[ind_max_sorted[i]])
print("Estimated likelihood:", translation_probas[ind_max_sorted[i]])
def get_top_n_translations(word_f, n, dict_e):
"""
Input: german word to be translated (string),
number of desired translations (int),
dictionary containing english->german translations (dict)
Output: dictionary containing top n translations and corresponding probabilities p(f|e) (dict)
"""
probas = {}
max_probas = {}
probas = t_dict_inversed[word_f].copy()
for i in range(1,n+1):
translation = max(probas, key=probas.get)
# proba_translation = probas[translation]
proba_translation = dict_e[translation][word_f]
max_probas[translation] = proba_translation
probas.pop(translation)
return max_probas
def get_sentence_p_f_e(sentence, dict_e):
"""
Computes possible translations for a non-english (German) sentence and computes the translation probability P(F|E) for each sentence
Input:
sentence: Non-english (German) sentence for which possible translation sentences and the probabilities P(F|E) should be computed (string),
dict_e: Dictionary containing english-german translations and the translation probabilites t(f|e) (dict),
Output: Dictionary with possible english translations and the corresponding translation probability p(F|E) (dict)
"""
split_sentence = test_sentence.split()
possible_translations = []
sentence_p_f_e = {}
list_of_dicts = []
for word in range(0,len(split_sentence)):
possible_translations.append(get_top_n_translations(split_sentence[word], 3, t_dict))
for i in range(0,len(split_sentence)):
list_of_dicts.append(possible_translations[i].items())
sentence_combinations = [x for x in product(*list_of_dicts)]
for i in range(0,len(sentence_combinations)):
sentence = ''
proba_total = 1
for word, proba in sentence_combinations[i]:
sentence = sentence + word + ' '
proba_total = proba_total * proba
sentence_p_f_e[sentence] = proba_total
return sentence_p_f_e
def get_total_proba(dict_p_f_e):
"""
Computes total probability P(E)P(F|E) for each sentence in the dict
Input:
dict_p_f_e: Dictionary with possible english translations and the corresponding translation probability p(F|E) (dict),
Output: Dictionary with possible english translations and the corresponding total probability P(E)P(F|E) (dict)
"""
sentence_probas = {}
p_e_dict = {}
sum_p_e = 0
for i, sentence_e in enumerate(list(dict_p_f_e.keys())):
p_e = get_p_e(sentence_e, filtered_list_en)
p_e_dict[sentence_e] = p_e
sum_p_e += p_e
if sum_p_e == 0:
sentence_probas = dict_p_f_e
else:
for sentence_e in list(dict_p_f_e.keys()):
total_proba = p_e_dict[sentence_e] * dict_p_f_e[sentence_e]
sentence_probas[sentence_e] = total_proba
return sentence_probas
def translate_german_to_english(german_sentence, t_dict):
"""
Translates German sentence to english and print probability
Input:
german_sentence: German sentence to be translated (string),
t_dict: Dictionary containing english-german translations and the translation probabilites t(f|e) (dict),
"""
sentence_p_f_e = get_sentence_p_f_e(test_sentence, t_dict)
sentece_proba = get_total_proba(sentence_p_f_e)
max_prob_sentence = max(sentece_proba, key=sentece_proba.get)
probability = sentece_proba[max_prob_sentence]
print('The translation is:', max_prob_sentence)
print('The achieved probability is:', probability)
test_sentence = 'das ist ein haus'
translate_german_to_english(test_sentence, t_dict)
test_sentence = 'was sind die folgen'
translate_german_to_english(test_sentence, t_dict)