import nltk
import random
from collections import Counter, defaultdict
from nltk . corpus import PlaintextCorpusReader
from nltk import bigrams
from nltk import trigrams
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data] /Users/laustdixenmunck/nltk_data...
[nltk_data] Package punkt is already up-to-date!
nltk.download('gutenberg')
emma_sents = nltk.corpus.gutenberg.sents('austen-emma.txt')
emma_words = nltk.corpus.gutenberg.words('austen-emma.txt')
[nltk_data] Downloading package gutenberg to
[nltk_data] /Users/laustdixenmunck/nltk_data...
[nltk_data] Package gutenberg is already up-to-date!
#All of the models include characters like ',' and '.' and the do differentiate from capitalization
counts_word = Counter(emma_words)
#ounts_sent = Counter(emma_sents)
len(counts_word)
counts_word.most_common(n=10)
# Checking the most common words
counts_word.most_common(n=10)
for word in counts_word:
counts_word[word] /= len(counts_word)
print(counts_word.most_common(n=10))
[(',', 1.4663935475611318), ('.', 0.8869542952246832), ('to', 0.6635514018691588), ('the', 0.620151069005249), ('and', 0.5981308411214953), ('of', 0.5478171808987325), ('I', 0.4068621175265651), ('a', 0.3845858404813724), ('was', 0.30533862501600306), ('her', 0.3048265266931251)]
text = []
for i in range(100):
r = random.random()
accumalator = .0
for word, freq in counts_word.items():
accumalator += freq
if accumalator >= r:
text.append(word)
break
print(' '.join(text))
I I I , I Emma I I Emma , by I by by Jane Emma I Emma I I Emma , I , , , , I I by I I Jane Woodhouse I Woodhouse Jane by I , , I I , I I Woodhouse , , Woodhouse I , I , I I I , I , Woodhouse , I I I by , I I Emma , by , , , Emma , I , I , I I , Emma I , , I I I Emma by I Emma Emma , I I ,
# Creating a dictionary containing the word and then the words the comes after as well as the probability:
# {(None): { '[' : Probability , ...}, ...}
# An example of this can be seen in the output
b_model = defaultdict(lambda: defaultdict(lambda: 0))
# Creating a bigram model by assign 1 each time the two combinations of words exists in the layout described above
for sentence in emma_sents:
for w1, w2 in bigrams(sentence, pad_right = True, pad_left = True):
b_model[w1][w2] += 1
# Iterating through the model and calculating the probability of the bigrams
for w1 in b_model:
total_count = float(sum(b_model[w1].values()))
# If the total_count == 0 the pass, which means if the bigram does not exist, then it should skip it
if total_count == 0:
pass
# Again if it does not exist assign the value 0
for w2 in b_model[w1]:
if total_count == 0:
b_model[w1][w2] = 0
# If the brigram is not 0, then it calculates the probability
else:
b_model[w1][w2] /= total_count
b_model['VOLUME']
text = [None]
sentence_finished = False
count = 0
while not sentence_finished:
r = random.random()
accumulator = .0
if len(text) == 1:
var = b_model[None]
else:
tup_text = tuple(text)
var = b_model[tup_text[-1]]
for word in var.keys():
accumulator += var[word]
if accumulator >= r:
text.append(word)
break
else:
pass
if text[-1:] == [None]:
sentence_finished = True
print ('\nThe generated sentence is: \n', ' '.join([t for t in text if t]))
The generated sentence is:
A few hours without Emma , and there were humble as winter and cheerful -- very well enough to Hartfield , and out and hoped our ball . Woodhouse , and every corner , and sea - betweens , Miss Fairfax ' s sake -- what is a daughter ' s account of an excellent thing . Weston and then , and Harriet , a stronger with a life , and be sure Miss Taylor had ?
# # Creating a dictionary containing the two words and then the words the comes after as well as the probability:
# {(None, None): { '[' : Probability , ...}, ...}
# An example of this can be seen in the output
t_model = defaultdict(lambda: defaultdict(lambda: 0))
# Creating a trigram model by assigning 1 each time the trigram exists in the layout described above
for sentence in emma_sents:
for w1, w2, w3 in trigrams(sentence, pad_right = True, pad_left = True):
t_model[(w1, w2)][w3] += 1
# Iterating through the model and calculating the probability of the bigrams
for w1_w2 in t_model:
# If the total_count == 0 the pass, which means if the bigram does not exist, then it should skip it
total_count = float(sum(t_model[w1_w2].values()))
if total_count == 0:
pass
# Again if it does not exist assign the value 0
for w3 in t_model[w1_w2]:
if total_count == 0:
t_model[w1_w2][w3] = 0
# If the brigram is not 0, then it calculates the probability
else:
t_model[w1_w2][w3] /= total_count
t_model['the', 'child']
text = [None, None]
sentence_finished = False
while not sentence_finished:
r = random.random()
accumulator = .0
for word in t_model[tuple(text[-2:])].keys():
accumulator += t_model[tuple(text[-2:])][word]
if accumulator >= r:
text.append(word)
break
else:
pass
if text[-2:] == [None, None]:
sentence_finished = True
print ('\nThe generated sentence is: \n', ' '.join([t for t in text if t]))
The generated sentence is:
By doing it , I have not deserved this .
from statistics import mean
def prob_finder(corp_sents, t_l, b_l, u_l):
prob_lst = []
for sent in corp_sents:
t_prob = 1
b_prob = 1
u_prob = 1
#Constructing word sets
t_sent_lst = trigrams(sent, pad_right = True, pad_left = True)
b_sent_lst = bigrams(sent, pad_right = True, pad_left = True)
u_sent_lst = Counter(sent)
#Constructing trigram model and adding probability
for w1, w2, w3 in t_sent_lst:
if t_model[w1,w2][w3] >0:
t_prob *= t_model[w1,w2][w3]
#Constructing bigram model and adding probability
for w1, w2 in b_sent_lst:
if b_model[w1][w2] > 0:
b_prob *= b_model[w1][w2]
#Constructing unigram model and adding probability
for w in u_sent_lst:
u_sent_lst[w] /= len(u_sent_lst)
if u_sent_lst[w] > 0:
u_prob *= u_sent_lst[w]
#Appending the probabilities to a list
ite_lst = [t_prob, b_prob, u_prob]
#Calculating over_all probability
o_prob = (t_prob*t_l) + (b_prob*b_l) + (u_prob*u_l)
prob_lst.append(o_prob)
#Calulating average probability of the models together
avg = sum(prob_lst) / len(prob_lst)
print('Average for lambdas,',t_l,b_l,u_l, ': ',avg)
prob_finder(emma_sents, 1/3,1/3,1/3)
prob_finder(emma_sents, 0.2,0.4,0.4)
prob_finder(emma_sents, 0.1,0.4,0.6)
prob_finder(emma_sents, 0.1,0.1,0.8)
prob_finder(emma_sents, 0.8,0.1,0.1)
prob_finder(emma_sents, 0.1,0.8,0.1)
prob_finder(emma_sents, 0.2,0.6,0.2)
prob_finder(emma_sents, 0.025,0.025,0.95)
Average for lambdas, 0.3333333333333333 0.3333333333333333 0.3333333333333333 : 0.0037899369028988666
Average for lambdas, 0.2 0.4 0.4 : 0.004516287200363553
Average for lambdas, 0.1 0.4 0.6 : 0.006717478541791497
Average for lambdas, 0.1 0.1 0.8 : 0.008896515661318913
Average for lambdas, 0.8 0.1 0.1 : 0.0012477108617725385
Average for lambdas, 0.1 0.8 0.1 : 0.0012255841856051869
Average for lambdas, 0.2 0.6 0.2 : 0.002324592493016779
Average for lambdas, 0.025 0.025 0.95 : 0.010537915976525328
print(emma_sents[3])
['Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty', '-', 'one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '.']
import nltk
import re
import numpy as np
from collections import Counter, defaultdict
from nltk . corpus import PlaintextCorpusReader
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data] /Users/laustdixenmunck/nltk_data...
[nltk_data] Package punkt is already up-to-date!
# Insert training set for class 1 as one str
ent = "the actor gives a convincing, charismatic performance as the multifaceted Spielberg gives us a visually spicy and historically accurate real life story His innovative mind entertains us now and will continue to entertain generations to come"
# Insert the Nc for class 1
ent_nr_sen = 3
# Insert training set for class 2 as one str
bor = "Unfortunately, the film has two major flaws, one in the disastrous ending If director actually thought this movie was worth anything His efforts seem fruitless, creates drama where drama shouldn’t be"
# Insert the Nc for class 2
bor_nr_sen = 3
# Insert test set
test = 'film is a innovative drama, entertains, but disastrous ending'
# Splitting sets based on space
ent_dic = sorted(ent.lower().split(' '))
bor_dic = bor.lower().split(' ')
test_dic = test.lower().split(' ')
print('N ent : ', len(ent_dic),'\n\nN bor : ', len(bor_dic))
# See how many times the word occour in each training set (insert word in x =)
x = 'and'
print("\nEntertaining The word \"",x,"\" occours ",ent_dic.count(x))
print("Boring The word \"",x,"\" occours ",bor_dic.count(x),'\n')
# vocabulary for union of all words types in classes
all_dic = ent_dic + bor_dic
all_u = []
[all_u.append(x) for x in all_dic if x not in all_u]
print('all_dic len : ', len(all_dic),'\nall_u len : ', len(all_u))
tot_nr_sen = ent_nr_sen + bor_nr_sen
N ent : 36
N bor : 31
Entertaining The word " and " occours 2
Boring The word " and " occours 0
all_dic len : 67
all_u len : 57
ent_word = nltk.word_tokenize(ent)
bor_word = nltk.word_tokenize(bor)
nonPunct = re.compile('.*[A-Za-z0-9].*')
ent_fil = [w for w in ent_word if nonPunct.match(w)]
bor_fil = [w for w in bor_word if nonPunct.match(w)]
ent_word = Counter(ent_fil)
bor_word = Counter(bor_fil)
test_set = list(set(all_u) & set(test_dic))
test_set_ent = list(set(test_set) & set(ent_word))
test_set_ent = test_set_ent + test_set
print(test_set_ent)
test_set_bor = list(set(test_set) & set(bor_word))
test_set_bor = test_set_bor + test_set
print(test_set_bor)
#Count each word:
count_test_set_ent = Counter(test_set_ent)
count_test_set_bor = Counter(test_set_bor)
#Convert to dict
count_test_set_ent_d = dict(count_test_set_ent)
count_test_set_bor_d = dict(count_test_set_bor)
list_ent = list(count_test_set_ent_d.values())
list_bor = list(count_test_set_bor_d.values())
print('\nEntertaintment set as dictionary:',count_test_set_ent_d)
print('\Boring set as dictionary:',count_test_set_bor_d)
print('\n\nProduct of bor: ',np.prod(list_bor))
print('\n\nProduct of ent: ',np.prod(list_ent))
['innovative', 'a', 'innovative', 'disastrous', 'film', 'a', 'ending']
['disastrous', 'ending', 'film', 'innovative', 'disastrous', 'film', 'a', 'ending']
Entertaintment set as dictionary: {'innovative': 2, 'a': 2, 'disastrous': 1, 'film': 1, 'ending': 1}
\Boring set as dictionary: {'disastrous': 2, 'ending': 2, 'film': 2, 'innovative': 1, 'a': 1}
Product of bor: 8
Product of ent: 4
def TestClass(a,b,c):
"""
a is the nr of sentences in class' share of the total amount of sentences (ent_nr_sen)
b is prod of sen or bor (prod_sen or prod_ent)
c is the ent_dic or bor_dic"""
return (a/tot_nr_sen) * (np.prod(b)/(len(all_u)+len(c))**len(b))
P_ent = TestClass(ent_nr_sen, list_ent, ent_dic)
P_bor = TestClass(bor_nr_sen, list_bor, bor_dic)
print('P(ent) = ', P_ent, '\nP(bor) = ', P_bor,'\n')
if P_ent > P_bor:
print('The model predicts that the class Entertaintment for the test sentence with a probability of : ', P_ent)
elif P_ent == P_bor:
print('The model predicts equal probability that the test sentence belongs to either of the class')
else:
print('The model predicts that the class Boring for the test sentence with a probability of : ', P_bor)
P(ent) = 2.874850419035171e-10
P(bor) = 7.579605994374452e-10
The model predicts that the class Boring for the test sentence with a probability of : 7.579605994374452e-10