!pip install nltk
Requirement already satisfied: nltk in /shared-libs/python3.7/py/lib/python3.7/site-packages (3.5)
Requirement already satisfied: tqdm in /shared-libs/python3.7/py/lib/python3.7/site-packages (from nltk) (4.58.0)
Requirement already satisfied: regex in /shared-libs/python3.7/py/lib/python3.7/site-packages (from nltk) (2020.11.13)
Requirement already satisfied: click in /shared-libs/python3.7/py/lib/python3.7/site-packages (from nltk) (7.1.2)
Requirement already satisfied: joblib in /shared-libs/python3.7/py/lib/python3.7/site-packages (from nltk) (1.0.1)
WARNING: You are using pip version 20.1.1; however, version 21.0.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# Import relevant packages
import nltk
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
from textblob import TextBlob
# Download corpus
nltk.download('reuters')
nltk.download('punkt')
from nltk.corpus import reuters
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
# Create unigram model
counts = Counter(reuters.words())
total_count = len(reuters.words())
# Total number of unigrams
print('Total unigrams in reuters corpus: ', total_count)
# Most common unigrams
print('The 20 most common unigrams are: ')
print(counts.most_common(n=20))
Total unigrams in reuters corpus: 1720901
The 20 most common unigrams are:
[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037), ('vs', 14120), ('-', 13705), ('for', 12785), ('dlrs', 11730), ("'", 11272), ('The', 10968), ('000', 10277), ('1', 9977), ('s', 9298), ('pct', 9093)]
# Compute probabilities
for word in counts:
counts[word] /= float(total_count)
# Check that probabilities add up to 1
print( sum(counts.values()))
# The most common 20 words are ...
print('The 20 most common unigrams are: ')
print(counts.most_common(n=20))
1.0000000000006808
The 20 most common unigrams are:
[('.', 0.055021758950689205), (',', 0.042047741270415905), ('the', 0.033849129031826936), ('of', 0.02090707135390124), ('to', 0.01977743054365126), ('in', 0.015386126221089999), ('said', 0.014657438167564549), ('and', 0.014552260705293332), ('a', 0.013650988639090802), ('mln', 0.010481137497159917), ('vs', 0.008205004239058494), ('-', 0.007963851494072001), ('for', 0.007429247818439295), ('dlrs', 0.006816196864317006), ("'", 0.0065500572084042025), ('The', 0.0063734055590646994), ('000', 0.0059718717113883945), ('1', 0.00579754442585599), ('s', 0.005402983669600982), ('pct', 0.005283860024487172)]
# Create bigram model
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))
for sentence in reuters.sents():
for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
bigram_model[w1][w2] += 1
# Compute probabilities
for w1 in bigram_model:
total_count = float(sum(bigram_model[w1].values()))
for w2 in bigram_model[w1]:
bigram_model[w1][w2] /= total_count
# Test bigram model
print(bigram_model["the"]["economists"])
print(bigram_model["the"]["nonexistingword"])
print(bigram_model[ None]["The"])
0.00013733669808243634
0
0.16154324146501936
# Creat trigram model
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))
for sentence in reuters.sents():
for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
trigram_model[(w1, w2)][w3] += 1
# Compute probabilities
for w1_w2 in trigram_model:
total_count = float(sum(trigram_model[w1_w2].values()))
for w3 in trigram_model[w1_w2]:
trigram_model[w1_w2][w3] /= total_count
# Test trigram model
print(trigram_model["what", "the"]["economists"])
print(trigram_model["what", "the"]["nonexistingword"])
print(trigram_model[None, None]["The"])
0.043478260869565216
0
0.16154324146501936
# Naive approach: 4 sets of values
lambda_set1 = [1/3.0, 1/3.0, 1/3.0]
lambda_set2 = [0.2, 0.4, 0.4]
lambda_set3 = [0.1, 0.1, 0.8]
lambda_set4 = [0.0, 0.0, 1.0]
# Interpolation strategy
def smoothed_trigram_probability(lambdas, trigram):
# Ensure input is a trigram
assert len(trigram)==3
# Assign lambdas
lambda1 = lambdas[0]
lambda2 = lambdas[1]
lambda3 = lambdas[2]
# Extract words out of trigram
u,v,w = trigram[0],trigram[1],trigram[2]
# Calculate smoothed probability
prob = (lambda1* counts[w])+\
(lambda2*bigram_model[v][w])+\
(lambda3*trigram_model[u,v][w])
return prob
# Test 4 different lambda values
print(smoothed_trigram_probability(lambda_set1, ["what", "the","economists"]))
print(smoothed_trigram_probability(lambda_set2, ["what", "the","economists"]))
print(smoothed_trigram_probability(lambda_set3, ["what", "the","economists"]))
print(smoothed_trigram_probability(lambda_set4, ["what", "the","economists"]))
0.014588118950433989
0.017475990883789927
0.03481121829382585
0.043478260869565216
import random
# Generate 100 words of language using frequencies of words
text = []
# Loop through 100 times
for i in range(100):
r = random.random()
accumulator = .0
# Loop through word frequency
for word, freq in counts.items():
accumulator += freq
# Select word that is above the frequency threshold, and append to text
if accumulator >= r:
text.append(word)
break
# Join text list with space and print
print( ' '.join(text))
JAPAN said North for service Quebec , financial of damages will Upham Shr Leading rise DESPITE reserve ." rains dlrs Bond " debt billion assets the said AG economy ," , 7 resin vs OIL stable year of 28 first combined . , Saudi Industriali . Michigan for profit Net ( Champion and Net city Bankverein , its Commonwealth for by by 10 in dollar that UPDATE England services Broadcasting > removal ; be French mln , removed money deposit 4 they . the the ," WAL tantamount the be will interest 2 difficult ct ' for strongest from filing
# The probability of a text
from operator import mul
from functools import reduce
print('Probability of random generated text: ', reduce(mul, [counts[w] for w in text], 1.0))
Probability of random generated text: 1.76421564e-316
# Starting words
text = [None]
sentence_finished = False
while not sentence_finished:
# Select a random probability threshold
r = random.random()
accumulator = .0
# Loop through word probabilities of latest word in text
latest_word = text[-1]
for word, prob in bigram_model[latest_word].items():
accumulator += prob
# Select word that is above the probability threshold, and append to text
if accumulator >= r:
text.append(word)
break
# If latest word is equal to none, sentence is finished
if text[-1] == None:
sentence_finished = True
# Join text list with space and print
print (' '.join([t for t in text if t]))
Producers Association newsletter said it increased the dollar . 5 pct against & lt ; FINANCIAL CORP 1ST QTR LOSS Oper net purchases at 20 pct in February from 1 mln to enhanced political manuevering to estimate today , it might ignore the past 20
# Starting words
text = [None, None]
sentence_finished = False
while not sentence_finished:
# Select a random probability threshold
r = random.random()
accumulator = .0
# Loop through word probabilities of latest words in text
latest_words = tuple(text[-2:])
for word, prob in trigram_model[latest_words].items():
accumulator += prob
# Select word that is above the probability threshold and append to text
if accumulator >= r:
text.append(word)
break
# If latest word is equal to none, sentence is finished
if text[-1] == None:
sentence_finished = True
# Join text list with space and print
print(' '.join([t for t in text if t]))
During the year , mainly from Hong Kong ' s fall in international trade houses for renewal of the trading range of questions , Oakley said " I worked out to European Sugar ( France ) and are mostly gold properties in Brazil would not result in retaliation for what the United States Lines ( S . Agriculture Department said deficiency payment savings , the paper was not clear how long the dollar slipped below 150 yen have come down to acceptable levels by the Preferential Trade Area ( PTA ), which is producing within the next repurchase tender to 7 . 3 pct , with exports put between 11 and said the government securities market to arrange two billion in 1986 / 87 03 / 09 / 87 season ( Sept - Aug ), nearly 400 mln stg vs 102 .
e1 = 'the actor gives a convincing, charismatic performance as the multifaceted'
e2 = 'Spielberg gives us a visually spicy and historically accurate real life story'
e3 = 'His innovative mind entertains us now and will continue to entertain generations to come'
b1 = 'Unfortunately, the film has two major flaws, one in the disastrous ending'
b2 = 'If director actually thought this movie was worth anything'
b3 = 'His efforts seem fruitless, creates drama where drama shouldn’t be'
c = [e1, e2, e3, b1, b2, b3]
e = [e1, e2, e3]
b = [b1, b2, b3]
train_e = [[e1, '1'], [e2, '1'], [e3, '1']]
train_b = [[b1, '0'], [b2, '0'], [b3, '0']]
def unique_words(corpus):
dictionary = {}
array = []
for c in corpus:
for word in c.split():
dictionary[word] = word
array.append(word)
return len(dictionary)
print("Unique words in whole corpus:", unique_words(c))
print("Unique words in 'entertaining' corpus:", unique_words(e))
print("Unique words in 'boring' corpus:", unique_words(b))
dict_e = unique_words(e)
Unique words in whole corpus: 57
Unique words in 'entertaining' corpus: 30
Unique words in 'boring' corpus: 29