NAME = "Kaymin Martin-Burnett"
COLLABORATORS = "Finn Macken, Eric Fairweather"
# Chosen: VERSION 1 - Node + Trie classes
class Node:
"""
This class represents one node of a trie tree.
Parameters
----------
The parameters for the Node class are not predetermined.
However, you will likely need to create one or more of them.
"""
def __init__(self, char):
"""
Parameters
----------
None
"""
self.children = {}
self.word_end = False
self.char = char
class Trie:
"""This class represents the entirety of a trie tree.
Parameters
----------
The parameters for Trie's __init__ are not predetermined.
However, you will likely need one or more of them.
Methods
-------
insert(self, word)
Inserts a word into the trie, creating nodes as required.
lookup(self, word)
Determines whether a given word is present in the trie.
"""
def __init__(self, word_list = None):
"""Creates the Trie instance, inserts initial words if provided.
Parameters
----------
word_list : list
List of strings to be inserted into the trie upon creation.
"""
self.root = Node(None)
# insert each word into tree
for word in word_list:
self.insert(word)
def insert(self, word):
"""Inserts a word into the trie, creating missing nodes on the go.
Parameters
----------
word : str
The word to be inserted into the trie.
"""
word = word.lower()
# only accept clean letters/words
if isinstance(word, str) == False:
return False
node = self.root
# loop through each character of word
for char in word:
# case 1: char already in tree
if char not in node.children:
# add to dictionary
node.children[char] = Node(char)
# case 2: char not in tree so add to tree
node = node.children[char]
# update word_end when all nodes inserted
node.word_end = True
def lookup(self, word):
"""Determines whether a given word is present in the trie.
Parameters
----------
word : str
The word to be looked-up in the trie.
Returns
-------
bool
True if the word is present in trie; False otherwise.
Notes
-----
Your trie should ignore whether a word is capitalized.
E.g. trie.insert('Prague') should lead to trie.lookup('prague') = True
"""
# only accept clean letters/words
if isinstance(word, str) == False:
return False
# transform input to lowercase
word = word.lower()
node = self.root
# loop through each character of word
for char in word:
# checks if char is present in tree
if char in node.children:
node = node.children[char]
else:
return False
# return when last char is reached
return node.word_end
# Here are several tests that have been created for you.
# Remeber that the question asks you to provide several more,
# as well as to justify them.
# This is Namárië, JRRT's elvish poem written in Quenya
wordbank = "Ai! laurië lantar lassi súrinen, yéni unótimë ve rámar aldaron! Yéni ve lintë yuldar avánier mi oromardi lisse-miruvóreva Andúnë pella, Vardo tellumar nu luini yassen tintilar i eleni ómaryo airetári-lírinen. Sí man i yulma nin enquantuva? An sí Tintallë Varda Oiolossëo ve fanyar máryat Elentári ortanë, ar ilyë tier undulávë lumbulë; ar sindanóriello caita mornië i falmalinnar imbë met, ar hísië untúpa Calaciryo míri oialë. Sí vanwa ná, Rómello vanwa, Valimar! Namárië! Nai hiruvalyë Valimar. Nai elyë hiruva. Namárië!".replace("!", "").replace("?", "").replace(".", "").replace(",", "").replace(";", "").split()
trie = Trie(wordbank)
# be careful about capital letters!
assert trie.lookup('oiolossëo') == True
# this is a prefix, but also a word in itself
assert trie.lookup('an') == True
# this is a prefix, but NOT a word
assert trie.lookup('ele') == False
# not in the wordbank
assert trie.lookup('Mithrandir') == False
# Note: There are several ways in which we can condense the text cleaning syntax,
# without repeating the method replace() multiple times,
# but we are leaving it this way for clarity.
# ----- 3 extra tests -----
# test 1 - partial slice of a word
wordbank_2 = "children at the carnival".split()
trie_2 = Trie(wordbank_2)
assert trie_2.lookup('child') == False
assert trie_2.lookup('children') == True
assert trie_2.lookup('carn') == False
assert trie_2.lookup('carnival') == True
# test 2 - special character removal
assert trie.lookup('ai!') == False
assert trie.lookup('ai') == True
# test 3 - blank word
assert trie.lookup('') == False
class Node:
"""
This class represents one node of a trie tree.
Parameters
----------
The parameters for the Node class are not predetermined.
However, you will likely need to create one or more of them.
"""
def __init__(self, char):
"""
Parameters
----------
None
"""
self.children = {}
self.word_end = False
self.char = char
class Trie:
"""This class represents the entirety of a trie tree.
Parameters
----------
The parameters for Trie's __init__ are not predetermined.
However, you will likely need one or more of them.
Methods
-------
insert(self, word)
Inserts a word into the trie, creating nodes as required.
lookup(self, word)
Determines whether a given word is present in the trie.
"""
def __init__(self, word_list = None):
"""Creates the Trie instance, inserts initial words if provided.
Parameters
----------
word_list : list
List of strings to be inserted into the trie upon creation.
"""
self.root = Node('')
self.word_list = word_list
# insert each word into tree
for word in word_list:
self.insert(word)
def insert(self, word):
"""Inserts a word into the trie, creating missing nodes on the go.
Parameters
----------
word : str
The word to be inserted into the trie.
"""
# case 1: root node which is empty; return True
if word == '':
return True
current = self.root
# for eacher character, check if children exist, otherwise return False
for char in word.lower():
if char not in current.children:
current.children[char] = Node(char)
current = current.children[char]
# when no more children exist, must be end of word
current.word_end = True
def lookup(self, word):
"""Determines whether a given word is present in the trie.
Parameters
----------
word : str
The word to be looked-up in the trie.
Returns
-------
bool
True if the word is present in trie; False otherwise.
Notes
-----
Your trie should ignore whether a word is capitalized.
E.g. trie.insert('Prague') should lead to trie.lookup('prague') = True
"""
# only accept clean letters/words
if isinstance(word, str) == False:
return False
# transform input to lowercase
word = word.lower()
node = self.root
# loop through each character of word
for char in word:
# checks if char is present in tree
if char in node.children:
node = node.children[char]
else:
return False
# return when last char is reached
return node.word_end
def alphabetical_list(self):
"""Returns content of tree in alphabetical order
Using a pre_order traversal
Returns
-------
list
list of words inside trie tree sorted alphabetically
"""
# function that builds a sorted list
def sorted_list(obj):
"""Using preorder traversal, builds a sorted list from the leaf node to root
Parameters
----------
obj : dict
dictionary containing words to be traveresed through
Returns
-------
output : array
sorted array of words from trie tree
"""
#initialize a list to store the letters
output = []
# letter node has no children
if obj.children == {}:
return obj.char
# recursive approach - calling sorted_list on each node
for key in obj.children.keys():
output.extend(sorted_list(obj.children[key]))
for i in range(len(output)):
output[i] = obj.char + output[i]
# check when last char of word is reached
if obj.word_end:
output.append(obj.char)
# sort list
return sorted(output)
# recursive call
return sorted_list(self.root)
# intiate the test by uncommenting one of the lines below, depending on your approach
wordbank = "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Duis pulvinar. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos hymenaeos. Nunc dapibus tortor vel mi dapibus sollicitudin. Etiam quis quam. Curabitur ligula sapien, pulvinar a vestibulum quis, facilisis vel sapien.".replace(",", "").replace(".", "").split()
trie = Trie(wordbank)
assert trie.alphabetical_list() == ['a','ad','adipiscing','amet','aptent',
'class','consectetuer','conubia',
'curabitur','dapibus','dolor','duis',
'elit','etiam','facilisis','hymenaeos',
'inceptos','ipsum','ligula','litora',
'lorem','mi','nostra','nunc','per',
'pulvinar','quam','quis','sapien',
'sit','sociosqu','sollicitudin','taciti',
'torquent','tortor','vel','vestibulum']
# ----- 3 EXTRA TESTS -----
# Test 1
wordbank_3 = ['child', 'children', 'dren', 'chi']
trie1 = Trie(wordbank_3)
assert trie1.alphabetical_list() == sorted(wordbank_3)
# Test 2
wordbank_3_v2 = [' ']
trie1 = Trie(wordbank_3_v2)
assert trie1.alphabetical_list() == sorted(wordbank_3_v2)
# Test 3
wordbank_3_v3 = ['你','好','吗']
trie1 = Trie(wordbank_3_v2)
assert trie1.alphabetical_list() == sorted(wordbank_3_v2)
import heapq as heapq
class Node:
"""
This class represents one node of a trie tree.
Parameters
----------
The parameters for the Node class are not predetermined.
However, you will likely need to create one or more of them.
"""
def __init__(self, char):
"""
Parameters
----------
None
"""
self.children = {}
self.word_end = False
self.char = char
# initialize counter for dictionary
self.counter = 0
def traversed_word(self):
"""
Returns
-------
word_list : dict
key = word
value = frequency in tree
"""
word_list = {}
# append word to word_list when complete word
if self.word_end == True:
word_list[self.value] = self.counter
# traverse tree to build word from chars
for child in self.children.chars():
child_words = child.traversed_word()
for word in child_words:
word_list[("" if self.value == None else self.value) + word] = child_words[word]
return word_list
class Trie:
"""This class represents the entirety of a trie tree.
Parameters
----------
The parameters for Trie's __init__ are not predetermined.
However, you will likely need one or more of them.
Methods
-------
insert(self, word)
Inserts a word into the trie, creating nodes as required.
lookup(self, word)
Determines whether a given word is present in the trie.
"""
def __init__(self, word_list = None):
"""Creates the Trie instance, inserts initial words if provided.
Parameters
----------
word_list : list
List of strings to be inserted into the trie upon creation.
"""
self.root = Node('')
self.word_list = word_list
#initialize heap
self.heap = []
# insert each word into tree
if word_list:
for word in word_list:
self.insert(word)
def insert(self, word):
"""Inserts a word into the trie, creating missing nodes on the go.
Parameters
----------
word : str
The word to be inserted into the trie.
"""
# case 1: root node which is empty; return True
if word == '':
return True
current = self.root
# for eacher character, check if children exist, otherwise return False
for char in word:
if char not in current.children:
current.children[char] = Node(char)
current = current.children[char]
# when no more children exist, must be end of word
current.word_end = True
# Additional code for heap implementation
# Case 1: In heap = increase counter
in_heap = False
for i in range(len(self.heap)):
if self.heap[i][1] == word:
# max heap
self.heap[i] = (self.heap[i][0]-1, self.heap[i][1])
in_heap = True
# Case 2: not in heap = insert
if not in_heap:
self.heap.append((-1,word))
heapq.heapify(self.heap)
def lookup(self, word):
"""Determines whether a given word is present in the trie.
Parameters
----------
word : str
The word to be looked-up in the trie.
Returns
-------
bool
True if the word is present in trie; False otherwise.
Notes
-----
Your trie should ignore whether a word is capitalized.
E.g. trie.insert('Prague') should lead to trie.lookup('prague') = True
"""
# only accept clean letters/words
if isinstance(word, str) == False:
return False
# transform input to lowercase
word = word.lower()
node = self.root
# loop through each character of word
for char in word:
# checks if char is present in tree
if char in node.children:
node = node.children[char]
else:
return False
# return when last char is reached
return node.word_end
def alphabetical_list(self):
"""Returns content of tree in alphabetical order
Using a pre_order traversal
Returns
-------
list
list of words inside trie tree sorted alphabetically
"""
# helper function that builds a sorted list
def sorted_list(obj):
"""Using preorder traversal, builds a sorted list from the leaf node to root
Parameters
----------
obj : dict
dictionary containing words to be traveresed through
Returns
-------
output : array
sorted array of words from trie tree
"""
# initialize a list to store the letters
output = []
# letter node has no children
if obj.children == {}:
return obj.char
# recursive approach - calling sorted_list on each node
for key in obj.children.keys():
output.extend(sorted_list(obj.children[key]))
for i in range(len(output)):
output[i] = obj.char + output[i]
# check when last char of word is reached
if obj.word_end:
output.append(obj.char)
# sort list
return sorted(output)
# recursive call
return sorted_list(self.root)
def k_most_common(self, k):
"""Finds k words inserted into the trie most often.
You will have to tweak some properties of your existing code,
so that it captures information about repeated insertion.
Parameters
----------
k : int
Number of most common words to be returned.
Returns
----------
list
List of tuples.
Each tuple entry consists of the word and its frequency.
The entries are sorted by frequency.
Example
-------
>>> print(trie.k_most_common(3))
[(‘the’, 154), (‘a’, 122), (‘i’, 122)]
I.e. the word ‘the’ has appeared 154 times in the inserted text.
The second and third most common words both appeared 122 times.
"""
# intialize frequency list
k_frequency = []
# dictionary where key = word and value = frequency
word_list_dict = self.root.traversed_word()
# create tuples where counter is negative because max heap
counter_list = [(-v, k) for k, v in word_list_dict.items()]
heapq.heapify(counter_list)
# pop to list containing frequencies
for i in range(k):
ele = hq.heappop(counter_list
return k_frequency
# Note
# you might have to run 'pip install requests' before running this cell
# since you're downloading data from an online resource
# please note this might take a while to run
# Mehreen Faruqi - Black Lives Matter in Australia: https://bit.ly/CS110-Faruqi
# John F. Kennedy - The decision to go to the Moon: https://bit.ly/CS110-Kennedy
# Martin Luther King Jr. - I have a dream: https://bit.ly/CS110-King
# Greta Thunberg - UN Climate Summit message: https://bit.ly/CS110-Thunberg
# Vaclav Havel - Address to US Congress after the fall of Soviet Union: https://bit.ly/CS110-Havel
from requests import get
speakers = ['Faruqi', 'Kennedy', 'King', 'Thunberg', 'Havel']
bad_chars = [';', ',', '.', '?', '!', '_',
'[', ']', ':', '“', '”', '"', '–', '-']
for speaker in speakers:
# download and clean up the speech from extra characters
speech_full = get(f'https://bit.ly/CS110-{speaker}').text
just_text = ''.join(c for c in speech_full if c not in bad_chars)
without_newlines = ''.join(c if (c not in ['\n', '\r', '\t']) else " " for c in just_text)
just_words = [word for word in without_newlines.split(" ") if word != ""]
trie = Trie(just_words)
if speaker == 'Faruqi':
Faruqi = [('the', 60), ('and', 45), ('to', 39), ('in', 37),
('of', 34), ('is', 25), ('that', 22), ('this', 21),
('a', 20), ('people', 20), ('has', 14), ('are', 13),
('for', 13), ('we', 13), ('have', 12), ('racism', 12),
('black', 11), ('justice', 9), ('lives', 9), ('police', 9)]
assert trie.k_most_common(20) == Faruqi
elif speaker == 'Kennedy':
Kennedy = [('the', 117), ('and', 109), ('of', 93), ('to', 63),
('this', 44), ('in', 43), ('we', 43), ('a', 39),
('be', 30), ('for', 27), ('that', 27), ('as', 26),
('it', 24), ('will', 24), ('new', 22), ('space', 22),
('is', 21), ('all', 15), ('are', 15), ('have', 15), ('our', 15)]
assert trie.k_most_common(21) == Kennedy
elif speaker == 'Havel':
Havel = [('the', 34), ('of', 23), ('and', 20), ('to', 15),
('in', 13), ('a', 12), ('that', 12), ('are', 9),
('we', 9), ('have', 8), ('human', 8), ('is', 8),
('you', 8), ('as', 7), ('for', 7), ('has', 7), ('this', 7),
('be', 6), ('it', 6), ('my', 6), ('our', 6), ('world', 6)]
assert trie.k_most_common(22) == Havel
elif speaker == 'King':
King = [('the', 103), ('of', 99), ('to', 59), ('and', 54), ('a', 37),
('be', 33), ('we', 29), ('will', 27), ('that', 24), ('is', 23),
('in', 22), ('as', 20), ('freedom', 20), ('this', 20),
('from', 18), ('have', 17), ('our', 17), ('with', 16),
('i', 15), ('let', 13), ('negro', 13), ('not', 13), ('one', 13)]
assert trie.k_most_common(23) == King
elif speaker == 'Thunberg':
Thunberg = [('you', 22), ('the', 20), ('and', 16), ('of', 15),
('to', 14), ('are', 10), ('is', 9), ('that', 9),
('be', 8), ('not', 7), ('with', 7), ('i', 6),
('in', 6), ('us', 6), ('a', 5), ('how', 5), ('on', 5),
('we', 5), ('all', 4), ('dare', 4), ('here', 4),
('my', 4), ('people', 4), ('will', 4)]
assert trie.k_most_common(24) == Thunberg
# -----------------------------------------------------------------------
# TODO: I have tried a number of fixes and sought help from TA OH however
# have been unable to fix this error. I know that the dictionary should
# only contain the word and its frequency as the key and value. I have
# tried to find where the dict object has this chars issue but am unable
# -----------------------------------------------------------------------
# depending on your choice of approach,
# add this method to your Node or Trie class
def autocomplete(self, prefix):
"""Finds the most common word with the given prefix.
You might want to reuse some functionality or ideas from Q4.
Parameters
----------
prefix : str
The word part to be “autocompleted”.
Returns
----------
str
The complete, most common word with the given prefix.
Notes
----------
The return char is equal to prefix if there is no valid word in the trie.
The return char is also equal to prefix if prefix is the most common word.
"""
# YOUR CODE HERE
#raise NotImplementedError()
import heapq as hq
class Node:
"""
This class represents one node of a trie tree.
Parameters
----------
The parameters for the Node class are not predetermined.
However, you will likely need to create one or more of them.
"""
def __init__(self, char):
"""
Parameters
----------
None
"""
self.children = {}
self.word_end = False
self.char = char
# initialize counter for dictionary
self.counter = 0
def traversed_word(self):
"""
Returns
-------
word_list : dict
key = word
value = frequency in tree
"""
word_list = {}
# append word to word_list when complete word
if self.word_end == True:
word_list[self.value] = self.counter
# traverse tree to build word from chars
for child in self.children.chars():
child_words = child.traversed_word()
for word in child_words:
word_list[("" if self.value == None else self.value) + word] = child_words[word]
return word_list
class Trie:
"""This class represents the entirety of a trie tree.
Parameters
----------
The parameters for Trie's __init__ are not predetermined.
However, you will likely need one or more of them.
Methods
-------
insert(self, word)
Inserts a word into the trie, creating nodes as required.
lookup(self, word)
Determines whether a given word is present in the trie.
"""
def __init__(self, word_list = None):
"""Creates the Trie instance, inserts initial words if provided.
Parameters
----------
word_list : list
List of strings to be inserted into the trie upon creation.
"""
self.root = Node('')
self.word_list = word_list
#initialize heap
self.heap = []
# insert each word into tree
if word_list:
for word in word_list:
self.insert(word)
def insert(self, word):
"""Inserts a word into the trie, creating missing nodes on the go.
Parameters
----------
word : str
The word to be inserted into the trie.
"""
# case 1: root node which is empty; return True
if word == '':
return True
current = self.root
# for eacher character, check if children exist, otherwise return False
for char in word:
if char not in current.children:
current.children[char] = Node(char)
current = current.children[char]
# when no more children exist, must be end of word
current.word_end = True
# Additional code for heap implementation
# Case 1: In heap = increase counter
in_heap = False
for i in range(len(self.heap)):
if self.heap[i][1] == word:
# max heap
self.heap[i] = (self.heap[i][0]-1, self.heap[i][1])
in_heap = True
# Case 2: not in heap = insert
if not in_heap:
self.heap.append((-1,word))
heapq.heapify(self.heap)
def lookup(self, word):
"""Determines whether a given word is present in the trie.
Parameters
----------
word : str
The word to be looked-up in the trie.
Returns
-------
bool
True if the word is present in trie; False otherwise.
Notes
-----
Your trie should ignore whether a word is capitalized.
E.g. trie.insert('Prague') should lead to trie.lookup('prague') = True
"""
# only accept clean letters/words
if isinstance(word, str) == False:
return False
# transform input to lowercase
word = word.lower()
node = self.root
# loop through each character of word
for char in word:
# checks if char is present in tree
if char in node.children:
node = node.children[char]
else:
return False
# return when last char is reached
return node.word_end
def alphabetical_list(self):
"""Returns content of tree in alphabetical order
Using a pre_order traversal
Returns
-------
list
list of words inside trie tree sorted alphabetically
"""
# helper function that builds a sorted list
def sorted_list(obj):
"""Using preorder traversal, builds a sorted list from the leaf node to root
Parameters
----------
obj : dict
dictionary containing words to be traveresed through
Returns
-------
output : array
sorted array of words from trie tree
"""
# initialize a list to store the letters
output = []
# letter node has no children
if obj.children == {}:
return obj.char
# recursive approach - calling sorted_list on each node
for key in obj.children.keys():
output.extend(sorted_list(obj.children[key]))
for i in range(len(output)):
output[i] = obj.char + output[i]
# check when last char of word is reached
if obj.word_end:
output.append(obj.char)
# sort list
return sorted(output)
# recursive call
return sorted_list(self.root)
def k_most_common(self, k):
"""Finds k words inserted into the trie most often.
You will have to tweak some properties of your existing code,
so that it captures information about repeated insertion.
Parameters
----------
k : int
Number of most common words to be returned.
Returns
----------
list
List of tuples.
Each tuple entry consists of the word and its frequency.
The entries are sorted by frequency.
Example
-------
>>> print(trie.k_most_common(3))
[(‘the’, 154), (‘a’, 122), (‘i’, 122)]
I.e. the word ‘the’ has appeared 154 times in the inserted text.
The second and third most common words both appeared 122 times.
"""
# intialize frequency list
k_frequency = []
# dictionary where key = word and value = frequency
word_list_dict = self.root.traversed_word()
# create tuples where counter is negative because max heap
counter_list = [(-v, k) for k, v in word_list_dict.items()]
heapq.heapify(counter_list)
# pop to list containing freuencies
for i in range(k):
ele = hq.heappop(counter_list
return k_frequency
# depending on your choice of approach, uncomment one of the lines below
# The Complete Works of William Shakespeare is a LARGE book,
# so the code might take a while to run
from requests import get
bad_chars = [';', ',', '.', '?', '!', '1', '2', '3', '4',
'5', '6', '7', '8', '9', '0', '_', '[', ']']
SH_full = get('http://bit.ly/CS110-Shakespeare').text
SH_just_text = ''.join(c for c in SH_full if c not in bad_chars)
SH_without_newlines = ''.join(c if (c not in ['\n', '\r', '\t']) else " " for c in SH_just_text)
SH_just_words = [word for word in SH_without_newlines.split(" ") if word != ""]
SH_trie = Trie(SH_just_words)
assert SH_trie.autocomplete('hist') == 'history'
assert SH_trie.autocomplete('en') == 'enter'
assert SH_trie.autocomplete('cae') == 'caesar'
assert SH_trie.autocomplete('gen') == 'gentleman'
assert SH_trie.autocomplete('pen') == 'pen'
assert SH_trie.autocomplete('tho') == 'thou'
assert SH_trie.autocomplete('pent') == 'pentapolis'
assert SH_trie.autocomplete('petr') == 'petruchio'