# This line downloads the spacy model to your computer.
# You need to run it only once.
# If the installation is successful, comment this line.
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
import spacy
from google.colab import files # we already placed all HP books there
import re # this is the regular expression module, we will use it for pre-processing
import statistics # we will use this for analyzing and visualizing our results
# import en_core_web_lg
# from spacytextblob.spacytextblob import SpacyTextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_sm")
# colab only includes the small model.
# For real projects consider using larger models with better accurancy (see https://spacy.io/models/en)
nlp.max_length = 1500000
# start the sentiment analyzer, we will use it later
sia = SentimentIntensityAnalyzer()
uploaded = files.upload()
Saving hp1.txt to hp1 (2).txt
Saving hp2.txt to hp2 (2).txt
Saving hp3.txt to hp3 (2).txt
Saving hp4.txt to hp4 (2).txt
Saving hp5.txt to hp5 (2).txt
Saving hp6.txt to hp6 (1).txt
Saving hp7.txt to hp7 (1).txt
def read_lines_of_file(filename): #you can call this method using this name (in brown)
"""
filename: name of the file, we want to read, e. g., 'hp1.txt'
Open the file and read all lines of the file.
return: list of strings, where each string is a line
"""
with open(filename) as f:
content = f.readlines()
return content
def clean_up_content(content):
"""
content: list of strings. The strings are lines of the file.
The method cleans up the given content. In this case, we remove:
- the preface,
- the chapter titles, and
- all empty lines.
All other lines are concatened to a long string (output_text).
return: a long string containing the content of the book.
"""
output_text = ""
for line in content:
if line.upper() == line:
# check if the line contains only upper cased letters.
# It is most likely that this line is part of the preface or a chapter heading.
# if yes, we don't want to include it
pass # do nothing
elif re.match("^Page | \d+ Harry Potter and .* - J.K. Rowling ", line):
# with this regular expression we want not to include to our final file lines with page numbers
# in this regular expression two placeholders are included
# \d+ stands for a number with at least one but possibly more digits
# .* stands for any symbol (either none or more) including spaces, letters, numbers, etc..
# ^ means look at the beginning of the string
pass # do nothing
elif line == "\n": # equal to only a linebreak
pass # do nothing
else: # it means that all other lines are convenient for us
output_text = output_text + line.strip() + " " # concatenate
return output_text
def preprocess_data(text, nlp):
"""
text: long string with book content
nlp: spacy NLP pipeline
Split the text into sentences and tokens. Therefore, use the NLP pipeline of spacy.
"""
all_sentences = list()
doc = nlp(text)
for sentence in doc.sents:
tokens_of_sentence = list()
for token in sentence:
tokens_of_sentence.append(token)
all_sentences.append(tokens_of_sentence)
return all_sentences # in the end, we have a list of strings (not just many strings in a file!)
def read_book(number, nlp): #we give each book to the spacy nlp pipeline
content = read_lines_of_file("hp"+str(number)+".txt") # since all our filenames have a common part, we introduced this formatting for an easier iteration
text = clean_up_content(content)
preprocessed_text = nlp(text)
return preprocessed_text
# In this cell, we test our pre-processing on a single book.
# Feel free to play with other books and observe whether the pre-processing works well.
book_content_preprocessed = read_book("1", nlp)
# Looking through the whole book can be difficult.
# The colon notation (:) means that we want all sentences from index 0 (first sentence) to index n-1 (10-1=9).
# You can use two numbers instead
print(book_content_preprocessed[0:10])
# We establish the Python dictionary with book numbers as keys and book contents as values.
# Each book is a list of sentences, each sentence is a list of tokens
harry_potter_corpus = dict()
for number in range(1,8): # the range includes elements up to n+1, so we type 8 if we want to iterate up to 7
print("Preprocessing Harry Potter book", number)
harry_potter_corpus[number] = read_book(number, nlp)
Preprocessing Harry Potter book 1
Preprocessing Harry Potter book 2
Preprocessing Harry Potter book 3
Preprocessing Harry Potter book 4
Preprocessing Harry Potter book 5
Preprocessing Harry Potter book 6
Preprocessing Harry Potter book 7
# Don't run this cell -- the output is very long :)
# This line shows the value for key 7, i.e. the whole contents of the book 7
# Its structure is list of lists
print(harry_potter_corpus[7])
# This line shows the first sentence of book 7.
# What type of data structure it is?
# You can use ranges with a colon notation to show multiple sentences (cf. section Combining all methods)
print(list(harry_potter_corpus[7].sents)[0])
The two men appeared out of nowhere, a few yards apart in the narrow, moonlit lane.
# This line shows the first word (as an object) of the first sentence of book 7.
print(list(harry_potter_corpus[7].sents)[0][0])
print(type(list(harry_potter_corpus[7].sents)[0][0]))
The
<class 'spacy.tokens.token.Token'>
# This line shows the text of the first word (as a string) of the first sentence of book 7.
print(list(harry_potter_corpus[7].sents)[0][0].text)
print(type(list(harry_potter_corpus[7].sents)[0][0].text))
The
<class 'str'>
# Let's find a sentence in the second Harry Potter book that includes the word "hated"
# We will save the id to use them in other cells.
# You can reuse this code to find ids of what you need in a doc produced by spacy.
for i, sentence in enumerate(harry_potter_corpus[2].sents):
for j, token in enumerate(sentence):
if token.text == "hated":
sentence_id = i
token_id = j
print(sentence_id, token_id)
1568 1
print(list(harry_potter_corpus[2].sents)[sentence_id])
Harry hated the disappointment in his voice.
print(list(harry_potter_corpus[2].sents)[sentence_id][token_id])
# What does this line show?
# The right answer is: the words number 1 (actually, the second) in the sentence number 1568 (actually, 1567th) in book 2
hated
# This line shows the lemma of the word we just found.
print(list(harry_potter_corpus[2].sents)[sentence_id][1].lemma_)
hate
# This line shows the coarse-grained POS tag of the of the word we just found.
print(list(harry_potter_corpus[2].sents)[sentence_id][1].pos_)
VERB
# This line shows the fine-grained POS tag of the word we just found.
print(list(harry_potter_corpus[2].sents)[sentence_id][1].tag_)
VBD
# This line shows the sentiment of the word we just found.
print(sia.polarity_scores(list(harry_potter_corpus[2].sents)[sentence_id][1].text))
{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.6369}
# If you don't know what happens in this cell, read about len() and dict() in Python.
number_subcorpora = len(harry_potter_corpus)
print(number_subcorpora)
7
# This cell calculates the number of sentences in each book.
# Technically, it goes through the dictionary, takes each key, gets its value and calculates its length.
for name, doc in harry_potter_corpus.items():
number_sentences = len(list(doc.sents))
print("Sentences in Harry Potter", name, number_sentences)
Sentences in Harry Potter 1 7021
Sentences in Harry Potter 2 7158
Sentences in Harry Potter 3 9734
Sentences in Harry Potter 4 15993
Sentences in Harry Potter 5 19777
Sentences in Harry Potter 6 13267
Sentences in Harry Potter 7 16099
# This cell calculates the number of sentences in each book, but sums them up
number_sentences_corpus = 0 #This is a numeric (integer) variable
for name, sentences in harry_potter_corpus.items():
number_sentences = len(list(sentences.sents))
number_sentences_corpus += number_sentences
# The previous line is a short notation for:
# number_sentences_corpus = number_sentences_corpus + number_sentences
# It adds the newly calculated number to what is stored in our variable
print("Senteces in all Harry Potter books:", number_sentences_corpus)
Senteces in all Harry Potter books: 89049
# Do you remember the data structure of our corpus?
# This cell does the same as the previous one, but goes one level deeper.
number_words_corpus = 0
for name, doc in harry_potter_corpus.items():
number_words = len(doc)
number_words_corpus += number_words
print("Number of words in Harry Potter", name, number_words)
print("Number of words in all Harry Potter books", number_words_corpus)
Number of words in Harry Potter 1 99430
Number of words in Harry Potter 2 109569
Number of words in Harry Potter 3 139318
Number of words in Harry Potter 4 248768
Number of words in Harry Potter 5 327350
Number of words in Harry Potter 6 217872
Number of words in Harry Potter 7 253285
Number of words in all Harry Potter books 1395592
number_types_corpus = 0
for name, doc in harry_potter_corpus.items():
list_types = list()
for token in doc:
list_types.append(token.text)
# Now we convert our list to a set and each word remains in this set only once.
number_types = len(set(list_types))
number_types_corpus += number_types
print("Number of types of Harry Potter", name, number_types)
print("Number of types of all Harry Potter books", number_types_corpus)
Number of types of Harry Potter 1 6580
Number of types of Harry Potter 2 7785
Number of types of Harry Potter 3 8433
Number of types of Harry Potter 4 11545
Number of types of Harry Potter 5 13783
Number of types of Harry Potter 6 11339
Number of types of Harry Potter 7 12381
Number of types of all Harry Potter books 71846
ttr_subcorpus = harry_potter_corpus[6]
list_of_tokens = list()
for sentence in #something#:
for token in #something#:
#something#.append(#something#)
number_of_tokens = #something#
number_of_types = #something#
ttr = (#something#/#something#)*100
print(number_of_tokens, number_of_types, ttr)
# If you don't understand what happens in this cell, go back to Number of types
number_lemmas_corpus = 0
for name, doc in harry_potter_corpus.items():
list_lemmas = list()
for token in doc:
list_lemmas.append(token.lemma)
number_lemmas = len(set(list_lemmas))
number_lemmas_corpus += number_lemmas
print("Number of lemmas in Harry Potter", name, number_lemmas)
print("Number of lemmas in all Harry Potter books", number_lemmas_corpus)
Number of lemmas in Harry Potter 1 4692
Number of lemmas in Harry Potter 2 5622
Number of lemmas in Harry Potter 3 6132
Number of lemmas in Harry Potter 4 8386
Number of lemmas in Harry Potter 5 10064
Number of lemmas in Harry Potter 6 8334
Number of lemmas in Harry Potter 7 8928
Number of lemmas in all Harry Potter books 52158
number_symbols_corpus = 0
for name, doc in harry_potter_corpus.items():
number_symbols = 0
for token in doc:
number_symbols += len(token.text)
number_symbols_corpus += number_symbols
print("Number of symbols of Harry Potter", name, number_symbols)
print("Number of symbols of all Harry Potter books", number_symbols_corpus)
Number of symbols of Harry Potter 1 356465
Number of symbols of Harry Potter 2 401519
Number of symbols of Harry Potter 3 510849
Number of symbols of Harry Potter 4 920504
Number of symbols of Harry Potter 5 1218640
Number of symbols of Harry Potter 6 805674
Number of symbols of Harry Potter 7 934376
Number of symbols of all Harry Potter books 5148027
# We want to count the number of several items (several tags).
# For that, one variable like number_tokens=0 is not enough.
# A dict is a data structure that can store many item names (keys) and adjust their values if needed.
number_pos_corpus = dict()
for name, doc in harry_potter_corpus.items():
for token in doc:
if token.pos_ in number_pos_corpus.keys():
# We want to check whether the POS of this token is already in our dict.
number_pos_corpus[token.pos_] += 1 # If yes, we just increase its value adding 1.
else:
number_pos_corpus[token.pos_] = 1 # If not, we create the new key and assert its value being 1.
# This (kinda magic) line sorts our dict.
# It ranges the values from highest to lowest.
# Then it creates a list of key-value pairs in the right order.
number_pos_corpus_sorted = sorted(number_pos_corpus.items(), key=lambda x:x[1], reverse=True)
print("Number of POS of all Harry Potter books:")
for pos, number in number_pos_corpus_sorted:
print(pos, number)
Number of POS of all Harry Potter books:
PUNCT 272413
VERB 200978
NOUN 154160
DET 122736
PRON 115951
ADP 109330
PROPN 107180
ADV 78405
AUX 60317
ADJ 59387
PART 38415
CCONJ 36426
SCONJ 25698
INTJ 7665
NUM 6259
X 247
SYM 25
# List of POS tags used here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
# The mechanism of this code is completely identical to the previous cell.
# You just need to extract other information.
# Copy-paste the code from the previous cell in your notebook and adjust it accordingly.
print("Average Sentence Length:", number_words_corpus/number_sentences_corpus)
Average Sentence Length: 15.67218048490157
print("Average Word Length:", number_symbols_corpus/number_words_corpus)
Average Word Length: 3.6887765192119186
#This cell runs rather long, but you don't need it for your homework
number_owls = 0
for i in range(1, 8): # we iterate through all 7 books
for j in range(0,len(list(harry_potter_corpus[i].sents))): # once inside the book, we iterate through all sentenes
#To find the forelast word, we need to make sure that the sentence has at least two words
if len(list(harry_potter_corpus[i].sents)[j]) >= 2:
# Now we check that the last word is the period and that the forelast word is "owl"
# We have to use the .text method to access the string.
# Otherwise, spacy will returnt the so-called <token object> and it will never be equal to a string we ask for.
if list(harry_potter_corpus[i].sents)[j][-1].text == "." and list(harry_potter_corpus[i].sents)[j][-2].text == "owl":
# print(list(harry_potter_corpus[i].sents)[j]) # uncomment this line if you want to see what sentences were found
number_owls += 1
print(number_owls) #19
number_owl_punct = 0
for name, doc in harry_potter_corpus.items():
for i,token in enumerate(doc):
if i < len(doc)-1:
token_text = token.text.lower()
token_text_next = doc[i+1].text.lower()
if token_text == "owl" and token_text_next.startswith("."):
number_owl_punct += 1
print(number_owl_punct)
number_mr_potter = 0
for name, doc in harry_potter_corpus.items():
for sentence in doc.sents:
# get a token and its index in the sentence. It's helpful to find the position of the token in the sentence.
# using the index we can also find the next token in the sentence.
for i, token in enumerate(sentence):
if i < len(sentence)-1:
token_text = token.text.lower()
token_text_next = sentence[i+1].text.lower()
if token_text == "mr." and token_text_next == "potter":
# if we find the combination mr. and potter, we count this occurrence
number_mr_potter += 1
# print(sentence)
print(number_mr_potter)
# We are searching for sentences with quotation marks because we want to find reporting verbs and other words around them.
# In each sentence with a quote, we search for the name of the person.
# We use the NLTK sentiment analyzer to calculate the sentiment of the whole reporting phrase.
# We assign the sentiment of the phrase to the person found in the sentence.
# We repeat this for all sentences.
# We end with a list of sentiment scores per person.
# Attention: it takes long to execute this code!
# Don't be afraid of this long cell!
# Many of the methods should already look familiar.
# Anyway, we provided a lot of comments for you.
person_sentiment_dict = dict()
for name, doc in harry_potter_corpus.items():
for sentence in doc.sents:
tokens_with_quotation_mark = []
sentence_without_quote = None
# if a sentence contains tokens with a quotation mark, we store the position of all of them in a list
# we need the indices later to find the position of the quotation mark again
for i, token in enumerate(sentence):
if token.is_quote:
tokens_with_quotation_mark.append(i)
# we limit our example to sentences with only one quote (two quotation marks)
if len(tokens_with_quotation_mark) == 2:
# we limit our example to sentences with the quote in the end or in the beginning of the sentence
if tokens_with_quotation_mark[0] == 0: # the sentence starts with a quote if the quotation mark is the first element of the sentence
# then we consider only the part without the quote.
# This means starting one token behind the second quotation mark
sentence_without_quote = sentence[tokens_with_quotation_mark[1]+1:]
elif tokens_with_quotation_mark[1] == len(sentence)-1: # the sentence ends with a quote if the quotation mark is the last element of the sentence
# to find the number of the last element we count the length of the sentence and substract 1
# the last index is smaller than the sentence length beacuse the counting of the indices start with 0 and not with 1
sentence_without_quote = sentence[:tokens_with_quotation_mark[0]]
# we consider only the part of the sentence until the quote begins, this is the position of the first quotation mark
if sentence_without_quote:
# we could improve our project here if we would remove all parts of the sentence after an conjunction
# we assume that something new is introduced in the sentence after a conjunction.
# we could find the conjunctions using part of speech tags.
sentence_without_quote_and_person = list()
for person in sentence_without_quote.ents:
if person[0].ent_type_ == "PERSON":
person = sentence_without_quote.ents[0]
for token in sentence_without_quote:
# we don't want to include the name of the person in the sentiment analysis.
# hence we only add all tokens to the relevant sentence which are not part of the named entity
# we know that the named entity recognizer consider a few adjectives as part of the named entity,
# but the adjcetives are very relevant for the sentiment analysis. Therefore, we only ignore
# the part of a named entity if it is a proper noun
if token in person and (token.tag_ == "NNP" or token.tag_ == "NNPS"):
pass
else:
# here we add the lemma of the relevant tokens to the new sentence part
# we will analyse the sentiment of this sentence part in the following
sentence_without_quote_and_person.append(token.lemma_)
# now we can calculate the sentiment of the sentence part
# we can only calculate the sentiment of a string, so, we need to combine all lemmas of the list to a string
# " ".join() help us, it concatenates all tokens with each other using a whitespace as seperator
sentiment = sia.polarity_scores(" ".join(sentence_without_quote_and_person))
# now we can assign the sentiment to the person named in the sentence
# we store this information in a dictionary
if person.lemma_ in person_sentiment_dict.keys(): # if the person's name is already in the dictionary, we can add the score
person_sentiment_dict[person.lemma_].append(sentiment["compound"])
else:
person_sentiment_dict[person.lemma_] = [sentiment["compound"]] # if not, we create a new list including the first sentiment value
# Some people occur in the text more often than others. Hence, summing up all values would give not balanced result.
# Therefore, we build the average of the sentiment scores of the sentence parts per person.
person_sentiment_dict_avg = dict()
for person in person_sentiment_dict:
person_sentiment_dict_avg[person] = statistics.mean(person_sentiment_dict[person])
# Here we have again some magic to sort the persons by the sentiment score
# These lists are identical, but sorted in reverse orders for convenience
person_sentiment_dict_avg_heroes = sorted(person_sentiment_dict_avg.items(), key=lambda item: item[1], reverse=True) #this will show highest positive values first
person_sentiment_dict_avg_villains = sorted(person_sentiment_dict_avg.items(), key=lambda item: item[1], reverse=False) #this will show lowest negative values first
# Prints top-n heroes
for elem in person_sentiment_dict_avg_heroes[0:20]:
print(elem)
('Padma', 0.89905)
('Harry know Dumbledore', 0.8141333333333334)
('Elphias Doge', 0.8026500000000001)
('irish', 0.7884)
('Fred happily', 0.765)
('Luna Lovegood', 0.7517)
('Elphias', 0.7158666666666667)
('Hermione keenly', 0.7023333333333334)
('Madam Puddifoot', 0.6914)
('mumble Harry', 0.66378)
('crookshank', 0.6301599999999999)
('Ireland', 0.6249)
('ThomasY', 0.62225)
('Ted Tonks', 0.62225)
('Monsieur Delacour', 0.62225)
('Dennis', 0.5994)
('Riddle - Hermione', 0.5859)
('Fred sourly', 0.5574)
('Fred proudly', 0.5574)
('Wendell', 0.5307999999999999)
# Prints top-n villains
for elem in person_sentiment_dict_avg_villains[0:50]:
print(elem)
('Horcrux', -0.81555)
('the Bulgarian Beaters', -0.8141333333333334)
('Harry know', -0.7619333333333334)
('three', -0.7096)
('dead Harry', -0.6486)
('peeve', -0.631)
('Hermione hopelessly', -0.6249)
('hermoine', -0.6124)
('Muggle - born', -0.6124)
('Galleon', -0.6059)
('Malfoys', -0.5994)
('four', -0.5574)
('Bellatrix Lestrange', -0.5423)
('two second', -0.5307999999999999)
('Cattermole', -0.5307999999999999)
('Archie', -0.5267)
('Phineas', -0.495)
('Harry hopelessly', -0.4939)
('Fred impatiently', -0.4878333333333333)
('Harry miserably', -0.4767)
('Ron miserably', -0.4767)
('the Sorting Hat', -0.4767)
('Sirius bitterly', -0.4588)
('Harry bitterly', -0.4588)
('Pigwidgeon', -0.4404)
('Gryffindor Tower', -0.41135)
('twenty', -0.4019)
('Moody quietly', -0.4019)
('killin', -0.4019)
('Marvolo', -0.3818)
('Morfin', -0.37895)
('Ronan', -0.35386666666666666)
('Harry frantically', -0.34)
('the Hanged', -0.34)
('Hermione helplessly', -0.34)
('Celestina', -0.34)
('Harry grimly', -0.3182)
('Hagrid grimly', -0.3182)
('Moody grimly', -0.3182)
('Sirius grimly', -0.3182)
('Fred grimly', -0.3182)
('Hermione grimly', -0.3182)
('Ron grimly', -0.3182)
('Hagrid gruffly', -0.3049375)
('Harry Potter', -0.27532222222222225)
('Leanne', -0.25)
('don', -0.23835)
('the Fat Lady', -0.2273)
('Seamus Finnigan', -0.22691)
('Greyback', -0.21459333333333333)
# What is the rank of Voldemort in the list of heroes?
hero_rank = 0
for i in range(len(person_sentiment_dict_avg_heroes)):
if "Voldemort" in person_sentiment_dict_avg_heroes[i][0]:
hero_rank = i
print(i, person_sentiment_dict_avg_heroes[i])
307 ('Voldemort', -0.06626296296296295)
# What is the rank of Voldemort in the list of villains?
villain_rank = 0
for i in range(len(person_sentiment_dict_avg_villains)):
if "Voldemort" in person_sentiment_dict_avg_villains[i][0]:
villain_rank = i
print(i, person_sentiment_dict_avg_villains[i])
81 ('Voldemort', -0.06626296296296295)