import pandas as pd
import altair as alt
import nltk # ← new
# downloading some additional packages and corpora
nltk.download('punkt') # necessary for tokenization
nltk.download('wordnet') # necessary for lemmatization
nltk.download('stopwords') # necessary for removal of stop words
nltk.download('averaged_perceptron_tagger') # necessary for POS tagging
nltk.download('maxent_ne_chunker' ) # necessary for entity extraction
nltk.download('omw-1.4') # necessary for lemmatization
nltk.download('words')
# and a small English language model
!python -m spacy download en_core_web_sm
import requests # to load the data by URL
r = requests.get('http://infovis.fh-potsdam.de/tutorials/data/story.txt')
r.encoding = "utf-8" # ensure correct encoding
story = r.text
# display first 500 characters
print(story[:500]+"…")
sentence = "There were plenty of towns, rivers, mountains, forests, and brooks."
words = nltk.word_tokenize(sentence)
words
# no punctuation, numbers or contractions (such as "isn't")
onlywords = [word for word in words if word.isalpha()]
onlywords[0:20]
from nltk.stem import PorterStemmer as stemmer
from nltk.stem import WordNetLemmatizer as lemmatizer
from nltk.corpus import wordnet # for robust lemmatization
word = "drove"
print(stemmer().stem(word))
print(lemmatizer().lemmatize(word, pos = wordnet.VERB))
# to save us some typing, we import these, so we can call them directly
from nltk import word_tokenize, pos_tag
sentence = "There were plenty of towns, rivers, mountains, forests, and brooks."
# first we tokenize then we pos_tag
sentence = pos_tag(word_tokenize(sentence))
sentence
# same as above: first tokenize, then pos_tag
pos = pos_tag(word_tokenize(story))
# to keep things short & sweet, we define a function for lemmatizing verbs
def lemmatize_verb (word):
return lemmatizer().lemmatize(word.lower(), pos = wordnet.VERB)
# remember this form? it's a list comprehension again!
# the condition at the end matches all verbs, whose POS tag starts with a V.
# word[1][0] refers to the second element of the tuple and its first letter
verbs = [lemmatize_verb(word[0]) for word in pos if word[1][0]=="V"]
# let's look at the first 50 verbs
print(verbs[:50])
import spacy
nlp = spacy.load("en_core_web_sm")
# retrieve plain text article
r = requests.get('https://infovis.fh-potsdam.de/tutorials/data/article.txt')
r.encoding = "utf-8"
article = r.text
# carry out NLP processing
doc = nlp(article)
# get the text and entity label of all word entities in the article
entities = [ (e.text, e.label_) for e in doc.ents if e.text ]
# see first 20 entities
entities[0:20]
tags = [ent[1] for ent in entities] # extract the tag parts
tags = set(tags) # get only the unique tags
# loop through all tags explain them
for tag in tags:
print(tag, spacy.explain(tag))
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True) # show the text with colorful highligths
from nltk.corpus import stopwords as stop
stopwords = stop.words("english")
print(stopwords)
tokens = nltk.word_tokenize(story.lower())
# let's focus on those tokens that contain only letters
lettertokens = [word for word in tokens if word.isalpha()]
# this is a special form called List Comprehensions (you've seen it before)
without_stopwords = [word for word in lettertokens if word not in stopwords]
print(without_stopwords[:50])
tokens = word_tokenize(story.lower())
words = [word for word in tokens if word.isalpha()]
# bag of words as a dictionary data type
bow = {}
# we count the occurrences of each word and save it
for word in words:
bow[word] = words.count(word)
# for later use, we create a sorted list of word-frequency tuples
words_frequency = sorted(bow.items(), key=lambda x: x[1], reverse=True)
print(words_frequency[0:100])
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = story
wc = WordCloud(width=500, height=500, background_color="white").generate(text)
# display the generated image:
my_dpi = 72
plt.figure(figsize = (500/my_dpi, 500/my_dpi), dpi=my_dpi)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
# first we create a dataframe from the word frequencies
df = pd.DataFrame(words_frequency, columns=['word', 'count'])
# we want to focus just on the top 20 words
df_top = df[:20]
# draw horizontal barchart
alt.Chart(df_top).mark_bar().encode(
x = 'count:Q',
y = 'word:N'
)
# first we extract all words and their types (a.k.a. parts-of-speech or POS)
pos = pos_tag(word_tokenize(article))
# we will be collecting words and types in lists of the same length
words = []
types = []
# iterate over all entries in the pos list (generated above)
for p in pos:
# get the word and turn it into lowercase
word = p[0].lower()
# get the word's type
tag = p[1]
# for this analysis we remove entries that contain punctuation or numbers
# and we also ignore the stopwords (sorry: the, and, or, etc!)
if word.isalpha() and word not in stopwords:
# first we add this word to the words list
words.append(word)
# then we add its word type to types list, based on the 1st letter of the pos tag
# note that we access letters in a string, like entries in a list
if (tag[0]=="J"): types.append("Adjective")
elif (tag[0]=="N"): types.append("Noun")
elif (tag[0]=="R"): types.append("Adverb")
elif (tag[0]=="V"): types.append("Verb")
# there are many more word types, we simply subsume them under 'other'
else: types.append("Other")
# with the two lists of the same length, we create a dataframe with a dictionary,
# of which the keys will become the column labels
df = pd.DataFrame({"word": words, "type": types })
# along the type column, we want to support a filter selection
selection = alt.selection_point(fields=['type'])
# we create a composite chart consisting of two sub-charts
# the base holds it together and acts as the concierge taking care of the data
base = alt.Chart(df)
# this shows the types, note that we rely on Altair's aggregation prowess
chart1 = base.mark_bar().encode(
x = alt.Y('type:N'),
y = alt.X('count()'),
# when a bar is selected, the others are displayed with reduced opacity
opacity=alt.condition(selection, alt.value(1), alt.value(.25)),
).add_params(selection)
# this chart reacts to the selection made in the left/above chart
chart2 = base.mark_bar(width=5).encode(
x = alt.X('word:N'),
y = alt.Y('count()'),
).transform_filter(selection)
chart1 | chart2
import re # regular expressions, we will need them to search through the text
# we move all line breaks with spaces, to not mess up the display (you'll see)
text = story.replace("\n", " ")
# the term to search the text
keyword = "kingdom"
# this is the window of characters displayed on both sides
span = 40 - int(len(keyword)/2)
# find all the start positions of matches in the text
starts = [m.start() for m in re.finditer(keyword, text)]
# if there are no matches, we also tell the user/reader
if (len(starts)==0): print("Sorry, but there are no matches for your query")
# we go through all the start positions
for start in starts:
# determine the end position, based on the keyword's length
end = start+len(keyword)
# we get the string left and right of the match
left = text[max(0, start-span):start]
match = text[start:end]
right = text[end:end+span]
# we print left and right context with the actual match in the middle
print(left+match+right)