6 Text processing

import pandas as pd import altair as alt import nltk # ← new

# downloading some additional packages and corpora nltk.download('punkt') # necessary for tokenization nltk.download('wordnet') # necessary for lemmatization nltk.download('stopwords') # necessary for removal of stop words nltk.download('averaged_perceptron_tagger') # necessary for POS tagging nltk.download('maxent_ne_chunker' ) # necessary for entity extraction nltk.download('omw-1.4') # necessary for lemmatization nltk.download('words') # and a small English language model !python -m spacy download en_core_web_sm

import requests # to load the data by URL r = requests.get('http://infovis.fh-potsdam.de/tutorials/data/story.txt') r.encoding = "utf-8" # ensure correct encoding story = r.text # display first 500 characters print(story[:500]+"…")

sentence = "There were plenty of towns, rivers, mountains, forests, and brooks."

words = nltk.word_tokenize(sentence) words

# no punctuation, numbers or contractions (such as "isn't") onlywords = [word for word in words if word.isalpha()] onlywords[0:20]

from nltk.stem import PorterStemmer as stemmer from nltk.stem import WordNetLemmatizer as lemmatizer from nltk.corpus import wordnet # for robust lemmatization word = "drove" print(stemmer().stem(word)) print(lemmatizer().lemmatize(word, pos = wordnet.VERB))

# to save us some typing, we import these, so we can call them directly from nltk import word_tokenize, pos_tag sentence = "There were plenty of towns, rivers, mountains, forests, and brooks." # first we tokenize then we pos_tag sentence = pos_tag(word_tokenize(sentence)) sentence

# same as above: first tokenize, then pos_tag pos = pos_tag(word_tokenize(story)) # to keep things short & sweet, we define a function for lemmatizing verbs def lemmatize_verb (word): return lemmatizer().lemmatize(word.lower(), pos = wordnet.VERB) # remember this form? it's a list comprehension again! # the condition at the end matches all verbs, whose POS tag starts with a V. # word[1][0] refers to the second element of the tuple and its first letter verbs = [lemmatize_verb(word[0]) for word in pos if word[1][0]=="V"] # let's look at the first 50 verbs print(verbs[:50])

import spacy nlp = spacy.load("en_core_web_sm")

# retrieve plain text article r = requests.get('https://infovis.fh-potsdam.de/tutorials/data/article.txt') r.encoding = "utf-8" article = r.text # carry out NLP processing doc = nlp(article) # get the text and entity label of all word entities in the article entities = [ (e.text, e.label_) for e in doc.ents if e.text ] # see first 20 entities entities[0:20]

tags = [ent[1] for ent in entities] # extract the tag parts tags = set(tags) # get only the unique tags # loop through all tags explain them for tag in tags: print(tag, spacy.explain(tag))

from spacy import displacy displacy.render(doc, style="ent", jupyter=True) # show the text with colorful highligths

from nltk.corpus import stopwords as stop stopwords = stop.words("english") print(stopwords)

tokens = nltk.word_tokenize(story.lower()) # let's focus on those tokens that contain only letters lettertokens = [word for word in tokens if word.isalpha()] # this is a special form called List Comprehensions (you've seen it before) without_stopwords = [word for word in lettertokens if word not in stopwords] print(without_stopwords[:50])

tokens = word_tokenize(story.lower()) words = [word for word in tokens if word.isalpha()] # bag of words as a dictionary data type bow = {} # we count the occurrences of each word and save it for word in words: bow[word] = words.count(word) # for later use, we create a sorted list of word-frequency tuples words_frequency = sorted(bow.items(), key=lambda x: x[1], reverse=True) print(words_frequency[0:100])

from wordcloud import WordCloud import matplotlib.pyplot as plt text = story wc = WordCloud(width=500, height=500, background_color="white").generate(text) # display the generated image: my_dpi = 72 plt.figure(figsize = (500/my_dpi, 500/my_dpi), dpi=my_dpi) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show()

# first we create a dataframe from the word frequencies df = pd.DataFrame(words_frequency, columns=['word', 'count']) # we want to focus just on the top 20 words df_top = df[:20] # draw horizontal barchart alt.Chart(df_top).mark_bar().encode( x = 'count:Q', y = 'word:N' )

# first we extract all words and their types (a.k.a. parts-of-speech or POS) pos = pos_tag(word_tokenize(article)) # we will be collecting words and types in lists of the same length words = [] types = [] # iterate over all entries in the pos list (generated above) for p in pos: # get the word and turn it into lowercase word = p[0].lower() # get the word's type tag = p[1] # for this analysis we remove entries that contain punctuation or numbers # and we also ignore the stopwords (sorry: the, and, or, etc!) if word.isalpha() and word not in stopwords: # first we add this word to the words list words.append(word) # then we add its word type to types list, based on the 1st letter of the pos tag # note that we access letters in a string, like entries in a list if (tag[0]=="J"): types.append("Adjective") elif (tag[0]=="N"): types.append("Noun") elif (tag[0]=="R"): types.append("Adverb") elif (tag[0]=="V"): types.append("Verb") # there are many more word types, we simply subsume them under 'other' else: types.append("Other")

# with the two lists of the same length, we create a dataframe with a dictionary, # of which the keys will become the column labels df = pd.DataFrame({"word": words, "type": types }) # along the type column, we want to support a filter selection selection = alt.selection_point(fields=['type']) # we create a composite chart consisting of two sub-charts # the base holds it together and acts as the concierge taking care of the data base = alt.Chart(df) # this shows the types, note that we rely on Altair's aggregation prowess chart1 = base.mark_bar().encode( x = alt.Y('type:N'), y = alt.X('count()'), # when a bar is selected, the others are displayed with reduced opacity opacity=alt.condition(selection, alt.value(1), alt.value(.25)), ).add_params(selection) # this chart reacts to the selection made in the left/above chart chart2 = base.mark_bar(width=5).encode( x = alt.X('word:N'), y = alt.Y('count()'), ).transform_filter(selection) chart1 | chart2

import re # regular expressions, we will need them to search through the text # we move all line breaks with spaces, to not mess up the display (you'll see) text = story.replace("\n", " ") # the term to search the text keyword = "kingdom" # this is the window of characters displayed on both sides span = 40 - int(len(keyword)/2) # find all the start positions of matches in the text starts = [m.start() for m in re.finditer(keyword, text)] # if there are no matches, we also tell the user/reader if (len(starts)==0): print("Sorry, but there are no matches for your query") # we go through all the start positions for start in starts: # determine the end position, based on the keyword's length end = start+len(keyword) # we get the string left and right of the match left = text[max(0, start-span):start] match = text[start:end] right = text[end:end+span] # we print left and right context with the actual match in the middle print(left+match+right)