import pandas as pd
import altair as alt
import nltk # ← new
# downloading some additional packages and corpora
nltk.download('punkt') # necessary for tokenization
nltk.download('wordnet') # necessary for lemmatization
nltk.download('stopwords') # necessary for removal of stop words
nltk.download('averaged_perceptron_tagger') # necessary for POS tagging
nltk.download('maxent_ne_chunker' ) # necessary for entity extraction
nltk.download('omw-1.4') # necessary for lemmatization
nltk.download('words')
# and a small English language model
!python -m spacy download en_core_web_sm
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /root/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data] /root/nltk_data...
[nltk_data] Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data] Unzipping corpora/words.zip.
2023-06-01 18:33:00.880026: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-01 18:33:01.064510: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-01 18:33:01.064559: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-01 18:33:01.100540: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-01 18:33:02.037772: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-01 18:33:02.037863: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-06-01 18:33:02.037881: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2023-06-01 18:33:03.651788: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-06-01 18:33:03.651834: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-06-01 18:33:03.651866: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (p-8f4e90ba-172b-4f2a-a60f-d304dcd78435): /proc/driver/nvidia/version does not exist
Collecting en-core-web-sm==3.4.1
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 77.5 MB/s eta 0:00:00
Requirement already satisfied: spacy<3.5.0,>=3.4.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from en-core-web-sm==3.4.1) (3.4.2)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.64.1)
Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.1.5)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.10.2)
Requirement already satisfied: jinja2 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.11.3)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.28.1)
Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.10.1)
Requirement already satisfied: typer<0.5.0,>=0.3.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.4.2)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.7)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.8)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.3)
Requirement already satisfied: packaging>=20.0 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (21.3)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.9)
Requirement already satisfied: numpy>=1.15.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.23.4)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.4.5)
Requirement already satisfied: setuptools in /root/venv/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (58.1.0)
Requirement already satisfied: pathy>=0.3.5 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.6.2)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.3.0)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.8)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.10 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.10)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from packaging>=20.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.9)
Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from pathy>=0.3.5->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (5.2.1)
Requirement already satisfied: typing-extensions>=4.1.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.4.0)
Requirement already satisfied: certifi>=2017.4.17 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2022.9.24)
Requirement already satisfied: charset-normalizer<3,>=2 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.1.1)
Requirement already satisfied: idna<4,>=2.5 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.26.12)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.7.9)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.0.3)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.1.3)
Requirement already satisfied: MarkupSafe>=0.23 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from jinja2->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.0)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
WARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
import requests # to load the data by URL
r = requests.get('http://infovis.fh-potsdam.de/tutorials/data/story.txt')
r.encoding = "utf-8" # ensure correct encoding
story = r.text
# display first 500 characters
print(story[:500]+"…")
The seventh Sally or how Trurl's own perfection led to no good
By Stanisław Lem, 1965.
Translated by Michael Kandel, 1974.
The Universe is infinite but bounded, and therefore a beam of light, in whatever direction it may travel, will after billions of centuries return - if powerful enough - to the point of its departure; and it is no different with rumor, that flies about from star to star and makes the rounds of every planet. One day Trurl heard distant reports of two mighty constructor-benef…
sentence = "There were plenty of towns, rivers, mountains, forests, and brooks."
words = nltk.word_tokenize(sentence)
words
# no punctuation, numbers or contractions (such as "isn't")
onlywords = [word for word in words if word.isalpha()]
onlywords[0:20]
from nltk.stem import PorterStemmer as stemmer
from nltk.stem import WordNetLemmatizer as lemmatizer
from nltk.corpus import wordnet # for robust lemmatization
word = "drove"
print(stemmer().stem(word))
print(lemmatizer().lemmatize(word, pos = wordnet.VERB))
drove
drive
# to save us some typing, we import these, so we can call them directly
from nltk import word_tokenize, pos_tag
sentence = "There were plenty of towns, rivers, mountains, forests, and brooks."
# first we tokenize then we pos_tag
sentence = pos_tag(word_tokenize(sentence))
sentence
# same as above: first tokenize, then pos_tag
pos = pos_tag(word_tokenize(story))
# to keep things short & sweet, we define a function for lemmatizing verbs
def lemmatize_verb (word):
return lemmatizer().lemmatize(word.lower(), pos = wordnet.VERB)
# remember this form? it's a list comprehension again!
# the condition at the end matches all verbs, whose POS tag starts with a V.
# word[1][0] refers to the second element of the tuple and its first letter
verbs = [lemmatize_verb(word[0]) for word in pos if word[1][0]=="V"]
# let's look at the first 50 verbs
print(verbs[:50])
['lead', 'translate', 'be', 'bound', 'travel', 'return', 'be', 'star', 'make', 'hear', 'accomplish', 'have', 'run', 'explain', 'be', 'have', 'circumnavigate', 'have', 'say', 'be', 'doubt', 'let', 'recall', 'be', 'undertake', 'keep', 'be', 'receive', 'pay', 'be', 'head', 'be', 'have', 'fly', 'pass', 'have', 'obtain', 'come', 'be', 'run', 'wave', 'astonish', 'concern', 'land', 'be', 'approach', 'clang', 'clank', 'introduce', 'have']
import spacy
nlp = spacy.load("en_core_web_sm")
/shared-libs/python3.9/py/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
2023-06-01 18:33:14.337573: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-01 18:33:14.562791: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-01 18:33:14.562835: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-01 18:33:14.603899: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-01 18:33:15.699216: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-01 18:33:15.699329: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-06-01 18:33:15.699340: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2023-06-01 18:33:16.891996: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-06-01 18:33:16.892029: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-06-01 18:33:16.892049: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (p-8f4e90ba-172b-4f2a-a60f-d304dcd78435): /proc/driver/nvidia/version does not exist
# retrieve plain text article
r = requests.get('https://infovis.fh-potsdam.de/tutorials/data/article.txt')
r.encoding = "utf-8"
article = r.text
# carry out NLP processing
doc = nlp(article)
# get the text and entity label of all word entities in the article
entities = [ (e.text, e.label_) for e in doc.ents if e.text ]
# see first 20 entities
entities[0:20]
tags = [ent[1] for ent in entities] # extract the tag parts
tags = set(tags) # get only the unique tags
# loop through all tags explain them
for tag in tags:
print(tag, spacy.explain(tag))
LOC Non-GPE locations, mountain ranges, bodies of water
FAC Buildings, airports, highways, bridges, etc.
TIME Times smaller than a day
PERCENT Percentage, including "%"
DATE Absolute or relative dates or periods
CARDINAL Numerals that do not fall under another type
ORG Companies, agencies, institutions, etc.
LAW Named documents made into laws.
NORP Nationalities or religious or political groups
PRODUCT Objects, vehicles, foods, etc. (not services)
ORDINAL "first", "second", etc.
PERSON People, including fictional
GPE Countries, cities, states
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True) # show the text with colorful highligths
from nltk.corpus import stopwords as stop
stopwords = stop.words("english")
print(stopwords)
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
tokens = nltk.word_tokenize(story.lower())
# let's focus on those tokens that contain only letters
lettertokens = [word for word in tokens if word.isalpha()]
# this is a special form called List Comprehensions (you've seen it before)
without_stopwords = [word for word in lettertokens if word not in stopwords]
print(without_stopwords[:50])
['seventh', 'sally', 'trurl', 'perfection', 'led', 'good', 'stanisław', 'lem', 'translated', 'michael', 'kandel', 'universe', 'infinite', 'bounded', 'therefore', 'beam', 'light', 'whatever', 'direction', 'may', 'travel', 'billions', 'centuries', 'return', 'powerful', 'enough', 'point', 'departure', 'different', 'rumor', 'flies', 'star', 'star', 'makes', 'rounds', 'every', 'planet', 'one', 'day', 'trurl', 'heard', 'distant', 'reports', 'two', 'mighty', 'wise', 'accomplished', 'equal', 'news', 'ran']
tokens = word_tokenize(story.lower())
words = [word for word in tokens if word.isalpha()]
# bag of words as a dictionary data type
bow = {}
# we count the occurrences of each word and save it
for word in words:
bow[word] = words.count(word)
# for later use, we create a sorted list of word-frequency tuples
words_frequency = sorted(bow.items(), key=lambda x: x[1], reverse=True)
print(words_frequency[0:100])
[('the', 169), ('of', 131), ('and', 127), ('a', 92), ('to', 63), ('that', 58), ('in', 49), ('you', 37), ('trurl', 33), ('it', 31), ('his', 30), ('they', 26), ('he', 25), ('was', 25), ('with', 24), ('for', 23), ('not', 22), ('as', 21), ('by', 19), ('but', 19), ('this', 19), ('do', 19), ('no', 18), ('all', 17), ('i', 17), ('had', 16), ('kingdom', 16), ('have', 16), ('when', 15), ('is', 14), ('one', 13), ('klapaucius', 13), ('him', 13), ('were', 13), ('an', 13), ('what', 13), ('or', 12), ('would', 12), ('box', 12), ('so', 11), ('are', 11), ('excelsius', 11), ('there', 11), ('who', 10), ('which', 10), ('into', 10), ('on', 10), ('king', 10), ('be', 10), ('how', 9), ('only', 9), ('their', 9), ('way', 9), ('if', 8), ('from', 8), ('nothing', 8), ('at', 8), ('subjects', 8), ('like', 8), ('these', 7), ('well', 7), ('our', 7), ('your', 7), ('about', 6), ('planet', 6), ('two', 6), ('those', 6), ('great', 6), ('monarch', 6), ('up', 6), ('also', 6), ('could', 6), ('though', 6), ('said', 6), ('know', 6), ('electrons', 6), ('after', 5), ('its', 5), ('even', 5), ('very', 5), ('without', 5), ('some', 5), ('such', 5), ('over', 5), ('now', 5), ('first', 5), ('death', 5), ('see', 5), ('too', 5), ('out', 5), ('model', 5), ('them', 5), ('doll', 5), ('understand', 5), ('enough', 4), ('space', 4), ('time', 4), ('ship', 4), ('through', 4), ('full', 4)]
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = story
wc = WordCloud(width=500, height=500, background_color="white").generate(text)
# display the generated image:
my_dpi = 72
plt.figure(figsize = (500/my_dpi, 500/my_dpi), dpi=my_dpi)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
# first we create a dataframe from the word frequencies
df = pd.DataFrame(words_frequency, columns=['word', 'count'])
# we want to focus just on the top 20 words
df_top = df[:20]
# draw horizontal barchart
alt.Chart(df_top).mark_bar().encode(
x = 'count:Q',
y = 'word:N'
)
# first we extract all words and their types (a.k.a. parts-of-speech or POS)
pos = pos_tag(word_tokenize(article))
# we will be collecting words and types in lists of the same length
words = []
types = []
# iterate over all entries in the pos list (generated above)
for p in pos:
# get the word and turn it into lowercase
word = p[0].lower()
# get the word's type
tag = p[1]
# for this analysis we remove entries that contain punctuation or numbers
# and we also ignore the stopwords (sorry: the, and, or, etc!)
if word.isalpha() and word not in stopwords:
# first we add this word to the words list
words.append(word)
# then we add its word type to types list, based on the 1st letter of the pos tag
# note that we access letters in a string, like entries in a list
if (tag[0]=="J"): types.append("Adjective")
elif (tag[0]=="N"): types.append("Noun")
elif (tag[0]=="R"): types.append("Adverb")
elif (tag[0]=="V"): types.append("Verb")
# there are many more word types, we simply subsume them under 'other'
else: types.append("Other")
# with the two lists of the same length, we create a dataframe with a dictionary,
# of which the keys will become the column labels
df = pd.DataFrame({"word": words, "type": types })
# along the type column, we want to support a filter selection
selection = alt.selection_point(fields=['type'])
# we create a composite chart consisting of two sub-charts
# the base holds it together and acts as the concierge taking care of the data
base = alt.Chart(df)
# this shows the types, note that we rely on Altair's aggregation prowess
chart1 = base.mark_bar().encode(
x = alt.Y('type:N'),
y = alt.X('count()'),
# when a bar is selected, the others are displayed with reduced opacity
opacity=alt.condition(selection, alt.value(1), alt.value(.25)),
).add_params(selection)
# this chart reacts to the selection made in the left/above chart
chart2 = base.mark_bar(width=5).encode(
x = alt.X('word:N'),
y = alt.Y('count()'),
).transform_filter(selection)
chart1 | chart2
import re # regular expressions, we will need them to search through the text
# we move all line breaks with spaces, to not mess up the display (you'll see)
text = story.replace("\n", " ")
# the term to search the text
keyword = "kingdom"
# this is the window of characters displayed on both sides
span = 40 - int(len(keyword)/2)
# find all the start positions of matches in the text
starts = [m.start() for m in re.finditer(keyword, text)]
# if there are no matches, we also tell the user/reader
if (len(starts)==0): print("Sorry, but there are no matches for your query")
# we go through all the start positions
for start in starts:
# determine the end position, based on the keyword's length
end = start+len(keyword)
# we get the string left and right of the match
left = text[max(0, start-span):start]
match = text[start:end]
right = text[end:end+span]
# we print left and right context with the actual match in the middle
print(left+match+right)
erora; the inhabitants of both these kingdoms had, in a fit of regicidal madness,
Trurl built the king an entirely new kingdom. There were plenty of towns, rivers,
. Trurl also carefully set into this kingdom a fabulous capital, all in marble an
t ran, and he gave the women of that kingdom beauty, the men - sullen silence and
he input and output of his brand-new kingdom were, and how to program wars, quell
levy. After a year had passed in the kingdom, which amounted to hardly a minute f
nsulted by Trurl's gift, in that the kingdom was too small and very like a child'
er presented a mighty monarch with a kingdom. Excelsius was sensible enough, howe
cepter under his arm, lifted the box kingdom with a grunt, and took it to his hum
Trurl exclaimed. "Really, the whole kingdom fits into a box three feet by two by
dimensions have anyway? In that box kingdom, doesn't a journey from the capital
existence with that of an imitation kingdom locked up in some glass box?!" cried
t. For otherwise the monarch of that kingdom sooner or later would have gotten th
est - you only sought to construct a kingdom as lifelike as possible, so similar
ke as possible, so similar to a real kingdom, that no one, absolutely no one, cou
when we get there?" "I'll take the kingdom away from him!" "And what will you
to undo the entire structure of the kingdom, then assemble from scratch ...""And