!pip -q install wordcloud
# Some few necessary imports
from tqdm import tqdm, trange
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
import random
# Define book text file globally
BOOK_PATH = "/work/How to Win Friends and Influence People - Dale Carnegie.txt"
book = open(BOOK_PATH, 'r')
# First few lines
book.readlines()[:100]
# Get random story chunk
for i, l in enumerate(open(BOOK_PATH, 'r').readlines()[6000:6060]):
if l != '\n':
print(l)
for i, l in enumerate(open(BOOK_PATH, 'r').readlines()[:100]):
# remove extra new lines
if len(l) > 3:# or l.isupper():
print(l.split(' \n')[0])
lines = []
for i, l in tqdm(enumerate(open(BOOK_PATH, 'r').readlines())):
# remove extra new lines
if len(l) > 3:
lines.append(l.split(' \n')[0])
# A preview of lines
print("A preview of first few(10) lines:\n" + '='*40 + f"\n{lines[:10]}\n")
# lines into single story-like corpus
joined_lines = ' '.join(lines)
# A preview of the first few lines of the story like text corpus
print("A preview of the story-like corpus:\n" + '='*40 + f"\n{joined_lines[:200]}\n")
# Designed sentences
sentences = joined_lines.split('.')
# A preview of the first 10 sentences
print("A preview of the first ten sentences:\n" + '='*40 + f"\n{[sent for sent in sentences[:10]]}")
# Total number of sentences
len(sentences)
import nltk
nltk.download('punkt')
words = []
for sent in tqdm(sentences):
for word in TextBlob(sent).words:
words.append(word)
# Preview random words
print(words[23], words[42], words[1000])
len(words)
lst = []
for l in open(BOOK_PATH, 'r').readlines():
lst.append(l)
# Random paragraph
' '.join(lst).split('. \n \n ')[100] + '.'
paragraphs = ' '.join(lst).split('. \n \n ')
# Add a fullstop at the end of each paragraph
paragraphs = [p+'.' for p in paragraphs]
# Remove unnecessary new lines in paragraphs
paragraphs = [''.join(' '.join(p.split('\n')).split(' ')) for p in paragraphs]
paragraphs[100]
len(paragraphs)
# get random paragragh
random.choice(paragraphs)
stopwords = set(STOPWORDS)
wc = WordCloud(background_color="red", max_words=20000,
stopwords=stopwords, contour_width=3, contour_color='steelblue',
relative_scaling=0, max_font_size=500)
wc.generate(' '.join(paragraphs))
plt.figure(figsize=(15,12))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")