from wordcloud import WordCloud
import string
import emoji
import re
import pandas as pd
import spacy as sp
from spacy.language import Language
from spacy_fastlang import LanguageDetector
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from chat_downloader import ChatDownloader
url = 'https://www.youtube.com/watch?v=dXTqn5W9R00&t=2445s'
chat = ChatDownloader().get_chat(url) # create a generator
text=[]
for message in chat: # iterate over messages
text.append(chat.format(message)) # print the formatted message
clean_text=[re.sub('^.*?: ', '', t) for t in text]
emoji_short_names=re.findall(':\w+?:', " ".join(clean_text))
emoji_short_names=[emoji.emojize(e, use_aliases=True) for e in emoji_short_names]
emoji_short_names=[e for e in emoji_short_names if emoji.is_emoji(e)]
clean_text_no_emoji=[re.sub(':\w+?:', '', t) for t in clean_text]
@Language.component("clean_text")
def clean_text(doc):
text=''
if doc._.language == 'en':
text_list=[token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]
text=' '.join(text_list)
return text
nlp = sp.load('en_core_web_sm')
nlp.Defaults.stop_words |= {"deepnote","notebook"}
nlp.add_pipe('language_detector', last=True)
nlp.add_pipe('clean_text')
docs = list(nlp.pipe(clean_text_no_emoji))
final_text=' '.join(emoji_short_names + docs)
# the regex used to detect words is a combination of normal words, ascii art, and emojis
normal_word = r"(?:\w[\w']+)"
ascii_art = r"(?:[{punctuation}][{punctuation}]+)".format(punctuation=string.punctuation)
emoji_reg = r"(?:[^\s])(?<![\w{ascii_printable}])".format(ascii_printable=string.printable)
regexp = r"{normal_word}|{ascii_art}|{emoji_reg}".format(normal_word=normal_word, ascii_art=ascii_art, emoji_reg=emoji_reg)
mask=np.array(Image.open('deepnote_mask.png'))
wordcloud = WordCloud(font_path='/work/OpenSansEmoji.ttf', regexp=regexp, background_color='black', mask=mask, contour_width=5, contour_color='#00A9FF').generate(final_text)
fig = plt.figure(figsize=(12,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()