import nltk
nltk.download("stopwords")
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# ready stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
stop_words.extend(["from", "subject", "re", "edu", "use"])
newsgroups = pd.read_json("https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json")
print(newsgroups.target_names.unique())
['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']
newsgroups.head()
# remove extraneous information
data = newsgroups.content.values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data] # remove emails
data = [re.sub('\s+', ' ', sent) for sent in data] # remove newline
data = [re.sub("\'", "", sent) for sent in data] # remove single quotes
pprint(data[:1])
['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
'15 I was wondering if anyone out there could enlighten me on this car I saw '
'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
'early 70s. It was called a Bricklin. The doors were really small. In '
'addition, the front bumper was separate from the rest of the body. This is '
'all I know. If anyone can tellme a model name, engine specs, years of '
'production, where this car is made, history, or whatever info you have on '
'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
'your neighborhood Lerxst ---- ']
# tokenize
def sent_to_words(sentences):
for s in sentences:
yield(gensim.utils.simple_preprocess(str(s), deacc=True))
data_words = list(sent_to_words(data))
print(data_words[:1])
[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]
# build bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(trigram_mod[bigram_mod[data_words[0]]])
['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
data_words_nostops = remove_stopwords(data_words) # remove stopwords
data_words_bigrams = make_bigrams(data_words_nostops) # form bigrams
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # initialize spacey 'en' model
data_lemmatized = lemmatization(data_words_bigrams)
print(data_lemmatized[:1])
[['where', 's', 'thing', 'car', 'nntp_poste', 'host', 'rac_wam', 'umd', 'organization', 'university', 'park', 'line', 'wonder', 'enlighten', 'car', 'saw', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]
id2word = corpora.Dictionary(data_lemmatized) # create dictionary
texts = data_lemmatized # corpus
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1)]]
# build the lda model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=15,
random_state=100,
update_every=1,
chunksize=100,
passes=10,
alpha='auto',
per_word_topics=True)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
[(0,
'0.530*"ax" + 0.082*"_" + 0.042*"max" + 0.023*"internet" + 0.016*"season" + '
'0.011*"scott" + 0.011*"ei" + 0.011*"cx" + 0.009*"mc" + 0.008*"rlk"'),
(1,
'0.042*"michigan" + 0.023*"relation" + 0.023*"uk" + 0.020*"historical" + '
'0.019*"radius" + 0.018*"roman" + 0.017*"december" + 0.016*"vol" + '
'0.007*"glory" + 0.006*"space_dig"'),
(2,
'0.019*"do" + 0.016*"say" + 0.014*"people" + 0.013*"think" + 0.012*"go" + '
'0.012*"write" + 0.012*"make" + 0.011*"know" + 0.009*"see" + 0.009*"well"'),
(3,
'0.025*"key" + 0.016*"government" + 0.015*"state" + 0.014*"public" + '
'0.014*"law" + 0.011*"system" + 0.011*"provide" + 0.010*"physical" + '
'0.009*"issue" + 0.009*"national"'),
(4,
'0.159*"window" + 0.148*"file" + 0.064*"entry" + 0.053*"screen" + '
'0.035*"program" + 0.022*"display" + 0.016*"rob" + 0.013*"adam" + '
'0.012*"default" + 0.010*"application"'),
(5,
'0.037*"goal" + 0.031*"pen" + 0.026*"islam" + 0.026*"iran" + '
'0.025*"generate" + 0.024*"smith" + 0.024*"muslim" + 0.022*"channel" + '
'0.019*"ltd" + 0.015*"islamic"'),
(6,
'0.072*"israel" + 0.038*"israeli" + 0.035*"pin" + 0.027*"wire" + '
'0.023*"arab" + 0.022*"jew" + 0.022*"jewish" + 0.015*"bomb" + '
'0.014*"processor" + 0.013*"palestinian"'),
(7,
'0.049*"car" + 0.049*"space" + 0.013*"earth" + 0.013*"engine" + '
'0.012*"launch" + 0.011*"moon" + 0.011*"nasa" + 0.011*"air" + '
'0.011*"mission" + 0.011*"telnet"'),
(8,
'0.042*"line" + 0.038*"organization" + 0.017*"nntp_poste" + 0.017*"write" + '
'0.017*"university" + 0.015*"host" + 0.014*"get" + 0.014*"article" + '
'0.010*"reply" + 0.009*"need"'),
(9,
'0.106*"gun" + 0.045*"drug" + 0.034*"police" + 0.033*"firearm" + '
'0.027*"convince" + 0.023*"cop" + 0.022*"fire" + 0.018*"safety" + '
'0.018*"militia" + 0.015*"dream"'),
(10,
'0.050*"team" + 0.049*"game" + 0.035*"win" + 0.032*"play" + 0.029*"year" + '
'0.021*"hockey" + 0.015*"fan" + 0.012*"trade" + 0.012*"nhl" + 0.012*"score"'),
(11,
'0.027*"armenian" + 0.023*"soldier" + 0.022*"war" + 0.021*"greek" + '
'0.020*"attack" + 0.020*"kill" + 0.020*"village" + 0.017*"turk" + '
'0.015*"rsa" + 0.014*"turkish"'),
(12,
'0.049*"league" + 0.028*"pitch" + 0.023*"milwaukee" + 0.017*"era" + '
'0.011*"monday" + 0.006*"innings" + 0.005*"bullpen" + 0.004*"rbi" + '
'0.003*"batting" + 0.001*"brewer"'),
(13,
'0.063*"notice" + 0.041*"md" + 0.031*"zone" + 0.022*"uunet" + '
'0.022*"stephen" + 0.018*"rs" + 0.013*"cd_rom" + 0.010*"organisation" + '
'0.007*"tiff" + 0.005*"asshole"'),
(14,
'0.036*"drive" + 0.033*"system" + 0.019*"card" + 0.017*"bit" + 0.016*"use" + '
'0.015*"problem" + 0.015*"driver" + 0.015*"software" + 0.014*"mac" + '
'0.013*"machine"')]
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis