News Data Analysis
Libraries
!pip install gensim==4.3.3
import pandas as pd
from pathlib import Path
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import plotly.express as px
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
from nltk.stem.snowball import SnowballStemmer
nltk.download('wordnet')
nltk.download('omw-1.4')
Data preprocessing
Data preprocessing and cleaning is an important step before any text mining task, in this step, we will remove the punctuation, stop words and normalize the reviews as much as possible. After every preprocessing step, it is a good practice to check the most frequent words in the data. Therefore, let’s define a function that would plot a bar graph of n most frequent words in the data.
search_news_raw=pd.read_csv('/work/news-topic-modeling/data/raw/raw_search_news_dataset.csv')
search_news_raw=search_news_raw.iloc[:, [ False, True, True,True,True, False,True,True, False,True]]
latest_news_raw=pd.read_csv('/work/news-topic-modeling/data/raw/raw_latest_news_dataset.csv')
latest_news_raw
latest_news_raw=latest_news_raw.iloc[:, [ False, True, True,True,True, False,True,True, False,True]]
latest_news_raw
latest_news_raw=latest_news_raw.rename(columns={'publishedAt':'Date','source.name':'Source'})
search_news_raw=search_news_raw.rename(columns={'publishedAt':'Date','source.name':'Source'})
latest_news_raw['Date']
latest_news_raw['Date']=pd.to_datetime(latest_news_raw['Date'], format='%Y-%m-%dT%H:%M:%SZ')
search_news_raw['Date']=pd.to_datetime(search_news_raw['Date'], format='%Y-%m-%dT%H:%M:%SZ')
# remove unwanted characters, numbers and symbols
search_news_raw['content']=search_news_raw['content'].str.replace(" [^a-zA-Z#] |\n|\'s|[:@#$&=^*!?~-]|[()]|[/]|\[|\]|\"|\'", " ")
latest_news_raw['content']=latest_news_raw['content'].str.replace(" [^a-zA-Z#] |\n|\'s|[:@#$&=^*!?~-]|[()]|[/]|\[|\]|\"|\'", " ")
latest_news_raw
#func to get the lemma and stemmed form of word
def lemmatize_stemming(text):
    return SnowballStemmer(language='english').stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result
print(search_news_raw['content'][19])
doc_sample = search_news_raw['content'][19]
print('original document: ')
words=[]
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))
processed_docs = search_news_raw['content'].fillna('').astype(str).map(preprocess)
processed_docs.head(10)
# Join multiple lists
l=''
for i in latest_news_raw['content']:
    #print(i)
    l=l+str(i)
print(l)
!pip install wordcloud==1.9.4
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result
pl=preprocess(l)
Latest news wordcloud
!pip install dash==2.18.2
import plotly.graph_objects as go
from dash import dcc
from wordcloud import STOPWORDS
from wordcloud import WordCloud
    # add search query to list of exclusions
excluded_words = list(STOPWORDS)
wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(l)) 
fig = go.Figure()
fig.add_trace(go.Image(z=wordcloud_image))
fig.update_layout(
    height=600,
xaxis={"visible": False},
yaxis={"visible": False},
margin={"t": 0, "b": 0, "l": 0, "r": 0},
hovermode=False,
paper_bgcolor="#F9F9FA",
plot_bgcolor="#F9F9FA",
    )
    
import plotly.graph_objects as go
from dash import dcc
from wordcloud import STOPWORDS
from wordcloud import WordCloud
    # add search query to list of exclusions
excluded_words = list(STOPWORDS)
wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(pl)) 
fig = go.Figure()
fig.add_trace(go.Image(z=wordcloud_image))
fig.update_layout(
    height=600,
xaxis={"visible": False},
yaxis={"visible": False},
margin={"t": 0, "b": 0, "l": 0, "r": 0},
hovermode=False,
paper_bgcolor="#F9F9FA",
plot_bgcolor="#F9F9FA",
    )
    
Search news wordcloud
search_news_raw['content'].head()
# Join multiple lists
s=''
for i in search_news_raw['content']:
    #print(i)
    s=s+str(i)
print(l)
import os
import plotly.graph_objects as go
from dash import dcc
from wordcloud import STOPWORDS
from wordcloud import WordCloud
    # add search query to list of exclusions
excluded_words = list(STOPWORDS)
wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(s)) 
fig = go.Figure()
fig.add_trace(go.Image(z=wordcloud_image))
fig.update_layout(
    height=600,
xaxis={"visible": False},
yaxis={"visible": False},
margin={"t": 0, "b": 0, "l": 0, "r": 0},
hovermode=False,
paper_bgcolor="#F9F9FA",
plot_bgcolor="#F9F9FA",
    )
if not os.path.exists("/work/news-topic-modeling/reports/figures/word-cloud.png"):
        fig.write_image("/work/news-topic-modeling/reports/figures/word-cloud.png")
    
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result
ps=preprocess(s)
# function to plot most frequent terms
def freq_words(x, terms = 30):
    #all_words = ' '.join([text for text in x])
    #all_words = x.split(' ')
    all_words = x
    fdist = nltk.FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n = terms) 
    d=d.sort_values(by='count',ascending=True)
    fig = px.bar(d,x = "count",y= "word",
             hover_data=['word', 'count'], color='count',
             labels={'count':'frequecy of the word'}, height=400)
    fig.show()
freq_words(ps)
# function to plot most frequent terms
def freq_words(x, terms = 30):
    #all_words = ' '.join([text for text in x])
    #all_words = x.split(' ')
    all_words = x
    fdist = nltk.FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n = terms) 
    d=d.sort_values(by='count',ascending=True)
    fig = px.bar(d,x = "count",y= "word",
             hover_data=['word', 'count'], color='count',
             labels={'count':'frequecy of the word'},title='Top words', height=400)
    fig.show()
freq_words(pl)
import nltk
nltk.download('stopwords')
import nltk
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
word_fd = nltk.FreqDist(pl)
bigram_fd = nltk.FreqDist(nltk.bigrams(ps))
l1=[]
l2=[]
bigram_fd.most_common()
for i in range(20):
    l1.append(bigram_fd.most_common()[i][0])
    l2.append(bigram_fd.most_common()[i][1])
df=pd.DataFrame({'two words':l1,'count':l2})
df['two words'][0]
def convertTuple(tup):
    str1 = ' '.join(tup)
    return str1
df['two words']=df['two words'].apply(convertTuple)
df
trigram_fd = nltk.FreqDist(nltk.trigrams(ps))
l1=[]
l2=[]
trigram_fd.most_common()
for i in range(20):
    l1.append(trigram_fd.most_common()[i][0])
    l2.append(trigram_fd.most_common()[i][1])
df1=pd.DataFrame({'three words':l1,'count':l2})
df1['three words']=df1['three words'].apply(convertTuple)
df1
# function to plot most frequent terms
def freq_words(df):
    df=df.sort_values(by='count',ascending=True)
    fig = px.bar(df,x = "count",y= "two words"
             , color='count',
             labels={'count':'frequecy'},title='Twp word phrases ', height=400)
    fig.show()
    if not os.path.exists("/work/news-topic-modeling/reports/figures/two-words-chart.png"):
        fig.write_image("/work/news-topic-modeling/reports/figures/two-words-chart.png")
freq_words(df)
def freq_words(df):
    df=df.sort_values(by='count',ascending=True)
    fig = px.bar(df,x = "count",y= "three words",
             hover_data=['three words', 'count'], color='count',
             labels={'count':'frequecy'},title='Three word phrases', height=400)
    fig.show()
    if not os.path.exists("/work/news-topic-modeling/reports/figures/three-words-chart.png"):
        fig.write_image("/work/news-topic-modeling/reports/figures/three-words-chart.png")
freq_words(df1)
search_news_raw
df=pd.DataFrame({'News source':search_news_raw['Source'].value_counts().index,'Count':search_news_raw['Source'].value_counts().values})
fig = px.pie(df, values='Count', names='News source', title='News source distribution')
fig.show()
df=pd.DataFrame({'News source':latest_news_raw['Source'].value_counts().index,'Count':latest_news_raw['Source'].value_counts().values})
fig = px.pie(df, values='Count', names='News source', title='News source distribution')
fig.show()
import os
if not os.path.exists("/work/news-topic-modeling/reports/figures/pie-chart.png"):
    fig.write_image("/work/news-topic-modeling/reports/figures/pie-chart.png")