News Data Analysis

Libraries

!pip install gensim==4.3.3

import pandas as pd from pathlib import Path import gensim from gensim.utils import simple_preprocess from gensim.parsing.preprocessing import STOPWORDS import plotly.express as px from nltk.stem import WordNetLemmatizer from nltk.stem.porter import * import numpy as np np.random.seed(2018) import nltk from nltk.stem.snowball import SnowballStemmer nltk.download('wordnet') nltk.download('omw-1.4')

Data preprocessing

Data preprocessing and cleaning is an important step before any text mining task, in this step, we will remove the punctuation, stop words and normalize the reviews as much as possible. After every preprocessing step, it is a good practice to check the most frequent words in the data. Therefore, let’s define a function that would plot a bar graph of n most frequent words in the data.

search_news_raw=pd.read_csv('/work/news-topic-modeling/data/raw/raw_search_news_dataset.csv')

search_news_raw=search_news_raw.iloc[:, [ False, True, True,True,True, False,True,True, False,True]]

latest_news_raw=pd.read_csv('/work/news-topic-modeling/data/raw/raw_latest_news_dataset.csv')

latest_news_raw

latest_news_raw=latest_news_raw.iloc[:, [ False, True, True,True,True, False,True,True, False,True]]

latest_news_raw

latest_news_raw=latest_news_raw.rename(columns={'publishedAt':'Date','source.name':'Source'})

search_news_raw=search_news_raw.rename(columns={'publishedAt':'Date','source.name':'Source'})

latest_news_raw['Date']

latest_news_raw['Date']=pd.to_datetime(latest_news_raw['Date'], format='%Y-%m-%dT%H:%M:%SZ')

search_news_raw['Date']=pd.to_datetime(search_news_raw['Date'], format='%Y-%m-%dT%H:%M:%SZ')

# remove unwanted characters, numbers and symbols search_news_raw['content']=search_news_raw['content'].str.replace(" [^a-zA-Z#] |\n|\'s|[:@#$&=^*!?~-]|[()]|[/]|\[|\]|\"|\'", " ") latest_news_raw['content']=latest_news_raw['content'].str.replace(" [^a-zA-Z#] |\n|\'s|[:@#$&=^*!?~-]|[()]|[/]|\[|\]|\"|\'", " ")

latest_news_raw

#func to get the lemma and stemmed form of word def lemmatize_stemming(text): return SnowballStemmer(language='english').stem(WordNetLemmatizer().lemmatize(text, pos='v')) def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: result.append(lemmatize_stemming(token)) return result

print(search_news_raw['content'][19])

doc_sample = search_news_raw['content'][19] print('original document: ') words=[] for word in doc_sample.split(' '): words.append(word) print(words) print('\n\n tokenized and lemmatized document: ') print(preprocess(doc_sample))

processed_docs = search_news_raw['content'].fillna('').astype(str).map(preprocess) processed_docs.head(10)

# Join multiple lists l='' for i in latest_news_raw['content']: #print(i) l=l+str(i) print(l)

!pip install wordcloud==1.9.4

from wordcloud import WordCloud import matplotlib.pyplot as plt

def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: result.append(token) return result

pl=preprocess(l)

Latest news wordcloud

!pip install dash==2.18.2

import plotly.graph_objects as go from dash import dcc from wordcloud import STOPWORDS from wordcloud import WordCloud # add search query to list of exclusions excluded_words = list(STOPWORDS) wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(l)) fig = go.Figure() fig.add_trace(go.Image(z=wordcloud_image)) fig.update_layout( height=600, xaxis={"visible": False}, yaxis={"visible": False}, margin={"t": 0, "b": 0, "l": 0, "r": 0}, hovermode=False, paper_bgcolor="#F9F9FA", plot_bgcolor="#F9F9FA", )

import plotly.graph_objects as go from dash import dcc from wordcloud import STOPWORDS from wordcloud import WordCloud # add search query to list of exclusions excluded_words = list(STOPWORDS) wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(pl)) fig = go.Figure() fig.add_trace(go.Image(z=wordcloud_image)) fig.update_layout( height=600, xaxis={"visible": False}, yaxis={"visible": False}, margin={"t": 0, "b": 0, "l": 0, "r": 0}, hovermode=False, paper_bgcolor="#F9F9FA", plot_bgcolor="#F9F9FA", )

Search news wordcloud

search_news_raw['content'].head()

# Join multiple lists s='' for i in search_news_raw['content']: #print(i) s=s+str(i) print(l)

import os

import plotly.graph_objects as go from dash import dcc from wordcloud import STOPWORDS from wordcloud import WordCloud # add search query to list of exclusions excluded_words = list(STOPWORDS) wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(s)) fig = go.Figure() fig.add_trace(go.Image(z=wordcloud_image)) fig.update_layout( height=600, xaxis={"visible": False}, yaxis={"visible": False}, margin={"t": 0, "b": 0, "l": 0, "r": 0}, hovermode=False, paper_bgcolor="#F9F9FA", plot_bgcolor="#F9F9FA", ) if not os.path.exists("/work/news-topic-modeling/reports/figures/word-cloud.png"): fig.write_image("/work/news-topic-modeling/reports/figures/word-cloud.png")

def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: result.append(token) return result

ps=preprocess(s)

# function to plot most frequent terms def freq_words(x, terms = 30): #all_words = ' '.join([text for text in x]) #all_words = x.split(' ') all_words = x fdist = nltk.FreqDist(all_words) words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) # selecting top 20 most frequent words d = words_df.nlargest(columns="count", n = terms) d=d.sort_values(by='count',ascending=True) fig = px.bar(d,x = "count",y= "word", hover_data=['word', 'count'], color='count', labels={'count':'frequecy of the word'}, height=400) fig.show() freq_words(ps)

# function to plot most frequent terms def freq_words(x, terms = 30): #all_words = ' '.join([text for text in x]) #all_words = x.split(' ') all_words = x fdist = nltk.FreqDist(all_words) words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) # selecting top 20 most frequent words d = words_df.nlargest(columns="count", n = terms) d=d.sort_values(by='count',ascending=True) fig = px.bar(d,x = "count",y= "word", hover_data=['word', 'count'], color='count', labels={'count':'frequecy of the word'},title='Top words', height=400) fig.show() freq_words(pl)

import nltk nltk.download('stopwords')

import nltk from nltk.util import ngrams from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures word_fd = nltk.FreqDist(pl) bigram_fd = nltk.FreqDist(nltk.bigrams(ps)) l1=[] l2=[] bigram_fd.most_common() for i in range(20): l1.append(bigram_fd.most_common()[i][0]) l2.append(bigram_fd.most_common()[i][1]) df=pd.DataFrame({'two words':l1,'count':l2})

df['two words'][0] def convertTuple(tup): str1 = ' '.join(tup) return str1

df['two words']=df['two words'].apply(convertTuple)

df

trigram_fd = nltk.FreqDist(nltk.trigrams(ps)) l1=[] l2=[] trigram_fd.most_common() for i in range(20): l1.append(trigram_fd.most_common()[i][0]) l2.append(trigram_fd.most_common()[i][1]) df1=pd.DataFrame({'three words':l1,'count':l2})

df1['three words']=df1['three words'].apply(convertTuple)

df1

# function to plot most frequent terms def freq_words(df): df=df.sort_values(by='count',ascending=True) fig = px.bar(df,x = "count",y= "two words" , color='count', labels={'count':'frequecy'},title='Twp word phrases ', height=400) fig.show() if not os.path.exists("/work/news-topic-modeling/reports/figures/two-words-chart.png"): fig.write_image("/work/news-topic-modeling/reports/figures/two-words-chart.png") freq_words(df)

def freq_words(df): df=df.sort_values(by='count',ascending=True) fig = px.bar(df,x = "count",y= "three words", hover_data=['three words', 'count'], color='count', labels={'count':'frequecy'},title='Three word phrases', height=400) fig.show() if not os.path.exists("/work/news-topic-modeling/reports/figures/three-words-chart.png"): fig.write_image("/work/news-topic-modeling/reports/figures/three-words-chart.png") freq_words(df1)

search_news_raw

df=pd.DataFrame({'News source':search_news_raw['Source'].value_counts().index,'Count':search_news_raw['Source'].value_counts().values})

fig = px.pie(df, values='Count', names='News source', title='News source distribution') fig.show()

df=pd.DataFrame({'News source':latest_news_raw['Source'].value_counts().index,'Count':latest_news_raw['Source'].value_counts().values})

fig = px.pie(df, values='Count', names='News source', title='News source distribution') fig.show()

import os if not os.path.exists("/work/news-topic-modeling/reports/figures/pie-chart.png"): fig.write_image("/work/news-topic-modeling/reports/figures/pie-chart.png")

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}News Data Analysis

Libraries

Data preprocessing

Latest news wordcloud

Search news wordcloud

News Data Analysis