# Statistical packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Text
import re
import string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud,STOPWORDS
#ML model
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import mean_squared_error,f1_score,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score
import joblib
#Preprocessing
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
# Over sampling
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import NearMiss
plt.rc('figure',figsize=(8,7.5))
np.random.seed(2021)
# Load data from the csv file
df = pd.read_csv('news_articles.csv', index_col=0)
print(f"Number of rows/records: {df.shape[0]}")
print(f"Number of columns/variables: {df.shape[1]}")
df.head()
NA = pd.DataFrame(data=[df.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
for i in (df.isna().sum()/df.shape[0]*100).tolist()]],
columns=df.columns, index=['NA Count', 'NA Percent']).T.\
sort_values(by='NA Count',ascending =False)
NA.style.background_gradient(cmap="summer", subset=['NA Count'])
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 10))
source_name = df["source_name"].dropna().value_counts()[:10]
author = df["author"].dropna().value_counts()[:10]
yticklabels = ['TAP',
'Reuters Editorial',
'CBS NEWS',
'BBC FB',
'AL Jazeera',
'The Irish Times',
'BBC News',
'CBS/AP',
'DAN Cancian',
'AP']
sns.barplot(x=source_name,y=source_name.index,palette='summer',ax=ax1)
sns.barplot(x=author,y=author.index,palette='summer',ax=ax2 )
sns.despine(bottom=True,left=True)
ax1.set(title='Top 10 Source')
ax2.set(title='Top 10 Author')
ax2.set_yticklabels(yticklabels) ;
fig, ax = plt.subplots(figsize=(16, 8))
fig.suptitle('Top Article', size = 20, color = "black")
explode = ( 0.05, 0.3)
labels = ["Not Top","Top"]
sizes = df["top_article"].dropna().value_counts()
ax.pie(sizes,
explode=explode,
colors=sns.color_palette("Set2"),
startangle=60,
labels=labels,
autopct='%1.0f%%',
pctdistance=0.9)
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()
eng = ['engagement_reaction_count',
'engagement_comment_count',
'engagement_share_count',
'engagement_comment_plugin_count']
ax = sns.boxplot(x="variable", y="value", data=pd.melt(df[eng]),palette='summer')
ax.set_title('Engagement Boxplots')
plt.xticks(rotation=45)
ax.set_yscale('Symlog')
plt.show()
pd.DataFrame(df["engagement_comment_plugin_count"].\
value_counts().\
reset_index().\
rename(columns = {'index':'engagement_comment_plugin',
'engagement_comment_plugin_count':'Counts'})).\
astype(int).\
style.background_gradient(cmap="summer", subset=['Counts'])
df['title'][0:5]
def clean_title(x:str):
# lowering the text
x=x.lower()
#removing square brackets
x = re.sub('\[.*?\]', '', x)
x = re.sub('<.*?>+', '', x)
#removing hyperlink
x = re.sub('https?://\S+|www\.\S+', '', x)
#removing puncuation
x = re.sub('[%s]' % re.escape(string.punctuation), '', x)
x = re.sub('\n', '', x)
#remove words containing numbers
x = re.sub('\w*\d\w*', '', x)
return x
df['clean_title'] = df['title'].astype(str).apply(clean_title)
df['clean_title'][0:5]
analyzer = SentimentIntensityAnalyzer()
def compound_score(txt):
return analyzer.polarity_scores(txt)["compound"]
## Sentiments
def sentiment(score):
emotion = ""
if score >= 0.5:
emotion = "Positive"
elif score <= -0.5:
emotion = "Negative"
else:
emotion = "Neutral"
return emotion
polarity_scores = df["clean_title"].astype("str").apply(compound_score)
df["Sentiment_Score"] = polarity_scores
df["Sentiment"] = df["Sentiment_Score"].apply(sentiment)
df.head()
sns.countplot(data=df,x="Sentiment",palette="summer")
plt.title('Sentiment Distribution');
np.array(list(STOPWORDS))[0:5]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[14, 14], facecolor = None)
wc = WordCloud(width = 800, height = 800,background_color="white",min_font_size = 10,\
repeat=True,stopwords = STOPWORDS)
wc.generate("".join(df['title'].astype(str)))
ax1.axis("off")
ax1.imshow(wc, interpolation="bilinear")
ax1.set_title('Common Words Used in Title',fontsize=16);
wc2 = WordCloud(width = 800, height = 800,background_color="white",min_font_size = 10,\
repeat=True,stopwords = STOPWORDS)
wc2.generate("".join(df['description'].astype(str)))
ax2.axis("off")
ax2.imshow(wc2, interpolation="bilinear")
ax2.set_title('Common Words Used in description',fontsize=16);
df['published_at'] = pd.to_datetime(df['published_at'])
df['Day_Of_Week'] = df['published_at'].apply(lambda x: x.dayofweek)
df['Month'] = df['published_at'].apply(lambda x: x.month)
df['Year'] = df['published_at'].apply(lambda x: x.year)
fig, ax1 = plt.subplots( figsize=[15, 8], facecolor = None)
date_mean = df.copy()
date_mean['published_at'] = pd.to_datetime(date_mean['published_at']).dt.normalize()
date_mean = date_mean.groupby(by='published_at').mean().reset_index()
ax1 = sns.lineplot(
data=date_mean, x="published_at", y="engagement_reaction_count",label = 'reaction'
)
ax1 = sns.lineplot(
data=date_mean, x="published_at", y="engagement_comment_count",label = 'comment'
)
ax1 = sns.lineplot(
data=date_mean, x="published_at", y="engagement_share_count", label = 'share'
)
plt.ylabel('Engagement Counts')
plt.title('Engagements over the time',fontsize=16);
dataplot = sns.heatmap(df.drop(columns = ['Year','engagement_comment_plugin_count']).corr(),
cmap="YlGnBu", annot=True)
dataplot.set_title('Coorelation HeatMap',fontsize=16)
# displaying heatmap
plt.show()
df['clean_title'].fillna('Missing',inplace=True)
tfidf_vec = TfidfVectorizer(ngram_range=(1,2),
dtype=np.float32,
sublinear_tf=True,
use_idf=True,
smooth_idf=True)
train_tfidf = tfidf_vec.fit_transform(df['clean_title'])
list_labels = df["top_article"].fillna(0)
list_labels.value_counts()
OS = SMOTE()
X,Y = OS.fit_resample(train_tfidf,list_labels)
df['Popularity_Score'] = np.log1p((df['engagement_reaction_count'] + \
df['engagement_comment_count']+\
df['engagement_share_count']+\
df['engagement_comment_plugin_count']))
sns.kdeplot(df['Popularity_Score'])
plt.title('Popularity Score KDE Plot',fontsize=16);
df.groupby(by='source_name').\
mean()['Popularity_Score'].\
to_frame().\
sort_values(by='Popularity_Score',
ascending=False)
X_train, X_test, y_train, y_test = train_test_split(X, Y,
test_size=0.1,
stratify = Y ,
random_state=40)
modeltop = LGBMClassifier(verbose=-1,
learning_rate=0.5,
max_depth=20,
num_leaves=50,
n_estimators=120,
max_bin=2000,)
scores = cross_val_score(
modeltop, X_train, y_train, cv=5, scoring='f1_macro')
print("Cross Validation F1 Scores : ",scores)
modeltop.fit(X_train,y_train)
predtop = modeltop.predict(X_test)
print("f1 score : ",round(f1_score(predtop,y_test),2))
print("accuracy score : ",round(accuracy_score(predtop,y_test),2))
joblib.dump(modeltop, 'Models/lgb_top.pkl')
y_test.value_counts()
plt.figure(figsize = (10,10))
cm = confusion_matrix(predtop, y_test)
sns.heatmap(cm,cmap= "Blues",
linecolor = 'black' ,
linewidth = 1 ,
annot = True,
fmt='' ,
xticklabels = ['Top','Not_Top'] ,
yticklabels = ['Top','Not_Top'])
plt.xlabel("Predicted")
plt.ylabel("Actual");
list_labels = df["Popularity_Score"].fillna(1)
X_train, X_test, y_train, y_test = train_test_split(train_tfidf,
list_labels,
test_size=0.2,
random_state=40)
model1 = LGBMRegressor(verbose=-1,
learning_rate=0.01,
max_depth=20,
num_leaves=50,
n_estimators=150)
scores = cross_val_score(
model1, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print("Cross Validation RMSE Scores : ",-scores)
model1.fit(X_train,y_train)
pred1 = model1.predict(X_test)
print("RMSE: ",round(np.sqrt(mean_squared_error(pred1,y_test)),2))
joblib.dump(model1, 'Models/lgb_pop.pkl')
def title_score(title):
text = clean_title(title)
text = tfidf_vec.transform([text])
top_cat = modeltop.predict(text)
pop = model1.predict(text)
print("Top Article :" , top_cat[0].astype(bool))
print("Popularity Score :" , round(pop[0],2))
print("Total Engagement :" , int(np.expm1(pop[0])))
return top_cat,pop
df[df['top_article']==1]['title'].values[10]
title_score("Here's what Hurricane Dorian is expected to do as it crawls toward the US");
title_score("Here Are the States With the Lowest COVID-19 Vaccination Rates");
title_score("Farmworker Found Guilty of Murdering Iowa Student Is Sentenced to Life in Prison");
title_score("Plea Deal Offered to U.S. Capitol Rioter Accused of Grabbing Officer's Gas Mask");