Analyzing and Predicting Consumer Engagement

# Statistical packages import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # Text import re import string from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from wordcloud import WordCloud,STOPWORDS #ML model from lightgbm import LGBMRegressor,LGBMClassifier from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier from sklearn.metrics import mean_squared_error,f1_score,accuracy_score,confusion_matrix from sklearn.model_selection import cross_val_score import joblib #Preprocessing from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer from sklearn.model_selection import train_test_split # Over sampling from imblearn.over_sampling import RandomOverSampler,SMOTE from imblearn.under_sampling import NearMiss plt.rc('figure',figsize=(8,7.5)) np.random.seed(2021)

# Load data from the csv file df = pd.read_csv('news_articles.csv', index_col=0) print(f"Number of rows/records: {df.shape[0]}") print(f"Number of columns/variables: {df.shape[1]}") df.head()

NA = pd.DataFrame(data=[df.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \ for i in (df.isna().sum()/df.shape[0]*100).tolist()]], columns=df.columns, index=['NA Count', 'NA Percent']).T.\ sort_values(by='NA Count',ascending =False) NA.style.background_gradient(cmap="summer", subset=['NA Count'])

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 10)) source_name = df["source_name"].dropna().value_counts()[:10] author = df["author"].dropna().value_counts()[:10] yticklabels = ['TAP', 'Reuters Editorial', 'CBS NEWS', 'BBC FB', 'AL Jazeera', 'The Irish Times', 'BBC News', 'CBS/AP', 'DAN Cancian', 'AP'] sns.barplot(x=source_name,y=source_name.index,palette='summer',ax=ax1) sns.barplot(x=author,y=author.index,palette='summer',ax=ax2 ) sns.despine(bottom=True,left=True) ax1.set(title='Top 10 Source') ax2.set(title='Top 10 Author') ax2.set_yticklabels(yticklabels) ;

fig, ax = plt.subplots(figsize=(16, 8)) fig.suptitle('Top Article', size = 20, color = "black") explode = ( 0.05, 0.3) labels = ["Not Top","Top"] sizes = df["top_article"].dropna().value_counts() ax.pie(sizes, explode=explode, colors=sns.color_palette("Set2"), startangle=60, labels=labels, autopct='%1.0f%%', pctdistance=0.9) ax.add_artist(plt.Circle((0,0),0.4,fc='white')) plt.show()

eng = ['engagement_reaction_count', 'engagement_comment_count', 'engagement_share_count', 'engagement_comment_plugin_count'] ax = sns.boxplot(x="variable", y="value", data=pd.melt(df[eng]),palette='summer') ax.set_title('Engagement Boxplots') plt.xticks(rotation=45) ax.set_yscale('Symlog') plt.show()

pd.DataFrame(df["engagement_comment_plugin_count"].\ value_counts().\ reset_index().\ rename(columns = {'index':'engagement_comment_plugin', 'engagement_comment_plugin_count':'Counts'})).\ astype(int).\ style.background_gradient(cmap="summer", subset=['Counts'])

df['title'][0:5]

def clean_title(x:str): # lowering the text x=x.lower() #removing square brackets x = re.sub('\[.*?\]', '', x) x = re.sub('<.*?>+', '', x) #removing hyperlink x = re.sub('https?://\S+|www\.\S+', '', x) #removing puncuation x = re.sub('[%s]' % re.escape(string.punctuation), '', x) x = re.sub('\n', '', x) #remove words containing numbers x = re.sub('\w*\d\w*', '', x) return x

df['clean_title'] = df['title'].astype(str).apply(clean_title) df['clean_title'][0:5]

analyzer = SentimentIntensityAnalyzer() def compound_score(txt): return analyzer.polarity_scores(txt)["compound"] ## Sentiments def sentiment(score): emotion = "" if score >= 0.5: emotion = "Positive" elif score <= -0.5: emotion = "Negative" else: emotion = "Neutral" return emotion

polarity_scores = df["clean_title"].astype("str").apply(compound_score) df["Sentiment_Score"] = polarity_scores

df["Sentiment"] = df["Sentiment_Score"].apply(sentiment)

df.head()

sns.countplot(data=df,x="Sentiment",palette="summer") plt.title('Sentiment Distribution');

np.array(list(STOPWORDS))[0:5]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[14, 14], facecolor = None) wc = WordCloud(width = 800, height = 800,background_color="white",min_font_size = 10,\ repeat=True,stopwords = STOPWORDS) wc.generate("".join(df['title'].astype(str))) ax1.axis("off") ax1.imshow(wc, interpolation="bilinear") ax1.set_title('Common Words Used in Title',fontsize=16); wc2 = WordCloud(width = 800, height = 800,background_color="white",min_font_size = 10,\ repeat=True,stopwords = STOPWORDS) wc2.generate("".join(df['description'].astype(str))) ax2.axis("off") ax2.imshow(wc2, interpolation="bilinear") ax2.set_title('Common Words Used in description',fontsize=16);

df['published_at'] = pd.to_datetime(df['published_at']) df['Day_Of_Week'] = df['published_at'].apply(lambda x: x.dayofweek) df['Month'] = df['published_at'].apply(lambda x: x.month) df['Year'] = df['published_at'].apply(lambda x: x.year)

fig, ax1 = plt.subplots( figsize=[15, 8], facecolor = None) date_mean = df.copy() date_mean['published_at'] = pd.to_datetime(date_mean['published_at']).dt.normalize() date_mean = date_mean.groupby(by='published_at').mean().reset_index() ax1 = sns.lineplot( data=date_mean, x="published_at", y="engagement_reaction_count",label = 'reaction' ) ax1 = sns.lineplot( data=date_mean, x="published_at", y="engagement_comment_count",label = 'comment' ) ax1 = sns.lineplot( data=date_mean, x="published_at", y="engagement_share_count", label = 'share' ) plt.ylabel('Engagement Counts') plt.title('Engagements over the time',fontsize=16);

dataplot = sns.heatmap(df.drop(columns = ['Year','engagement_comment_plugin_count']).corr(), cmap="YlGnBu", annot=True) dataplot.set_title('Coorelation HeatMap',fontsize=16) # displaying heatmap plt.show()

df['clean_title'].fillna('Missing',inplace=True) tfidf_vec = TfidfVectorizer(ngram_range=(1,2), dtype=np.float32, sublinear_tf=True, use_idf=True, smooth_idf=True) train_tfidf = tfidf_vec.fit_transform(df['clean_title'])

list_labels = df["top_article"].fillna(0)

list_labels.value_counts()

OS = SMOTE() X,Y = OS.fit_resample(train_tfidf,list_labels)

df['Popularity_Score'] = np.log1p((df['engagement_reaction_count'] + \ df['engagement_comment_count']+\ df['engagement_share_count']+\ df['engagement_comment_plugin_count']))

sns.kdeplot(df['Popularity_Score']) plt.title('Popularity Score KDE Plot',fontsize=16);

df.groupby(by='source_name').\ mean()['Popularity_Score'].\ to_frame().\ sort_values(by='Popularity_Score', ascending=False)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, stratify = Y , random_state=40) modeltop = LGBMClassifier(verbose=-1, learning_rate=0.5, max_depth=20, num_leaves=50, n_estimators=120, max_bin=2000,)

scores = cross_val_score( modeltop, X_train, y_train, cv=5, scoring='f1_macro') print("Cross Validation F1 Scores : ",scores)

modeltop.fit(X_train,y_train) predtop = modeltop.predict(X_test) print("f1 score : ",round(f1_score(predtop,y_test),2)) print("accuracy score : ",round(accuracy_score(predtop,y_test),2))

joblib.dump(modeltop, 'Models/lgb_top.pkl')

y_test.value_counts()

plt.figure(figsize = (10,10)) cm = confusion_matrix(predtop, y_test) sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Top','Not_Top'] , yticklabels = ['Top','Not_Top']) plt.xlabel("Predicted") plt.ylabel("Actual");

list_labels = df["Popularity_Score"].fillna(1) X_train, X_test, y_train, y_test = train_test_split(train_tfidf, list_labels, test_size=0.2, random_state=40)

model1 = LGBMRegressor(verbose=-1, learning_rate=0.01, max_depth=20, num_leaves=50, n_estimators=150)

scores = cross_val_score( model1, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error') print("Cross Validation RMSE Scores : ",-scores)

model1.fit(X_train,y_train) pred1 = model1.predict(X_test) print("RMSE: ",round(np.sqrt(mean_squared_error(pred1,y_test)),2))

joblib.dump(model1, 'Models/lgb_pop.pkl')

def title_score(title): text = clean_title(title) text = tfidf_vec.transform([text]) top_cat = modeltop.predict(text) pop = model1.predict(text) print("Top Article :" , top_cat[0].astype(bool)) print("Popularity Score :" , round(pop[0],2)) print("Total Engagement :" , int(np.expm1(pop[0]))) return top_cat,pop

df[df['top_article']==1]['title'].values[10]

title_score("Here's what Hurricane Dorian is expected to do as it crawls toward the US");

title_score("Here Are the States With the Lowest COVID-19 Vaccination Rates");

title_score("Farmworker Found Guilty of Murdering Iowa Student Is Sentenced to Life in Prison");

title_score("Plea Deal Offered to U.S. Capitol Rioter Accused of Grabbing Officer's Gas Mask");