from __future__ import division
import pymongo
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import operator
import os
from bson.son import SON
from collections import Counter
from pymongo import MongoClient
Establish connection with database
client = MongoClient(os.environ["MONGODB_ATLAS_CLUSTER_CONNECTION_STRING"])
db = client.bda_violence
col = db.tweets_women_slurs
users = db.users_aggr
res = users.find(
# {'ratio_sex_content': {'$lt': 0.9}},
{},
{
'_id': 0,
'user_id': 1,
'ratio_originals': 1,
'ratio_retweets': 1,
'ratio_replies': 1,
'ratio_quotes': 1,
'ratio_cuss_tweets': 1,
'ratio_threat_tweets': 1,
# 'ratio_cuss_threat_tweets': 1,
'ratio_hate_tweets': 1,
'influencer_ratio': 1,
# 'geo_location': 1,
'mean_reaction_time_name': 1,
# 'mean_reaction_time_ln': 1,
# 'mean_hashtags': 1,
# 'mean_urls': 1,
'mean_user_mentions': 1,
# 'mean_symbols': 1,
# 'mean_media': 1,
'ratio_sex_content': 1,
})
df = pd.DataFrame.from_records(res).set_index('user_id')
num_users = len(df)
df.to_csv('files/users_aggr_with_retweets.csv')
df.head(6)
Check sparse columns
# Geo location has 'real' pandas missing values
if 'geo_location' in df.columns:
print('geo_location', len(df[pd.isna(df['geo_location'])]) / num_users)
# Other columns have custom missing values
missing_values = {
'mean_reaction_time_name': 'none',
'mean_reaction_time_ln': 0,
'mean_hashtags': 0,
'mean_urls': 0,
'mean_user_mentions': 0,
'mean_symbols': 0,
'mean_media': 0,
}
for column in missing_values.keys():
if column in df.columns:
print(column, len(df[df[column] == missing_values[column]]) / num_users)
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
# Copy dataset's columns without missing values
columns_with_missing_values = [
'geo_location',
'mean_reaction_time_name',
'mean_reaction_time_ln',
]
columns_without_missing_values = [col for col in df.columns if col not in columns_with_missing_values]
df_agg = df.loc[:, columns_without_missing_values]
# Stepwise add columns with missing values and set them differently
# Also encode categorical labels
if 'geo_location' in df.columns:
df_agg.loc[:, 'geo_location'] = df.loc[:, 'geo_location']
df_agg.fillna('none', axis=1, inplace=True)
enc = OrdinalEncoder()
geo_loc = df_agg['geo_location'].values
geo_loc = geo_loc.reshape(-1, 1)
df_agg['geo_location'] = enc.fit_transform(geo_loc)
if 'mean_reaction_time_name' in df.columns:
df_agg.loc[:, 'mean_reaction_time_name'] = df.loc[:, 'mean_reaction_time_name']
df_agg.fillna('none', axis=1, inplace=True)
enc = OrdinalEncoder()
r_time = df_agg['mean_reaction_time_name'].values
r_time = r_time.reshape(-1, 1)
df_agg['mean_reaction_time_name'] = enc.fit_transform(r_time)
if 'mean_reaction_time_ln' in df.columns:
df_agg.loc[:, 'mean_reaction_time_ln'] = df.loc[:, 'mean_reaction_time_ln']
df_agg.fillna(0, axis=1, inplace=True)
# Scale values
std_model = StandardScaler()
X = df_agg.iloc[:,:].values
x = std_model.fit_transform(X)
df_x = pd.DataFrame(x)
df_x.columns = df_agg.columns
df_x.to_csv('files/users_aggr_preprocessed.csv') # Export data for DBSCAN clustering in Google Colab
df_x.head(10)
random_seed = 42
from sklearn.cluster import KMeans
k = []
wcss =[]
for i in np.arange(1, 15, 1):
km_model = KMeans(n_clusters=i, init='k-means++', tol=1e-4, random_state=random_seed)
km_model.fit(x)
wcss.append (km_model.inertia_)
k.append(i)
# Plotting WCSS against K
plt.plot(k, wcss, 'b-')
plt.xlabel('No. of clusters')
plt.ylabel('Within Cluster Sum of Square')
plt.show()
num_clusters = 4
clustmodel = KMeans(n_clusters=num_clusters, init='k-means++', tol=1e-4, random_state=random_seed)
clustmodel.fit(x)
df_res = df_agg.copy()
df_res['segment'] = clustmodel.predict(x)
df_res.head(10)
Make PCA and plot in multiple dimensions
# Make PCA
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(x)
pca.explained_variance_ratio_
# Sample just a part from dataset for plotting
fraction = 0.1
df_res['p0'] = X_pca[:, 0]
df_res['p1'] = X_pca[:, 1]
df_plot = df_res.sample(frac=fraction, axis=0)
sns.scatterplot(data=df_plot, x="p0", y="p1", hue="segment", palette="colorblind")
df_res['p0'] = X_pca[:, 0]
df_res['p1'] = X_pca[:, 2]
df_plot = df_res.sample(frac=fraction, axis=0)
sns.scatterplot(data=df_plot, x="p0", y="p1", hue="segment", palette="colorblind")
df_res['p0'] = X_pca[:, 0]
df_res['p1'] = X_pca[:, 3]
df_plot = df_res.sample(frac=fraction, axis=0)
sns.scatterplot(data=df_plot, x="p0", y="p1", hue="segment", palette="colorblind")
df_res['p0'] = X_pca[:, 1]
df_res['p1'] = X_pca[:, 2]
df_plot = df_res.sample(frac=fraction, axis=0)
sns.scatterplot(data=df_plot, x="p0", y="p1", hue="segment", palette="colorblind")
df_res['p0'] = X_pca[:, 1]
df_res['p1'] = X_pca[:, 3]
df_plot = df_res.sample(frac=fraction, axis=0)
sns.scatterplot(data=df_plot, x="p0", y="p1", hue="segment", palette="colorblind")
df_res['p0'] = X_pca[:, 2]
df_res['p1'] = X_pca[:, 3]
sns.scatterplot(data=df_res, x="p0", y="p1", hue="segment", palette="colorblind")
Cluster sizes
for c in range(num_clusters):
print("Cluster", c, len(df_res[df_res['segment'] == c]))
client.close()