import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
headers = pd.read_csv('./Data/handles-data.csv')
tweets = pd.read_csv('./Data/tweets-data.csv')
pd.options.display.float_format = '{:.2f}'.format
headers.shape
headers.head(6)
headers.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246 entries, 0 to 1245
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 1246 non-null int64
1 name 1229 non-null object
2 username 1229 non-null object
3 bio 1116 non-null object
4 url 1009 non-null object
5 join_datetime 1246 non-null object
6 join_date 1229 non-null object
7 join_time 1229 non-null object
8 tweets 1246 non-null int64
9 location 839 non-null object
10 following 1246 non-null int64
11 followers 1246 non-null int64
12 likes 1246 non-null int64
13 media 1246 non-null int64
14 private 1229 non-null object
15 verified 1229 non-null object
16 avatar 1229 non-null object
17 background_image 1118 non-null object
dtypes: int64(6), object(12)
memory usage: 175.3+ KB
headers.describe()
headers[headers['tweets']==-1]
tweets.shape
tweets.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40581 entries, 0 to 40580
Data columns (total 38 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 40581 non-null int64
1 conversation_id 40581 non-null int64
2 created_at 40581 non-null float64
3 date 40581 non-null object
4 timezone 40581 non-null int64
5 place 34 non-null object
6 tweet 40581 non-null object
7 language 40581 non-null object
8 hashtags 40581 non-null object
9 cashtags 40581 non-null object
10 user_id 40581 non-null int64
11 user_id_str 40581 non-null int64
12 username 40581 non-null object
13 name 40581 non-null object
14 day 40581 non-null int64
15 hour 40581 non-null int64
16 link 40581 non-null object
17 urls 40581 non-null object
18 photos 40581 non-null object
19 video 40581 non-null int64
20 thumbnail 9155 non-null object
21 retweet 40581 non-null bool
22 nlikes 40581 non-null int64
23 nreplies 40581 non-null int64
24 nretweets 40581 non-null int64
25 quote_url 40581 non-null object
26 search 40581 non-null object
27 near 0 non-null float64
28 geo 0 non-null float64
29 source 0 non-null float64
30 user_rt_id 6147 non-null float64
31 user_rt 6147 non-null object
32 retweet_id 6147 non-null float64
33 reply_to 40581 non-null object
34 retweet_date 6147 non-null object
35 translate 0 non-null float64
36 trans_src 0 non-null float64
37 trans_dest 0 non-null float64
dtypes: bool(1), float64(9), int64(11), object(17)
memory usage: 11.5+ MB
tweets.describe()
tweets.head()
union = pd.merge(headers, tweets, left_on='username', right_on='username' )
union.describe()
union.to_csv("./Data/union.csv",sep=";", index=False)
tweets.shape
headers.shape
union.shape
print(union.columns.tolist())
['id_x', 'name_x', 'username', 'bio', 'url', 'join_datetime', 'join_date', 'join_time', 'tweets', 'location', 'following', 'followers', 'likes', 'media', 'private', 'verified', 'avatar', 'background_image', 'id_y', 'conversation_id', 'created_at', 'date', 'timezone', 'place', 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str', 'name_y', 'day', 'hour', 'link', 'urls', 'photos', 'video', 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url', 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt', 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src', 'trans_dest']
union = pd.read_csv("./Data/union.csv", sep=";")
/home/julianmelero/twitter-ai/twai/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3457: DtypeWarning: Columns (23) have mixed types.Specify dtype option on import or set low_memory=False.
exec(code_obj, self.user_global_ns, self.user_ns)
union[['nlikes','likes']].describe()
union[['nlikes','likes']].head(10)
union[['nlikes','likes']].corr()
tweets[tweets['username'] == 'latimes']['nlikes'].sum()
headers[headers['username'] == 'latimes']['likes'].sum()
tweets['nlikes'].head(10)
union['nlikes'].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.85, 0.9])
union.boxplot(column =['nlikes'], figsize=(10,10))
n_likes = union.groupby(['username'])['nlikes'].count().reset_index()
n_likes.sort_values('nlikes',ascending=False)
union[union['username'] == 'latimes'][['join_date','date','tweet','followers','following']].sort_values('date', ascending=False)
union[union['username'] == 'latimes'][['nlikes']]
union[['nlikes', 'nretweets']].corr()
figura = plt.figure()
axes= figura.add_subplot()
axes.set_ylim(0,400)
tweets.boxplot(column =['nretweets'], figsize=(10,10))
nlikes_400 = union[union["nlikes"] < 400]["nlikes"].count()
nlikes_400
nlikes_400_array = union["nlikes"][union["nlikes"] < 400]
nlikes_400_df = pd.DataFrame(data=nlikes_400_array, columns=["nlikes"])
sns.histplot(data=nlikes_400_df, x="nlikes")
union['language'].unique()
union.groupby(['language'])['language'].count().sort_values(ascending=False).to_frame()
union.groupby(['tweets','username'])['username'].count().sort_index(ascending=False).head(10)
# headers.boxplot(column =['followers'], figsize=(10,10))
union["followers"].max()
followers_median = union["followers"].median()
followers_median
followers_array = union["followers"][union["followers"] < followers_median]
followers_df = pd.DataFrame(data=followers_array, columns=["followers"])
sns.boxplot(x=followers_df["followers"])
# Si tiene urls
union["have_urls"] = union["urls"] != "[]"
# Si contiene videos
union["have_video"] = union["video"] != 0
# Si tiene fotos
union["have_photos"] = union["urls"] != "[]"
# Si contiene hashtags
union["have_hashtags"] = union["hashtags"] != "[]"
# Cuando la Bio es superior a 90 caracteres
union["large_bio"] = union["bio"].str.len() >= 90
union[['have_urls', 'have_video']].corr()
union[['have_hashtags', 'have_urls']].corr()
union["large_bio"].value_counts()
union[union['bio'].str.len() >= 100].count()
tweets['date'][0]
from datetime import datetime
print(datetime.strptime(tweets['date'][0], '%Y-%m-%d %H:%M:%S').isoweekday())
4
tweets['date'].isnull().sum()
def calcular_dia(fecha):
return datetime.strptime(fecha, '%Y-%m-%d %H:%M:%S').isoweekday()
tweets['dia_semana'] = tweets['date'].aggregate([calcular_dia])
tweets[["dia_semana"]]
tweets['dia_semana'].value_counts().sort_index()
import matplotlib.pyplot as plt
import seaborn as sns
figura = plt.figure(figsize=(15,8))
figura.suptitle("Día y Hora Tweets")
axes = figura.add_subplot(2,2,1)
axes2 = figura.add_subplot(2,2,2)
axes3 = figura.add_subplot(2,2,3)
axes4 = figura.add_subplot(2,2,4)
axes.set_title("Hist. Hora")
axes.set_xlabel("Hora")
sns.histplot(x=tweets['hour'],ax=axes, kde=True)
sns.histplot(x=tweets['day'],ax=axes2, kde=True)
axes3.set_title("Hora / Nº Likes")
axes3.set_xlabel("Hora")
axes3.scatter(tweets['hour'].values, tweets['nlikes'].values, label="Nº Likes")
axes3.scatter(tweets['hour'].values, tweets['nretweets'].values, color="red", label="Nº Retweets")
axes3.legend()
axes4.set_title("Día / Nº Likes")
axes4.scatter(tweets['day'].values, tweets['nlikes'].values)
axes4.scatter(tweets['day'].values, tweets['nretweets'].values, color="red")
union[union["have_hashtags"] == False ]["have_hashtags"].count()
union[union["have_urls"] == True ]["have_urls"].count()
union[union["have_photos"] == True ]["have_photos"].count()
union[union["have_video"] == True ]["have_video"].count()
print(tweets.columns.tolist())
['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place', 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str', 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video', 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url', 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt', 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src', 'trans_dest', 'dia_semana']
datosInteresantes = union[['day','hour','followers','have_urls']]
df_c = datosInteresantes.copy()
df_c = df_c[df_c["followers"] < 8000000]
target = union["nretweets"][union["followers"] < 8000000]
print(df_c.count())
print(target.count())
day 26392
hour 26392
followers 26392
have_urls 26392
dtype: int64
26392
df_c.isnull().sum()
df_c.dtypes
enc_ohe = OneHotEncoder()
enc_ohe.fit(df_c[["have_urls"]])
data_encoder = enc_ohe.transform(df_c[['have_urls']]).toarray()
print(data_encoder)
[[1. 0.]
[1. 0.]
[1. 0.]
...
[1. 0.]
[1. 0.]
[1. 0.]]
df_encoder = pd.DataFrame(data_encoder)
df_encoder.head()
df_encoder.isnull().sum()
df_encoder.shape
df_encoder.count()
df_c.shape
merge = pd.concat([df_c.reset_index(drop=True), df_encoder.reset_index(drop=True)],axis=1)
merge.count()
merge
df_c = pd.concat([df_c.reset_index(drop=True),df_encoder.reset_index(drop=True)],axis=1)
df_c.drop(['have_urls'], axis=1, inplace=True)
df_c.count()
from sklearn.model_selection import train_test_split
df_c
X_train, X_test, y_train, y_test = train_test_split(df_c, target, test_size=0.3)
print(X_train.shape)
print(X_test.shape)
(18474, 5)
(7918, 5)
import numpy as np
from sklearn.linear_model import LinearRegression
# Poongo un seed para empezar siempre desde el mismo número.
np.random.seed(5)
model=LinearRegression()
model.fit(X_train, y_train)
/home/julianmelero/twitter-ai/twai/lib/python3.8/site-packages/sklearn/utils/validation.py:1688: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.
warnings.warn(
predictions = model.predict(X_test)
X_tested = X_test.copy()
print(X_tested.shape)
print(predictions.shape)
X_tested['predictions'] = predictions
X_tested['Target_test'] = y_test.values
X_tested.head(25)
(7918, 5)
(7918,)
/home/julianmelero/twitter-ai/twai/lib/python3.8/site-packages/sklearn/utils/validation.py:1688: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.
warnings.warn(
X_train
model=Sequential()
model.add(Dense(16, activation='relu',input_dim=5))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error')
2022-02-05 18:15:26.089367: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-05 18:15:26.089468: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-05 18:15:26.089511: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-02-05 18:15:26.089543: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-02-05 18:15:26.089574: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory
2022-02-05 18:15:26.089605: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcurand.so.10'; dlerror: libcurand.so.10: cannot open shared object file: No such file or directory
2022-02-05 18:15:26.089636: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory
2022-02-05 18:15:26.089666: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory
2022-02-05 18:15:26.089697: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-02-05 18:15:26.089702: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-02-05 18:15:26.090078: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
np.random.seed(5)
tf.random.set_seed(5)
hstory = model.fit(X_train, y_train,epochs=50)
Epoch 1/50
578/578 [==============================] - 1s 801us/step - loss: 517086048.0000
Epoch 2/50
578/578 [==============================] - 0s 837us/step - loss: 80324136.0000
Epoch 3/50
578/578 [==============================] - 0s 817us/step - loss: 10696057.0000
Epoch 4/50
578/578 [==============================] - 0s 753us/step - loss: 3740269.7500
Epoch 5/50
578/578 [==============================] - 0s 732us/step - loss: 2323977.5000
Epoch 6/50
578/578 [==============================] - 0s 863us/step - loss: 2323263.0000
Epoch 7/50
578/578 [==============================] - 0s 863us/step - loss: 2322441.2500
Epoch 8/50
578/578 [==============================] - 0s 760us/step - loss: 2321422.0000
Epoch 9/50
578/578 [==============================] - 0s 753us/step - loss: 2320318.5000
Epoch 10/50
578/578 [==============================] - 0s 832us/step - loss: 2319134.7500
Epoch 11/50
578/578 [==============================] - 0s 854us/step - loss: 2317790.7500
Epoch 12/50
578/578 [==============================] - 0s 770us/step - loss: 2316344.0000
Epoch 13/50
578/578 [==============================] - 0s 746us/step - loss: 2314858.7500
Epoch 14/50
578/578 [==============================] - 0s 799us/step - loss: 2313346.7500
Epoch 15/50
578/578 [==============================] - 0s 781us/step - loss: 2311902.7500
Epoch 16/50
578/578 [==============================] - 0s 755us/step - loss: 2310433.7500
Epoch 17/50
578/578 [==============================] - 0s 780us/step - loss: 2309074.7500
Epoch 18/50
578/578 [==============================] - 0s 730us/step - loss: 2307510.0000
Epoch 19/50
578/578 [==============================] - 0s 734us/step - loss: 2306309.7500
Epoch 20/50
578/578 [==============================] - 0s 720us/step - loss: 2305130.2500
Epoch 21/50
578/578 [==============================] - 0s 742us/step - loss: 2303995.0000
Epoch 22/50
578/578 [==============================] - 0s 739us/step - loss: 2302943.5000
Epoch 23/50
578/578 [==============================] - 0s 733us/step - loss: 2301904.0000
Epoch 24/50
578/578 [==============================] - 0s 774us/step - loss: 2301051.7500
Epoch 25/50
578/578 [==============================] - 0s 750us/step - loss: 2300297.5000
Epoch 26/50
578/578 [==============================] - 0s 723us/step - loss: 2299543.0000
Epoch 27/50
578/578 [==============================] - 0s 736us/step - loss: 2298955.7500
Epoch 28/50
578/578 [==============================] - 0s 748us/step - loss: 2298402.7500
Epoch 29/50
578/578 [==============================] - 0s 735us/step - loss: 2297905.0000
Epoch 30/50
578/578 [==============================] - 0s 755us/step - loss: 2297524.7500
Epoch 31/50
578/578 [==============================] - 0s 721us/step - loss: 2297173.0000
Epoch 32/50
578/578 [==============================] - 0s 751us/step - loss: 2296898.7500
Epoch 33/50
578/578 [==============================] - 0s 702us/step - loss: 2296618.7500
Epoch 34/50
578/578 [==============================] - 0s 714us/step - loss: 2296387.7500
Epoch 35/50
578/578 [==============================] - 0s 733us/step - loss: 2296148.7500
Epoch 36/50
578/578 [==============================] - 0s 712us/step - loss: 2295933.2500
Epoch 37/50
578/578 [==============================] - 0s 725us/step - loss: 2295766.5000
Epoch 38/50
578/578 [==============================] - 0s 715us/step - loss: 2295620.7500
Epoch 39/50
578/578 [==============================] - 0s 729us/step - loss: 2295514.5000
Epoch 40/50
578/578 [==============================] - 0s 722us/step - loss: 2295378.2500
Epoch 41/50
578/578 [==============================] - 0s 699us/step - loss: 2295296.7500
Epoch 42/50
578/578 [==============================] - 0s 710us/step - loss: 2295225.0000
Epoch 43/50
578/578 [==============================] - 0s 763us/step - loss: 2295152.5000
Epoch 44/50
578/578 [==============================] - 0s 715us/step - loss: 2295072.0000
Epoch 45/50
578/578 [==============================] - 0s 758us/step - loss: 2295009.0000
Epoch 46/50
578/578 [==============================] - 0s 715us/step - loss: 2294984.0000
Epoch 47/50
578/578 [==============================] - 0s 716us/step - loss: 2294954.0000
Epoch 48/50
578/578 [==============================] - 0s 717us/step - loss: 2294892.7500
Epoch 49/50
578/578 [==============================] - 0s 709us/step - loss: 2294858.5000
Epoch 50/50
578/578 [==============================] - 0s 714us/step - loss: 2294820.5000
hstory.history["loss"]
predictions = model.predict(X_test)
X_tested = X_test.copy()
print(X_tested.shape)
print(predictions.shape)
X_tested['predictions'] = predictions
X_tested['Target_test'] = y_test.values
X_tested.head(35)
(7918, 5)
(7918, 1)