import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
headers = pd.read_csv('./Data/handles-data.csv')
tweets = pd.read_csv('./Data/tweets-data.csv')
pd.options.display.float_format = '{:.2f}'.format
headers.shape
headers.head(6)
headers.info()
headers.describe()
headers[headers['tweets']==-1]
tweets.shape
tweets.info()
tweets.describe()
tweets.head()
union = pd.merge(headers, tweets, left_on='username', right_on='username' )
union.describe()
union.to_csv("./Data/union.csv",sep=";", index=False)
tweets.shape
headers.shape
union.shape
print(union.columns.tolist())
union = pd.read_csv("./Data/union.csv", sep=";")
union[['nlikes','likes']].describe()
union[['nlikes','likes']].head(10)
union[['nlikes','likes']].corr()
tweets[tweets['username'] == 'latimes']['nlikes'].sum()
headers[headers['username'] == 'latimes']['likes'].sum()
tweets['nlikes'].head(10)
union['nlikes'].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.85, 0.9])
union.boxplot(column =['nlikes'], figsize=(10,10))
n_likes = union.groupby(['username'])['nlikes'].count().reset_index()
n_likes.sort_values('nlikes',ascending=False)
union[union['username'] == 'latimes'][['join_date','date','tweet','followers','following']].sort_values('date', ascending=False)
union[union['username'] == 'latimes'][['nlikes']]
union[['nlikes', 'nretweets']].corr()
figura = plt.figure()
axes= figura.add_subplot()
axes.set_ylim(0,400)
tweets.boxplot(column =['nretweets'], figsize=(10,10))
nlikes_400 = union[union["nlikes"] < 400]["nlikes"].count()
nlikes_400
nlikes_400_array = union["nlikes"][union["nlikes"] < 400]
nlikes_400_df = pd.DataFrame(data=nlikes_400_array, columns=["nlikes"])
sns.histplot(data=nlikes_400_df, x="nlikes")
union['language'].unique()
union.groupby(['language'])['language'].count().sort_values(ascending=False).to_frame()
union.groupby(['tweets','username'])['username'].count().sort_index(ascending=False).head(10)
# headers.boxplot(column =['followers'], figsize=(10,10))
union["followers"].max()
followers_median = union["followers"].median()
followers_median
followers_array = union["followers"][union["followers"] < followers_median]
followers_df = pd.DataFrame(data=followers_array, columns=["followers"])
sns.boxplot(x=followers_df["followers"])
# Si tiene urls
union["have_urls"] = union["urls"] != "[]"
# Si contiene videos
union["have_video"] = union["video"] != 0
# Si tiene fotos
union["have_photos"] = union["urls"] != "[]"
# Si contiene hashtags
union["have_hashtags"] = union["hashtags"] != "[]"
# Cuando la Bio es superior a 90 caracteres
union["large_bio"] = union["bio"].str.len() >= 90
union[['have_urls', 'have_video']].corr()
union[['have_hashtags', 'have_urls']].corr()
union["large_bio"].value_counts()
union[union['bio'].str.len() >= 100].count()
tweets['date'][0]
from datetime import datetime
print(datetime.strptime(tweets['date'][0], '%Y-%m-%d %H:%M:%S').isoweekday())
tweets['date'].isnull().sum()
def calcular_dia(fecha):
return datetime.strptime(fecha, '%Y-%m-%d %H:%M:%S').isoweekday()
tweets['dia_semana'] = tweets['date'].aggregate([calcular_dia])
tweets[["dia_semana"]]
tweets['dia_semana'].value_counts().sort_index()
import matplotlib.pyplot as plt
import seaborn as sns
figura = plt.figure(figsize=(15,8))
figura.suptitle("Día y Hora Tweets")
axes = figura.add_subplot(2,2,1)
axes2 = figura.add_subplot(2,2,2)
axes3 = figura.add_subplot(2,2,3)
axes4 = figura.add_subplot(2,2,4)
axes.set_title("Hist. Hora")
axes.set_xlabel("Hora")
sns.histplot(x=tweets['hour'],ax=axes, kde=True)
sns.histplot(x=tweets['day'],ax=axes2, kde=True)
axes3.set_title("Hora / Nº Likes")
axes3.set_xlabel("Hora")
axes3.scatter(tweets['hour'].values, tweets['nlikes'].values, label="Nº Likes")
axes3.scatter(tweets['hour'].values, tweets['nretweets'].values, color="red", label="Nº Retweets")
axes3.legend()
axes4.set_title("Día / Nº Likes")
axes4.scatter(tweets['day'].values, tweets['nlikes'].values)
axes4.scatter(tweets['day'].values, tweets['nretweets'].values, color="red")
union[union["have_hashtags"] == False ]["have_hashtags"].count()
union[union["have_urls"] == True ]["have_urls"].count()
union[union["have_photos"] == True ]["have_photos"].count()
union[union["have_video"] == True ]["have_video"].count()
print(tweets.columns.tolist())
datosInteresantes = union[['day','hour','followers','have_urls']]
df_c = datosInteresantes.copy()
df_c = df_c[df_c["followers"] < 8000000]
target = union["nretweets"][union["followers"] < 8000000]
print(df_c.count())
print(target.count())
df_c.isnull().sum()
df_c.dtypes
enc_ohe = OneHotEncoder()
enc_ohe.fit(df_c[["have_urls"]])
data_encoder = enc_ohe.transform(df_c[['have_urls']]).toarray()
print(data_encoder)
df_encoder = pd.DataFrame(data_encoder)
df_encoder.head()
df_encoder.isnull().sum()
df_encoder.shape
df_encoder.count()
df_c.shape
merge = pd.concat([df_c.reset_index(drop=True), df_encoder.reset_index(drop=True)],axis=1)
merge.count()
merge
df_c = pd.concat([df_c.reset_index(drop=True),df_encoder.reset_index(drop=True)],axis=1)
df_c.drop(['have_urls'], axis=1, inplace=True)
df_c.count()
from sklearn.model_selection import train_test_split
df_c
X_train, X_test, y_train, y_test = train_test_split(df_c, target, test_size=0.3)
print(X_train.shape)
print(X_test.shape)
import numpy as np
from sklearn.linear_model import LinearRegression
# Poongo un seed para empezar siempre desde el mismo número.
np.random.seed(5)
model=LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
X_tested = X_test.copy()
print(X_tested.shape)
print(predictions.shape)
X_tested['predictions'] = predictions
X_tested['Target_test'] = y_test.values
X_tested.head(25)
X_train
model=Sequential()
model.add(Dense(16, activation='relu',input_dim=5))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error')
np.random.seed(5)
tf.random.set_seed(5)
hstory = model.fit(X_train, y_train,epochs=50)
hstory.history["loss"]
predictions = model.predict(X_test)
X_tested = X_test.copy()
print(X_tested.shape)
print(predictions.shape)
X_tested['predictions'] = predictions
X_tested['Target_test'] = y_test.values
X_tested.head(35)