import pandas as pd
df = pd.read_csv('list_OutubroRosa.csv', sep=";")
df = df.loc[df['lang'] == 'pt']
df.drop('month', axis=1, inplace=True)
df = df.reset_index(drop=True)
df.drop('Unnamed: 0', axis=1, inplace=True)
df
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3173: DtypeWarning: Columns (0,1,2,3,4,6,7,8) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
df["retweets"] = df["share data"].str.replace(r'retweets=','').str.split(';').str[1]
df["replies"] = df["share data"].str.replace(r'replies=','').str.split(';').str[2]
df["quotes"] = df["share data"].str.replace(r'quotes=','').str.split(';').str[3]
df.drop('share data', axis=1, inplace=True)
df
# função auxiliar para checar os valores que nao existem
def check_nulls(df):
rows = []
for column in df.columns:
row = {'coluna': column, 'nans': df[column].isnull().sum(), 'frac_nans': df[column].isnull().sum() / df.shape[0]}
rows.append(row)
res = pd.DataFrame(rows)
return res[res.nans>0].sort_values('nans', ascending=False)
check_nulls(df)
df_fill_na = df.copy()
df_fill_na["likes"] = df_fill_na["likes"].fillna(0)
df_fill_na["retweets"] = df_fill_na["retweets"].fillna(0)
df_fill_na["replies"] = df_fill_na["replies"].fillna(0)
df_fill_na["quotes"] = df_fill_na["quotes"].fillna(0)
df_fill_na["url"] = df_fill_na["url"].fillna('https://twitter.com/')
check_nulls(df_fill_na)
replaced_df = df_fill_na.copy()
replaced_df = replaced_df.drop(['url'], axis=1)
replaced_df = replaced_df.drop(['lang'], axis=1)
replaced_df = replaced_df.drop(['date'], axis=1)
replaced_df = replaced_df.drop(['retweets'], axis=1)
replaced_df = replaced_df.drop(['replies'], axis=1)
replaced_df = replaced_df.drop(['quotes'], axis=1)
replaced_df
# vetor com todos usernames ordenados por frequencia
usernames = replaced_df['username'].value_counts(dropna=False).keys().tolist()
usernames_ids = []
for i in range(len(usernames)):
usernames_ids.append(i+1)
replace_dict = dict(zip(usernames, usernames_ids))
df_filled = replaced_df.copy()
# substitui os atributos categóricos por numericos
df_filled['username'] = df_filled['username'].map(replace_dict)
df_filled
#importando NLTK
try:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
except:
!pip install nltk -q
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
#lista de stopwords do portugues
sw = set(stopwords.words('portuguese') + list(punctuation))
sw.update(['http', 'https', 'co', 't', 'lrt'])
#remove as stopwords dos textos
df_nosw = df_filled.copy()
df_nosw["text"] = df_nosw["text"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in sw))
df_nosw
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
from tensorflow.keras.preprocessing.text import Tokenizer
tokenized_df = df_nosw.copy()
#Tokenize the sentences
tfTokenizer = Tokenizer(num_words=None)
tfTokenizer.fit_on_texts(tokenized_df['text'])
tfTokenizer.texts_to_sequences(tokenized_df['text'])
import numpy
tokenized_df['text'] = tfTokenizer.texts_to_sequences(tokenized_df['text'])
tokenized_df
import numpy as np
df_filled_final = tokenized_df.copy()
df_filled_final = pd.concat([df_filled_final.pop('text').apply(pd.Series), df_filled_final['username'], df_filled_final['likes']], axis=1)
df_filled_final.replace(np.nan, 0, inplace=True)
df_filled_final["username"] = df_filled_final["username"].astype(float)
df_filled_final
from sklearn.model_selection import train_test_split
df_training_final = df_filled_final.copy()
x, y = df_training_final.drop('likes', axis=1), df_training_final['likes']
p_x_train, p_x_test, y_train, y_test = train_test_split (x, y, test_size = 0.3, random_state = 0)
x_train = (p_x_train-p_x_train.min())/(p_x_train.max()-p_x_train.min())
x_test = (p_x_test-p_x_test.min())/(p_x_test.max()-p_x_test.min())
x_train.replace(np.nan, 0, inplace=True)
x_train
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
# inicializa o modelo de árvore de regressão
tree_model = DecisionTreeRegressor(random_state=42)
# treina o modelo utilizando o método fit
tree_model.fit(x_train, y_train)
# faz as previsões com o conjunto de testes
y_pred = tree_model.predict(x_test)
# Imprime os resultados alcançados
mse = mean_squared_error(y_test, y_pred, squared=False)
rt_cv = cross_validate(tree_model, x_train, y_train, cv=10)
print("MSE: ", str(mse))
print("Cross-validation:\n",pd.DataFrame(rt_cv).mean())
Tensorflow's Tokenizer: 96.56659114028972
Cross-validation:
fit_time 6.582565
score_time 0.024795
test_score -38.454986
dtype: float64
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
model = keras.Sequential([
layers.Dense(97, activation='sigmoid', input_shape=[len(x_train.keys())]),
layers.Dense(100, activation='sigmoid'),
layers.Dense(70, activation='sigmoid'),
layers.Dense(1)
])
model.compile(
loss='mse',
optimizer=tf.keras.optimizers.RMSprop(0.001),
metrics=['mae', 'mse'],
)
model.fit(x_train, y_train, epochs=20, callbacks=[])
print(model.evaluate(x_test, y_test, verbose=2))
Epoch 1/20
4945/4945 [==============================] - 28s 5ms/step - loss: 79954.2330 - mae: 5.6692 - mse: 79954.2330
Epoch 2/20
4945/4945 [==============================] - 23s 5ms/step - loss: 39660.7204 - mae: 5.0726 - mse: 39660.7204
Epoch 3/20
4945/4945 [==============================] - 21s 4ms/step - loss: 17212.6656 - mae: 4.1326 - mse: 17212.6656
Epoch 4/20
4945/4945 [==============================] - 22s 4ms/step - loss: 69220.8634 - mae: 5.1591 - mse: 69220.8634
Epoch 5/20
4945/4945 [==============================] - 23s 5ms/step - loss: 21907.0095 - mae: 4.2106 - mse: 21907.0095
Epoch 6/20
4945/4945 [==============================] - 24s 5ms/step - loss: 35664.6035 - mae: 4.9047 - mse: 35664.6035
Epoch 7/20
4945/4945 [==============================] - 24s 5ms/step - loss: 35049.0121 - mae: 4.3012 - mse: 35049.0121
Epoch 8/20
4945/4945 [==============================] - 23s 5ms/step - loss: 18706.3254 - mae: 4.3423 - mse: 18706.3254
Epoch 9/20
4945/4945 [==============================] - 23s 5ms/step - loss: 7768.6861 - mae: 4.2930 - mse: 7768.6861
Epoch 10/20
4945/4945 [==============================] - 23s 5ms/step - loss: 25984.7842 - mae: 4.6888 - mse: 25984.7842
Epoch 11/20
4945/4945 [==============================] - 24s 5ms/step - loss: 63586.7697 - mae: 4.9788 - mse: 63586.7697
Epoch 12/20
4945/4945 [==============================] - 24s 5ms/step - loss: 77400.8653 - mae: 5.0893 - mse: 77400.8653
Epoch 13/20
4945/4945 [==============================] - 23s 5ms/step - loss: 63408.7130 - mae: 5.0265 - mse: 63408.7130
Epoch 14/20
4945/4945 [==============================] - 23s 5ms/step - loss: 44360.8228 - mae: 5.2489 - mse: 44360.8228
Epoch 15/20
4945/4945 [==============================] - 24s 5ms/step - loss: 3159.9667 - mae: 3.9978 - mse: 3159.9667
Epoch 16/20
4945/4945 [==============================] - 23s 5ms/step - loss: 34093.0335 - mae: 4.9777 - mse: 34093.0335
Epoch 17/20
4945/4945 [==============================] - 24s 5ms/step - loss: 12653.5435 - mae: 4.3828 - mse: 12653.5435
Epoch 18/20
4945/4945 [==============================] - 23s 5ms/step - loss: 24032.7603 - mae: 4.3640 - mse: 24032.7603
Epoch 19/20
4945/4945 [==============================] - 23s 5ms/step - loss: 216118.3651 - mae: 7.2122 - mse: 216118.3651
Epoch 20/20
4945/4945 [==============================] - 23s 5ms/step - loss: 12727.2231 - mae: 4.2698 - mse: 12727.2231
2120/2120 - 5s - loss: 2503.4844 - mae: 3.1081 - mse: 2503.4844
[2503.484375, 3.1081132888793945, 2503.484375]
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(n_estimators=50, random_state=0)
forest_model.fit(x_train, y_train)
# faz as previsões com o conjunto de testes
y_pred_rf = forest_model.predict(x_test)
# Imprime os resultados alcançados
mse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
rf_cv = cross_validate(forest_model, x_train, y_train, cv=10)
print("MSE com 50 estimadores:",mse_rf)
print("Cross-validation:\n",pd.DataFrame(rf_cv).mean())
MSE com 75 estimadores: 77.01166444255793
Cross-validation:
fit_time 121.310203
score_time 0.232282
test_score -2.779421
dtype: float64
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred_nb = gnb.fit(x_train, y_train)
y_pred_nb = gnb.predict(x_test)
# Imprime os resultados alcançados
mse_nb = mean_squared_error(y_test, y_pred_nb, squared=False)
nb_cv = cross_validate(gnb, x_train, y_train, cv=10)
print("MSE:",mse_nb)
print("Cross-validation:\n",pd.DataFrame(nb_cv).mean())
MSE: 379.28290712539325
Cross-validation:
fit_time 0.548601
score_time 3.457848
test_score 0.000114
dtype: float64
try:
import xgboost as xgb
from sklearn.model_selection import cross_validate
except:
!pip install xgboost
import xgboost as xgb
from sklearn.model_selection import cross_validate
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 8, alpha = 10, n_estimators = 100)
# treina o modelo utilizando o método fit
xg_reg.fit(x_train, y_train)
# faz as previsões com o conjunto de testes
y_pred_xg = xg_reg.predict(x_test)
# Imprime os resultados alcançados
mse_xg = mean_squared_error(y_test, y_pred_xg, squared=False)
xg_cv = cross_validate(xg_reg, x_train, y_train, cv=10)
print("MSE: ",mse_xg)
print("Cross-validation:\n",pd.DataFrame(xg_cv).mean())
Collecting xgboost
Downloading xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
|████████████████████████████████| 173.5 MB 41.3 MB/s
Requirement already satisfied: scipy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from xgboost) (1.7.2)
Requirement already satisfied: numpy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from xgboost) (1.19.5)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
MSE: 61.5786523536647
Cross-validation:
fit_time 39.453960
score_time 0.358409
test_score -3.501130
dtype: float64
msedata = pd.DataFrame(
{"pies": [mse_rf, mse_nb, mse_xg]},
index=["Random Forest", "Naive Baiyes", "XGBoost"])
# Plot a bar chart
msedata.plot(kind="bar")