import pandas as pd
df = pd.read_csv('list_OutubroRosa.csv', sep=";")
df = df.loc[df['lang'] == 'pt']
df.drop('month', axis=1, inplace=True)
df = df.reset_index(drop=True)
df.drop('Unnamed: 0', axis=1, inplace=True)
df
df["retweets"] = df["share data"].str.replace(r'retweets=','').str.split(';').str[1]
df["replies"] = df["share data"].str.replace(r'replies=','').str.split(';').str[2]
df["quotes"] = df["share data"].str.replace(r'quotes=','').str.split(';').str[3]
df.drop('share data', axis=1, inplace=True)
df
# função auxiliar para checar os valores que nao existem
def check_nulls(df):
rows = []
for column in df.columns:
row = {'coluna': column, 'nans': df[column].isnull().sum(), 'frac_nans': df[column].isnull().sum() / df.shape[0]}
rows.append(row)
res = pd.DataFrame(rows)
return res[res.nans>0].sort_values('nans', ascending=False)
check_nulls(df)
df_fill_na = df.copy()
df_fill_na["likes"] = df_fill_na["likes"].fillna(0)
df_fill_na["retweets"] = df_fill_na["retweets"].fillna(0)
df_fill_na["replies"] = df_fill_na["replies"].fillna(0)
df_fill_na["quotes"] = df_fill_na["quotes"].fillna(0)
df_fill_na["url"] = df_fill_na["url"].fillna('https://twitter.com/')
check_nulls(df_fill_na)
replaced_df = df_fill_na.copy()
replaced_df = replaced_df.drop(['url'], axis=1)
replaced_df = replaced_df.drop(['lang'], axis=1)
replaced_df = replaced_df.drop(['date'], axis=1)
replaced_df = replaced_df.drop(['retweets'], axis=1)
replaced_df = replaced_df.drop(['replies'], axis=1)
replaced_df = replaced_df.drop(['quotes'], axis=1)
replaced_df
# vetor com todos usernames ordenados por frequencia
usernames = replaced_df['username'].value_counts(dropna=False).keys().tolist()
usernames_ids = []
for i in range(len(usernames)):
usernames_ids.append(i+1)
replace_dict = dict(zip(usernames, usernames_ids))
df_filled = replaced_df.copy()
# substitui os atributos categóricos por numericos
df_filled['username'] = df_filled['username'].map(replace_dict)
df_filled
#importando NLTK
try:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
except:
!pip install nltk -q
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
#lista de stopwords do portugues
sw = set(stopwords.words('portuguese') + list(punctuation))
sw.update(['http', 'https', 'co', 't', 'lrt'])
#remove as stopwords dos textos
df_nosw = df_filled.copy()
df_nosw["text"] = df_nosw["text"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in sw))
df_nosw
from tensorflow.keras.preprocessing.text import Tokenizer
tokenized_df = df_nosw.copy()
#Tokenize the sentences
tfTokenizer = Tokenizer(num_words=None)
tfTokenizer.fit_on_texts(tokenized_df['text'])
tfTokenizer.texts_to_sequences(tokenized_df['text'])
import numpy
tokenized_df['text'] = tfTokenizer.texts_to_sequences(tokenized_df['text'])
tokenized_df
import numpy as np
df_filled_final = tokenized_df.copy()
df_filled_final = pd.concat([df_filled_final.pop('text').apply(pd.Series), df_filled_final['username'], df_filled_final['likes']], axis=1)
df_filled_final.replace(np.nan, 0, inplace=True)
df_filled_final["username"] = df_filled_final["username"].astype(float)
df_filled_final
from sklearn.model_selection import train_test_split
df_training_final = df_filled_final.copy()
x, y = df_training_final.drop('likes', axis=1), df_training_final['likes']
p_x_train, p_x_test, y_train, y_test = train_test_split (x, y, test_size = 0.3, random_state = 0)
x_train = (p_x_train-p_x_train.min())/(p_x_train.max()-p_x_train.min())
x_test = (p_x_test-p_x_test.min())/(p_x_test.max()-p_x_test.min())
x_train.replace(np.nan, 0, inplace=True)
x_train
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
# inicializa o modelo de árvore de regressão
tree_model = DecisionTreeRegressor(random_state=42)
# treina o modelo utilizando o método fit
tree_model.fit(x_train, y_train)
# faz as previsões com o conjunto de testes
y_pred = tree_model.predict(x_test)
# Imprime os resultados alcançados
mse = mean_squared_error(y_test, y_pred, squared=False)
rt_cv = cross_validate(tree_model, x_train, y_train, cv=10)
print("MSE: ", str(mse))
print("Cross-validation:\n",pd.DataFrame(rt_cv).mean())
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
model = keras.Sequential([
layers.Dense(97, activation='sigmoid', input_shape=[len(x_train.keys())]),
layers.Dense(100, activation='sigmoid'),
layers.Dense(70, activation='sigmoid'),
layers.Dense(1)
])
model.compile(
loss='mse',
optimizer=tf.keras.optimizers.RMSprop(0.001),
metrics=['mae', 'mse'],
)
model.fit(x_train, y_train, epochs=20, callbacks=[])
print(model.evaluate(x_test, y_test, verbose=2))
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(n_estimators=50, random_state=0)
forest_model.fit(x_train, y_train)
# faz as previsões com o conjunto de testes
y_pred_rf = forest_model.predict(x_test)
# Imprime os resultados alcançados
mse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
rf_cv = cross_validate(forest_model, x_train, y_train, cv=10)
print("MSE com 50 estimadores:",mse_rf)
print("Cross-validation:\n",pd.DataFrame(rf_cv).mean())
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred_nb = gnb.fit(x_train, y_train)
y_pred_nb = gnb.predict(x_test)
# Imprime os resultados alcançados
mse_nb = mean_squared_error(y_test, y_pred_nb, squared=False)
nb_cv = cross_validate(gnb, x_train, y_train, cv=10)
print("MSE:",mse_nb)
print("Cross-validation:\n",pd.DataFrame(nb_cv).mean())
try:
import xgboost as xgb
from sklearn.model_selection import cross_validate
except:
!pip install xgboost
import xgboost as xgb
from sklearn.model_selection import cross_validate
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 8, alpha = 10, n_estimators = 100)
# treina o modelo utilizando o método fit
xg_reg.fit(x_train, y_train)
# faz as previsões com o conjunto de testes
y_pred_xg = xg_reg.predict(x_test)
# Imprime os resultados alcançados
mse_xg = mean_squared_error(y_test, y_pred_xg, squared=False)
xg_cv = cross_validate(xg_reg, x_train, y_train, cv=10)
print("MSE: ",mse_xg)
print("Cross-validation:\n",pd.DataFrame(xg_cv).mean())
msedata = pd.DataFrame(
{"pies": [mse_rf, mse_nb, mse_xg]},
index=["Random Forest", "Naive Baiyes", "XGBoost"])
# Plot a bar chart
msedata.plot(kind="bar")