import snscrape.modules.twitter as sntwitter
import pandas as pd
tweets_list_hastag = []
for i,tweet in enumerate(sntwitter.TwitterHashtagScraper('#SOSMarMenor since:2012-01-01 until:2022-04-20').get_items()):
tweets_list_hastag.append([tweet.id, tweet.date, tweet.content,tweet.user.username, tweet.mentionedUsers, tweet.retweetCount])
tweets_df = pd.DataFrame(tweets_list_hastag, columns=['ID','Datetime', 'Text', 'Username','mentionedUsers','nretweets'])
tweets_df
tweets_df.to_csv("mar_menor.csv", index=0)
import pandas as pd
pd.set_option('max_colwidth', None)
tweets_df = pd.read_csv("mar_menor.csv",parse_dates=['Datetime'])
tweets_df.head(5)
import re
from os.path import exists
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
from nltk.stem import SnowballStemmer
from nltk.tokenize import ToktokTokenizer
stemmer = SnowballStemmer("spanish")
def preprocesar_texto__tokenizar():
if not exists("./preprocesado.csv"):
tweets_pre = tweets_df.copy(deep=True)
# Quitamos urls
tweets_pre['Text'] = tweets_pre['Text'].apply(lambda x: re.sub(r'(\n)|(https?:\/\/.*[^\s])', '', x))
# Quitamos hashtags
tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'(?:#\w*)','',x).lower())
# Quitamos @usuarios
tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'(?:@\w*)','',x).lower())
# Quitamos los números
tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'\d+', '', x))
# Quitamos símbolos y emojis
tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'[^\w\s]','',x).lower())
# Quitar saltos de párrafo
tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'\n','',x))
# Quitamos espacios en blanco con mas de uno
tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'\s{2,}','',x))
# Quitamos espacios en blanco al principio y al final
tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'^\s+|\s+$','',x))
# Espacios en balnco por un solo espacio
tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'\s+',' ',x))
# Tokenizamos
tokenizer = ToktokTokenizer()
tweets_pre["Tokens"] = tweets_pre.Text.apply(tokenizer.tokenize)
# tweets_pre["Tokens"] = tweets_pre["Text"].apply(lambda x: word_tokenize(x))
# stopwords Tokens
tweets_pre["Tokens"] = tweets_pre["Tokens"].apply(lambda x: [word for word in x if word not in stop_words])
# Stemming Tokens
tweets_pre["Stemming"] = tweets_pre["Tokens"].apply(lambda x: [stemmer.stem(word) for word in x])
# Guardamos los @usuarios y hashtags
tweets_pre["Usuarios_hastags"] = tweets_df["Text"].apply(lambda x: re.findall(r'(?:@\w*)',x))
tweets_pre["Usuarios_hastags"] = tweets_pre["Usuarios_hastags"] + tweets_df["Text"].apply(lambda x: re.findall(r'(?:#\w*)',x))
# Guardamos el csv
tweets_pre.to_csv("preprocesado.csv", index=0)
return pd.read_csv("preprocesado.csv", parse_dates=['Datetime'])
else:
return pd.read_csv("preprocesado.csv", parse_dates=['Datetime'])
tweets_pre = preprocesar_texto__tokenizar()
tweets_pre.head(5)
import pandas as pd
tweets_df = pd.read_csv("mar_menor.csv")
tweets_df.sort_values(by=['Datetime'], inplace=True)
tweets_df.head(10)
# Convertimos Object a Datetime
tweets_df["Datetime"] = pd.to_datetime(tweets_df["Datetime"])
tweets_df.dtypes
tweets_df["Mes"] = tweets_df['Datetime'].dt.month
tweets_df["Year"] = tweets_df['Datetime'].dt.year
tweets_df["Day"] = tweets_df['Datetime'].dt.day
tweets_df_grouped_year_month = tweets_df.groupby(["Mes","Year"])
tweets_df_grouped_year_month.size().sort_values(ascending=False)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={"figure.figsize":(12, 7)})
sns.histplot( data = tweets_df, x="Mes", hue="Year", palette="Set1", kde=True).set(xlabel="Mes", ylabel="Número de tweets")
tweets_df.plot(x="Datetime", y="nretweets", figsize=(17,8))
pd.set_option('max_colwidth', None)
tweets_df[(tweets_df["Year"] == 2021) & (tweets_df["Mes"] == 8)].sort_values(by=['nretweets'], inplace=False, ascending=False).head(10)
tweets_df[(tweets_df["Year"] == 2019) & (tweets_df["Mes"] == 10)].sort_values(by=['nretweets'], inplace=False, ascending=False).head(10)
tweets_df_grouped_user = tweets_df.groupby("Username")
tweets_df_grouped_user["Username"].unique().count()
tweets_df_grouped_user["Username"].count().sort_values(ascending=False).head(100)
tweets_df_grouped_user["Username"].count().sort_values(ascending=False).head(50).plot(kind='bar').set(xlabel="Usuario", ylabel="Número de tweets")
import snscrape.modules.twitter
for username in tweets_df_grouped_user["Username"].count().sort_values(ascending=False).head(20).iteritems():
scraper = snscrape.modules.twitter.TwitterUserScraper(username[0])
if scraper.entity:
print(username[0], "-->", scraper.entity.description)
tweets_df.sort_values(by=['nretweets'], inplace=False, ascending=False).head(10)
top_10_df = tweets_df[["Username","ID","nretweets"]].sort_values(by=['nretweets'], inplace=False, ascending=False).head(10)
top_10_df
from IPython.display import HTML
import requests
# Muestra un tuit a partir de su id y del nombre del usuario
def show_tweet(user, tweet_id):
url = 'https://twitter.com/' + user + '/status/' + str(tweet_id)
url_to_json = 'https://publish.twitter.com/oembed?url=%s' % url
response = requests.get(url_to_json)
html = response.json()["html"]
display(HTML(html))
# recorrer dataframe top_10_df
for i in range(0,10):
show_tweet(top_10_df.iloc[i]["Username"], top_10_df.iloc[i]["ID"])
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import word_tokenize
tweets_pre = pd.read_csv("preprocesado.csv", parse_dates=['Datetime'])
tweets_pre.head(2)
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
tweets_pre.dropna(inplace=True)
tweets_pre["Text"].isnull().sum()
tweets_2019 = tweets_pre[(tweets_pre["Datetime"].dt.year == 2019) & (tweets_pre["Datetime"].dt.month == 10)]
tweets_2021 = tweets_pre[(tweets_pre["Datetime"].dt.year == 2021) & (tweets_pre["Datetime"].dt.month == 8)]
def sustantivos(df):
sustantivos = []
for text in df["Text"].iteritems():
word_sentence = text[1]
post_tag_sentences = pos_tag(word_tokenize(word_sentence))
post_tag_sentences = [word for word in post_tag_sentences if word[0] not in stop_words]
for word in post_tag_sentences:
if word[1] == "NN":
sustantivos.append(word[0])
return sustantivos
sustantivos_2019 = []
sustantivos_2021 = []
sustantivos_2019 = sustantivos(tweets_2019)
sustantivos_2021 = sustantivos(tweets_2021)
tweet_tokens = ' '.join(list(sustantivos_2019))
tweets_wc = WordCloud(width = 1200, height = 900, background_color = "white").generate(tweet_tokens)
plt.figure(figsize = (8,6), facecolor = 'k')
plt.imshow(tweets_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
tweet_tokens = ' '.join(list(sustantivos_2021))
tweets_wc = WordCloud(width = 1200, height = 900, background_color = "white").generate(tweet_tokens)
plt.figure(figsize = (8,6), facecolor = 'k')
plt.imshow(tweets_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
tweets_pre = pd.read_csv("preprocesado.csv", parse_dates=['Datetime'])
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import gensim
from nltk import word_tokenize
from nltk.tokenize import ToktokTokenizer
tweets_pre.dropna(inplace=True)
tokenizer = ToktokTokenizer()
tweets_pre["Modelo"] = tweets_pre["Text"].apply(tokenizer.tokenize)
tweets_pre["Modelo"] = tweets_pre["Modelo"].apply(lambda x: [word for word in x if word not in stop_words])
tweets_pre["Modelo"]
tweets_pre["Modelo"][5][0:4]
diccionario = Dictionary(tweets_pre["Modelo"])
len(diccionario)
corpus = [diccionario.doc2bow(token) for token in tweets_pre.Modelo]
lda = LdaModel(corpus=corpus, id2word=diccionario,
num_topics=20, random_state=42,
chunksize=200, passes=10, alpha='auto')
topicos = lda.print_topics(num_words=5, num_topics=20)
for topico in topicos:
print(topico)
import random
indice_tweet = random.randint(0,len(tweets_pre))
tweet = tweets_pre.iloc[indice_tweet]
print("Tweet: " + tweet.Text)
print(tweet.Modelo)
bow_tweet = corpus[indice_tweet]
distribucion_tweet = lda[bow_tweet]
# Indices de los topicos mas significativos
dist_indices = [topico[0] for topico in lda[bow_tweet]]
# Contribución de los topicos mas significativos
dist_contrib = [topico[1] for topico in lda[bow_tweet]]
distribucion_topicos = pd.DataFrame({'Topico':dist_indices,
'Contribucion':dist_contrib })
distribucion_topicos.sort_values('Contribucion',
ascending=False, inplace=True)
ax = distribucion_topicos.plot.bar(y='Contribucion',x='Topico',
rot=0, color="orange",
title = 'Tópicos mas importantes'
'de noticia ' + str(indice_tweet))
for i in range(1, 10):
plt.figure()
plt.imshow(WordCloud(background_color='white', prefer_horizontal=1.0)
.fit_words(dict(lda.show_topic(i, 20))))
plt.axis("off")
plt.title("Tópico " + str(i))
plt.show()