SOS Mar Menor

import snscrape.modules.twitter as sntwitter import pandas as pd

tweets_list_hastag = [] for i,tweet in enumerate(sntwitter.TwitterHashtagScraper('#SOSMarMenor since:2012-01-01 until:2022-04-20').get_items()): tweets_list_hastag.append([tweet.id, tweet.date, tweet.content,tweet.user.username, tweet.mentionedUsers, tweet.retweetCount]) tweets_df = pd.DataFrame(tweets_list_hastag, columns=['ID','Datetime', 'Text', 'Username','mentionedUsers','nretweets'])

tweets_df

tweets_df.to_csv("mar_menor.csv", index=0)

import pandas as pd pd.set_option('max_colwidth', None)

tweets_df = pd.read_csv("mar_menor.csv",parse_dates=['Datetime'])

tweets_df.head(5)

import re from os.path import exists from nltk.tokenize import word_tokenize from nltk.corpus import stopwords stop_words = stopwords.words('spanish') from nltk.stem import SnowballStemmer from nltk.tokenize import ToktokTokenizer stemmer = SnowballStemmer("spanish") def preprocesar_texto__tokenizar(): if not exists("./preprocesado.csv"): tweets_pre = tweets_df.copy(deep=True) # Quitamos urls tweets_pre['Text'] = tweets_pre['Text'].apply(lambda x: re.sub(r'(\n)|(https?:\/\/.*[^\s])', '', x)) # Quitamos hashtags tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'(?:#\w*)','',x).lower()) # Quitamos @usuarios tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'(?:@\w*)','',x).lower()) # Quitamos los números tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'\d+', '', x)) # Quitamos símbolos y emojis tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'[^\w\s]','',x).lower()) # Quitar saltos de párrafo tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'\n','',x)) # Quitamos espacios en blanco con mas de uno tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'\s{2,}','',x)) # Quitamos espacios en blanco al principio y al final tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'^\s+|\s+$','',x)) # Espacios en balnco por un solo espacio tweets_pre["Text"] = tweets_pre["Text"].apply(lambda x: re.sub(r'\s+',' ',x)) # Tokenizamos tokenizer = ToktokTokenizer() tweets_pre["Tokens"] = tweets_pre.Text.apply(tokenizer.tokenize) # tweets_pre["Tokens"] = tweets_pre["Text"].apply(lambda x: word_tokenize(x)) # stopwords Tokens tweets_pre["Tokens"] = tweets_pre["Tokens"].apply(lambda x: [word for word in x if word not in stop_words]) # Stemming Tokens tweets_pre["Stemming"] = tweets_pre["Tokens"].apply(lambda x: [stemmer.stem(word) for word in x]) # Guardamos los @usuarios y hashtags tweets_pre["Usuarios_hastags"] = tweets_df["Text"].apply(lambda x: re.findall(r'(?:@\w*)',x)) tweets_pre["Usuarios_hastags"] = tweets_pre["Usuarios_hastags"] + tweets_df["Text"].apply(lambda x: re.findall(r'(?:#\w*)',x)) # Guardamos el csv tweets_pre.to_csv("preprocesado.csv", index=0) return pd.read_csv("preprocesado.csv", parse_dates=['Datetime']) else: return pd.read_csv("preprocesado.csv", parse_dates=['Datetime'])

tweets_pre = preprocesar_texto__tokenizar()

tweets_pre.head(5)

import pandas as pd

tweets_df = pd.read_csv("mar_menor.csv")

tweets_df.sort_values(by=['Datetime'], inplace=True)

tweets_df.head(10)

# Convertimos Object a Datetime tweets_df["Datetime"] = pd.to_datetime(tweets_df["Datetime"])

tweets_df.dtypes

tweets_df["Mes"] = tweets_df['Datetime'].dt.month tweets_df["Year"] = tweets_df['Datetime'].dt.year tweets_df["Day"] = tweets_df['Datetime'].dt.day

tweets_df_grouped_year_month = tweets_df.groupby(["Mes","Year"])

tweets_df_grouped_year_month.size().sort_values(ascending=False)

import seaborn as sns import matplotlib.pyplot as plt

sns.set(rc={"figure.figsize":(12, 7)})

sns.histplot( data = tweets_df, x="Mes", hue="Year", palette="Set1", kde=True).set(xlabel="Mes", ylabel="Número de tweets")

tweets_df.plot(x="Datetime", y="nretweets", figsize=(17,8))

pd.set_option('max_colwidth', None)

tweets_df[(tweets_df["Year"] == 2021) & (tweets_df["Mes"] == 8)].sort_values(by=['nretweets'], inplace=False, ascending=False).head(10)

tweets_df[(tweets_df["Year"] == 2019) & (tweets_df["Mes"] == 10)].sort_values(by=['nretweets'], inplace=False, ascending=False).head(10)

tweets_df_grouped_user = tweets_df.groupby("Username")

tweets_df_grouped_user["Username"].unique().count()

tweets_df_grouped_user["Username"].count().sort_values(ascending=False).head(100)

tweets_df_grouped_user["Username"].count().sort_values(ascending=False).head(50).plot(kind='bar').set(xlabel="Usuario", ylabel="Número de tweets")

import snscrape.modules.twitter for username in tweets_df_grouped_user["Username"].count().sort_values(ascending=False).head(20).iteritems(): scraper = snscrape.modules.twitter.TwitterUserScraper(username[0]) if scraper.entity: print(username[0], "-->", scraper.entity.description)

tweets_df.sort_values(by=['nretweets'], inplace=False, ascending=False).head(10)

top_10_df = tweets_df[["Username","ID","nretweets"]].sort_values(by=['nretweets'], inplace=False, ascending=False).head(10)

top_10_df

from IPython.display import HTML import requests # Muestra un tuit a partir de su id y del nombre del usuario def show_tweet(user, tweet_id): url = 'https://twitter.com/' + user + '/status/' + str(tweet_id) url_to_json = 'https://publish.twitter.com/oembed?url=%s' % url response = requests.get(url_to_json) html = response.json()["html"] display(HTML(html))

# recorrer dataframe top_10_df for i in range(0,10): show_tweet(top_10_df.iloc[i]["Username"], top_10_df.iloc[i]["ID"])

from wordcloud import WordCloud import matplotlib.pyplot as plt

from nltk import word_tokenize

tweets_pre = pd.read_csv("preprocesado.csv", parse_dates=['Datetime'])

tweets_pre.head(2)

from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.corpus import stopwords stop_words = stopwords.words('spanish')

tweets_pre.dropna(inplace=True) tweets_pre["Text"].isnull().sum()

tweets_2019 = tweets_pre[(tweets_pre["Datetime"].dt.year == 2019) & (tweets_pre["Datetime"].dt.month == 10)]

tweets_2021 = tweets_pre[(tweets_pre["Datetime"].dt.year == 2021) & (tweets_pre["Datetime"].dt.month == 8)]

def sustantivos(df): sustantivos = [] for text in df["Text"].iteritems(): word_sentence = text[1] post_tag_sentences = pos_tag(word_tokenize(word_sentence)) post_tag_sentences = [word for word in post_tag_sentences if word[0] not in stop_words] for word in post_tag_sentences: if word[1] == "NN": sustantivos.append(word[0]) return sustantivos

sustantivos_2019 = [] sustantivos_2021 = [] sustantivos_2019 = sustantivos(tweets_2019) sustantivos_2021 = sustantivos(tweets_2021)

tweet_tokens = ' '.join(list(sustantivos_2019)) tweets_wc = WordCloud(width = 1200, height = 900, background_color = "white").generate(tweet_tokens) plt.figure(figsize = (8,6), facecolor = 'k') plt.imshow(tweets_wc) plt.axis('off') plt.tight_layout(pad = 0) plt.show()

tweet_tokens = ' '.join(list(sustantivos_2021)) tweets_wc = WordCloud(width = 1200, height = 900, background_color = "white").generate(tweet_tokens) plt.figure(figsize = (8,6), facecolor = 'k') plt.imshow(tweets_wc) plt.axis('off') plt.tight_layout(pad = 0) plt.show()

tweets_pre = pd.read_csv("preprocesado.csv", parse_dates=['Datetime'])

from gensim.corpora import Dictionary from gensim.models import LdaModel import gensim from nltk import word_tokenize

from nltk.tokenize import ToktokTokenizer

tweets_pre.dropna(inplace=True)

tokenizer = ToktokTokenizer() tweets_pre["Modelo"] = tweets_pre["Text"].apply(tokenizer.tokenize)

tweets_pre["Modelo"] = tweets_pre["Modelo"].apply(lambda x: [word for word in x if word not in stop_words])

tweets_pre["Modelo"]

tweets_pre["Modelo"][5][0:4]

diccionario = Dictionary(tweets_pre["Modelo"])

len(diccionario)

corpus = [diccionario.doc2bow(token) for token in tweets_pre.Modelo]

lda = LdaModel(corpus=corpus, id2word=diccionario, num_topics=20, random_state=42, chunksize=200, passes=10, alpha='auto')

topicos = lda.print_topics(num_words=5, num_topics=20) for topico in topicos: print(topico)

import random

indice_tweet = random.randint(0,len(tweets_pre)) tweet = tweets_pre.iloc[indice_tweet] print("Tweet: " + tweet.Text) print(tweet.Modelo)

bow_tweet = corpus[indice_tweet] distribucion_tweet = lda[bow_tweet]

# Indices de los topicos mas significativos dist_indices = [topico[0] for topico in lda[bow_tweet]] # Contribución de los topicos mas significativos dist_contrib = [topico[1] for topico in lda[bow_tweet]]

distribucion_topicos = pd.DataFrame({'Topico':dist_indices, 'Contribucion':dist_contrib }) distribucion_topicos.sort_values('Contribucion', ascending=False, inplace=True) ax = distribucion_topicos.plot.bar(y='Contribucion',x='Topico', rot=0, color="orange", title = 'Tópicos mas importantes' 'de noticia ' + str(indice_tweet))

for i in range(1, 10): plt.figure() plt.imshow(WordCloud(background_color='white', prefer_horizontal=1.0) .fit_words(dict(lda.show_topic(i, 20)))) plt.axis("off") plt.title("Tópico " + str(i)) plt.show()