M2 Project

Intro

https://www.kaggle.com/crowdflower/twitter-airline-sentiment

#install kaggle package to import Kaggle API !pip install -qq kaggle

#make folder for api key !mkdir ~/.kaggle #copy the api key to the kaggle folder !cp kaggle.json ~/.kaggle #set permissions for the key !chmod 600 ~/.kaggle/kaggle.json

#importing relevant packages import pandas as pd import numpy as np import spacy import seaborn as sns import matplotlib.pyplot as plt #to suppress warnings from warnings import filterwarnings filterwarnings('ignore') #instantiation English module !python -m spacy download en_core_web_sm nlp = spacy.load('en_core_web_sm')

#Afterwords we downloaded the Kaggle dataset and unzipped it. !kaggle datasets download -d crowdflower/twitter-airline-sentiment

#Read the dataset and name it. The dataset will be called "data". data = pd.read_csv('/work/Projekt/Tweets.csv')

1. Data preparation/EDA

data.head()

#The dataset has 15 columns and 14,640 rows: data.shape

data.info()

data.describe()

#Next step will to the check if there is any NaN-values. As we can see below, there is a couple of NaN-values. data.isna().sum() #As there is 14640 rows (as we can see by the .shape), and there is 14608 NaN-values is negativereason_gold. #As there is 14640 rows (as we can see by the .shape), and there is 13621 NaN-values is tweet_coord.

#As the tweet_id coloumn is just a unique number of the tweet. In that case we decided just to use the row number instead. data = data.drop(["tweet_id"], axis = 1) #We removed those columns where there is a very high number NaNs, so we chose to remove those ones. data = data.drop(["airline_sentiment_gold"], axis = 1) data = data.drop(["negativereason_gold"], axis = 1) data = data.drop(["tweet_coord"], axis = 1) #Furthermore, we removed those coloumn that we did not use. data = data.drop(["airline_sentiment_confidence"], axis = 1) data = data.drop(["negativereason_confidence"], axis = 1) data = data.drop(["tweet_location"], axis = 1)

#We can see in the .describe() that there are numerous tweets from the same username. data["name"].value_counts(sort=True) #By this code, we can se that the maximum number unique names are 63. By this we do estimate that it is not necessary remove dulpicates.

#To get a bigger overview of the labels - we made a histogram. It shows that there is a tend that reviews of airline companies are negative. data['airline_sentiment'].unique() ax = sns.countplot(x="airline_sentiment",data=data, color= "black")

#Here we have plotted the airlines, which visualises the popularity of the airlines. plt.figure(figsize=(8,8)) ax = sns.countplot(x="airline", data=data, color = "black")

#To combine the two figures above a grouped barplot has been made below to enlighten the volume of sentiment compated to each other data.groupby('airline').airline_sentiment.value_counts().unstack().plot.bar(figsize=(12, 10)) plt.xlabel("Airlines") plt.xticks(rotation=0)

2. Natural Language Processing

2.1 Preprocessing tweets

#import spacy import spacy #Downoad the Enlighs module for spacy !python -m spacy download en_core_web_sm #instantiating English module nlp = spacy.load("en_core_web_sm")

#Initiating an empty list: clean_text = [] for text in nlp.pipe(data['text'], disable=["tagger", "parser", "ner"]): #Disable part of the pipeline to make it faster txt = [token.lemma_.lower() for token in text if token.is_alpha # having only alphanumreical values and not token.is_stop #removing stopwords and not token.is_punct] #removing punctuations #Appending the above to 'clean_text' clean_text.append(txt) data['clean_text'] = clean_text

2.2. Simple frequency-based analysis

#For text preprocessing import re import spacy import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer import string #Import numpy for matrix operation import numpy as np

#Importing packages import itertools from collections import Counter data_bow =pd.DataFrame(data[['airline_sentiment', "clean_text"]]) most_common=Counter(itertools.chain(*data_bow.clean_text)).most_common(10) most_common

!pip install sacremoses

nltk.download('perluniprops') from sacremoses import MosesDetokenizer detokenizer = MosesDetokenizer() data['clean_detoken']=data['clean_text'].apply(lambda x: detokenizer.detokenize(x, return_str=True))# Using apply(str) method to lambda function

!pip install wordcloud from wordcloud import WordCloud,STOPWORDS

new_df=data[data['airline_sentiment']=='negative'] words = ' '.join(new_df['clean_detoken']) cleaned_word = " ".join([word for word in words.split() if 'http' not in word and not word.startswith('@') and word != 'RT' ]) wordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', width=3000, height=2500 ).generate(cleaned_word) plt.figure(1,figsize=(10, 10)) plt.imshow(wordcloud) plt.axis('off') plt.show()

new_df=data[data['airline_sentiment']=='positive'] words = ' '.join(new_df['clean_detoken']) cleaned_word = " ".join([word for word in words.split() if 'http' not in word and not word.startswith('@') and word != 'RT' ]) wordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', width=3000, height=2500 ).generate(cleaned_word) plt.figure(1,figsize=(10, 10)) plt.imshow(wordcloud) plt.axis('off') plt.show()

new_df=data[data['airline_sentiment']=='neutral'] words = ' '.join(new_df['clean_detoken']) cleaned_word = " ".join([word for word in words.split() if 'http' not in word and not word.startswith('@') and word != 'RT' ]) wordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', width=3000, height=2500 ).generate(cleaned_word) plt.figure(1,figsize=(10, 10)) plt.imshow(wordcloud) plt.axis('off') plt.show()# Start with one review:

2.3. Topic Modeling (LDA)

#Importing the LDAMulticore model from Gensim, which is a fast version of LDA: !pip install gensim from gensim.models import LdaMulticore

#Import the dictionary builder. This maps the text-data (each word) into its- #Unique integer ID. This is needed to work with Gensim. from gensim.corpora.dictionary import Dictionary

#Create a Dictionary from the articles: dictionary dictionary = Dictionary(data['clean_text'])

#Construct corpus using this dictionary #The corpus is the collection of all the tweets, based on the dictionary corpus = [dictionary.doc2bow(doc) for doc in data['clean_text']]

# That's how the corpus looks corpus[3][:10]

# Training the model lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=2, workers = 4, passes=10)

# Check out topics lda_model.print_topics(-1)

# Where does a text belong to? lda_model[corpus][0]

data['clean_text'][1]

# let's fist install this nice visualizer !pip install pyLDAvis

import pyLDAvis.gensim_models

# Let's try to visualize lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Let's Visualize pyLDAvis.display(lda_display)

2.4 Embedding-model based vectorization (Word2Vec)

#Import Word2vec from gensim from gensim.models import Word2Vec #Import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity

#Use the model on the tokenzied data w2v_model = Word2Vec(sentences=data['clean_text'], vector_size=300, window=5, min_count=2, workers=2, epochs=5)

#Import itertools and counter import itertools from collections import Counter #Create a dataframe with word tokens and sentiments data_bow =pd.DataFrame(data[['airline_sentiment', "clean_text"]]) #find the most used words most_common=Counter(itertools.chain(*data_bow.clean_text)).most_common(10) print(most_common[:3])

#Lookup 'late' and words close to it print('"flight" is close to:',w2v_model.wv.similar_by_word('flight')[:3]) print('"thanks" is close to:',w2v_model.wv.similar_by_word('thanks')[:3]) print('"cancelled" is close to:',w2v_model.wv.similar_by_word('cancelled')[:3]) print('"late" is close to:',w2v_model.wv.similar_by_word('late')[:3])

w2v_model.save('w2v_model')

from sklearn.feature_extraction.text import TfidfVectorizer

# function that does absolutely nothing... # to be able to use TfidfVectorizer on already tokenized text def dummy_fun(doc): return doc

# we turn of any preprocessing and align vocabulary with the one # used by our embeddings # that will allow us to use TFIDF vectors to weight the embeddings tfidf = TfidfVectorizer(vocabulary=w2v_model.wv.key_to_index.keys(), tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)

# create TFIDF matrix (we could also just use that one for search) data_tfidf = tfidf.fit_transform(data['clean_text'])

# how many word-vectors do we have? len(w2v_model.wv.key_to_index)

# one tfidf vector has also 5360 columns - because we provided a vocab data_tfidf[:1,:]

# we can use np.dot or since Python 3 the @ for matrix-multiplication # let's try data_tfidf[:1,:] @ w2v_model.wv.vectors

# for the whole matrix data_w2v_tfidf = data_tfidf @ w2v_model.wv.vectors data_w2v_tfidf

2.5 Sentiment prediction (SML)

We would like to predict the outcome (airline_sentiment) weather they will be labelled with: "positive", "neautral" or "negative"

2.5.1 Logistic regression

#Import relevant packages from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report from sklearn.feature_extraction.text import CountVectorizer

#Define y as the coloumn Airline Sentiment y = data['airline_sentiment'] #Define X as the w2v model X= data_w2v_tfidf

#Split the dataset into a test and a train set, with a 20% percentage X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

#Create a model, that uses the logisticregression and then fit it model = LogisticRegression(multi_class="ovr") model.fit(X_train, y_train) #Check the score of the model. model.score(X_test, y_test)

labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y)

#Defining true- and predicted classes, and making a crosstab: true_class = labelencoder_y.fit_transform(y_test) y_pred = model.predict(X_test) predicted_class = labelencoder_y.fit_transform(y_pred) df = pd.DataFrame({'true_class': true_class, 'predicted_class': predicted_class }) pd.crosstab(df.true_class, df.predicted_class)

print(classification_report(y_test, y_pred))

2.5.2 XGBoost

!brew install libomp !pip install xgboost

from xgboost import XGBClassifier model_xgb = XGBClassifier()

model_xgb.fit(X_train, y_train) y_pred = model_xgb.predict(X_test)

#XGBoost print(classification_report(y_test, y_pred))

3. Network Analysis

#Download dataset from kaggle !kaggle datasets download -d stackoverflow/stack-overflow-tag-network

#Unzip the kaggle files !unzip -y /work/Projekt/stack-overflow-tag-network.zip

3.1 Overall network

#install packages !pip install networkx !pip install -qq names

#Importing some packages: import pandas as pd import numpy as np import networkx as nx import matplotlib.pyplot as plt import names

#Load the data into DataFrames df_nodes = pd.read_csv('/work/Projekt/stack_network_nodes.csv') df_edges = pd.read_csv('/work/Projekt/stack_network_links.csv')

#Creating an empty graph structure (a “null graph”) with no nodes and no edges. G = nx.Graph(day="Stackoverflow") # growing the nodes - as the programs (example Python/R) for index, row in df_nodes.iterrows(): G.add_node(row['name'], group=row['group'], nodesize=row['nodesize']) # growing the edges - as the tags between the programs (when they are tagged together) for index, row in df_edges.iterrows(): G.add_weighted_edges_from([(row['source'], row['target'], row['value'])]) #Info about the nodes and edges: nx.info(G)

# Defining the network: def draw_graph(G,size): nodes = G.nodes() color_map = {1:'#f09494', 2:'#eebcbc', 3:'#72bbd0', 4:'#91f0a1', 5:'#629fff', 6:'#bcc2f2', 7:'#eebcbc', 8:'#f1f0c0', 9:'#d2ffe7', 10:'#caf3a6', 11:'#ffdf55', 12:'#ef77aa', 13:'#d6dcff', 14:'#d2f5f0'} node_color= [color_map[d['group']] for n,d in G.nodes(data=True)] node_size = [d['nodesize']*10 for n,d in G.nodes(data=True)] pos = nx.drawing.spring_layout(G,k=0.70,iterations=60) plt.figure(figsize=size) nx.draw_networkx(G,pos=pos,node_color=node_color,node_size=node_size,edge_color='#FFDEA2') plt.show()

#Drawing the network draw_graph(G,size=(25,25))

3.2 Community Detection

#Importing the community_louvain package: ! pip3 install python-louvain import community as community_louvain

#Plotting the communities in the network: partition = community_louvain.best_partition(G) nx.set_node_attributes(G, partition, 'partition') nx.draw_kamada_kawai(G, with_labels = True, node_color=list(partition.values()))

#Creating a new graph from the network, with radius = 2 (goes two nodes out from 'python') python = nx.ego_graph(G, 'python', radius=2) #Creating a new dataframe python_df = pd.DataFrame.from_dict(dict(python.nodes(data=True)), orient='index') #plotting the network, with radius=2 from Python: nx.draw_kamada_kawai(python, with_labels = True, node_color=python_df.partition)

- 3.3 Global Network

#Network level characteristics print('Density: ') print(nx.density(G)) print() print('Transitivity: ') print(nx.transitivity(G)) print() print('Reciprocity: ') print(nx.reciprocity(G))

#Create a dictionaries with degre, betweenness and eigenvector centrality centrality_dgr = nx.degree_centrality(G) centrality_between = nx.betweenness_centrality(G) centrality_eigen = nx.eigenvector_centrality_numpy(G) #sort the 3 dictionaries degre, betweenness and eigenvector centrality cen_dgr_sor = sorted(centrality_dgr.items(), key=lambda item: item[1],reverse=True) cen_eig_sor = sorted(centrality_eigen.items(), key=lambda item: item[1],reverse=True) cen_bet_sor = sorted(centrality_between.items(), key=lambda item: item[1],reverse=True) #Create a Dataframe and merge each of them centrality = pd.DataFrame(cen_dgr_sor, columns=['Name', 'Centrality Degree']) df_cen_eig = pd.DataFrame(cen_eig_sor, columns=['Name', 'Centrality Eigen']) df_cen_bet = pd.DataFrame(cen_bet_sor, columns=['Name', 'Centrality Between']) centrality = centrality.merge(df_cen_eig, how='left', left_on='Name', right_on='Name') centrality = centrality.merge(df_cen_bet, how='left', left_on='Name', right_on='Name') #Create a new value to find the nodes that in genneral have the highest centralities centrality['Sum'] = centrality['Centrality Degree']+ centrality['Centrality Eigen']+ centrality['Centrality Between'] #Sort and show the 10 nodes with the highst centrality centrality.sort_values(by='Sum', ascending=False)[:10]

jquery is the most central node in the network

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}M2 Project