M2 Project
Intro
https://www.kaggle.com/crowdflower/twitter-airline-sentiment
#install kaggle package to import Kaggle API
!pip install -qq kaggle
#make folder for api key
!mkdir ~/.kaggle
#copy the api key to the kaggle folder
!cp kaggle.json ~/.kaggle
#set permissions for the key
!chmod 600 ~/.kaggle/kaggle.json
#importing relevant packages
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
#to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')
#instantiation English module
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
#Afterwords we downloaded the Kaggle dataset and unzipped it.
!kaggle datasets download -d crowdflower/twitter-airline-sentiment
#Read the dataset and name it. The dataset will be called "data".
data = pd.read_csv('/work/Projekt/Tweets.csv')
1. Data preparation/EDA
data.head()
#The dataset has 15 columns and 14,640 rows:
data.shape
data.info()
data.describe()
#Next step will to the check if there is any NaN-values. As we can see below, there is a couple of NaN-values.
data.isna().sum()
#As there is 14640 rows (as we can see by the .shape), and there is 14608 NaN-values is negativereason_gold.
#As there is 14640 rows (as we can see by the .shape), and there is 13621 NaN-values is tweet_coord.
#As the tweet_id coloumn is just a unique number of the tweet. In that case we decided just to use the row number instead.
data = data.drop(["tweet_id"], axis = 1)
#We removed those columns where there is a very high number NaNs, so we chose to remove those ones.
data = data.drop(["airline_sentiment_gold"], axis = 1)
data = data.drop(["negativereason_gold"], axis = 1)
data = data.drop(["tweet_coord"], axis = 1)
#Furthermore, we removed those coloumn that we did not use.
data = data.drop(["airline_sentiment_confidence"], axis = 1)
data = data.drop(["negativereason_confidence"], axis = 1)
data = data.drop(["tweet_location"], axis = 1)
#We can see in the .describe() that there are numerous tweets from the same username.
data["name"].value_counts(sort=True)
#By this code, we can se that the maximum number unique names are 63. By this we do estimate that it is not necessary remove dulpicates.
#To get a bigger overview of the labels - we made a histogram. It shows that there is a tend that reviews of airline companies are negative.
data['airline_sentiment'].unique()
ax = sns.countplot(x="airline_sentiment",data=data, color= "black")
#Here we have plotted the airlines, which visualises the popularity of the airlines.
plt.figure(figsize=(8,8))
ax = sns.countplot(x="airline", data=data, color = "black")
#To combine the two figures above a grouped barplot has been made below to enlighten the volume of sentiment compated to each other
data.groupby('airline').airline_sentiment.value_counts().unstack().plot.bar(figsize=(12, 10))
plt.xlabel("Airlines")
plt.xticks(rotation=0)
2. Natural Language Processing
2.1 Preprocessing tweets
#import spacy
import spacy
#Downoad the Enlighs module for spacy
!python -m spacy download en_core_web_sm
#instantiating English module
nlp = spacy.load("en_core_web_sm")
#Initiating an empty list:
clean_text = []
for text in nlp.pipe(data['text'], disable=["tagger", "parser", "ner"]): #Disable part of the pipeline to make it faster
txt = [token.lemma_.lower() for token in text
if token.is_alpha # having only alphanumreical values
and not token.is_stop #removing stopwords
and not token.is_punct] #removing punctuations
#Appending the above to 'clean_text'
clean_text.append(txt)
data['clean_text'] = clean_text
2.2. Simple frequency-based analysis
#For text preprocessing
import re
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
#Import numpy for matrix operation
import numpy as np
#Importing packages
import itertools
from collections import Counter
data_bow =pd.DataFrame(data[['airline_sentiment', "clean_text"]])
most_common=Counter(itertools.chain(*data_bow.clean_text)).most_common(10)
most_common
!pip install sacremoses
nltk.download('perluniprops')
from sacremoses import MosesDetokenizer
detokenizer = MosesDetokenizer()
data['clean_detoken']=data['clean_text'].apply(lambda x: detokenizer.detokenize(x, return_str=True))# Using apply(str) method to lambda function
!pip install wordcloud
from wordcloud import WordCloud,STOPWORDS
new_df=data[data['airline_sentiment']=='negative']
words = ' '.join(new_df['clean_detoken'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
new_df=data[data['airline_sentiment']=='positive']
words = ' '.join(new_df['clean_detoken'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
new_df=data[data['airline_sentiment']=='neutral']
words = ' '.join(new_df['clean_detoken'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()# Start with one review:
2.3. Topic Modeling (LDA)
#Importing the LDAMulticore model from Gensim, which is a fast version of LDA:
!pip install gensim
from gensim.models import LdaMulticore
#Import the dictionary builder. This maps the text-data (each word) into its-
#Unique integer ID. This is needed to work with Gensim.
from gensim.corpora.dictionary import Dictionary
#Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data['clean_text'])
#Construct corpus using this dictionary
#The corpus is the collection of all the tweets, based on the dictionary
corpus = [dictionary.doc2bow(doc) for doc in data['clean_text']]
# That's how the corpus looks
corpus[3][:10]
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=2, workers = 4, passes=10)
# Check out topics
lda_model.print_topics(-1)
# Where does a text belong to?
lda_model[corpus][0]
data['clean_text'][1]
# let's fist install this nice visualizer
!pip install pyLDAvis
import pyLDAvis.gensim_models
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
# Let's Visualize
pyLDAvis.display(lda_display)
2.4 Embedding-model based vectorization (Word2Vec)
#Import Word2vec from gensim
from gensim.models import Word2Vec
#Import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
#Use the model on the tokenzied data
w2v_model = Word2Vec(sentences=data['clean_text'], vector_size=300, window=5, min_count=2, workers=2, epochs=5)
#Import itertools and counter
import itertools
from collections import Counter
#Create a dataframe with word tokens and sentiments
data_bow =pd.DataFrame(data[['airline_sentiment', "clean_text"]])
#find the most used words
most_common=Counter(itertools.chain(*data_bow.clean_text)).most_common(10)
print(most_common[:3])
#Lookup 'late' and words close to it
print('"flight" is close to:',w2v_model.wv.similar_by_word('flight')[:3])
print('"thanks" is close to:',w2v_model.wv.similar_by_word('thanks')[:3])
print('"cancelled" is close to:',w2v_model.wv.similar_by_word('cancelled')[:3])
print('"late" is close to:',w2v_model.wv.similar_by_word('late')[:3])
w2v_model.save('w2v_model')
from sklearn.feature_extraction.text import TfidfVectorizer
# function that does absolutely nothing...
# to be able to use TfidfVectorizer on already tokenized text
def dummy_fun(doc):
return doc
# we turn of any preprocessing and align vocabulary with the one
# used by our embeddings
# that will allow us to use TFIDF vectors to weight the embeddings
tfidf = TfidfVectorizer(vocabulary=w2v_model.wv.key_to_index.keys(),
tokenizer=dummy_fun,
preprocessor=dummy_fun,
token_pattern=None)
# create TFIDF matrix (we could also just use that one for search)
data_tfidf = tfidf.fit_transform(data['clean_text'])
# how many word-vectors do we have?
len(w2v_model.wv.key_to_index)
# one tfidf vector has also 5360 columns - because we provided a vocab
data_tfidf[:1,:]
# we can use np.dot or since Python 3 the @ for matrix-multiplication
# let's try
data_tfidf[:1,:] @ w2v_model.wv.vectors
# for the whole matrix
data_w2v_tfidf = data_tfidf @ w2v_model.wv.vectors
data_w2v_tfidf
2.5 Sentiment prediction (SML)
We would like to predict the outcome (airline_sentiment) weather they will be labelled with: "positive", "neautral" or "negative"
2.5.1 Logistic regression
#Import relevant packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
#Define y as the coloumn Airline Sentiment
y = data['airline_sentiment']
#Define X as the w2v model
X= data_w2v_tfidf
#Split the dataset into a test and a train set, with a 20% percentage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
#Create a model, that uses the logisticregression and then fit it
model = LogisticRegression(multi_class="ovr")
model.fit(X_train, y_train)
#Check the score of the model.
model.score(X_test, y_test)
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
#Defining true- and predicted classes, and making a crosstab:
true_class = labelencoder_y.fit_transform(y_test)
y_pred = model.predict(X_test)
predicted_class = labelencoder_y.fit_transform(y_pred)
df = pd.DataFrame({'true_class': true_class, 'predicted_class': predicted_class })
pd.crosstab(df.true_class, df.predicted_class)
print(classification_report(y_test, y_pred))
2.5.2 XGBoost
!brew install libomp
!pip install xgboost
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)
#XGBoost
print(classification_report(y_test, y_pred))
3. Network Analysis
#Download dataset from kaggle
!kaggle datasets download -d stackoverflow/stack-overflow-tag-network
#Unzip the kaggle files
!unzip -y /work/Projekt/stack-overflow-tag-network.zip
3.1 Overall network
#install packages
!pip install networkx
!pip install -qq names
#Importing some packages:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import names
#Load the data into DataFrames
df_nodes = pd.read_csv('/work/Projekt/stack_network_nodes.csv')
df_edges = pd.read_csv('/work/Projekt/stack_network_links.csv')
#Creating an empty graph structure (a “null graph”) with no nodes and no edges.
G = nx.Graph(day="Stackoverflow")
# growing the nodes - as the programs (example Python/R)
for index, row in df_nodes.iterrows():
G.add_node(row['name'], group=row['group'], nodesize=row['nodesize'])
# growing the edges - as the tags between the programs (when they are tagged together)
for index, row in df_edges.iterrows():
G.add_weighted_edges_from([(row['source'], row['target'], row['value'])])
#Info about the nodes and edges:
nx.info(G)
# Defining the network:
def draw_graph(G,size):
nodes = G.nodes()
color_map = {1:'#f09494', 2:'#eebcbc', 3:'#72bbd0', 4:'#91f0a1', 5:'#629fff', 6:'#bcc2f2',
7:'#eebcbc', 8:'#f1f0c0', 9:'#d2ffe7', 10:'#caf3a6', 11:'#ffdf55', 12:'#ef77aa',
13:'#d6dcff', 14:'#d2f5f0'}
node_color= [color_map[d['group']] for n,d in G.nodes(data=True)]
node_size = [d['nodesize']*10 for n,d in G.nodes(data=True)]
pos = nx.drawing.spring_layout(G,k=0.70,iterations=60)
plt.figure(figsize=size)
nx.draw_networkx(G,pos=pos,node_color=node_color,node_size=node_size,edge_color='#FFDEA2')
plt.show()
#Drawing the network
draw_graph(G,size=(25,25))
3.2 Community Detection
#Importing the community_louvain package:
! pip3 install python-louvain
import community as community_louvain
#Plotting the communities in the network:
partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, partition, 'partition')
nx.draw_kamada_kawai(G, with_labels = True, node_color=list(partition.values()))
#Creating a new graph from the network, with radius = 2 (goes two nodes out from 'python')
python = nx.ego_graph(G, 'python', radius=2)
#Creating a new dataframe
python_df = pd.DataFrame.from_dict(dict(python.nodes(data=True)), orient='index')
#plotting the network, with radius=2 from Python:
nx.draw_kamada_kawai(python, with_labels = True, node_color=python_df.partition)
- 3.3 Global Network
#Network level characteristics
print('Density: ')
print(nx.density(G))
print()
print('Transitivity: ')
print(nx.transitivity(G))
print()
print('Reciprocity: ')
print(nx.reciprocity(G))
#Create a dictionaries with degre, betweenness and eigenvector centrality
centrality_dgr = nx.degree_centrality(G)
centrality_between = nx.betweenness_centrality(G)
centrality_eigen = nx.eigenvector_centrality_numpy(G)
#sort the 3 dictionaries degre, betweenness and eigenvector centrality
cen_dgr_sor = sorted(centrality_dgr.items(), key=lambda item: item[1],reverse=True)
cen_eig_sor = sorted(centrality_eigen.items(), key=lambda item: item[1],reverse=True)
cen_bet_sor = sorted(centrality_between.items(), key=lambda item: item[1],reverse=True)
#Create a Dataframe and merge each of them
centrality = pd.DataFrame(cen_dgr_sor, columns=['Name', 'Centrality Degree'])
df_cen_eig = pd.DataFrame(cen_eig_sor, columns=['Name', 'Centrality Eigen'])
df_cen_bet = pd.DataFrame(cen_bet_sor, columns=['Name', 'Centrality Between'])
centrality = centrality.merge(df_cen_eig, how='left', left_on='Name', right_on='Name')
centrality = centrality.merge(df_cen_bet, how='left', left_on='Name', right_on='Name')
#Create a new value to find the nodes that in genneral have the highest centralities
centrality['Sum'] = centrality['Centrality Degree']+ centrality['Centrality Eigen']+ centrality['Centrality Between']
#Sort and show the 10 nodes with the highst centrality
centrality.sort_values(by='Sum', ascending=False)[:10]
jquery is the most central node in the network