M2 Project
Intro
https://www.kaggle.com/crowdflower/twitter-airline-sentiment
#install kaggle package to import Kaggle API
!pip install -qq kaggle
#make folder for api key
!mkdir ~/.kaggle
#copy the api key to the kaggle folder
!cp kaggle.json ~/.kaggle
#set permissions for the key
!chmod 600 ~/.kaggle/kaggle.json
#importing relevant packages
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
#to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')
#instantiation English module
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
#Afterwords we downloaded the Kaggle dataset and unzipped it.
!kaggle datasets download -d crowdflower/twitter-airline-sentiment
twitter-airline-sentiment.zip: Skipping, found more recently modified local copy (use --force to force download)
#Read the dataset and name it. The dataset will be called "data".
data = pd.read_csv('/work/Projekt/Tweets.csv')
1. Data preparation/EDA
data.head()
#The dataset has 15 columns and 14,640 rows:
data.shape
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 tweet_id 14640 non-null int64
1 airline_sentiment 14640 non-null object
2 airline_sentiment_confidence 14640 non-null float64
3 negativereason 9178 non-null object
4 negativereason_confidence 10522 non-null float64
5 airline 14640 non-null object
6 airline_sentiment_gold 40 non-null object
7 name 14640 non-null object
8 negativereason_gold 32 non-null object
9 retweet_count 14640 non-null int64
10 text 14640 non-null object
11 tweet_coord 1019 non-null object
12 tweet_created 14640 non-null object
13 tweet_location 9907 non-null object
14 user_timezone 9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB
data.describe()
#Next step will to the check if there is any NaN-values. As we can see below, there is a couple of NaN-values.
data.isna().sum()
#As there is 14640 rows (as we can see by the .shape), and there is 14608 NaN-values is negativereason_gold.
#As there is 14640 rows (as we can see by the .shape), and there is 13621 NaN-values is tweet_coord.
#As the tweet_id coloumn is just a unique number of the tweet. In that case we decided just to use the row number instead.
data = data.drop(["tweet_id"], axis = 1)
#We removed those columns where there is a very high number NaNs, so we chose to remove those ones.
data = data.drop(["airline_sentiment_gold"], axis = 1)
data = data.drop(["negativereason_gold"], axis = 1)
data = data.drop(["tweet_coord"], axis = 1)
#Furthermore, we removed those coloumn that we did not use.
data = data.drop(["airline_sentiment_confidence"], axis = 1)
data = data.drop(["negativereason_confidence"], axis = 1)
data = data.drop(["tweet_location"], axis = 1)
#We can see in the .describe() that there are numerous tweets from the same username.
data["name"].value_counts(sort=True)
#By this code, we can se that the maximum number unique names are 63. By this we do estimate that it is not necessary remove dulpicates.
#To get a bigger overview of the labels - we made a histogram. It shows that there is a tend that reviews of airline companies are negative.
data['airline_sentiment'].unique()
ax = sns.countplot(x="airline_sentiment",data=data, color= "black")
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Here we have plotted the airlines, which visualises the popularity of the airlines.
plt.figure(figsize=(8,8))
ax = sns.countplot(x="airline", data=data, color = "black")
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#To combine the two figures above a grouped barplot has been made below to enlighten the volume of sentiment compated to each other
data.groupby('airline').airline_sentiment.value_counts().unstack().plot.bar(figsize=(12, 10))
plt.xlabel("Airlines")
plt.xticks(rotation=0)
2. Natural Language Processing
2.1 Preprocessing tweets
#import spacy
import spacy
#Downoad the Enlighs module for spacy
!python -m spacy download en_core_web_sm
#instantiating English module
nlp = spacy.load("en_core_web_sm")
#Initiating an empty list:
clean_text = []
for text in nlp.pipe(data['text'], disable=["tagger", "parser", "ner"]): #Disable part of the pipeline to make it faster
txt = [token.lemma_.lower() for token in text
if token.is_alpha # having only alphanumreical values
and not token.is_stop #removing stopwords
and not token.is_punct] #removing punctuations
#Appending the above to 'clean_text'
clean_text.append(txt)
data['clean_text'] = clean_text
2.2. Simple frequency-based analysis
#For text preprocessing
import re
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
#Import numpy for matrix operation
import numpy as np
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
#Importing packages
import itertools
from collections import Counter
data_bow =pd.DataFrame(data[['airline_sentiment', "clean_text"]])
most_common=Counter(itertools.chain(*data_bow.clean_text)).most_common(10)
most_common
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
!pip install sacremoses
nltk.download('perluniprops')
from sacremoses import MosesDetokenizer
detokenizer = MosesDetokenizer()
data['clean_detoken']=data['clean_text'].apply(lambda x: detokenizer.detokenize(x, return_str=True))# Using apply(str) method to lambda function
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data] Package perluniprops is already up-to-date!
!pip install wordcloud
from wordcloud import WordCloud,STOPWORDS
new_df=data[data['airline_sentiment']=='negative']
words = ' '.join(new_df['clean_detoken'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
new_df=data[data['airline_sentiment']=='positive']
words = ' '.join(new_df['clean_detoken'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
new_df=data[data['airline_sentiment']=='neutral']
words = ' '.join(new_df['clean_detoken'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()# Start with one review:
2.3. Topic Modeling (LDA)
#Importing the LDAMulticore model from Gensim, which is a fast version of LDA:
!pip install gensim
from gensim.models import LdaMulticore
#Import the dictionary builder. This maps the text-data (each word) into its-
#Unique integer ID. This is needed to work with Gensim.
from gensim.corpora.dictionary import Dictionary
#Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data['clean_text'])
#Construct corpus using this dictionary
#The corpus is the collection of all the tweets, based on the dictionary
corpus = [dictionary.doc2bow(doc) for doc in data['clean_text']]
# That's how the corpus looks
corpus[3][:10]
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=2, workers = 4, passes=10)
# Check out topics
lda_model.print_topics(-1)
# Where does a text belong to?
lda_model[corpus][0]
data['clean_text'][1]
# let's fist install this nice visualizer
!pip install pyLDAvis
import pyLDAvis.gensim_models
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# Let's Visualize
pyLDAvis.display(lda_display)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
2.4 Embedding-model based vectorization (Word2Vec)
#Import Word2vec from gensim
from gensim.models import Word2Vec
#Import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Use the model on the tokenzied data
w2v_model = Word2Vec(sentences=data['clean_text'], vector_size=300, window=5, min_count=2, workers=2, epochs=5)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Import itertools and counter
import itertools
from collections import Counter
#Create a dataframe with word tokens and sentiments
data_bow =pd.DataFrame(data[['airline_sentiment', "clean_text"]])
#find the most used words
most_common=Counter(itertools.chain(*data_bow.clean_text)).most_common(10)
print(most_common[:3])
[('flight', 3923), ('thanks', 1078), ('cancelled', 1065)]
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Lookup 'late' and words close to it
print('"flight" is close to:',w2v_model.wv.similar_by_word('flight')[:3])
print('"thanks" is close to:',w2v_model.wv.similar_by_word('thanks')[:3])
print('"cancelled" is close to:',w2v_model.wv.similar_by_word('cancelled')[:3])
print('"late" is close to:',w2v_model.wv.similar_by_word('late')[:3])
"flight" is close to: [('tomorrow', 0.9902928471565247), ('flighted', 0.9864764213562012), ('flighting', 0.9857304692268372)]
"thanks" is close to: [('thank', 0.9998644590377808), ('appreciate', 0.9998199939727783), ('twitter', 0.9997870326042175)]
"cancelled" is close to: [('flight', 0.9740378260612488), ('flighted', 0.9602243900299072), ('tomorrow', 0.9554851651191711)]
"late" is close to: [('delayed', 0.9959874749183655), ('hours', 0.9955582022666931), ('reschedule', 0.9938867688179016)]
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
w2v_model.save('w2v_model')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
from sklearn.feature_extraction.text import TfidfVectorizer
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# function that does absolutely nothing...
# to be able to use TfidfVectorizer on already tokenized text
def dummy_fun(doc):
return doc
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# we turn of any preprocessing and align vocabulary with the one
# used by our embeddings
# that will allow us to use TFIDF vectors to weight the embeddings
tfidf = TfidfVectorizer(vocabulary=w2v_model.wv.key_to_index.keys(),
tokenizer=dummy_fun,
preprocessor=dummy_fun,
token_pattern=None)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# create TFIDF matrix (we could also just use that one for search)
data_tfidf = tfidf.fit_transform(data['clean_text'])
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# how many word-vectors do we have?
len(w2v_model.wv.key_to_index)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# one tfidf vector has also 5360 columns - because we provided a vocab
data_tfidf[:1,:]
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# we can use np.dot or since Python 3 the @ for matrix-multiplication
# let's try
data_tfidf[:1,:] @ w2v_model.wv.vectors
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# for the whole matrix
data_w2v_tfidf = data_tfidf @ w2v_model.wv.vectors
data_w2v_tfidf
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
2.5 Sentiment prediction (SML)
We would like to predict the outcome (airline_sentiment) weather they will be labelled with: "positive", "neautral" or "negative"
2.5.1 Logistic regression
#Import relevant packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Define y as the coloumn Airline Sentiment
y = data['airline_sentiment']
#Define X as the w2v model
X= data_w2v_tfidf
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Split the dataset into a test and a train set, with a 20% percentage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Create a model, that uses the logisticregression and then fit it
model = LogisticRegression(multi_class="ovr")
model.fit(X_train, y_train)
#Check the score of the model.
model.score(X_test, y_test)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Defining true- and predicted classes, and making a crosstab:
true_class = labelencoder_y.fit_transform(y_test)
y_pred = model.predict(X_test)
predicted_class = labelencoder_y.fit_transform(y_pred)
df = pd.DataFrame({'true_class': true_class, 'predicted_class': predicted_class })
pd.crosstab(df.true_class, df.predicted_class)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
print(classification_report(y_test, y_pred))
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
precision recall f1-score support
negative 0.70 0.96 0.81 1889
neutral 0.47 0.22 0.30 580
positive 0.90 0.12 0.22 459
accuracy 0.68 2928
macro avg 0.69 0.43 0.44 2928
weighted avg 0.68 0.68 0.61 2928
2.5.2 XGBoost
!brew install libomp
!pip install xgboost
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
[12:13:25] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
#XGBoost
print(classification_report(y_test, y_pred))
precision recall f1-score support
negative 0.76 0.90 0.83 1889
neutral 0.55 0.39 0.46 580
positive 0.65 0.40 0.50 459
accuracy 0.72 2928
macro avg 0.65 0.57 0.59 2928
weighted avg 0.70 0.72 0.70 2928
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
3. Network Analysis
#Download dataset from kaggle
!kaggle datasets download -d stackoverflow/stack-overflow-tag-network
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Unzip the kaggle files
!unzip -y /work/Projekt/stack-overflow-tag-network.zip
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
3.1 Overall network
#install packages
!pip install networkx
!pip install -qq names
#Importing some packages:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import names
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Load the data into DataFrames
df_nodes = pd.read_csv('/work/Projekt/stack_network_nodes.csv')
df_edges = pd.read_csv('/work/Projekt/stack_network_links.csv')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Creating an empty graph structure (a “null graph”) with no nodes and no edges.
G = nx.Graph(day="Stackoverflow")
# growing the nodes - as the programs (example Python/R)
for index, row in df_nodes.iterrows():
G.add_node(row['name'], group=row['group'], nodesize=row['nodesize'])
# growing the edges - as the tags between the programs (when they are tagged together)
for index, row in df_edges.iterrows():
G.add_weighted_edges_from([(row['source'], row['target'], row['value'])])
#Info about the nodes and edges:
nx.info(G)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
# Defining the network:
def draw_graph(G,size):
nodes = G.nodes()
color_map = {1:'#f09494', 2:'#eebcbc', 3:'#72bbd0', 4:'#91f0a1', 5:'#629fff', 6:'#bcc2f2',
7:'#eebcbc', 8:'#f1f0c0', 9:'#d2ffe7', 10:'#caf3a6', 11:'#ffdf55', 12:'#ef77aa',
13:'#d6dcff', 14:'#d2f5f0'}
node_color= [color_map[d['group']] for n,d in G.nodes(data=True)]
node_size = [d['nodesize']*10 for n,d in G.nodes(data=True)]
pos = nx.drawing.spring_layout(G,k=0.70,iterations=60)
plt.figure(figsize=size)
nx.draw_networkx(G,pos=pos,node_color=node_color,node_size=node_size,edge_color='#FFDEA2')
plt.show()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Drawing the network
draw_graph(G,size=(25,25))
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
3.2 Community Detection
#Importing the community_louvain package:
! pip3 install python-louvain
import community as community_louvain
#Plotting the communities in the network:
partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, partition, 'partition')
nx.draw_kamada_kawai(G, with_labels = True, node_color=list(partition.values()))
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Creating a new graph from the network, with radius = 2 (goes two nodes out from 'python')
python = nx.ego_graph(G, 'python', radius=2)
#Creating a new dataframe
python_df = pd.DataFrame.from_dict(dict(python.nodes(data=True)), orient='index')
#plotting the network, with radius=2 from Python:
nx.draw_kamada_kawai(python, with_labels = True, node_color=python_df.partition)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
- 3.3 Global Network
#Network level characteristics
print('Density: ')
print(nx.density(G))
print()
print('Transitivity: ')
print(nx.transitivity(G))
print()
print('Reciprocity: ')
print(nx.reciprocity(G))
Density:
0.03737604881769641
Transitivity:
0.48709239130434784
Reciprocity:
0.0
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
#Create a dictionaries with degre, betweenness and eigenvector centrality
centrality_dgr = nx.degree_centrality(G)
centrality_between = nx.betweenness_centrality(G)
centrality_eigen = nx.eigenvector_centrality_numpy(G)
#sort the 3 dictionaries degre, betweenness and eigenvector centrality
cen_dgr_sor = sorted(centrality_dgr.items(), key=lambda item: item[1],reverse=True)
cen_eig_sor = sorted(centrality_eigen.items(), key=lambda item: item[1],reverse=True)
cen_bet_sor = sorted(centrality_between.items(), key=lambda item: item[1],reverse=True)
#Create a Dataframe and merge each of them
centrality = pd.DataFrame(cen_dgr_sor, columns=['Name', 'Centrality Degree'])
df_cen_eig = pd.DataFrame(cen_eig_sor, columns=['Name', 'Centrality Eigen'])
df_cen_bet = pd.DataFrame(cen_bet_sor, columns=['Name', 'Centrality Between'])
centrality = centrality.merge(df_cen_eig, how='left', left_on='Name', right_on='Name')
centrality = centrality.merge(df_cen_bet, how='left', left_on='Name', right_on='Name')
#Create a new value to find the nodes that in genneral have the highest centralities
centrality['Sum'] = centrality['Centrality Degree']+ centrality['Centrality Eigen']+ centrality['Centrality Between']
#Sort and show the 10 nodes with the highst centrality
centrality.sort_values(by='Sum', ascending=False)[:10]
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
and should_run_async(code)
jquery is the most central node in the network