ASSIGNMENT 1 - NETWORK ANALYSIS
1. Create a network
# Basic packaging for network exploration
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import community as community_louvain
sns.set()
#import attributs
attributes = pd.read_csv('https://raw.githubusercontent.com/SDS-AAU/SDS-master/master/00_data/network_krackhard/Krack-High-Tec-Attributes.csv')
#lowering the heads
attributes.columns = [w.lower() for w in attributes.columns]
attributes = attributes.set_index('id').to_dict('index')
attributes
#import edges as df
edge_list = ['source', 'target', 'weight']
df_edge_advice =pd.read_csv("/work/Krack-High-Tec-edgelist-Advice.txt", sep='\s+', header=None, names=edge_list)
df_edge_friend =pd.read_csv("/work/Krack-High-Tec-edgelist-Friendship.txt", sep='\s+', header=None, names=edge_list)
df_edge_report =pd.read_csv("/work/Krack-High-Tec-edgelist-ReportsTo.txt", sep='\s+', header=None, names=edge_list)
#Remove ids without an edge
df_edge_advice = df_edge_advice[df_edge_advice.weight != 0]
df_edge_friend = df_edge_friend[df_edge_friend.weight != 0]
df_edge_report = df_edge_report[df_edge_report.weight != 0]
#Create networks for each of the 3 networks (Advice, Friendship, ReportsTo)
G_a = nx.from_pandas_edgelist(df_edge_advice, source='source', target='target', edge_attr='weight', create_using=nx.Graph())
G_f = nx.from_pandas_edgelist(df_edge_friend, source='source', target='target', edge_attr='weight', create_using=nx.Graph())
G_r = nx.from_pandas_edgelist(df_edge_report, source='source', target='target', edge_attr='weight', create_using=nx.Graph())
#Add the attributes to the nodes of each network
nx.set_node_attributes(G_a, attributes)
nx.set_node_attributes(G_f, attributes)
nx.set_node_attributes(G_r, attributes)
#test the number of notes for each network
len(G_a.nodes()), len(G_f.nodes()), len(G_r.nodes())
#test the number of edges for each network
len(G_a.edges), len(G_f.edges), len(G_r.edges)
2. Analysis
#create a overall network T of all the other networks
T = nx.compose_all([G_a, G_f, G_r])
#Print the density, transitivity and reciprocity of the overall network T
print('Density is',nx.density(T))
print('Transitivity is', nx.transitivity(T))
print('Recisporcity is', nx.reciprocity(T))
Are Relationships like friendship and advice giving usually reciprocal?
edge_advice = df_edge_advice.set_index('source').to_dict()
edge_friend = df_edge_friend.set_index('source').to_dict()
edge_report = df_edge_report.set_index('source').to_dict()
ties = ['edge_advice', 'edge_friend', 'edge_report']
for tie in ties:
T_sub = T.edge_subgraph([(u,v) for u,v,d in T.edges(data=True) if tie in d.keys()])
print('Tie Type:' + tie)
#find how many frinds of friends each person have, that are friend with the person
sum(nx.triangles(G_f).values())/3
#all the people have a total 102 friends, that are friends with the first persons too.
#find the mean of how many people each node is connected to
print('Advice-seeking:',pd.Series([*nx.degree_centrality(G_a).values()]).mean())
print('Friendships:', pd.Series([*nx.degree_centrality(G_f).values()]).mean())
#employees are more likely to have a advice-seeking relationship
B: Node level characteristics: Likewise, find out:
deg_cent_a = nx.degree_centrality(G_a)
deg_cent_f = nx.degree_centrality(G_f)
deg_cent_r = nx.degree_centrality(G_r)
#Most polular advice giver
max_dc_a = max(deg_cent_a.values())
max_dc_f = max(deg_cent_f.values())
max_dc_r = max(deg_cent_r.values())
# Find the user(s) that have collaborated the most: prolific_collaborators
prolific_collaborators_a = [n for n, dc in deg_cent_a.items() if dc == max_dc_a]
prolific_collaborators_f = [n for n, dc in deg_cent_f.items() if dc == max_dc_f]
prolific_collaborators_r = [n for n, dc in deg_cent_r.items() if dc == max_dc_r]
# Print the most prolific collaborator(s)
print('Most polular advice giver: id',prolific_collaborators_a)
print('Most polular friend giver: id',prolific_collaborators_f)
print('Most polular Reported to: id',prolific_collaborators_r)
#Create a sub graph of managers and non managers
managers = (
node
for node, data
in G_f.nodes(data=True)
if data.get("level") == 3)
G_f_m = G_f.subgraph(managers)
non_managers = (
node
for node, data
in G_f.nodes(data=True)
if data.get("level") != 3)
G_f_nm = G_f.subgraph(non_managers)
print('Non_mangaers:',pd.Series([*nx.degree_centrality(G_f_nm).values()]).mean())
print('Managers:',pd.Series([*nx.degree_centrality(G_f_m).values()]).mean())
print('Managers are more popular as friends')
C: Relational Characteristics: Answer the following questions:
#Create a sub graph of managers in advice givers
G_a_m = G_a.subgraph(managers)
print('Age')
print('Friendships:', nx.numeric_assortativity_coefficient(G_f, "age"))
print('Advice givers:', nx.numeric_assortativity_coefficient(G_a, "age"))
print('Managers of the same age are more likly to become friends')
print('')
print('Tenure')
print('Friendships:', nx.numeric_assortativity_coefficient(G_f, "tenure"))
print('Advice givers:', nx.numeric_assortativity_coefficient(G_a, "tenure"))
print('Managers with the same tenure are more likly to become advice givers')
print('')
print('Level')
print('Friendships:', nx.numeric_assortativity_coefficient(G_f, "level"))
print('Advice givers:', nx.numeric_assortativity_coefficient(G_a, "level"))
print('Managers on the same level are more likly to become friends')
print('')
print('Deptartment')
print('Friendships:', nx.numeric_assortativity_coefficient(G_f, "dept"))
print('Advice givers:', nx.numeric_assortativity_coefficient(G_a, "dept"))
print('Managers from the same department are more likly to become friends')
df_edge_friend['combined'] = df_edge_friend[edge_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df_edge_advice['combined'] = df_edge_advice[edge_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
#make a coloum to compare
df_edge_friend['combined'] = df_edge_friend[edge_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df_edge_advice['combined'] = df_edge_advice[edge_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
# make a new dataframe with a boolean that compare if friends also gives advice
compare = df_edge_friend['combined'].isin(df_edge_advice['combined'])
compare.value_counts()
# Friends are more likely to also give advice
3. Visualization
# Here we can calculate different centrality indicators as well as partition (community detection)
centrality_dgr = nx.degree_centrality(G_a)
centrality_eig = nx.eigenvector_centrality_numpy(G_a)
partition = community_louvain.best_partition(G_a) #that will take some time...
degree = G_a.degree()
# All these indicators can now be set as attribute of the Graph
nx.set_node_attributes(G_a, centrality_dgr, 'dgr')
nx.set_node_attributes(G_a, centrality_eig, 'eig')
nx.set_node_attributes(G_a, partition, 'partition')
nx.set_node_attributes(G_a, dict(degree), 'degree_basic')
nx.draw(G_a,pos=nx.circular_layout(G_a), with_labels=True, node_size=[v * 5000 for v in centrality_eig.values()])
nx.draw(G_f,pos=nx.circular_layout(G_f), with_labels=True, node_size=[v * 5000 for v in centrality_eig.values()])
nx.draw(G_r,pos=nx.circular_layout(G_r), with_labels=True, node_size=[v * 5000 for v in centrality_eig.values()])
nx.draw(T,pos=nx.circular_layout(T), with_labels = True, node_size=[v * 5000 for v in centrality_eig.values()])