复杂网络的统计特征

# 导入库 import networkx as nx import matplotlib.pyplot as plt

常见的度分布：泊松分布与幂律分布

泊松分布：以ER随机网络为例

# 创建一个ER随机网络 n = 10000 p = 0.001 ER = nx.erdos_renyi_graph(n, p)

# 获取单个节点的度:G.degree(i) ER.degree(1)

# 获取平均度 d = dict(nx.degree(ER)) print("平均度为：", sum(d.values())/len(ER.nodes))

# 获取所有可能的度值对应的概率 x = list(range(max(d.values())+1)) y = [i/n for i in nx.degree_histogram(ER)]

# 绘制度分布 plt.plot(x, y, 'ro-') plt.xlabel("$k$") plt.ylabel("$p_k$")

幂律分布：以BA无标度网络为例

m = 3 BA = nx.barabasi_albert_graph(n, m)

# 获取平均度 d = dict(nx.degree(BA)) print("平均度为：", sum(d.values())/len(BA.nodes))

# 获取所有可能的度值对应的概率 x = list(range(max(d.values())+1)) y = [i/n for i in nx.degree_histogram(BA)] # 绘制度分布 plt.plot(x, y, 'ro-') plt.xlabel("$k$") plt.ylabel("$p_k$")

# 在双对数坐标轴下显示 plt.plot(x, y, 'ro-') plt.xscale("log") plt.yscale("log") plt.xlabel("$k$") plt.ylabel("$p_k$")

# 在双对数坐标轴下要把横坐标和纵坐标的0值排除掉 new_x = [] new_y = [] for i in range(len(x)): if y[i] != 0: new_x.append(x[i]) new_y.append(y[i]) plt.plot(new_x, new_y, 'ro-') plt.xscale("log") plt.yscale("log") plt.xlabel("$k$") plt.ylabel("$p_k$")

有向网络的入度、出度和总度值

DG = nx.DiGraph() DG.add_nodes_from([1,2,3,4]) DG.add_edges_from([(1,2),(1,4),(2,3),(4,3),(4,1)]) As = nx.adjacency_matrix(DG) # 转化成二维数组形式的矩阵 A = As.todense() print(A)

nx.draw(DG, node_size=500, with_labels=True)

for i in DG.nodes(): # print(DG.in_degree(i)) # 入度 # print(DG.out_degree(i)) # 出度 print(DG.degree(i)) # 总度=入度+出度

网络的直径、效率和平均距离

G1 = nx.barabasi_albert_graph(1000, 3) print("网络的直径为：", nx.diameter(G1))

# 指定节点对i和j之间的效率：前提是这两个节点之家要有路径，即从i到j是可达的 print(nx.efficiency(G1, 1, 5)) print(nx.shortest_path_length(G1, 1, 5))

# 局部效率 print(nx.local_efficiency(G1))

# 全局效率：直接调用下列函数的前提是网络G1要是连通的 print(nx.global_efficiency(G1))

# 求整个网络的平均距离：直接调用下列函数的前提是网络G1要是连通的 print(nx.average_shortest_path_length(G1))

在大规模网络的计算下，networkx库的效率上会有劣势。

igraph是更加高效的库，在大规模网络的场景下，应选择igraph库。

import igraph as ig import time print(ig.__version__)

# 对比两个库计算网络直径 N, L = 5000, 25000 GNX = nx.gnm_random_graph(N, L) # 确保生成的网络是连通的 while not nx.is_connected(GNX): GNX = nx.gnm_random_graph(N, L) gig = ig.Graph.from_networkx(GNX) # # 比较networkx和igraph的效率 t1 = time.perf_counter() spl_nx = nx.average_shortest_path_length(GNX) t2 = time.perf_counter() print("networkx计算所需时间为：", t2-t1) t3 = time.perf_counter() spl_ig = gig.average_path_length() t4 = time.perf_counter() print("igraph计算所需时间为：", t4-t3)

# 维纳指数的计算 nx.wiener_index(G1)

无向图和有向图的距离矩阵

UG = nx.Graph([(1,2),(2,3),(2,6),(3,4),(3,5),(3,6),(4,5),(4,7),(5,6)]) print(dict(nx.shortest_path_length(UG))) nx.draw(UG, node_size=500, with_labels=True)

import numpy as np D = np.array([nx.shortest_path_length(UG, i, j) for i in range(1,8) for j in range(1,8)]).reshape(7,7) D

DG = nx.DiGraph([(1,2),(2,1),(2,3),(3,5),(4,3),(4,7),(5,4),(5,6),(6,2),(6,3),(6,5),(7,4)]) print(dict(nx.shortest_path_length(DG))) nx.draw(DG, node_size=500, with_labels=True)

D2 = np.array([nx.shortest_path_length(DG, i, j) for i in range(1,8) for j in range(1,8)]).reshape(7,7) D2

# 两个节点间的最短路径数 print(list(nx.all_shortest_paths(UG, 1, 5))) print(len(list(nx.all_shortest_paths(UG, 1, 5))))

节点的集聚系数和平均集聚系数

集聚系数

print(nx.clustering(UG)) # 以字典的形式打印所有节点的局部集聚系数

平均集聚系数

print(nx.average_clustering(UG))

全局集聚系数

print(nx.transitivity(UG))

有向网络的互惠性

print(nx.number_of_edges(DG))

rec = {i: nx.reciprocity(DG, i) for i in DG.nodes()} print(rec)

print(nx.reciprocity(DG)) # 6/12

介数与核度

G = nx.Graph([(1,2),(1,3),(2,4),(2,5),(3,4),(4,5),(4,6),(5,6),(6,7)]) bc = nx.betweenness_centrality(G, normalized=True) print(bc) nx.draw(G, node_size=500, with_labels=True)

print(nx.closeness_centrality(G))

边介数

ebc = nx.edge_betweenness_centrality(G) print(ebc)

核度

ks = nx.core_number(G) print(ks)

团和k-分量

print(list(nx.enumerate_all_cliques(G))) # 返回无向图中的所有团

# 计算k分量 from networkx.algorithms import approximation as apxa k_components = apxa.k_components(G) k_components

网络密度

真实世界中的图具有稀疏性。

nx.density(G)

富人俱乐部系数

G = nx.barabasi_albert_graph(100, 3) nx.rich_club_coefficient(G, normalized=True)

路径和循环（或圈）

G = nx.Graph([(1,2),(1,3),(2,5),(3,4),(4,6),(4,7),(5,7),(5,8),(6,7),(7,8)]) G.add_nodes_from(list(range(1,10))) nx.draw(G, with_labels=True) print(nx.cycle_basis(G))

加权网络的静态特征

# 创建一个无向加权网络 WG = nx.Graph() WG.add_nodes_from([1,2,3,4,5,6]) WG.add_weighted_edges_from([(1,2,3),(1,3,1),(2,4,4),(2,5,1.5),(3,5,2),(3,6,3.5),(4,5,2.5),(4,6,0.5)]) w = [WG[e[0]][e[1]]['weight'] for e in WG.edges()] nx.draw(WG,node_size=500,width=w,with_labels=True) w

点权

nx.degree(WG, weight='weight')

无权集聚系数

nx.clustering(WG)

加权集聚系数

nx.clustering(WG, weight='weight')

几种常用的中心性指标

# 分别生成ER和BA无标度网络，节点数设定为N=100 GER = nx.erdos_renyi_graph(100,0.08) GBA = nx.barabasi_albert_graph(100,4)

# 度中心性 dc1 = nx.degree_centrality(GER) dc2 = nx.degree_centrality(GBA) # 介数中心性 bc1 = nx.betweenness_centrality(GER) bc2 = nx.betweenness_centrality(GBA) # 接近度中心性 cc1 = nx.closeness_centrality(GER) cc2 = nx.closeness_centrality(GBA) # 特征向量中心性 ec1 = nx.eigenvector_centrality(GER) ec2 = nx.eigenvector_centrality(GBA)

# 绘图比较 plt.figure(figsize=(10,10)) plt.subplot(221) plt.plot(dc1.keys(), dc1.values(), 'ro', label='ER') plt.plot(dc2.keys(), dc2.values(), 'b--', label='BA') plt.legend(loc=0) plt.xlabel("node label") plt.ylabel("dc") plt.title("degree_centrality") plt.subplot(222) plt.plot(bc1.keys(), bc1.values(), 'ro', label='ER') plt.plot(bc2.keys(), bc2.values(), 'b--', label='BA') plt.legend(loc=0) plt.xlabel("node label") plt.ylabel("bc") plt.title("betweenness_centrality") plt.subplot(223) plt.plot(cc1.keys(), cc1.values(), 'ro', label='ER') plt.plot(cc2.keys(), cc2.values(), 'b--', label='BA') plt.legend(loc=0) plt.xlabel("node label") plt.ylabel("cc") plt.title("closeness_centrality") plt.subplot(224) plt.plot(ec1.keys(), ec1.values(), 'ro', label='ER') plt.plot(ec2.keys(), ec2.values(), 'b--', label='BA') plt.legend(loc=0) plt.xlabel("node label") plt.ylabel("ec") plt.title("eigenvector_centrality")

度-度相关性

基于最近邻平均度值的度-度相关性

import pandas as pd # 加载三个真实数据集 # 1. 科学合作网络 df1 = pd.read_excel("citation.xlsx") G1 = nx.from_pandas_edgelist(df1, 'source', 'target', create_using = nx.Graph()) # 2. 电网 df2 = pd.read_excel("power.xlsx") G2 = nx.from_pandas_edgelist(df2, 'source', 'target', create_using = nx.Graph()) # 3. 代谢网络 df3 = pd.read_excel("celegans_metabolic.xlsx") G3 = nx.from_pandas_edgelist(df3, 'source', 'target', create_using = nx.Graph())

# 定义求最近邻平均度的函数 def average_nearest_neighbor_degree(G): k = set([G.degree(i) for i in G.nodes()]) # 获取所有可能的度值 sorted_k = sorted(k) knni = nx.average_neighbor_degree(G) k_nn_k = [] for ki in sorted_k: if ki == 0: k_nn_k.append(0.0) else: c = 0 s = 0 for i in G.nodes(): if G.degree(i) == ki: s += knni[i] c += 1 k_nn_k.append(s/c) return sorted_k, k_nn_k

x1, y1 = average_nearest_neighbor_degree(G1) x2, y2 = average_nearest_neighbor_degree(G2) x3, y3 = average_nearest_neighbor_degree(G3)

print(y1)

基于Pearson相关系数的度-度相关性

r1 = nx.degree_assortativity_coefficient(G1) r2 = nx.degree_assortativity_coefficient(G2) r3 = nx.degree_assortativity_coefficient(G3) print(r1) print(r2) print(r3)

绘图

plt.figure(figsize=(12,4)) plt.subplot(131) plt.plot(x1, y1, 'ro', label='r = '+'%.4f'%r1) plt.legend(loc=0) plt.xlabel("$k$") plt.ylabel("$k_{nn}(k)$") plt.xscale("log") plt.yscale("log") plt.title('citation') plt.ylim([1,100]) plt.subplot(132) plt.plot(x2, y2, 'bs', label='r = '+'%.4f'%r2) plt.legend(loc=0) plt.xlabel("$k$") plt.ylabel("$k_{nn}(k)$") plt.xscale("log") plt.yscale("log") plt.title('power') plt.ylim([1,10]) plt.subplot(133) plt.plot(x3, y3, 'gv', label='r = '+'%.4f'%r3) plt.legend(loc=0) plt.xlabel("$k$") plt.ylabel("$k_{nn}(k)$") plt.xscale("log") plt.yscale("log") plt.title('celegans_metabolic') plt.ylim([1,100]) plt.tight_layout()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}常见的度分布：泊松分布与幂律分布