import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
authors = pd.read_csv('/work/data/dbis/id_author.txt', delimiter='\t', names=['id', 'name'], encoding='latin1')
confs = pd.read_csv('/work/data/dbis/id_conf.txt', delimiter='\t', names=['id', 'name'])
papers = pd.read_csv('/work/data/dbis/paper.txt', delimiter='\t', names=['col'])
papers['id'] = papers['col'].map(lambda row: int(row.split(' ')[0]))
papers['name'] = papers['col'].map(lambda row: " ".join(row.split(' ')[1:]))
del papers['col']
papers.head()
idint64
nameobject
0
21525
Methods and Tools for Data Value Re-Engineering.
1
21526
Active Database Technology Supports Cancer Clustering.
2
21527
Dynamic Maps as Composite Views of Varied Geographic Database Servers.
3
21528
An Intelligent Database System Application: The Design of EMS.
4
21529
Text / Relational Database Management Systems: Harmonizing SQL and SGML.
confs.head()
idint64
nameobject
0
4789
vVLDBJ.
1
3258
vISWCWorkshoponTrust,Security,andReputationontheSemanticWeb
2
4149
vIWRIDL
3
3252
vSWDB
4
3257
vSWSWPC
authors.head()
idint64
nameobject
0
438659
aMichaelJ.Rothman
1
287141
aShien-ChingHwang
2
116662
aStéphaneAyache
3
370250
aJavierMartínez-Viademonte
4
89376
aRuiJiang
author_nodes = []
paper_nodes = []
conf_nodes = []
for author in authors.itertuples():
author_nodes.append((author.name, {'type': 'author'}))
for paper in papers.itertuples():
paper_nodes.append((paper.name.strip(), {'type': 'paper'}))
for conf in confs.itertuples():
conf_nodes.append((conf.name, {'type': 'conf'}))
graph = nx.Graph()
graph.add_nodes_from(author_nodes)
graph.add_nodes_from(conf_nodes)
graph.add_nodes_from(paper_nodes)
print(f"number of nodes: {graph.number_of_nodes()}")
number of nodes: 132535
paper_author_edges = pd.read_csv('/work/data/dbis/paper_author.txt', delimiter='\t', names=['src', 'dst'])
paper_conf_edges = pd.read_csv('/work/data/dbis/paper_conf.txt', delimiter='\t', names=['src', 'dst'])
paper_author_edges.head()
srcint64
dstint64
0
21525
33467
1
21525
33468
2
21526
33469
3
21526
33470
4
21526
33471
paper_conf_edges.head()
srcint64
dstint64
0
21525
122
1
21526
122
2
21527
122
3
21528
122
4
21529
122
def build_edges(edges_df, src_df, dst_df):
edges = []
for _, edge in enumerate(edges_df.itertuples()):
src = src_df[src_df['id'] == edge.src]["name"].values[0].strip()
dst = dst_df[dst_df['id'] == edge.dst]["name"].values[0].strip()
edges.append((src, dst))
return edges
graph.add_edges_from(build_edges(paper_author_edges, papers, authors))
graph.add_edges_from(build_edges(paper_conf_edges, papers, confs))
print(f"number of edges: {graph.number_of_edges()}")
number of edges: 262590
def generate_metapath(graph, metapath):
mp = metapath.copy()
walk_length = len(mp)
first_type = mp.pop(0)
nodes = [node for node, attr in graph.nodes(data=True) if attr['type'] == first_type]
node = np.random.choice(nodes)
walk = [node]
current = node
while len(walk) < walk_length:
next_type = mp.pop(0)
neighbors = list(graph.neighbors(current))
same_type = []
others = []
if len(neighbors) != 0:
for neighbor in neighbors:
ntype = graph.nodes[neighbor]["type"]
if ntype == next_type:
same_type.append(neighbor)
else:
others.append(neighbor)
if len(same_type) == 0:
break
next = np.random.choice(same_type)
walk.append(next)
current = next
else:
break
return walk
def generate_metapaths(graph, walk_length, num_walks, metapath):
walks = []
for _ in range(num_walks):
fragment = metapath[1:]
while len(metapath) < walk_length:
metapath.extend(fragment)
walk = generate_metapath(graph, metapath)
walks.append(walk)
return walks
walks = generate_metapaths(graph, walk_length=80, num_walks=10, metapath=['author', 'paper', 'author'])
word2vec = Word2Vec(sentences=walks, vector_size=128, negative=10, window=5, min_count=0)
k2i = word2vec.wv.key_to_index
i2k = dict({i: k for i, k in enumerate(word2vec.wv.index_to_key) })
vectors = word2vec.wv.vectors
vectors.shape
pca = PCA(n_components=2)
pca
features = pca.fit_transform(vectors)
features.shape
colors = ['' for _ in range(features.shape[0])]
for key, index in k2i.items():
if key[0] == 'a':
colors[index] = 'orange'
else:
colors[index] = 'blue'
fig, ax = plt.subplots(figsize=(18, 10))
ax.scatter(features[:15, 0], features[:15, 1], s=50, c=colors[:15])
for i in range(15):
ax.annotate(i2k[i], (features[i][0], features[i][1]))
plt.show()