import tensorflow as tf
import json
import csv
BASE_PATH = "/work/"
f_node_features = json.load(open(BASE_PATH + "node_features_text.json", "r"))
node_features = {int(k): v for (k, v) in f_node_features.items()}
f_isolated_nodes = csv.reader(open(BASE_PATH + "isolated_nodes.csv", "r"))
next(f_isolated_nodes)
isolated_nodes = [int(node[0]) for node in f_isolated_nodes]
f_node_classification = csv.reader(open(BASE_PATH + "node_classification.csv", "r"))
next(f_node_classification)
node_classification = {}
for (node_id, page_type) in f_node_classification:
node_classification[int(node_id)] = int(page_type)
f_training_graph = csv.reader(open(BASE_PATH + "training_graph.csv", "r"))
next(f_training_graph)
training_graph = {(int(first), int(second)) for (first, second) in f_training_graph}
from random import Random
nodes_set = set(node_features.keys())
for node in isolated_nodes:
nodes_set.remove(node)
all_nodes = sorted(list(nodes_set))
RAND = Random(42)
RAND.shuffle(all_nodes)
total_len = len(all_nodes)
train_nodes = set(all_nodes[:int(total_len * 0.7)])
val_nodes = set(all_nodes[int(total_len * 0.7):int(total_len * 0.85)])
try_nodes = set(all_nodes[int(total_len * 0.85):])
train_node_features = {k: v for (k, v) in node_features.items() if k in train_nodes}
try_node_features = {k: v for (k, v) in node_features.items() if k in try_nodes}
import networkx as nx
graph = nx.Graph()
graph.add_nodes_from(all_nodes)
graph.add_edges_from(training_graph)
words = {}
for d in node_features.values():
for word in d:
if not word in words:
words[word] = 0
words[word] += 1
we_map = []
for (word, freq) in words.items():
if freq >= 2:
we_map.append(word)
we_map = {w: i + 1 for (i, w) in enumerate(we_map)}
#assert(len(we_map) == 4096)
we_map_size = len(we_map)
print(len(we_map))
def is_connected(a, b):
return (a, b) in training_graph or (b, a) in training_graph
import tensorflow as tf
import numpy as np
import random
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers
features_size = 24#None
DISC_BIAS_RATIO = 1
def get_node_features(a):
cla = node_classification[a]
nfa = node_features[a]
# zp = []
# for (i,w) in enumerate(nfa):
# if w in we_map:
# zp.append(we_map[w])
# else:
# zp.append(0)
# zp = np.array(zp, dtype=np.uint16)
zp = np.zeros(features_size, dtype=np.uint16)
for (i, w) in enumerate(nfa):
if i >= features_size: break
if w in we_map:
zp[i] = we_map[w]
return cla, zp
def generate_ds(anodes):
sg = graph.subgraph(anodes)
ylst = []
total_nodes = len(anodes)
def rand_index():
return anodes[random.randrange(total_nodes)]
def generate_disconnected(rand_index):
first = rand_index()
fc = node_classification[first]
while True:
second = rand_index()
sc = node_classification[second]
if sc == fc:
return (first, second)
el = list(sg.edges) +\
[generate_disconnected(rand_index) for i in range(int(sg.number_of_edges() * DISC_BIAS_RATIO / 2))] +\
[(rand_index(), rand_index()) for i in range(int(sg.number_of_edges() * DISC_BIAS_RATIO / 2))]
rander = random.Random(42)
rander.shuffle(el)
xlstca = []
xlstcb = []
xlstna = []
xlstnb = []
for (a, b) in tqdm(el):
ca, na = get_node_features(a)
xlstca.append(ca)
xlstna.append(na)
cb, nb = get_node_features(b)
xlstcb.append(cb)
xlstnb.append(nb)
ylst.append(0 if is_connected(a, b) else 1)
yarr = np.array(ylst, dtype=np.float16)
xdict = {"ac":np.array(xlstca, dtype=np.uint8), "an": np.array(xlstna), "bc": np.array(xlstcb, dtype=np.uint8), "bn": np.array(xlstnb)}
ytrue = np.count_nonzero(yarr)
print(f"Connected: {ytrue}, Disconnected: {len(yarr) - ytrue}.")
ds = tf.data.Dataset.from_tensor_slices((xdict, yarr))
# def gen():
# for (a, b) in el:
# ca, na = get_node_features(a)
# cb, nb = get_node_features(b)
# yv = 0 if is_connected(a, b) else 1
# yield ({"ac": ca, "an": na, "bc": cb, "bn": nb}, yv)
# ds = tf.data.Dataset.from_generator(gen, output_signature=(
# {"ac": tf.TensorSpec(shape=(), dtype=np.uint8),
# "an": tf.TensorSpec(shape=(None,), dtype=np.uint16),
# "bc": tf.TensorSpec(shape=(), dtype=np.uint8),
# "bn": tf.TensorSpec(shape=(None,), dtype=np.uint16)},
# tf.TensorSpec(shape=(), dtype=np.float16)
# ))
return ds.batch(128)
train_ds = generate_ds(list(train_nodes))
val_ds = generate_ds(list(val_nodes))
try_ds = generate_ds(list(try_nodes))
embedding_size = 256
def make_model():
c_input_a = keras.Input(shape=(), name="ac")
w_input_a = keras.Input(shape=(features_size,), name="an")
c_input_b = keras.Input(shape=(), name="bc")
w_input_b = keras.Input(shape=(features_size,), name="bn")
def make_submodel():
c_input = keras.Input(shape=())
w_input = keras.Input(shape=(features_size,))
embedding_layer = layers.Embedding(we_map_size + 1, embedding_size, name="embedding")(w_input)
x = layers.GlobalAveragePooling1D()(embedding_layer)
concat_layer = layers.Concatenate(axis=-1)([tf.expand_dims(c_input, -1), x])
x = layers.Dense(512, activation='relu')(concat_layer)
x = layers.Dense(384, activation='relu')(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.25)(x)
sm = keras.Model([c_input, w_input], x)
sm.summary()
return sm
submodel = make_submodel()
encoded_a = submodel([c_input_a, w_input_a])
encoded_b = submodel([c_input_b, w_input_b])
#L1_Distance = tf.losses.CosineSimilarity()(encoded_a, encoded_b)
#layers.Lambda(tf.compat.v1.losses.cosine_distance, (1,))([encoded_a, encoded_b])
dif_layer = layers.Lambda(lambda tensors: tf.expand_dims(tf.keras.losses.cosine_similarity(tensors[0], tensors[1], axis=-1), axis=-1))
distance = dif_layer([encoded_a, encoded_b])
x = layers.Dense(1, activation='sigmoid')(distance)
model = keras.Model([c_input_a, w_input_a, c_input_b, w_input_b], x)
model.summary()
return model
model = make_model()
#keras.utils.plot_model(model, BASE_PATH + "model.png")
epochs = 32
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(train_ds, epochs=epochs, validation_data=val_ds)