### BEGIN IMPORTS - DO NOT TOUCH!
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import sys
sys.path.append('..')
#!{sys.executable} -m pip install matplotlib
#!{sys.executable} -m pip install networkx
#!{sys.executable} -m pip install torchvision
import random
import scipy.io as sio
import time
import networkx as nx
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from itertools import count
import torch
import torch.optim
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from utilities.load_data import load_mnist
import utilities.email as email
from utilities.mnist import *
from utilities.make_graphs import read_edge_list, read_list, load_data
### END IMPORTS - DO NOT TOUCH!
import math
from numpy import inf
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import normalized_mutual_info_score
G = nx.Graph()
G.add_edges_from([(1,2),(2,3), (2,4), (3,4), (1,3)])
nx.draw(G, with_labels=True, )
#A) YOUR CODE HERE
pagerank = [1/4, 1/4, 1/4, 1/4]
vnorm = np.linalg.norm(pagerank)
transition_matrix = [[0, 1/2, 1/2, 0],
[1/3, 0, 1/3, 1/3],
[1/3, 1/3, 0, 1/3],
[0, 1/2, 1/2, 0]]
eps = 10**(-16)
norm_square_difference_list = list()
while vnorm >= eps:
new_pagerank = np.dot(np.transpose(transition_matrix), pagerank)
new_vnorm = np.linalg.norm(new_pagerank - pagerank)
norm_square_difference_list.append(vnorm-new_vnorm)
pagerank = new_pagerank
vnorm = new_vnorm
print("PageRank:", pagerank)
We add norm_square_difference_list in exercise A to make B simpler to plot
#B) YOUR CODE HERE
plt.plot(norm_square_difference_list)
plt.show()
#D) YOUR CODE HERE
temp_matrix = transition_matrix
for j in range(100):
temp_matrix = np.dot(temp_matrix, transition_matrix)
print("Pagerank:", temp_matrix[0])
#Load Data
from utilities.make_graphs import read_edge_list, read_list, load_data
import numpy as np
X, Y = load_data()
def L2Distance(x, y):
return np.sqrt((x[0]-y[0])**2+(x[1]-y[1])**2)
print("X", (X))
print("Y", (Y))
#YOUR CODE HERE
# Be sure that your constructed graphs does not
# contain loop edges (edges from i to i for some node i)
def nn_graph(data, eps, remove_self=True, directed=False):
n = len(X)
G = nx.Graph()
if directed:
G = nx.DiGraph()
### YOUR CODE STARTS HERE
biglist = []
for i in range(len(data)):
G.add_node(i)
for i in range(len(data)):
for j in range(len(data)):
templist = []
if (remove_self):
if((i != j) & (L2Distance(data[i],data[j]) <= eps)):
templist.append(j)
G.add_edge(i,j)
elif(L2Distance(data[i],data[j])<= eps):
templist.append(j)
G.add_edge(i,j)
biglist.append(templist)
### YOUR ENDS CODE HERE
print(G)
return G
### Run the code below
eps_values = [0.01, 0.05, 0.1, 0.2, 0.4]
#eps_values = [0.01, 0.01, 0.01, 0.01, 0.01]
for eps in eps_values:
ax=plt.subplot()
ax1=plt.subplot()
G = nn_graph(X, eps)
pos=nx.spring_layout(G)
nx.draw_networkx_edges(G,pos=X)
nx.draw_networkx_nodes(G, pos=X, node_color=Y, node_size=20, cmap=plt.get_cmap('tab10'))
ax.set_xlim(-0.1, 1.1)
ax.set_ylim(-0.1, 1.1)
plt.show()
def weighted_nn_graph(data, eps=20, t=0.1):
n = len(data)
G = nx.Graph()
### YOUR CODE STARTS HERE
biglist = []
for i in range(len(data)):
G.add_node(i)
for i in range(len(data)):
for j in range(len(data)):
templist = []
distance = L2Distance(data[i],data[j])
if((i != j) & (distance <= eps)):
templist.append(j)
G.add_edge(i,j, weight = math.exp(-(distance**2)/t))
biglist.append(templist)
print(G)
### YOUR CODE ENDS HERE
return G
ts = [10, 0.2, 0.07, 0.000001]
fig, ax = plt.subplots(1,4, figsize=(20, 10))
row = 0
for i, t in enumerate(ts):
G = weighted_nn_graph(X, eps=60, t=t)
ys = []
col = i
for i, d in enumerate(G.edges.data()):
ys.append(d[2]['weight'])
plt.hist(ys, bins=100)
ax[col].hist(ys, bins=100)
ax[col].set_title("t: "+str(t))
plt.tight_layout()
edgelist = read_edge_list('./data/edges.txt')
n = np.max(edgelist)+1
G = nx.Graph()
for i in range(n):
G.add_node(i)
for edge in edgelist:
G.add_edge(edge[0], edge[1])
pos=nx.spring_layout(G)
nx.draw_networkx_edges(G,pos=X)
nx.draw_networkx_nodes(G, pos=X, node_color=Y, node_size=20, cmap=plt.get_cmap('tab10'))
plt.show()
#A) YOUR CODE HERE
#G = nn_graph(X,0.09)
def differentEdges(G, real):
GList = [list(ele) for ele in G.edges]
counter = 0
for ele in real:
if (ele not in GList):
counter += 1
for ele in GList:
if (ele not in real):
counter += 1
return counter
#B) YOUR CODE HERE
templist = []
diffs = []
print(edgelist)
for eps in np.arange(0.01, 0.11, 0.005):
G = nn_graph(X, eps)
diff = differentEdges(G, edgelist)
templist.append(eps)
diffs.append(diff)
plt.plot(templist,diffs)
def graph_eig(L):
"""
Takes a graph Laplacian and returns sorted the eigenvalues and vectors.
"""
lambdas, eigenvectors = np.linalg.eig(L)
lambdas = np.real(lambdas)
eigenvectors = np.real(eigenvectors)
order = np.argsort(lambdas)
lambdas = lambdas[order]
eigenvectors = eigenvectors[:, order]
return lambdas, eigenvectors
L_norm = None
L_rw = None
### YOUR CODE STARTS HERE
G = nn_graph(X, 0.07)
nodecount = G.number_of_nodes()
degrees = np.zeros((nodecount,nodecount))
for i in range(nodecount):
degrees[i,i] = G.degree[i]
adjacency = nx.adjacency_matrix(G).toarray()
print("ADJ", adjacency.sum())
laplace = degrees-adjacency
temparr = np.float_power(degrees, -0.5)
temparr[temparr == inf] = 0
L_norm = temparr@laplace@temparr
temparr = np.float_power(degrees, -1)
temparr[temparr == inf] = 0
L_rw = temparr@laplace
### YOUR CODE ENDS HERE
eigval_norm, eigvec_norm = graph_eig(L_norm)
eigval_rw, eigvec_rw = graph_eig(L_rw)
plt.figure(0)
plt.plot(eigval_norm, 'b-o', label='Spectrum of Normalized Laplacian', )
plt.legend()
plt.figure(1)
plt.plot(eigval_rw, 'b-o', label='Spectrum of the Random Walk Laplacian')
plt.legend()
from sklearn.cluster import k_means
def spect_cluster(G, eig_type="normal", k=5, d=5):
### YOUR CODE STARTS HERE
nodecount = G.number_of_nodes()
degrees = np.zeros((nodecount,nodecount))
for i in range(nodecount):
degrees[i,i] = G.degree[i]
adjacency = nx.adjacency_matrix(G).toarray()
laplace = degrees-adjacency
if eig_type=="normal":
temparr = np.float_power(degrees, -0.5)
temparr[temparr == inf] = 0
L_norm = temparr@laplace@temparr
eigval_norm, eigvec_norm = graph_eig(L_norm)
elif eig_type=="random":
temparr = np.float_power(degrees, -1)
temparr[temparr == inf] = 0
L_rw = temparr@laplace
eigval_norm, eigvec_norm = graph_eig(L_rw)
vecs = eigvec_norm[:,np.argsort(eigval_norm)]
vals = eigval_norm[np.argsort(eigval_norm)]
kmeans = KMeans(n_clusters=k)
kmeans.fit(vecs[:,0:d])
y_clust = kmeans.labels_
### YOUR CODE ENDS HERE
return y_clust
def plot_graph(G, clusters):
plt.figure(1,figsize=(30,15))
nodes = G.nodes()
ec = nx.draw_networkx_edges(G, X, alpha=0.2)
nc = nx.draw_networkx_nodes(G, X, nodelist=nodes, node_color=clusters, node_size=100, cmap=plt.cm.jet)
plt.axis('off')
plt.show()
your_clusters = spect_cluster(G, k=6)
plot_graph(G, your_clusters)
for method in ['normal', 'random']:
for k in np.arange(2,8):
your_clusters = spect_cluster(G,eig_type=method, k=k)
plot_graph(G, your_clusters)
def modularity(G, clustering):
modularity = 0
### YOUR CODE STARTS HERE
nodecount = G.number_of_nodes()
degrees = np.zeros((nodecount,nodecount))
for i in range(nodecount):
degrees[i,i] = G.degree[i]
adjacency = nx.adjacency_matrix(G).toarray()
C = max(clustering)
m = degrees.sum()
length = len(clustering)
runningsum = 0.0
for c in range(C):
for i in range(length):
if clustering[i] == c:
for j in range(length):
if clustering[j] == c:
runningsum += (adjacency[i,j] - ((G.degree[i]*G.degree[j])/(2*m)))
modularity = runningsum/(2*m)
### YOUR CODE ENDS HERE
return modularity
mods = []
ks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
for k in ks:
print(k)
clusters = spect_cluster(G, k=k) ### NOTE: If you do not use your implementation substitute with a call to the sklearn one.
mods.append(modularity(G, clusters))
# You may want to use plt.plot to plot the modularity for different values of k
plt.plot(ks, mods)
print(mods)
import random
def approx_personalized_pagerank(G, node, alpha = 0.85, iterations = 1000):
ppr = np.zeros(G.number_of_nodes())
### YOUR CODE STARTS HERE
current_node = node
for i in range(iterations):
ppr[current_node] += 1/iterations
#Determine if restart - jump to next node
chance = random.random()
if chance < alpha:
neibrs = [n for n in G[current_node]]
current_node = random.choice(neibrs)
else:
current_node = node
### YOUR CODE ENDS HERE
return ppr
edgelist = read_edge_list('./data/edges.txt')
n = np.max(edgelist)+1
G = nx.Graph()
for i in range(n):
G.add_node(i)
for edge in edgelist:
G.add_edge(edge[0], edge[1])
starting_node = np.argmax(nx.pagerank(G))
for i, iterations in enumerate([10, G.number_of_nodes(), G.number_of_nodes()*2, G.number_of_nodes()*4, G.number_of_nodes()*100, G.number_of_nodes()*1000]):
r = approx_personalized_pagerank(G, starting_node, iterations = iterations)
r[starting_node] = 0
r_sorted = np.argsort(r)[::-1]
r_values = np.sort(r)[::-1]
print(f'Iteration {iterations}: top-10 r={r_sorted[:10]}\n top-10 values={r_values[:10]}\n')
import operator
rr = nx.pagerank(G, alpha=0.85, personalization = {starting_node: 1})
rr[starting_node] = 0
r=np.zeros(len(rr))
for k in rr: r[k] = rr[k]
r_sorted = np.argsort(r)[::-1]
r_values = np.sort(r)[::-1]
print(f'top-10 r={r_sorted[:10]}\n top-10 values={r_values[:10]}\n')
k = 5
ppr_nx = nx.pagerank(G, alpha=0.85, personalization = {starting_node: 1})
r_nx = [0 for _ in range(G.number_of_nodes())]
for k, v in ppr_nx.items():
r_nx[k] = v
r_est = approx_personalized_pagerank(G, starting_node, alpha=0.85)
topk_nx = np.argsort(r_nx)[-5:]
topk_est = np.argsort(r_est)[-5:]
print(topk_nx, topk_est)
for iterations in [10, G.number_of_nodes(), G.number_of_nodes()*2, G.number_of_nodes()*4, G.number_of_nodes()*100, G.number_of_nodes()*1000]:
print(f'Number of iterations {iterations}')
ppr_nx = nx.pagerank(G, alpha=0.85, personalization = {starting_node: 1})
r_nx = [0 for _ in range(G.number_of_nodes())]
for k, v in ppr_nx.items():
r_nx[k] = v
r_est = approx_personalized_pagerank(G, starting_node, iterations = iterations, alpha=0.85)
print(f'Approximate PPR: {r_est[:10]}')
print(f'Real PPR: {r_nx[:10]}')
topk_nx = np.argsort(r_nx)[-5:]
topk_est = np.argsort(r_est)[-5:]
print(f"Topk of nx.pagerank: {topk_nx}, Topk of our estimation {topk_est}, Size of intersection: {len(set(topk_nx).intersection(set(topk_est)))}")
for iterations in [10, G.number_of_nodes(), G.number_of_nodes()*2, G.number_of_nodes()*4, G.number_of_nodes()*100]:
ppr_nx = nx.pagerank(G, alpha=0.1, personalization = {starting_node: 1})
r_nx = [0 for _ in range(G.number_of_nodes())]
for k, v in ppr_nx.items():
r_nx[k] = v
r_est = approx_personalized_pagerank(G, starting_node, iterations = iterations, alpha=0.1)
topk_nx = np.argsort(r_nx)[-5:]
topk_est = np.argsort(r_est)[-5:]
print(f"Topk of nx.pagerank: {topk_nx}, Topk of our estimation {topk_est}, Size of intersection: {len(set(topk_nx).intersection(set(topk_est)))}")
edgelist = read_edge_list('./data/edges.txt')
n = np.max(edgelist)+1
G2 = nx.Graph()
for i in range(n):
G2.add_node(i)
for edge in edgelist:
G2.add_edge(edge[0], edge[1])
G = G2.copy()
#CODE HERE IF YOU NEED THAT
def trusted_pagerank(G, trusted_indices, iterations=500, alpha=0.5):
r = None
### YOUR CODE STARTS HERE
### YOUR CODE ENDS HERE
return r
### YOUR CODE HERE
G = email.S_dir.copy()
import numpy as np
import networkx as nx
def sigmoid(x):
''' Return the sigmoid function of x
x: the input vector
'''
return 1 / (1 + np.exp(-x))
def pagerank_matrix(G, alpha = 0.85) :
''' Return the Personalized PageRank matrix of a graph
Args:
G: the input graph
alpha: the damping factor of PageRank
:return The nxn PageRank matrix P
'''
adjacency = nx.adjacency_matrix(G).toarray()
row_sums = adjacency.sum(axis=0)
D = np.diag(row_sums)
D_inv = np.linalg.inv(D)
M = D_inv @ adjacency
N = M.shape[0]
P = np.ones((N, N)) / N
M_hat = (alpha * M + (1 - alpha) / N)
P = M_hat @ P
return P
def update(u, v, Z, C, step_size) :
'''Update the matrix Z using row-wise gradients of the loss function
Args:
u : the first node
v : the second node
Z : the embedding matrix
C : the classification variable used in Noise Contrastive estimation indicating whether the sample is positive or negative
step_size: step size for gradient descent
:return nothing, just update rows Z[v,:] and and Z[u,:]
'''
score = 0;
length = len(Z[v,:])
for i in range(length):
score += Z[v,i] * Z[u,i];
score = (C - sigmoid(score)) * step_size;
for i in range(length):
Z[v,i] += score * Z[u,i];
for i in range(length):
Z[u,i] += score * Z[v,i];
def verse(G, S, d, k = 3, step_size = 0.0025, steps = 10000):
''' Return the sampled version of VERSE
Args:
G: the input Graph
S: the PageRank similarity matrix
d: dimension of the embedding space
k: number of negative samples
step_size: step size for gradient descent
steps: number of iterations
:return the embedding matrix nxd
'''
n = G.number_of_nodes()
Z = 1/d*np.random.rand(n,d)
### YOUR CODE STARTS HERE
for i in range(steps):
v = np.random.choice(n)
neighbors = list(G.neighbors(v))
if len(neighbors) > 0:
u = np.random.choice(neighbors)
C = 1
update(u, v, Z, C, step_size)
p = S[v, :]
p[neighbors] = 0
p[v] = 0
p /= p.sum()
neg_nodes = np.random.choice(n, size=k, replace=False, p=p)
for u in neg_nodes:
C = 0
update(u, v, Z, C, step_size)
### YOUR CODE ENDS HERE
return Z
# This code runs the `verse` algorithm above on G and stores the embeddings to 'verse.npy'
P = pagerank_matrix(G)
emb = verse(G, P, 128, step_size=0.0025, steps=10_000)
np.save('verse.npy', emb)
### YOUR CODE STARTS HERE
perfsK = []
perfsS = []
for i in range(2,8):
perfKmeans = 0
perfSpec = 0
for _ in range(100):
kmeanslbl = KMeans(n_clusters=i).fit(emb).labels_
perfKmeans += normalized_mutual_info_score(email.communities, kmeanslbl)
speclbl = SpectralClustering(n_clusters=i).fit(emb).labels_
perfSpec += normalized_mutual_info_score(email.communities, speclbl)
perfsK.append(perfKmeans/100)
perfsS.append(perfSpec/100)
### YOUR CODE ENDS HERE
### YOUR CODE STARTS HERE
perfsB = []
for i in range(2,8):
perfSpec = 0
for _ in range(10):
kmeanslbl = KMeans(n_clusters=i).fit(emb).labels_
speclbl = SpectralClustering(n_clusters=i).fit(emb).labels_
perfSpec += normalized_mutual_info_score(kmeanslbl, speclbl)
perfsB.append(perfSpec/10)
plt.bar(range(2,8),perfsS)
plt.bar(range(2,8),perfsK)
plt.bar(range(2,8),perfsB)
import pykeen
# Adjacency matrix
G = email.S_undir.copy()
A = np.array(nx.adj_matrix(G, weight=None).todense())
I = np.eye(A.shape[0])
A = A + I # Add self loop
# Degree matrix
### YOUR CODE HERE
# Normalized Laplacian
# Create input vectors
### TODO your code here
X = torch.tensor(X, dtype=torch.float, requires_grad=True) # Indicate to pytorch that we need gradients for this variable
As = torch.tensor(A, dtype=torch.float)
L = torch.tensor(L, dtype=torch.float) # We don't need to learn this so no grad required.
# Define a GCN
class GCNLayer(nn.Module):
def __init__(self, L, input_features, output_features, activation=F.relu):
"""
Inputs:
L: The "Laplacian" of the graph, as defined above
input_features: The size of the input embedding
output_features: The size of the output embedding
activation: Activation function sigma
"""
super().__init__()
### TODO Your code here
### TODO Your code here
def forward(self, X):
### TODO Your code here
### TODO Your code here
return X
def modularity_matrix(A):
B = None
### YOUR CODE HERE
### YOUR CODE HERE
return torch.tensor(B, dtype=torch.float)
def modularity_loss(C, B, l = 0.01):
''' Return the modularity loss
Args:
C: the node-community affinity matrix
B: the modularity matrix
l: the regularization factor
:return the modularity loss as described at the beginning of the exercise
'''
loss = 0
### YOUR CODE HERE
### YOUR CODE HERE
return loss
### Compute labels from communities
labels = None
### YOUR CODE HERE
### YOUR CODE HERE
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
### Encode the labels with one-hot encoding
def to_categorical(y):
""" 1-hot encodes a tensor """
num_classes = np.unique(y).size
return np.eye(num_classes, dtype='uint8')[y]
def encode_label(labels):
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels = to_categorical(labels)
return labels, label_encoder.classes_
y, classes = encode_label(labels)
y = torch.tensor(y)
# Define convolutional network
in_features, out_features = X.shape[1], classes.size # output features as many as the number of classes
hidden_dim = 16
# Stack two GCN layers as our model
# nn.Sequential is an implicit nn.Module, which uses the layers in given order as the forward pass
gcn = nn.Sequential(
GCNLayer(L, in_features, hidden_dim),
GCNLayer(L, hidden_dim, out_features, None),
nn.Softmax(dim=1)
)
gcn.to(device)
l = 100
epochs = 2000
def train_model(model, optimizer, X, B, epochs=100, print_every=10, batch_size = 2):
for epoch in range(epochs+1):
y_pred = model(X)
loss = modularity_loss(y_pred, B, l=l)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % print_every == 0:
print(f'Epoch {epoch:2d}, loss={loss.item():.5f}')
B = modularity_matrix(A)
optimizer = torch.optim.Adam(gcn.parameters(), lr=0.01)
train_model(gcn, optimizer, X, B, epochs=epochs, print_every=100)
from sklearn.metrics.cluster import normalized_mutual_info_score
def plot_graph(G, y_pred):
plt.figure(1,figsize=(15,5))
pos = nx.spring_layout(G)
ec = nx.draw_networkx_edges(G, pos, alpha=0.2)
nc = nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_color=y_pred, node_size=100, cmap=plt.cm.jet)
plt.axis('off')
plt.show()
### YOUR CODE STARTS HERE
### YOUR CODE ENDS HERE