import numpy as np
import pandas as pd
import scipy.linalg as linalg
import rdkit.Chem as Chem
from rdkit.Chem import AllChem
import torch
import matplotlib.pyplot as plt
def generate_graph(smile):
def get_atom_hash(atomic_number):
""" A helper function to quickly encode atomic number into a one-hot vector """
atomic_number_list = [6, 7, 8, 9, 15, 16, 17, 35, 53]
if atomic_number in atomic_number_list:
return atomic_number_list.index(atomic_number)
else:
return len(atomic_number_list)
def encode_atom(atom):
""" Generates the vector of features for an atom in a molecule"""
atom_matrix = np.zeros(10)
atom_matrix[get_atom_hash(atom.GetAtomicNum())] = 1
return atom_matrix
m = AllChem.MolFromSmiles(smile)
m = AllChem.AddHs(m)
AllChem.ComputeGasteigerCharges(m)
atom_matrix = np.array(list(map(encode_atom, (atom for atom in m.GetAtoms()))))
adj_matrix = AllChem.GetAdjacencyMatrix(m) + np.identity(m.GetNumAtoms()) # Augmented Adjacency Matrix
return(atom_matrix, adj_matrix, m.GetNumAtoms())
esol_data = pd.read_csv('delaney-processed.csv', sep=',')
amigdalin = esol_data['smiles'][0]
print(amigdalin)
print(generate_graph(amigdalin)[2])
n_samples = 10
feature_matrices = []
adj_matrices = []
molecule_membership = []
for i in range(n_samples):
feature_matrix, adj_matrix, n_atoms = generate_graph(esol_data['smiles'][i])
feature_matrices.append(feature_matrix)
adj_matrices.append(adj_matrix)
molecule_membership = np.concatenate((molecule_membership, [i]*n_atoms))
big_feature_matrix = np.concatenate(feature_matrices, axis = 0)
print(np.shape(big_feature_matrix))
big_adj_matrix = linalg.block_diag(*adj_matrices)
print(np.shape(big_adj_matrix))
masks = torch.tensor(np.array([np.where(molecule_membership == i, 1, 0) for i in range(n_samples)])).double()
print(np.shape(masks))
from torch.nn.parameter import Parameter
### Optional Preprocessing Here ###
D_mat = np.sum(big_adj_matrix, axis=1)
D_mat *= -0.5
D_mat = np.diag(D_mat)
laplacian_mat = D_mat @ big_adj_matrix @ D_mat
normalized_laplacian = torch.tensor(laplacian_mat ).double()
class SolNet(torch.nn.Module):
def __init__(self, laplacian, molecule_masks, input_size = 10, internal_size = 10, output_size = 1, num_conv_layers = 3, activation_function=torch.nn.functional.relu):
super().__init__()
self.W0 = Parameter(torch.rand(input_size, internal_size),requires_grad=True)
self.W1 = Parameter(torch.rand(internal_size, internal_size), requires_grad=True)
self.W2 = Parameter(torch.rand(internal_size, output_size), requires_grad=True)
self.laplacian = laplacian
self.molecule_masks = molecule_masks
self.activation_function = activation_function
self.double()
def forward(self, x):
"""Computes the prediction of the neural network"""
### Your Code Here ###
x = self.activation_function(self.laplacian @ x @ self.W0)
x = self.activation_function(self.laplacian @ x @ self.W1)
x = self.activation_function(self.laplacian @ x @ self.W2)
y = self.molecule_masks @ x
return y
## Create Targets
targets = torch.tensor(esol_data['measured log solubility in mols per litre'][:n_samples])
# Only train on 2/3 of points
labelled_indices = np.random.choice(np.arange(n_samples), int(2*n_samples/3), replace = False)
unlabelled_indices = np.setdiff1d(np.arange(n_samples), labelled_indices)
### Train your model here (You can use a very similar approach to the one used in previous practicals) ###
g = SolNet(normalized_laplacian, masks)
x_train = esol_data.iloc[labelled_indices]
y_train = targets[labelled_indices]
x_test = esol_data.iloc[unlabelled_indices]
y_test = targets[unlabelled_indices]
learning_rate = .005 #### YOUR CODE HERE ####
num_epochs = 500
opt = torch.optim.SGD(g.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()
all_train_loss = []
all_test_loss = []
for epoch in range(num_epochs):
epoch_train_loss = 0
pred = g(torch.tensor(big_feature_matrix))
loss = loss_fn(pred.squeeze()[labelled_indices], y_train)
opt.zero_grad()
loss.backward()
opt.step()
epoch_train_loss += loss.item()
for epoch in range(num_epochs):
epoch_test_loss = 0
with torch.no_grad():
pred = g(big_feature_matrix)
loss = loss_fn(pred.squeeze()[unlabelled_indices], y_test)
epoch_test_loss += loss.item()
all_train_loss.append(epoch_train_loss)
all_test_loss.append(epoch_test_loss)
plt.plot(all_train_loss, label="Train")
plt.plot(all_test_loss, label="Test")
plt.legend()
class MPNet(torch.nn.Module):
def __init__(self, n_features, activation_function=torch.nn.functional.relu):
super().__init__()
self.HL1 = torch.nn.Linear(n_features, 10)
self.HL2 = torch.nn.Linear(10, 1)
self.activation_function = activation_function
self.float()
# def aggregate(list_of_mlps, x):
# preds = []
# for model in list_of_mlps:
# preds.append(model(x))
# return torch.mean(preds)
def forward(self, feats, adj):
"""Computes the prediction of the neural network"""
aggregation = adj @ feats
summed_aggregation = torch.sum(aggregation, dim=0) # adds all row-wise features
# summation to be added to the diagonal entries of the feature matrix which corresponds to fet aggregation
mat_to_sum = summed_aggregation @ torch.eye(len(summed_aggregation))
aggregated_feats = feats + mat_to_sum # added the aggregation to diagonal entries
y_HL1 = self.HL1(aggregated_feats)
y_HL2 = self.HL2(y_HL1)
y = self.activation_function(y_HL2).float()
return y
### Train your model here (You can use a very similar approach to the one used in previous practicals) ###
# Note that training here can be slow if your module is not sufficiently vectorized. It might
# be helpful to reduce n_samples to make sure your architecture is working properly before trying to apply
# to larger data.
m = MPNet(10)
x_train = esol_data.iloc[labelled_indices]
y_train = targets[labelled_indices]
x_test = esol_data.iloc[unlabelled_indices]
y_test = targets[unlabelled_indices]
learning_rate = .005 #### YOUR CODE HERE ####
num_epochs = 500
opt = torch.optim.SGD(g.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()
all_train_loss = []
all_test_loss = []
for epoch in range(num_epochs):
epoch_train_loss = 0
random_index = np.random.randint(len(x_train))
random_smiles = x_train.reset_index().iloc[random_index]["smiles"]
#print(random_smiles)
fet, adj, _ = generate_graph(random_smiles)
#print(fet.shape)
#print(adj.shape)
pred = m(torch.tensor(fet).float(), torch.tensor(adj).float())
loss = loss_fn(pred, y_train[random_index].float())
opt.zero_grad()
loss.backward()
opt.step()
epoch_train_loss += loss.item()
for epoch in range(num_epochs):
epoch_test_loss = 0
with torch.no_grad():
pred = g(big_feature_matrix)
loss = loss_fn(pred.squeeze()[unlabelled_indices], y_test)
epoch_test_loss += loss.item()
all_train_loss.append(epoch_train_loss)
all_test_loss.append(epoch_test_loss)
plt.plot(all_train_loss, label="Train")
plt.plot(all_test_loss, label="Test")
plt.legend()