Practical 5 - Duplicate

import numpy as np import pandas as pd import scipy.linalg as linalg import rdkit.Chem as Chem from rdkit.Chem import AllChem import torch import matplotlib.pyplot as plt

def generate_graph(smile): def get_atom_hash(atomic_number): """ A helper function to quickly encode atomic number into a one-hot vector """ atomic_number_list = [6, 7, 8, 9, 15, 16, 17, 35, 53] if atomic_number in atomic_number_list: return atomic_number_list.index(atomic_number) else: return len(atomic_number_list) def encode_atom(atom): """ Generates the vector of features for an atom in a molecule""" atom_matrix = np.zeros(10) atom_matrix[get_atom_hash(atom.GetAtomicNum())] = 1 return atom_matrix m = AllChem.MolFromSmiles(smile) m = AllChem.AddHs(m) AllChem.ComputeGasteigerCharges(m) atom_matrix = np.array(list(map(encode_atom, (atom for atom in m.GetAtoms())))) adj_matrix = AllChem.GetAdjacencyMatrix(m) + np.identity(m.GetNumAtoms()) # Augmented Adjacency Matrix return(atom_matrix, adj_matrix, m.GetNumAtoms())

esol_data = pd.read_csv('delaney-processed.csv', sep=',') amigdalin = esol_data['smiles'][0] print(amigdalin) print(generate_graph(amigdalin)[2])

n_samples = 10 feature_matrices = [] adj_matrices = [] molecule_membership = [] for i in range(n_samples): feature_matrix, adj_matrix, n_atoms = generate_graph(esol_data['smiles'][i]) feature_matrices.append(feature_matrix) adj_matrices.append(adj_matrix) molecule_membership = np.concatenate((molecule_membership, [i]*n_atoms)) big_feature_matrix = np.concatenate(feature_matrices, axis = 0) print(np.shape(big_feature_matrix)) big_adj_matrix = linalg.block_diag(*adj_matrices) print(np.shape(big_adj_matrix)) masks = torch.tensor(np.array([np.where(molecule_membership == i, 1, 0) for i in range(n_samples)])).double() print(np.shape(masks))

from torch.nn.parameter import Parameter ### Optional Preprocessing Here ### D_mat = np.sum(big_adj_matrix, axis=1) D_mat *= -0.5 D_mat = np.diag(D_mat) laplacian_mat = D_mat @ big_adj_matrix @ D_mat normalized_laplacian = torch.tensor(laplacian_mat ).double() class SolNet(torch.nn.Module): def __init__(self, laplacian, molecule_masks, input_size = 10, internal_size = 10, output_size = 1, num_conv_layers = 3, activation_function=torch.nn.functional.relu): super().__init__() self.W0 = Parameter(torch.rand(input_size, internal_size),requires_grad=True) self.W1 = Parameter(torch.rand(internal_size, internal_size), requires_grad=True) self.W2 = Parameter(torch.rand(internal_size, output_size), requires_grad=True) self.laplacian = laplacian self.molecule_masks = molecule_masks self.activation_function = activation_function self.double() def forward(self, x): """Computes the prediction of the neural network""" ### Your Code Here ### x = self.activation_function(self.laplacian @ x @ self.W0) x = self.activation_function(self.laplacian @ x @ self.W1) x = self.activation_function(self.laplacian @ x @ self.W2) y = self.molecule_masks @ x return y

## Create Targets targets = torch.tensor(esol_data['measured log solubility in mols per litre'][:n_samples]) # Only train on 2/3 of points labelled_indices = np.random.choice(np.arange(n_samples), int(2*n_samples/3), replace = False) unlabelled_indices = np.setdiff1d(np.arange(n_samples), labelled_indices)

### Train your model here (You can use a very similar approach to the one used in previous practicals) ### g = SolNet(normalized_laplacian, masks) x_train = esol_data.iloc[labelled_indices] y_train = targets[labelled_indices] x_test = esol_data.iloc[unlabelled_indices] y_test = targets[unlabelled_indices] learning_rate = .005 #### YOUR CODE HERE #### num_epochs = 500 opt = torch.optim.SGD(g.parameters(), lr=learning_rate) loss_fn = torch.nn.MSELoss() all_train_loss = [] all_test_loss = [] for epoch in range(num_epochs): epoch_train_loss = 0 pred = g(torch.tensor(big_feature_matrix)) loss = loss_fn(pred.squeeze()[labelled_indices], y_train) opt.zero_grad() loss.backward() opt.step() epoch_train_loss += loss.item() for epoch in range(num_epochs): epoch_test_loss = 0 with torch.no_grad(): pred = g(big_feature_matrix) loss = loss_fn(pred.squeeze()[unlabelled_indices], y_test) epoch_test_loss += loss.item() all_train_loss.append(epoch_train_loss) all_test_loss.append(epoch_test_loss) plt.plot(all_train_loss, label="Train") plt.plot(all_test_loss, label="Test") plt.legend()

class MPNet(torch.nn.Module): def __init__(self, n_features, activation_function=torch.nn.functional.relu): super().__init__() self.HL1 = torch.nn.Linear(n_features, 10) self.HL2 = torch.nn.Linear(10, 1) self.activation_function = activation_function self.float() # def aggregate(list_of_mlps, x): # preds = [] # for model in list_of_mlps: # preds.append(model(x)) # return torch.mean(preds) def forward(self, feats, adj): """Computes the prediction of the neural network""" aggregation = adj @ feats summed_aggregation = torch.sum(aggregation, dim=0) # adds all row-wise features # summation to be added to the diagonal entries of the feature matrix which corresponds to fet aggregation mat_to_sum = summed_aggregation @ torch.eye(len(summed_aggregation)) aggregated_feats = feats + mat_to_sum # added the aggregation to diagonal entries y_HL1 = self.HL1(aggregated_feats) y_HL2 = self.HL2(y_HL1) y = self.activation_function(y_HL2).float() return y

### Train your model here (You can use a very similar approach to the one used in previous practicals) ### # Note that training here can be slow if your module is not sufficiently vectorized. It might # be helpful to reduce n_samples to make sure your architecture is working properly before trying to apply # to larger data. m = MPNet(10) x_train = esol_data.iloc[labelled_indices] y_train = targets[labelled_indices] x_test = esol_data.iloc[unlabelled_indices] y_test = targets[unlabelled_indices] learning_rate = .005 #### YOUR CODE HERE #### num_epochs = 500 opt = torch.optim.SGD(g.parameters(), lr=learning_rate) loss_fn = torch.nn.MSELoss() all_train_loss = [] all_test_loss = [] for epoch in range(num_epochs): epoch_train_loss = 0 random_index = np.random.randint(len(x_train)) random_smiles = x_train.reset_index().iloc[random_index]["smiles"] #print(random_smiles) fet, adj, _ = generate_graph(random_smiles) #print(fet.shape) #print(adj.shape) pred = m(torch.tensor(fet).float(), torch.tensor(adj).float()) loss = loss_fn(pred, y_train[random_index].float()) opt.zero_grad() loss.backward() opt.step() epoch_train_loss += loss.item() for epoch in range(num_epochs): epoch_test_loss = 0 with torch.no_grad(): pred = g(big_feature_matrix) loss = loss_fn(pred.squeeze()[unlabelled_indices], y_test) epoch_test_loss += loss.item() all_train_loss.append(epoch_train_loss) all_test_loss.append(epoch_test_loss) plt.plot(all_train_loss, label="Train") plt.plot(all_test_loss, label="Test") plt.legend()