4G5 molecular modelling student computer projects

import numpy as np from matplotlib.pyplot import * # target function def f(x): return np.sin(x)*np.exp(x/5) xx = np.linspace(0,10, 100) yy = f(xx) plot(xx, yy)

# gather some data points which we will fit to x = np.random.random(10)*10 y = f(x) plot(xx, yy, ':') plot(x, y, 'ro')

# define a Gaussian kernel sig = 1.0 # length scale def kernel(x1, x2): return np.exp(-(x1-x2)**2/(2*sig**2)) N = len(x) K = np.zeros((N,N)) # Now apply the formulas from the lecture to fill in the kernel matrix and # compute fitting coefficients. Use the function numpy.linalg.lstsq() to solve the linear problem K@c=y # (look up its documentation, and don't forget that it returns 4 things, but you only need the solution vector) lam = 0.1# strength of regulariser # now use your coefficients to predict the function on the xx array

# you should get something like the picture below: plot(xx, yy, ':') plot(x, y, 'ro') plot(xx, ypred, 'k')

import pandas

sol = pandas.read_csv("curated-solubility-dataset.csv")

sol

sol.columns

sol['Solubility'][0] # here is how to access a given property, here the solubility of the first molecule

hist(sol['Solubility'], bins=50)

# we can plot the solubility against any given property. Clearly there are relationships, but the solubility is not determined well # by any single property. Note how it's better to work with the log of the molecular weight (MolWt) scatter(np.log(sol['MolWt']), sol['Solubility'], s=2)

scatter((sol['NumHAcceptors']+sol['NumHDonors'])/sol['MolWt'], sol['Solubility'], s=2)

Y = np.array(sol['Solubility']); proplist = ['HeavyAtomCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings', 'NumSaturatedRings', 'NumAliphaticRings', 'RingCount'] X = np.array([list(sol[prop]/sol['MolWt']) for prop in proplist]) # Many properties are extensive, so we divide by the molecular weight X = np.insert(X, 0, list(np.log(sol['MolWt'])), axis=0) # add the log MolWt as well

X.shape # we have 11 properties

c,_,_,_ = np.linalg.lstsq(X.T, Y) # here we fit a LINEAR MODEL , solving the equation X.T @ c = Y

c # these are the resulting fitting coefficients

# now we make a scatter plot of target solubility versus predicted solubility scatter(Y, X.T @ c) plot([-50,50], [-50,50], 'k--') xlim(-12,6) ylim(-12,6) ylabel('predicted') xlabel('target') gca().set_aspect('equal')

# We can calculate the mean squared error of our fit. The solubility values are such that a method with prediction error > 1 is not that useful. np.sqrt(sum((Y- X.T @ c)**2)/len(Y))