import numpy as np
from matplotlib.pyplot import *
# target function
def f(x):
return np.sin(x)*np.exp(x/5)
xx = np.linspace(0,10, 100)
yy = f(xx)
plot(xx, yy)
# gather some data points which we will fit to
x = np.random.random(10)*10
y = f(x)
plot(xx, yy, ':')
plot(x, y, 'ro')
# define a Gaussian kernel
sig = 1.0 # length scale
def kernel(x1, x2):
return np.exp(-(x1-x2)**2/(2*sig**2))
N = len(x)
K = np.zeros((N,N))
# Now apply the formulas from the lecture to fill in the kernel matrix and
# compute fitting coefficients. Use the function numpy.linalg.lstsq() to solve the linear problem K@c=y
# (look up its documentation, and don't forget that it returns 4 things, but you only need the solution vector)
lam = 0.1# strength of regulariser
# now use your coefficients to predict the function on the xx array
# you should get something like the picture below:
plot(xx, yy, ':')
plot(x, y, 'ro')
plot(xx, ypred, 'k')
import pandas
sol = pandas.read_csv("curated-solubility-dataset.csv")
sol
sol.columns
sol['Solubility'][0] # here is how to access a given property, here the solubility of the first molecule
hist(sol['Solubility'], bins=50)
# we can plot the solubility against any given property. Clearly there are relationships, but the solubility is not determined well
# by any single property. Note how it's better to work with the log of the molecular weight (MolWt)
scatter(np.log(sol['MolWt']), sol['Solubility'], s=2)
scatter((sol['NumHAcceptors']+sol['NumHDonors'])/sol['MolWt'], sol['Solubility'], s=2)
Y = np.array(sol['Solubility']);
proplist = ['HeavyAtomCount',
'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds',
'NumValenceElectrons', 'NumAromaticRings', 'NumSaturatedRings',
'NumAliphaticRings', 'RingCount']
X = np.array([list(sol[prop]/sol['MolWt']) for prop in proplist]) # Many properties are extensive, so we divide by the molecular weight
X = np.insert(X, 0, list(np.log(sol['MolWt'])), axis=0) # add the log MolWt as well
X.shape # we have 11 properties
c,_,_,_ = np.linalg.lstsq(X.T, Y) # here we fit a LINEAR MODEL , solving the equation X.T @ c = Y
c # these are the resulting fitting coefficients
# now we make a scatter plot of target solubility versus predicted solubility
scatter(Y, X.T @ c)
plot([-50,50], [-50,50], 'k--')
xlim(-12,6)
ylim(-12,6)
ylabel('predicted')
xlabel('target')
gca().set_aspect('equal')
# We can calculate the mean squared error of our fit. The solubility values are such that a method with prediction error > 1 is not that useful.
np.sqrt(sum((Y- X.T @ c)**2)/len(Y))