4G5 molecular modelling student computer projects

import numpy as np from matplotlib.pyplot import * # target function def f(x): return np.sin(x)*np.exp(x/5) xx = np.linspace(0,10, 100) yy = f(xx) plot(xx, yy)

# gather some data points which we will fit to x = np.random.random(10)*10 y = f(x) plot(xx, yy, ':') plot(x, y, 'ro')

# define a Gaussian kernel sig = .75 # length scale def kernel(x1, x2): return np.exp(-(x1-x2)**2/(2*sig**2)) N = len(x) K = np.zeros((N,N)) #print(N) # https://mccormickml.com/2014/02/26/kernel-regression/ # Now apply the formulas from the lecture to fill in the kernel matrix and # compute fitting coefficients. Use the function numpy.linalg.lstsq() to solve the linear problem K@c=y # (look up its documentation, and don't forget that it returns 4 things, but you only need the solution vector) for i in range(N): for j in range(N): K[i,j] = kernel(x[i],x[j]) # Gets gaussian kernel over all points (Gaussian kernel is basically a weighted sum where points close to our chosen point are worth more) # a @ x = b ours k @ c = y c = np.linalg.lstsq(K, y) cpred = c[0] # Saves just the solution vector that we need summing = 0 # Just var to add up summation portion of the eq ypred = np.zeros((len(xx))) # Initializes our y_pred array for i in range(len(xx)): # Runs over all x-vars we want for j in range(N): # Runs over all the basis functions (aka points) we have calc = kernel(xx[i], x[j])*cpred[j] # Kernel * c for each basis function summing = summing + calc # Sums above calc for each basis func ypred[i] = summing # Saves summation as our pred value summing = 0 # resets summation var lam = 1 # strength of regulariser # Lets do the above w/ the regulator now c_reg = np.linalg.lstsq((K+lam*np.identity(N)), y) cpred_reg = c_reg[0] summing_reg = 0 ypred_reg = np.zeros((len(xx))) for i in range(len(xx)): # Runs over all x-vars we want for j in range(N): # Runs over all the basis functions (aka points) we have calc_reg = kernel(xx[i], x[j])*cpred_reg[j] # Kernel * c for each basis function summing_reg = summing_reg + calc_reg # Sums above calc for each basis func ypred_reg[i] = summing_reg # Saves summation as our pred value summing_reg = 0 # resets summation var print(cpred_reg.shape) # now use your coefficients to predict the function on the xx array

# you should get something like the picture below: plot(xx, yy, ':') plot(x, y, 'ro') plot(xx, ypred, 'b') plot(xx, ypred_reg, 'c') title("lam = 1")

import pandas sol = pandas.read_csv("curated-solubility-dataset.csv")

K2 = np.zeros((8000,8000)) lam = .3 #print(N) def kernelinner(x1, x2, sig): return (x1-x2)**2/(2*sig**2) def kernelouter(inner): return np.exp(-inner) # https://mccormickml.com/2014/02/26/kernel-regression/ # Now apply the formulas from the lecture to fill in the kernel matrix and # compute fitting coefficients. Use the function numpy.linalg.lstsq() to solve the linear problem K@c=y # (look up its documentation, and don't forget that it returns 4 things, but you only need the solution vector) multisum = 0 for i in range(8000): for j in range(8000): for k in range(11): multi = kernelinner(X[k,i],X[k,j], sigma[k]) multisum = multisum + multi K2[i,j] = kernelouter(multisum) # Gets gaussian kernel over all points (Gaussian kernel is basically a weighted sum where points close to our chosen point are worth more) multisum = 0 c2 = np.linalg.lstsq((K2+lam*np.identity(8000)), Y2) c2_pred = c2[0] summing2 = 0 summingkern = 0 Ypred = np.zeros((1982)) # test pred size index = 0 for i in range(8000, 9982): # Runs over all vars we want preds on for j in range(8000): # Training set size for k in range(11): calc2 = kernelinner(X[k,i],X[k,j], sigma[k]) summing2 = summing2 + calc2 kernel = kernelouter(summing2)*c2_pred[j] summingkern = summingkern + kernel summing2 = 0 Ypred[index] = summingkern summingkern = 0 index = index + 1

scatter(Y[8000:], Ypred) plot([-50,50], [-50,50], 'k--') xlim(-12,6) ylim(-12,6) ylabel('predicted') xlabel('target') gca().set_aspect('equal') np.sqrt(sum((Y[8000:] - Ypred)**2)/len(Ypred))