# To embed plots in the notebooks
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np # numpy library
import scipy.linalg as lng # linear algebra from scipy library
from sklearn import preprocessing as preproc # load preprocessing function
# seaborn can be used to "prettify" default matplotlib plots by importing and setting as default
import seaborn as sns
sns.set() # Set searborn as default
prostatePath = 'Prostate.txt'
T = np.loadtxt(prostatePath, delimiter = ' ', skiprows = 1, usecols=[1,2,3,4,5,6,7,8,9])
y = T[:, 0]
X = T[:,1:]
[n, p] = np.shape(X)
#Our chosen normalization centers and normalize the variables of a data matrix to unit length.
# We can use sklearn "Normalizer" to do this, but we must transpose the matrices to act on the variables instead of samples
X_pre = X - np.mean(X,axis=0)
y_pre = y - np.mean(y,axis=0)
normalizer = preproc.Normalizer().fit(X_pre.T)
X_pre = normalizer.transform(X_pre.T).T
def ridgeMulti(X, _lambda, p, y):
inner_prod = np.linalg.inv(np.matmul(X.T, X) + _lambda * np.eye(p,p))
outer_prod = np.matmul(X.T, y)
betas = np.matmul(inner_prod, outer_prod)
return betas
k = 100; # try k values of lambda
lambdas = np.logspace(-4, 4, k)
betas = np.zeros((p,k))
for i in range(k):
betas[:,i] = ridgeMulti(X,lambdas[i],p,y)
plt.figure()
plt.semilogx(lambdas, betas.T )
plt.xlabel("Lambdas")
plt.ylabel("Betas")
plt.suptitle("Regularized beta estimates", fontsize=20)
plt.show()
def centerData(data):
mu = np.mean(data,axis=0)
data = data - mu
return data, mu
K = 10
#lambdas = np.logspace(-4, 4, k)
#Create a vector of length n that contains equal amounts of numbers from 1 to K
x = list(range(1, K))
array = ((np.repeat(x, int(n/K), axis=0)))
pp = np.random.permutation(array)
p1 = pp == 1
print(pp[p1])
[1 1 1 1 1 1 1 1 1]
K = 10
lambdas = np.logspace(-4, 4, k)
MSE = np.zeros((10, 100))
#Create a vector of length n that contains equal amounts of numbers from 1 to K
x = list(range(1, K))
array = (np.repeat(x, int(n/K), axis=0))
#Permute that vector.
permute = np.random.permutation(array)
# For each chunk of data; run ridge for each lambda and calculate mean squared error
for i in range(1, K+1):
# average the mean squared error across chunks for the same lambda values to find the optimal lambda
# Remember excact solution depends on a random indexing, so results may vary
# I reuse the plot with all the betas from 1 a) and add a line for the optimal value of lambda
plt.figure()
plt.semilogx(lambdas, betas.T )
plt.xlabel("Lambdas")
plt.ylabel("Betas")
plt.suptitle(f"Optimal lambda: {lambda_OP}", fontsize=20)
plt.semilogx([lambda_OP, lambda_OP], [np.min(betas), np.max(betas)], marker = ".")
plt.show()
IndentationError: expected an indented block (<ipython-input-60-5fc05990a7bc>, line 20)
# Calculate the standard error for the best lambda, and find a the largest lambda with a MSE that is within
# the range of the optimal lambda +- the standard error.
Lambda_CV_1StdRule =
print("CV lambda with 1-std-rule %0.2f" % Lambda_CV_1StdRule)
D = np.zeros(100)
AIC = np.zeros(100)
BIC = np.zeros(100)
# calculate the AIC and BIC for ridge regression for different lambdas
# pick the lambda based on themodels with the lowest AIC and BIC
jAIC =
jBIC =
print("AIC at %0.2f" % lambdas[jAIC])
print("BIC at %0.2f" % lambdas[jBIC])
#plot different methods Information criteria: AIC BIC CV
fig, axs = plt.subplots(1, 2, figsize=(15,5))
axs[0].set_title('Information criteria')
_ = axs[0].semilogx(lambdas,AIC,'-r',label='AIC')
axs[0].semilogx(lambdas,BIC/300,'-b',label='BIC')
axs[0].semilogx(lambdas,meanMSE,'-k',label='CV')
axs[0].semilogx(lambdas[jAIC],np.min(AIC),'*r',label='AIC')
axs[0].semilogx(lambdas[jBIC],np.min(BIC)/300,'*b',label='BIC')
axs[0].semilogx(lambdas[jOpt],np.min(meanMSE),'*k',label='CV')
axs[0].legend()
axs[0].set_xlabel('Lambda')
axs[0].set_ylabel('Criteria')
#plot the degree of freedom
axs[1].semilogx(lambdas,D,'-k');
axs[1].set_title('Degrees of freedom');
axs[1].set_xlabel('Lambda');
axs[1].set_ylabel('d');
NBoot = 100
Beta = np.zeros((p, len(lambdas), NBoot))
# Run bootstrap on ridge for the same range of lambdas as before
# similarly as with cross validation you run the bootstrap x times and each time you sample with replacement
# ridge is then run on that sample for each lambda value
# Plot the varriance of the betas for each lambda value
plt.figure()
for i in range(8):
plt.semilogx(lambdas, stdBeta[i,:])
plt.title("Bootstrapped variance")
plt.ylabel("beta")
plt.xlabel("lambda")
plt.show()