import sklearn
import numpy as np
from numpy.linalg import inv
from numpy.linalg import pinv
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
import random
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet
import itertools
import time
import statsmodels.api as sm
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error as mse
# Describing data
boston_dataset = load_boston()
print(boston_dataset.keys())
print(boston_dataset.DESCR)
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston['MEDV'] = boston_dataset.target
boston
boston.isnull().sum()
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(boston['MEDV'], bins=30)
plt.show()
correlation_matrix = boston.corr().round(2)
# annot = True to print the values inside the square
sns.heatmap(data=correlation_matrix, annot=True, cmap="YlGnBu")
plt.figure(figsize=(20, 5))
features = ['LSTAT', 'RM']
target = boston['MEDV']
for i, col in enumerate(features):
plt.subplot(1, len(features) , i+1)
x = boston[col]
y = target
plt.scatter(x, y, marker='o')
plt.title(col)
plt.xlabel(col)
plt.ylabel('MEDV')
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston['MEDV'] = boston_dataset.target
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# This function is to be used for best subsets and step-wise approaches.
def processSubset(features):
# Fit model on features and calculate RSS
selected = sm.OLS(y,X[list(features)])
regr = selected.fit()
RSS = ((regr.predict(X[list(features)]) - y) ** 2).sum()
return {"Selected":regr, "RSS":RSS}
def bestSubsets(k):
results = []
for combo in itertools.combinations(X.columns, k):
results.append(processSubset(combo))
# Put in dataframe
results = pd.DataFrame(results)
# Choose the model with the lowest RSS
best_result = results.loc[results['RSS'].argmin()]
# Return the best model
return best_result
# We see the best subsets up to 13 predictors
models_best = pd.DataFrame(columns=["RSS", "Selected"])
for i in range(1,len(X.columns)+1):
t0 = time.time()
models_best.loc[i] = bestSubsets(i)
t1 = time.time()
print(f"Finished {i} features in {t1-t0} seconds.")
print(models_best.loc[3, "Selected"].summary())
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
def forward(predictors):
# Pull out predictors we still need to process
remaining = [p for p in X.columns if p not in predictors]
results = []
for p in remaining:
results.append(processSubset(predictors+[p]))
# Wrap everything up in a nice dataframe
results = pd.DataFrame(results)
# Choose the model with the highest RSS
best_result = results.loc[results['RSS'].argmin()]
# Return the best result
return best_result
models_fwd = pd.DataFrame(columns=["RSS", "Selected"])
predictors = []
for i in range(1,len(X.columns)+1):
t0 = time.time()
models_fwd.loc[i] = forward(predictors)
predictors = models_fwd.loc[i]["Selected"].model.exog_names
t1 = time.time()
print(f"Finished {i} predictors in {t1-t0} seconds.")
print(models_fwd.loc[3, "Selected"].summary())
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
def backward(predictors):
results = []
for combo in itertools.combinations(predictors, len(predictors)-1):
results.append(processSubset(combo))
# put results in a dataframe
results = pd.DataFrame(results)
# Choose the model with the highest RSS
best_result = results.loc[results['RSS'].argmin()]
# Return the best result
return best_result
models_bwd = pd.DataFrame(columns=["RSS", "Selected"], index = range(1,len(X.columns)))
predictors = X.columns
while(len(predictors) > 1):
t0 = time.time()
models_bwd.loc[len(predictors)-1] = backward(predictors)
predictors = models_bwd.loc[len(predictors)-1]["Selected"].model.exog_names
t1 = time.time()
print(f"Down to {len(predictors)} predictors in {t1-t0} seconds.")
print(models_bwd.loc[3, "Selected"].summary())
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
lasso = Lasso(alpha=0.1, normalize=True) # sets all but 3 features to 0
lasso.fit(X, y)
lasso_coeffs = lasso.coef_
# Put in dataframe
lasso_coeffs = pd.DataFrame(lasso_coeffs, index=X.columns)
lasso_coeffs = lasso_coeffs.transpose()
# Drop zeros
lasso_coeffs.loc[:, (lasso_coeffs != 0).any(axis=0)]
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
enet = ElasticNet(alpha=0.4, l1_ratio=0.5, normalize=True) # set all but 3 features to 0
enet.fit(X, y)
enet_coeffs = enet.coef_
# Put in dataframe
enet_coeffs = pd.DataFrame(enet_coeffs, index=X.columns)
enet_coeffs = enet_coeffs.transpose()
# Drop zeros
enet_coeffs.loc[:, (enet_coeffs != 0).any(axis=0)]
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-5,2, n_alphas)
coefs = []
for a in alphas:
# Evaluate the models using crossvalidation
lasso = Lasso(alpha=a)
lasso.fit(X, y)
coefs.append(lasso.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[15,10])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-5,2, n_alphas)
for l1 in [0.2]:
coefs = []
for a in alphas:
# Evaluate the models using crossvalidation
enet = ElasticNet(alpha=a, l1_ratio=l1)
enet.fit(X, y)
coefs.append(enet.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[15,10])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title(f'Elastic net coefficients as a function of the regularization l1_ratio = {l1}')
plt.axis('tight')
plt.show()
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-5,2, n_alphas)
for l1 in [0.8]:
coefs = []
for a in alphas:
# Evaluate the models using crossvalidation
enet = ElasticNet(alpha=a, l1_ratio=l1)
enet.fit(X, y)
coefs.append(enet.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[15,10])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title(f'Elastic net coefficients as a function of the regularization l1_ratio = {l1}')
plt.axis('tight')
plt.show()
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-3,7, n_alphas)
coefs = []
for a in alphas:
# Evaluate the models using crossvalidation
ridge = Ridge(alpha=a)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[15,10])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
'''
2) a)
'''
boston_dataset = load_boston()
X = boston_dataset.data.copy()
y = boston_dataset.target.copy()
lr_intercept = LinearRegression()
lr_intercept.fit(X,y)
print(lr_intercept.coef_)
mean_X = X.mean(axis = 0)
mean_y = y.mean()
X_centered = boston_dataset.data.copy()
y_centered = boston_dataset.target.copy()
for i in range(X.shape[0]):
X_centered[i] = X_centered[i] - mean_X
for i in range(y.shape[0]):
y_centered[i] = y_centered[i] - (mean_y)
lr_centered = LinearRegression(fit_intercept=False)
lr_centered.fit(X_centered, y_centered)
print(lr_centered.coef_)
col1 = np.ones(X.shape[0])[np.newaxis].T
X_1 = np.concatenate((col1, X), 1)
y_1 = y.copy()
lr_1 = LinearRegression(fit_intercept=False)
lr_1.fit(X_1, y_1)
print(lr_1.coef_[1:])
# COMPARE THEM
print(lr_intercept.coef_ == lr_centered.coef_)
print([math.isclose(lr_centered.coef_[x], lr_1.coef_[x+1]) for x in range(len(lr_centered.coef_))])
'''
2.b)
'''
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
assert(10<len(X.columns))
y = boston["MEDV"].head(10)
X = boston.drop(["MEDV"], axis = 1).head(10)
lreg = LinearRegression()
lreg.fit(X,y)
print(lreg.score(X,y))
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
'''
2.c)
'''
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# Split data 50-50 train and test
split = X.shape[0]//2
X_train = X.loc[:split,:]
y_train = y.loc[:split]
X_test = X.loc[split:,:]
y_test = y.loc[split:]
lreg = LinearRegression()
lreg.fit(X_train, y_train)
mse_0 = mse(lreg.predict(X_test), y_test)
print(f"Linear regression MSE is {mse_0}")
alphas = [0.1,1,10,100,1000, 10000, 10000000]
found = False
for a in alphas:
ridge = Ridge(alpha=a)
ridge.fit(X_train, y_train)
mse_1 = mse(ridge.predict(X_test), y_test)
if mse_1<mse_0:
print(f"Lambda is {a}, mse is {mse_1} for ridge")
found = True
print("Found better ridge?", found)