import sklearn
import numpy as np
from numpy.linalg import inv
from numpy.linalg import pinv
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
import random
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet
import itertools
import time
import statsmodels.api as sm
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error as mse
# Describing data
boston_dataset = load_boston()
print(boston_dataset.keys())
print(boston_dataset.DESCR)
dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
.. _boston_dataset:
Boston house prices dataset
---------------------------
**Data Set Characteristics:**
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.
:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per $10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of black people by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's
:Missing Attribute Values: None
:Creator: Harrison, D. and Rubinfeld, D.L.
This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/
This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980. N.B. Various transformations are used in the table on
pages 244-261 of the latter.
The Boston house-price data has been used in many machine learning papers that address regression
problems.
.. topic:: References
- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston['MEDV'] = boston_dataset.target
boston
boston.isnull().sum()
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(boston['MEDV'], bins=30)
plt.show()
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
correlation_matrix = boston.corr().round(2)
# annot = True to print the values inside the square
sns.heatmap(data=correlation_matrix, annot=True, cmap="YlGnBu")
plt.figure(figsize=(20, 5))
features = ['LSTAT', 'RM']
target = boston['MEDV']
for i, col in enumerate(features):
plt.subplot(1, len(features) , i+1)
x = boston[col]
y = target
plt.scatter(x, y, marker='o')
plt.title(col)
plt.xlabel(col)
plt.ylabel('MEDV')
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston['MEDV'] = boston_dataset.target
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# This function is to be used for best subsets and step-wise approaches.
def processSubset(features):
# Fit model on features and calculate RSS
selected = sm.OLS(y,X[list(features)])
regr = selected.fit()
RSS = ((regr.predict(X[list(features)]) - y) ** 2).sum()
return {"Selected":regr, "RSS":RSS}
def bestSubsets(k):
results = []
for combo in itertools.combinations(X.columns, k):
results.append(processSubset(combo))
# Put in dataframe
results = pd.DataFrame(results)
# Choose the model with the lowest RSS
best_result = results.loc[results['RSS'].argmin()]
# Return the best model
return best_result
# We see the best subsets up to 13 predictors
models_best = pd.DataFrame(columns=["RSS", "Selected"])
for i in range(1,len(X.columns)+1):
t0 = time.time()
models_best.loc[i] = bestSubsets(i)
t1 = time.time()
print(f"Finished {i} features in {t1-t0} seconds.")
Finished 1 features in 0.03358316421508789 seconds.
Finished 2 features in 0.1840522289276123 seconds.
Finished 3 features in 0.660210132598877 seconds.
Finished 4 features in 1.6938869953155518 seconds.
Finished 5 features in 3.255265712738037 seconds.
Finished 6 features in 4.09630012512207 seconds.
Finished 7 features in 4.436066627502441 seconds.
Finished 8 features in 3.403841495513916 seconds.
Finished 9 features in 1.9705407619476318 seconds.
Finished 10 features in 0.7486200332641602 seconds.
Finished 11 features in 0.2093794345855713 seconds.
Finished 12 features in 0.036870718002319336 seconds.
Finished 13 features in 0.005837917327880859 seconds.
print(models_best.loc[3, "Selected"].summary())
OLS Regression Results
=======================================================================================
Dep. Variable: MEDV R-squared (uncentered): 0.952
Model: OLS Adj. R-squared (uncentered): 0.952
Method: Least Squares F-statistic: 3335.
Date: Fri, 24 Sep 2021 Prob (F-statistic): 0.00
Time: 01:48:13 Log-Likelihood: -1564.1
No. Observations: 506 AIC: 3134.
Df Residuals: 503 BIC: 3147.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
RM 6.2433 0.225 27.692 0.000 5.800 6.686
PTRATIO -0.5714 0.092 -6.215 0.000 -0.752 -0.391
LSTAT -0.4919 0.040 -12.439 0.000 -0.570 -0.414
==============================================================================
Omnibus: 196.409 Durbin-Watson: 0.888
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1133.974
Skew: 1.593 Prob(JB): 5.76e-247
Kurtosis: 9.606 Cond. No. 24.3
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
def forward(predictors):
# Pull out predictors we still need to process
remaining = [p for p in X.columns if p not in predictors]
results = []
for p in remaining:
results.append(processSubset(predictors+[p]))
# Wrap everything up in a nice dataframe
results = pd.DataFrame(results)
# Choose the model with the highest RSS
best_result = results.loc[results['RSS'].argmin()]
# Return the best result
return best_result
models_fwd = pd.DataFrame(columns=["RSS", "Selected"])
predictors = []
for i in range(1,len(X.columns)+1):
t0 = time.time()
models_fwd.loc[i] = forward(predictors)
predictors = models_fwd.loc[i]["Selected"].model.exog_names
t1 = time.time()
print(f"Finished {i} predictors in {t1-t0} seconds.")
Finished 1 predictors in 0.05340433120727539 seconds.
Finished 2 predictors in 0.03305315971374512 seconds.
Finished 3 predictors in 0.03634214401245117 seconds.
Finished 4 predictors in 0.04172348976135254 seconds.
Finished 5 predictors in 0.026268959045410156 seconds.
Finished 6 predictors in 0.02853107452392578 seconds.
Finished 7 predictors in 0.03122878074645996 seconds.
Finished 8 predictors in 0.02082991600036621 seconds.
Finished 9 predictors in 0.01615595817565918 seconds.
Finished 10 predictors in 0.013617753982543945 seconds.
Finished 11 predictors in 0.017235755920410156 seconds.
Finished 12 predictors in 0.013709306716918945 seconds.
Finished 13 predictors in 0.00966954231262207 seconds.
print(models_fwd.loc[3, "Selected"].summary())
OLS Regression Results
=======================================================================================
Dep. Variable: MEDV R-squared (uncentered): 0.952
Model: OLS Adj. R-squared (uncentered): 0.952
Method: Least Squares F-statistic: 3335.
Date: Fri, 24 Sep 2021 Prob (F-statistic): 0.00
Time: 01:54:09 Log-Likelihood: -1564.1
No. Observations: 506 AIC: 3134.
Df Residuals: 503 BIC: 3147.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
RM 6.2433 0.225 27.692 0.000 5.800 6.686
LSTAT -0.4919 0.040 -12.439 0.000 -0.570 -0.414
PTRATIO -0.5714 0.092 -6.215 0.000 -0.752 -0.391
==============================================================================
Omnibus: 196.409 Durbin-Watson: 0.888
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1133.974
Skew: 1.593 Prob(JB): 5.76e-247
Kurtosis: 9.606 Cond. No. 24.3
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
def backward(predictors):
results = []
for combo in itertools.combinations(predictors, len(predictors)-1):
results.append(processSubset(combo))
# put results in a dataframe
results = pd.DataFrame(results)
# Choose the model with the highest RSS
best_result = results.loc[results['RSS'].argmin()]
# Return the best result
return best_result
models_bwd = pd.DataFrame(columns=["RSS", "Selected"], index = range(1,len(X.columns)))
predictors = X.columns
while(len(predictors) > 1):
t0 = time.time()
models_bwd.loc[len(predictors)-1] = backward(predictors)
predictors = models_bwd.loc[len(predictors)-1]["Selected"].model.exog_names
t1 = time.time()
print(f"Down to {len(predictors)} predictors in {t1-t0} seconds.")
Down to 12 predictors in 0.054945945739746094 seconds.
Down to 11 predictors in 0.04611992835998535 seconds.
Down to 10 predictors in 0.03251457214355469 seconds.
Down to 9 predictors in 0.027847766876220703 seconds.
Down to 8 predictors in 0.024013996124267578 seconds.
Down to 7 predictors in 0.024118423461914062 seconds.
Down to 6 predictors in 0.01862645149230957 seconds.
Down to 5 predictors in 0.01583719253540039 seconds.
Down to 4 predictors in 0.012738943099975586 seconds.
Down to 3 predictors in 0.012104272842407227 seconds.
Down to 2 predictors in 0.008622169494628906 seconds.
Down to 1 predictors in 0.006352424621582031 seconds.
print(models_bwd.loc[3, "Selected"].summary())
OLS Regression Results
=======================================================================================
Dep. Variable: MEDV R-squared (uncentered): 0.952
Model: OLS Adj. R-squared (uncentered): 0.952
Method: Least Squares F-statistic: 3335.
Date: Fri, 24 Sep 2021 Prob (F-statistic): 0.00
Time: 01:58:32 Log-Likelihood: -1564.1
No. Observations: 506 AIC: 3134.
Df Residuals: 503 BIC: 3147.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
RM 6.2433 0.225 27.692 0.000 5.800 6.686
PTRATIO -0.5714 0.092 -6.215 0.000 -0.752 -0.391
LSTAT -0.4919 0.040 -12.439 0.000 -0.570 -0.414
==============================================================================
Omnibus: 196.409 Durbin-Watson: 0.888
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1133.974
Skew: 1.593 Prob(JB): 5.76e-247
Kurtosis: 9.606 Cond. No. 24.3
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
lasso = Lasso(alpha=0.1, normalize=True) # sets all but 3 features to 0
lasso.fit(X, y)
lasso_coeffs = lasso.coef_
# Put in dataframe
lasso_coeffs = pd.DataFrame(lasso_coeffs, index=X.columns)
lasso_coeffs = lasso_coeffs.transpose()
# Drop zeros
lasso_coeffs.loc[:, (lasso_coeffs != 0).any(axis=0)]
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
enet = ElasticNet(alpha=0.4, l1_ratio=0.5, normalize=True) # set all but 3 features to 0
enet.fit(X, y)
enet_coeffs = enet.coef_
# Put in dataframe
enet_coeffs = pd.DataFrame(enet_coeffs, index=X.columns)
enet_coeffs = enet_coeffs.transpose()
# Drop zeros
enet_coeffs.loc[:, (enet_coeffs != 0).any(axis=0)]
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-5,2, n_alphas)
coefs = []
for a in alphas:
# Evaluate the models using crossvalidation
lasso = Lasso(alpha=a)
lasso.fit(X, y)
coefs.append(lasso.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[15,10])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-5,2, n_alphas)
for l1 in [0.2]:
coefs = []
for a in alphas:
# Evaluate the models using crossvalidation
enet = ElasticNet(alpha=a, l1_ratio=l1)
enet.fit(X, y)
coefs.append(enet.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[15,10])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title(f'Elastic net coefficients as a function of the regularization l1_ratio = {l1}')
plt.axis('tight')
plt.show()
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-5,2, n_alphas)
for l1 in [0.8]:
coefs = []
for a in alphas:
# Evaluate the models using crossvalidation
enet = ElasticNet(alpha=a, l1_ratio=l1)
enet.fit(X, y)
coefs.append(enet.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[15,10])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title(f'Elastic net coefficients as a function of the regularization l1_ratio = {l1}')
plt.axis('tight')
plt.show()
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-3,7, n_alphas)
coefs = []
for a in alphas:
# Evaluate the models using crossvalidation
ridge = Ridge(alpha=a)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[15,10])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
'''
2) a)
'''
boston_dataset = load_boston()
X = boston_dataset.data.copy()
y = boston_dataset.target.copy()
lr_intercept = LinearRegression()
lr_intercept.fit(X,y)
print(lr_intercept.coef_)
mean_X = X.mean(axis = 0)
mean_y = y.mean()
X_centered = boston_dataset.data.copy()
y_centered = boston_dataset.target.copy()
for i in range(X.shape[0]):
X_centered[i] = X_centered[i] - mean_X
for i in range(y.shape[0]):
y_centered[i] = y_centered[i] - (mean_y)
lr_centered = LinearRegression(fit_intercept=False)
lr_centered.fit(X_centered, y_centered)
print(lr_centered.coef_)
col1 = np.ones(X.shape[0])[np.newaxis].T
X_1 = np.concatenate((col1, X), 1)
y_1 = y.copy()
lr_1 = LinearRegression(fit_intercept=False)
lr_1.fit(X_1, y_1)
print(lr_1.coef_[1:])
# COMPARE THEM
print(lr_intercept.coef_ == lr_centered.coef_)
print([math.isclose(lr_centered.coef_[x], lr_1.coef_[x+1]) for x in range(len(lr_centered.coef_))])
[-1.08011358e-01 4.64204584e-02 2.05586264e-02 2.68673382e+00
-1.77666112e+01 3.80986521e+00 6.92224640e-04 -1.47556685e+00
3.06049479e-01 -1.23345939e-02 -9.52747232e-01 9.31168327e-03
-5.24758378e-01]
[-1.08011358e-01 4.64204584e-02 2.05586264e-02 2.68673382e+00
-1.77666112e+01 3.80986521e+00 6.92224640e-04 -1.47556685e+00
3.06049479e-01 -1.23345939e-02 -9.52747232e-01 9.31168327e-03
-5.24758378e-01]
[-1.08011358e-01 4.64204584e-02 2.05586264e-02 2.68673382e+00
-1.77666112e+01 3.80986521e+00 6.92224640e-04 -1.47556685e+00
3.06049479e-01 -1.23345939e-02 -9.52747232e-01 9.31168327e-03
-5.24758378e-01]
[ True True True True True True True True True True True True
True]
[True, True, True, True, True, True, True, True, True, True, True, True, True]
'''
2.b)
'''
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
assert(10<len(X.columns))
y = boston["MEDV"].head(10)
X = boston.drop(["MEDV"], axis = 1).head(10)
lreg = LinearRegression()
lreg.fit(X,y)
print(lreg.score(X,y))
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
1.0
'''
2.c)
'''
y = boston["MEDV"]
X = boston.drop(["MEDV"], axis = 1)
# Split data 50-50 train and test
split = X.shape[0]//2
X_train = X.loc[:split,:]
y_train = y.loc[:split]
X_test = X.loc[split:,:]
y_test = y.loc[split:]
lreg = LinearRegression()
lreg.fit(X_train, y_train)
mse_0 = mse(lreg.predict(X_test), y_test)
print(f"Linear regression MSE is {mse_0}")
alphas = [0.1,1,10,100,1000, 10000, 10000000]
found = False
for a in alphas:
ridge = Ridge(alpha=a)
ridge.fit(X_train, y_train)
mse_1 = mse(ridge.predict(X_test), y_test)
if mse_1<mse_0:
print(f"Lambda is {a}, mse is {mse_1} for ridge")
found = True
print("Found better ridge?", found)
Linear regression MSE is 306.09761402149604
Lambda is 0.1, mse is 276.0028192436867 for ridge
Lambda is 1, mse is 191.6326338467738 for ridge
Lambda is 10, mse is 128.70180266077497 for ridge
Lambda is 100, mse is 55.671261898827595 for ridge
Lambda is 1000, mse is 37.85018012343591 for ridge
Lambda is 10000, mse is 48.45322012837898 for ridge
Lambda is 10000000, mse is 94.46550856897723 for ridge
Found better ridge? True