from sklearn.linear_model import LinearRegression
from statistics import mean
import numpy as np
import numpy.random as rd
from sklearn.datasets import make_regression
import sklearn as sk
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
%autosave 10
Autosaving every 10 seconds
# load the data
boston_data = sk.datasets.load_boston()
indices = np.array(range(boston_data.data.shape[0]))
x_train, x_test, y_train, y_test, idx_train, idx_test = sk.model_selection.train_test_split(boston_data.data, boston_data.target, indices, test_size=.20, random_state=0)
# run linear regression with an intercept
reg = LinearRegression(fit_intercept=True)
reg.fit(x_train, y_train)
print("Regression Results w Intercept:")
print("training score:", reg.score(x_train, y_train))
print("test score", reg.score(x_test, y_test))
print("learned coefs:\n", reg.coef_)
print("intercept:", reg.intercept_)
Regression Results w Intercept:
training score: 0.7730135569264233
test score 0.5892223849182512
learned coefs:
[-1.19443447e-01 4.47799511e-02 5.48526168e-03 2.34080361e+00
-1.61236043e+01 3.70870901e+00 -3.12108178e-03 -1.38639737e+00
2.44178327e-01 -1.09896366e-02 -1.04592119e+00 8.11010693e-03
-4.92792725e-01]
intercept: 38.09169492630284
# i) Center the Y vector and center the X matrix along the columns
y_train_centered = y_train-np.mean(y_train)
y_test_centered = y_test - np.mean(y_test)
x_train_centered = x_train - np.mean(x_train, axis=0)
x_test_centered = x_test - np.mean(x_test, axis=0)
# linear regression without an intercept
reg_centered = LinearRegression(fit_intercept=False)
reg_centered.fit(x_train_centered, y_train_centered)
print("Regression Results centered w/o Intercept:")
print("training score:", reg_centered.score(x_train_centered, y_train_centered))
print("test score", reg_centered.score(x_test_centered, y_test_centered))
print("learned coefs:\n", reg_centered.coef_)
print("intercept:", reg_centered.intercept_)
# check if the regression coefficients are the same within a rounding error
atol = 1e-14
print("True or False: the coefficients are the same?", np.all(np.isclose(reg_centered.coef_, reg.coef_, atol=atol)))
Regression Results centered w/o Intercept:
training score: 0.7730135569264234
test score 0.5900539045756942
learned coefs:
[-1.19443447e-01 4.47799511e-02 5.48526168e-03 2.34080361e+00
-1.61236043e+01 3.70870901e+00 -3.12108178e-03 -1.38639737e+00
2.44178327e-01 -1.09896366e-02 -1.04592119e+00 8.11010693e-03
-4.92792725e-01]
intercept: 0.0
True or False: the coefficients are the same? True
# ii) Linear regression with a column of ones added to X
x_train1 = np.concatenate((x_train, np.ones((x_train.shape[0],1))), axis=1)
x_test1 = np.concatenate((x_test, np.ones((x_test.shape[0],1))),axis=1)
reg1 = LinearRegression(fit_intercept=False)
reg1.fit(x_train1, y_train)
print("Regression Results with an added column of ones to X:")
print("training score:", reg1.score(x_train1, y_train))
print("test score", reg1.score(x_test1, y_test))
print("learned coefs:\n", reg1.coef_)
print("intercept:", reg1.intercept_)
# check if the regression coefficients are the same within a rounding error
# note that there is one more coefficient, which is not included in the analysis
atol = 1e-14
print("True or False: the coefficients are the same?", np.all(np.isclose(reg1.coef_[:x_train.shape[1]], reg.coef_, atol=atol)))
print("Linear Regression Intercept: {}\nOnes-vector regression coef: {}".format(reg.intercept_, reg1.coef_[-1]))
Regression Results with an added column of ones to X:
training score: 0.7730135569264234
test score 0.5892223849182491
learned coefs:
[-1.19443447e-01 4.47799511e-02 5.48526168e-03 2.34080361e+00
-1.61236043e+01 3.70870901e+00 -3.12108178e-03 -1.38639737e+00
2.44178327e-01 -1.09896366e-02 -1.04592119e+00 8.11010693e-03
-4.92792725e-01 3.80916949e+01]
intercept: 0.0
True or False: the coefficients are the same? True
Linear Regression Intercept: 38.09169492630284
Ones-vector regression coef: 38.091694926302644
def show_training_error(data=[x_train, y_train], name="boston"):
"""
Given a data set, returns and plots the training error of an OLS linear regression model.
stops once error rises above 0
Params:
data: list of training features and labels
name: string of the name of the dataset
Returns:
train_errs: array of training errors as a function of n
"""
xtrain = data[0]
ytrain = data[1]
n, p = xtrain.shape
train_errs = np.empty((n,1))
first0 = 0 # index when training error rises above 0
for i in range(1,n):
print(i, "out of", n)
temp_reg = LinearRegression()
temp_reg.fit(xtrain[:i], ytrain[:i])
ypred = temp_reg.predict(xtrain[:i])
train_errs[i] = mean_squared_error(ypred, ytrain[:i])
# stop regression once error is greater than 0
if mean_squared_error(ypred, ytrain[:i]) > 1e-9:
if first0 ==0:
first0 = i
if i - first0 > 20:
stop = i
break
fig, ax = plt.subplots()
ax.plot(range(stop), train_errs[:stop])
ax.set(xlabel='number of features', ylabel='training error (MSE)',
title='Training error as a function of n on {} dataset (p={})'.format(name, p))
ax.vlines(p, 0, max(.1,train_errs[stop]), colors='r', linestyles='dashed')
ax.legend({"training error", "p"}, loc=2)
plt.ylim([0, max(.1, train_errs[stop])])
return train_errs
show_training_error(data=[x_train, y_train], name="Boston");
1 out of 404
2 out of 404
3 out of 404
4 out of 404
5 out of 404
6 out of 404
7 out of 404
8 out of 404
9 out of 404
10 out of 404
11 out of 404
12 out of 404
13 out of 404
14 out of 404
15 out of 404
16 out of 404
17 out of 404
18 out of 404
19 out of 404
20 out of 404
21 out of 404
22 out of 404
23 out of 404
24 out of 404
25 out of 404
26 out of 404
27 out of 404
28 out of 404
29 out of 404
30 out of 404
31 out of 404
32 out of 404
33 out of 404
34 out of 404
35 out of 404
36 out of 404
iris = sk.datasets.load_iris()
show_training_error(data=[iris.data, iris.target], name="Iris");
1 out of 150
2 out of 150
3 out of 150
4 out of 150
5 out of 150
6 out of 150
7 out of 150
8 out of 150
9 out of 150
10 out of 150
11 out of 150
12 out of 150
13 out of 150
14 out of 150
15 out of 150
16 out of 150
17 out of 150
18 out of 150
19 out of 150
20 out of 150
21 out of 150
22 out of 150
23 out of 150
24 out of 150
25 out of 150
26 out of 150
27 out of 150
28 out of 150
29 out of 150
30 out of 150
31 out of 150
32 out of 150
33 out of 150
34 out of 150
35 out of 150
36 out of 150
37 out of 150
38 out of 150
39 out of 150
40 out of 150
41 out of 150
42 out of 150
43 out of 150
44 out of 150
45 out of 150
46 out of 150
47 out of 150
48 out of 150
49 out of 150
50 out of 150
51 out of 150
52 out of 150
53 out of 150
54 out of 150
55 out of 150
56 out of 150
57 out of 150
58 out of 150
59 out of 150
60 out of 150
61 out of 150
62 out of 150
63 out of 150
64 out of 150
65 out of 150
66 out of 150
67 out of 150
68 out of 150
69 out of 150
70 out of 150
71 out of 150
72 out of 150
diabetes = sk.datasets.load_diabetes()
show_training_error(data=[diabetes.data, diabetes.target], name="Diabetes");
1 out of 442
2 out of 442
3 out of 442
4 out of 442
5 out of 442
6 out of 442
7 out of 442
8 out of 442
9 out of 442
10 out of 442
11 out of 442
12 out of 442
13 out of 442
14 out of 442
15 out of 442
16 out of 442
17 out of 442
18 out of 442
19 out of 442
20 out of 442
21 out of 442
22 out of 442
23 out of 442
24 out of 442
25 out of 442
26 out of 442
27 out of 442
28 out of 442
29 out of 442
30 out of 442
31 out of 442
32 out of 442
33 out of 442
wine = sk.datasets.load_wine()
show_training_error(data=[wine.data, wine.target], name="Wine");
1 out of 178
2 out of 178
3 out of 178
4 out of 178
5 out of 178
6 out of 178
7 out of 178
8 out of 178
9 out of 178
10 out of 178
11 out of 178
12 out of 178
13 out of 178
14 out of 178
15 out of 178
16 out of 178
17 out of 178
18 out of 178
19 out of 178
20 out of 178
21 out of 178
22 out of 178
23 out of 178
24 out of 178
25 out of 178
26 out of 178
27 out of 178
28 out of 178
29 out of 178
30 out of 178
31 out of 178
32 out of 178
33 out of 178
34 out of 178
35 out of 178
36 out of 178
37 out of 178
38 out of 178
39 out of 178
40 out of 178
41 out of 178
42 out of 178
43 out of 178
44 out of 178
45 out of 178
46 out of 178
47 out of 178
48 out of 178
49 out of 178
50 out of 178
51 out of 178
52 out of 178
53 out of 178
54 out of 178
55 out of 178
56 out of 178
57 out of 178
58 out of 178
59 out of 178
60 out of 178
61 out of 178
62 out of 178
63 out of 178
64 out of 178
65 out of 178
66 out of 178
67 out of 178
68 out of 178
69 out of 178
70 out of 178
71 out of 178
72 out of 178
73 out of 178
74 out of 178
75 out of 178
76 out of 178
77 out of 178
78 out of 178
79 out of 178
80 out of 178
81 out of 178
cali = sk.datasets.fetch_california_housing()
show_training_error(data=[cali.data, cali.target], name="California housing")
1 out of 20640
2 out of 20640
3 out of 20640
4 out of 20640
5 out of 20640
6 out of 20640
7 out of 20640
8 out of 20640
9 out of 20640
10 out of 20640
11 out of 20640
12 out of 20640
13 out of 20640
14 out of 20640
15 out of 20640
16 out of 20640
17 out of 20640
18 out of 20640
19 out of 20640
20 out of 20640
21 out of 20640
22 out of 20640
23 out of 20640
24 out of 20640
25 out of 20640
26 out of 20640
27 out of 20640
28 out of 20640
29 out of 20640
30 out of 20640
31 out of 20640