from sklearn.linear_model import LinearRegression
from statistics import mean
import numpy as np
import numpy.random as rd
from sklearn.datasets import make_regression
import sklearn as sk
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
%autosave 10
# load the data
boston_data = sk.datasets.load_boston()
indices = np.array(range(boston_data.data.shape[0]))
x_train, x_test, y_train, y_test, idx_train, idx_test = sk.model_selection.train_test_split(boston_data.data, boston_data.target, indices, test_size=.20, random_state=0)
# run linear regression with an intercept
reg = LinearRegression(fit_intercept=True)
reg.fit(x_train, y_train)
print("Regression Results w Intercept:")
print("training score:", reg.score(x_train, y_train))
print("test score", reg.score(x_test, y_test))
print("learned coefs:\n", reg.coef_)
print("intercept:", reg.intercept_)
# i) Center the Y vector and center the X matrix along the columns
y_train_centered = y_train-np.mean(y_train)
y_test_centered = y_test - np.mean(y_test)
x_train_centered = x_train - np.mean(x_train, axis=0)
x_test_centered = x_test - np.mean(x_test, axis=0)
# linear regression without an intercept
reg_centered = LinearRegression(fit_intercept=False)
reg_centered.fit(x_train_centered, y_train_centered)
print("Regression Results centered w/o Intercept:")
print("training score:", reg_centered.score(x_train_centered, y_train_centered))
print("test score", reg_centered.score(x_test_centered, y_test_centered))
print("learned coefs:\n", reg_centered.coef_)
print("intercept:", reg_centered.intercept_)
# check if the regression coefficients are the same within a rounding error
atol = 1e-14
print("True or False: the coefficients are the same?", np.all(np.isclose(reg_centered.coef_, reg.coef_, atol=atol)))
# ii) Linear regression with a column of ones added to X
x_train1 = np.concatenate((x_train, np.ones((x_train.shape[0],1))), axis=1)
x_test1 = np.concatenate((x_test, np.ones((x_test.shape[0],1))),axis=1)
reg1 = LinearRegression(fit_intercept=False)
reg1.fit(x_train1, y_train)
print("Regression Results with an added column of ones to X:")
print("training score:", reg1.score(x_train1, y_train))
print("test score", reg1.score(x_test1, y_test))
print("learned coefs:\n", reg1.coef_)
print("intercept:", reg1.intercept_)
# check if the regression coefficients are the same within a rounding error
# note that there is one more coefficient, which is not included in the analysis
atol = 1e-14
print("True or False: the coefficients are the same?", np.all(np.isclose(reg1.coef_[:x_train.shape[1]], reg.coef_, atol=atol)))
print("Linear Regression Intercept: {}\nOnes-vector regression coef: {}".format(reg.intercept_, reg1.coef_[-1]))
def show_training_error(data=[x_train, y_train], name="boston"):
"""
Given a data set, returns and plots the training error of an OLS linear regression model.
stops once error rises above 0
Params:
data: list of training features and labels
name: string of the name of the dataset
Returns:
train_errs: array of training errors as a function of n
"""
xtrain = data[0]
ytrain = data[1]
n, p = xtrain.shape
train_errs = np.empty((n,1))
first0 = 0 # index when training error rises above 0
for i in range(1,n):
print(i, "out of", n)
temp_reg = LinearRegression()
temp_reg.fit(xtrain[:i], ytrain[:i])
ypred = temp_reg.predict(xtrain[:i])
train_errs[i] = mean_squared_error(ypred, ytrain[:i])
# stop regression once error is greater than 0
if mean_squared_error(ypred, ytrain[:i]) > 1e-9:
if first0 ==0:
first0 = i
if i - first0 > 20:
stop = i
break
fig, ax = plt.subplots()
ax.plot(range(stop), train_errs[:stop])
ax.set(xlabel='number of features', ylabel='training error (MSE)',
title='Training error as a function of n on {} dataset (p={})'.format(name, p))
ax.vlines(p, 0, max(.1,train_errs[stop]), colors='r', linestyles='dashed')
ax.legend({"training error", "p"}, loc=2)
plt.ylim([0, max(.1, train_errs[stop])])
return train_errs
show_training_error(data=[x_train, y_train], name="Boston");
iris = sk.datasets.load_iris()
show_training_error(data=[iris.data, iris.target], name="Iris");
diabetes = sk.datasets.load_diabetes()
show_training_error(data=[diabetes.data, diabetes.target], name="Diabetes");
wine = sk.datasets.load_wine()
show_training_error(data=[wine.data, wine.target], name="Wine");
cali = sk.datasets.fetch_california_housing()
show_training_error(data=[cali.data, cali.target], name="California housing")