elec578_hw1_phillips

from sklearn.linear_model import LinearRegression from statistics import mean import numpy as np import numpy.random as rd from sklearn.datasets import make_regression import sklearn as sk import matplotlib.pyplot as plt import math from sklearn.metrics import mean_squared_error from sklearn.linear_model import Ridge

%autosave 10

# load the data boston_data = sk.datasets.load_boston() indices = np.array(range(boston_data.data.shape[0])) x_train, x_test, y_train, y_test, idx_train, idx_test = sk.model_selection.train_test_split(boston_data.data, boston_data.target, indices, test_size=.20, random_state=0)

# run linear regression with an intercept reg = LinearRegression(fit_intercept=True) reg.fit(x_train, y_train) print("Regression Results w Intercept:") print("training score:", reg.score(x_train, y_train)) print("test score", reg.score(x_test, y_test)) print("learned coefs:\n", reg.coef_) print("intercept:", reg.intercept_)

# i) Center the Y vector and center the X matrix along the columns y_train_centered = y_train-np.mean(y_train) y_test_centered = y_test - np.mean(y_test) x_train_centered = x_train - np.mean(x_train, axis=0) x_test_centered = x_test - np.mean(x_test, axis=0) # linear regression without an intercept reg_centered = LinearRegression(fit_intercept=False) reg_centered.fit(x_train_centered, y_train_centered) print("Regression Results centered w/o Intercept:") print("training score:", reg_centered.score(x_train_centered, y_train_centered)) print("test score", reg_centered.score(x_test_centered, y_test_centered)) print("learned coefs:\n", reg_centered.coef_) print("intercept:", reg_centered.intercept_) # check if the regression coefficients are the same within a rounding error atol = 1e-14 print("True or False: the coefficients are the same?", np.all(np.isclose(reg_centered.coef_, reg.coef_, atol=atol)))

# ii) Linear regression with a column of ones added to X x_train1 = np.concatenate((x_train, np.ones((x_train.shape[0],1))), axis=1) x_test1 = np.concatenate((x_test, np.ones((x_test.shape[0],1))),axis=1) reg1 = LinearRegression(fit_intercept=False) reg1.fit(x_train1, y_train) print("Regression Results with an added column of ones to X:") print("training score:", reg1.score(x_train1, y_train)) print("test score", reg1.score(x_test1, y_test)) print("learned coefs:\n", reg1.coef_) print("intercept:", reg1.intercept_) # check if the regression coefficients are the same within a rounding error # note that there is one more coefficient, which is not included in the analysis atol = 1e-14 print("True or False: the coefficients are the same?", np.all(np.isclose(reg1.coef_[:x_train.shape[1]], reg.coef_, atol=atol))) print("Linear Regression Intercept: {}\nOnes-vector regression coef: {}".format(reg.intercept_, reg1.coef_[-1]))

def show_training_error(data=[x_train, y_train], name="boston"): """ Given a data set, returns and plots the training error of an OLS linear regression model. stops once error rises above 0 Params: data: list of training features and labels name: string of the name of the dataset Returns: train_errs: array of training errors as a function of n """ xtrain = data[0] ytrain = data[1] n, p = xtrain.shape train_errs = np.empty((n,1)) first0 = 0 # index when training error rises above 0 for i in range(1,n): print(i, "out of", n) temp_reg = LinearRegression() temp_reg.fit(xtrain[:i], ytrain[:i]) ypred = temp_reg.predict(xtrain[:i]) train_errs[i] = mean_squared_error(ypred, ytrain[:i]) # stop regression once error is greater than 0 if mean_squared_error(ypred, ytrain[:i]) > 1e-9: if first0 ==0: first0 = i if i - first0 > 20: stop = i break fig, ax = plt.subplots() ax.plot(range(stop), train_errs[:stop]) ax.set(xlabel='number of features', ylabel='training error (MSE)', title='Training error as a function of n on {} dataset (p={})'.format(name, p)) ax.vlines(p, 0, max(.1,train_errs[stop]), colors='r', linestyles='dashed') ax.legend({"training error", "p"}, loc=2) plt.ylim([0, max(.1, train_errs[stop])]) return train_errs

show_training_error(data=[x_train, y_train], name="Boston");

iris = sk.datasets.load_iris() show_training_error(data=[iris.data, iris.target], name="Iris");

diabetes = sk.datasets.load_diabetes() show_training_error(data=[diabetes.data, diabetes.target], name="Diabetes");

wine = sk.datasets.load_wine() show_training_error(data=[wine.data, wine.target], name="Wine");

cali = sk.datasets.fetch_california_housing() show_training_error(data=[cali.data, cali.target], name="California housing")