import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
%matplotlib inline
data = np.load('Wages.npy')
print(np.shape(data))
data_x = data[:,0]
data_y = data[:,1]
plt.scatter(data_x, data_y, facecolor='None', edgecolor='k', alpha=0.3)
plt.xlabel('age (years)',fontsize=16)
plt.ylabel('yearly wage (k $)',fontsize=16)
plt.show()
trainData_x, testData_x, trainData_y, testData_y = train_test_split(data_x, data_y, test_size=0.33, random_state = 1)
polynomialOrder = [1, 3, 6, 8, 16, 32]
fig = plt.figure(figsize=(10,20))
for p in range(len(polynomialOrder)):
coeffs = np.polyfit(trainData_x,trainData_y,polynomialOrder[p])
yPredict = np.polyval(coeffs,trainData_x)
ax0 = fig.add_subplot(6,2,1*p+1)
ax0.set_title('Data-set for regression with polynomial order %s' % polynomialOrder[p])
plt.xlabel('age (years)')
plt.ylabel('yearly wage (k $)')
fig.tight_layout() # optionnel (pour espacer les subplots les uns des autres)
ax0.scatter(trainData_x,trainData_y,facecolor='None', edgecolor='k', alpha=0.3)
ax0.plot(trainData_x,yPredict,"o")
plt.show()
polynomialOrder = [1, 3, 6, 8, 16, 32]
meanSquaredError_train=[]
meanSquaredError_test=[]
print("MSE for the training data subset:")
for p in polynomialOrder:
coeffs = np.polyfit(trainData_x,trainData_y,p)
yPredict = np.polyval(coeffs,trainData_x)
meanSquaredError_train.append(mean_squared_error(trainData_y,yPredict))
print(meanSquaredError_train)
print("\n")
print("MSE for the test data subset:")
for p in polynomialOrder:
coeffs = np.polyfit(testData_x,testData_y,p)
yPredict = np.polyval(coeffs,testData_x)
meanSquaredError_test.append(mean_squared_error(testData_y,yPredict))
print(meanSquaredError_test)
plt.plot(polynomialOrder,meanSquaredError_train,"r:o",label="Train")
plt.plot(polynomialOrder,meanSquaredError_test,"b:o",label="Test")
plt.legend()
plt.show()