# Display plots inline
%matplotlib inline
# Data libraries
import pandas as pd
import numpy as np
# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Plotting defaults
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['figure.dpi'] = 80
# sklearn modules
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
Matplotlib is building the font cache; this may take a moment.
d = pd.read_csv("gp.csv")
n = d.shape[0] # number of rows
sns.scatterplot(x='x', y='y', data=d, color="black")
from sklearn.model_selection import train_test_split
X = np.c_[d.x]
y = d.y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
print("orig sizes :", X.shape, y.shape)
print("train sizes:", X_train.shape, y_train.shape)
print("test sizes :", X_test.shape, y_test.shape)
orig sizes : (100, 1) (100,)
train sizes: (60, 1) (60,)
test sizes : (40, 1) (40,)
degree = []
train_rmse = []
test_rmse = []
M = 30
for i in np.arange(1, M+1):
m = make_pipeline(
PolynomialFeatures(degree=i),
LinearRegression(fit_intercept=False)
).fit(X_train, y_train)
degree.append(i)
train_rmse.append(mean_squared_error(y_train, m.predict(X_train), squared=False))
test_rmse.append(mean_squared_error(y_test, m.predict(X_test), squared=False))
fit = pd.DataFrame(data = {"degree": degree, "train_rmse": train_rmse, "test_rmse": test_rmse})
sns.lineplot(x="degree", y="value", hue="variable", data = pd.melt(fit,id_vars=["degree"]))
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.split(X, y)
[test for train, test in kf.split(X, y)]
from sklearn.model_selection import cross_val_score
model = make_pipeline(
PolynomialFeatures(degree=1),
LinearRegression(fit_intercept=False)
)
# Use shuffle to avoid the issue seen w/ Ex. 3 & 4
# random_state again sets a random seed so we get the same results each time this cell is run
kf = KFold(n_splits=5, shuffle=True, random_state=0)
cross_val_score(model, X, y, cv=kf, scoring="neg_root_mean_squared_error")
degree = []
test_mean_rmse = []
test_rmse = []
M = 30
kf = KFold(n_splits=5, shuffle=True, random_state=0)
for i in np.arange(1,M+1):
model = make_pipeline(
PolynomialFeatures(degree=i),
LinearRegression(fit_intercept=False)
)
cv = -1 * cross_val_score(model, X, y, cv=kf, scoring="neg_root_mean_squared_error")
degree.append(i)
test_mean_rmse.append(np.mean(cv))
test_rmse.append(cv)
cv = pd.DataFrame(
data = np.c_[degree, test_mean_rmse, test_rmse],
columns = ["degree", "mean_rmse"] + ["fold" + str(i) for i in range(1,6) ]
)
cv.head(n=15)
sns.lineplot(x="degree", y="mean_rmse", data = cv, color="black")
sns.scatterplot(x="degree", y="value", hue="variable", data = pd.melt(cv,id_vars=["degree", "mean_rmse"]))
g = sns.lineplot(x="degree", y="mean_rmse", data = cv, color="black")
g = sns.scatterplot(x="degree", y="value", hue="variable", data = pd.melt(cv,id_vars=["degree", "mean_rmse"]))
g.set_yscale("log")
from sklearn.model_selection import GridSearchCV
m = make_pipeline(
PolynomialFeatures(),
LinearRegression(fit_intercept=False)
)
parameters = {
'polynomialfeatures__degree': np.arange(1,31,1)
}
kf = KFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(m, parameters, cv=kf, scoring="neg_root_mean_squared_error").fit(X, y)
print("best index: ", grid_search.best_index_)
print("best param: ", grid_search.best_params_)
print("best score: ", grid_search.best_score_)
best index: 12
best param: {'polynomialfeatures__degree': 13}
best score: -0.19616621662547432
grid_search.cv_results_["mean_test_score"]
grid_search.cv_results_["split0_test_score"]
grid_search.best_estimator_
grid_search.best_estimator_.named_steps['linearregression'].coef_
sns.scatterplot(x='x', y='y', data=d, color="black")
sns.lineplot(
x=d.x,
y=grid_search.best_estimator_.predict(X)
)
np.random.seed(1234)
n = 500
f = lambda x: 1.2 * x + 1.1
g = lambda x: 2.5 * x**2 - 0.9 * x - 3.2
h = lambda x: 2 * x**3 + 0.4 * x**2 - 5.2 * x + 2.7
ex2 = pd.DataFrame({
"x1": np.random.rand(n),
"x2": np.random.rand(n),
"x3": np.random.rand(n)
}).assign(
y = lambda d: f(d.x1) + g(d.x2) + h(d.x3) + 0.25*np.random.randn(n) # epsilon
)
print(ex2)
x1 x2 x3 y
0 0.191519 0.883951 0.401106 0.207226
1 0.622109 0.741361 0.930614 -0.966704
2 0.437728 0.515711 0.515336 -0.899870
3 0.785359 0.135252 0.809582 -1.917369
4 0.779976 0.039884 0.881772 -1.179076
.. ... ... ... ...
495 0.267568 0.995838 0.202188 1.562021
496 0.932827 0.944205 0.372405 1.223071
497 0.826145 0.131144 0.173656 0.741909
498 0.145443 0.518355 0.345234 -0.633376
499 0.647004 0.477245 0.567738 -0.833677
[500 rows x 4 columns]
sns.pairplot(ex2)
X = ex2.drop(columns=['y']) # Keep as a data frame not a nparray for later
y = ex2.y
m = make_pipeline(
PolynomialFeatures(degree=3),
LinearRegression(fit_intercept=False)
)
fit = m.fit(X, y)
print( fit.named_steps['linearregression'].coef_ )
[ 0.28199407 2.32656568 0.20117018 -4.55915257 -1.10824194 -1.16081456
-0.98352703 0.83509629 -0.48174047 -0.32581289 0.32341883 0.46312716
0.55649515 0.53131633 0.12888933 0.34799729 0.66614765 0.52534563
0.03664093 2.2074557 ]
print( fit.named_steps['polynomialfeatures'].powers_ )
[[0 0 0]
[1 0 0]
[0 1 0]
[0 0 1]
[2 0 0]
[1 1 0]
[1 0 1]
[0 2 0]
[0 1 1]
[0 0 2]
[3 0 0]
[2 1 0]
[2 0 1]
[1 2 0]
[1 1 1]
[1 0 2]
[0 3 0]
[0 2 1]
[0 1 2]
[0 0 3]]
kf = KFold(n_splits=5, shuffle=True, random_state=0)
cv = cross_val_score(m, X, y, cv=kf, scoring="neg_root_mean_squared_error")
print(cv)
print(cv.mean())
[-0.26798009 -0.25624857 -0.23095929 -0.24516022 -0.267326 ]
-0.253534831926424
from sklearn.compose import ColumnTransformer, make_column_transformer
ind_poly = make_column_transformer(
(PolynomialFeatures(degree=3, include_bias=False), ['x1']),
(PolynomialFeatures(degree=3, include_bias=False), ['x2']),
(PolynomialFeatures(degree=3, include_bias=False), ['x3']),
)
trans = ind_poly.fit_transform(X, y)
pd.DataFrame(trans) # printing as a DataFrame makes the array more readable
pd.concat([X, pd.DataFrame(trans)], axis=1)
m2 = make_pipeline(
make_column_transformer(
(PolynomialFeatures(degree=3, include_bias=False), ['x1']),
(PolynomialFeatures(degree=3, include_bias=False), ['x2']),
(PolynomialFeatures(degree=3, include_bias=False), ['x3']),
),
LinearRegression(fit_intercept=True)
)
fit = m2.fit(X, y)
fit.named_steps['linearregression'].coef_
fit.named_steps['linearregression'].intercept_
cv = cross_val_score(m2, X, y, cv=5, scoring="neg_root_mean_squared_error")
print(cv)
print(cv.mean())
[-0.26494573 -0.27297603 -0.23030777 -0.22129478 -0.2645753 ]
-0.25081992382431384
m2.named_steps['columntransformer'].named_transformers_
m2.get_params().keys()
parameters = {
'columntransformer__polynomialfeatures-1__degree': np.arange(1,5,1),
'columntransformer__polynomialfeatures-2__degree': np.arange(1,5,1),
'columntransformer__polynomialfeatures-3__degree': np.arange(1,5,1),
}
kf = KFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(m2, parameters, cv=kf, scoring="neg_root_mean_squared_error").fit(X, y)
print("best index: ", grid_search.best_index_)
print("best param: ", grid_search.best_params_)
print("best score: ", grid_search.best_score_)
best index: 10
best param: {'columntransformer__polynomialfeatures-1__degree': 1, 'columntransformer__polynomialfeatures-2__degree': 3, 'columntransformer__polynomialfeatures-3__degree': 3}
best score: -0.24827845229249973
grid_search.best_estimator_.named_steps["linearregression"].intercept_
grid_search.best_estimator_.named_steps["linearregression"].coef_
!jupyter nbconvert --to pdf mlp-week04.ipynb
[NbConvertApp] Converting notebook mlp-week04.ipynb to pdf
[NbConvertApp] Support files will be in mlp-week04_files/
[NbConvertApp] Making directory ./mlp-week04_files
[NbConvertApp] Making directory ./mlp-week04_files
[NbConvertApp] Making directory ./mlp-week04_files
[NbConvertApp] Making directory ./mlp-week04_files
[NbConvertApp] Making directory ./mlp-week04_files
[NbConvertApp] Making directory ./mlp-week04_files
[NbConvertApp] Writing 95777 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] WARNING | bibtex had problems, most likely because there were no citations
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 441720 bytes to mlp-week04.pdf