import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
nba_data = pd.read_csv('NBA.csv')
nba_data.head()
seasonint64
possint64
0
1987
4847
1
2017
5582
2
2016
4976
3
2021
1178
4
1988
4534
X = nba_data[['win_rating']]
y = nba_data['mp']
model = LinearRegression()
cv_results = cross_validate(model, X, y, cv=5)
cv_score = cv_results['test_score'].mean()
cv_score
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
train_sizes = np.linspace(100, 3200, 32, dtype='int')
train_sizes, train_scores, test_scores = learning_curve(estimator=LinearRegression(),
X=X,
y=y,
train_sizes=train_sizes,
cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.figure(figsize=(10,6))
plt.plot(train_sizes, train_scores_mean, label='training score')
plt.plot(train_sizes, test_scores_mean, label='test score')
plt.ylabel('r2 score')
plt.xlabel('training set size')
plt.legend();
X = nba_data[['mp', 'poss', 'do_ratio', 'pacing']]
y = nba_data['win_rating']
model = LinearRegression()
cross_val = cross_validate(model, X, y, cv=5)
score_added_features = cross_val['test_score'].mean()
score_added_features
train_sizes = np.linspace(100, 3200, 32, dtype='int')
train_sizes, train_scores, test_scores = learning_curve(estimator=LinearRegression(),
X=X,
y=y,
train_sizes=train_sizes,
cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.figure(figsize=(10,6))
plt.plot(train_sizes, train_scores_mean, label='training score')
plt.plot(train_sizes, test_scores_mean, label='test score')
plt.ylabel('r2 score')
plt.xlabel('training set size')
plt.legend();
import seaborn as sns
sns.scatterplot(data=nba_data, x='mp', y='win_rating', alpha=0.5);
from sklearn.model_selection import train_test_split
# training the model
model = LinearRegression()
X = nba_data[['mp']]
y = nba_data['win_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
model.fit(X_train, y_train)
# scoring the model
lin_reg_score = model.score(X_test, y_test)
print("Model R2:", lin_reg_score)
# extracting the coefficients and regression function
regression = model.coef_[0] * nba_data['mp'] + model.intercept_
# plotting the data and learned regression function
sns.scatterplot(data=nba_data, x='mp', y='win_rating', alpha=0.5)
plt.plot(nba_data['mp'], regression, color='red', linewidth=3);
Model R2: 0.5441185829799919
from sklearn.preprocessing import PolynomialFeatures
polynomial_features = PolynomialFeatures(degree=2, include_bias=False) # we don't want to add a column of 1's
X_poly = polynomial_features.fit_transform(X)
X_poly = pd.DataFrame(X_poly) # turning it back into a DataFrame for easier manipulation
X_poly.head()
0float64
1float64
0
2409.0
5803281.0
1
2708.0
7333264.0
2
2407.0
5793649.0
3
585.0
342225.0
4
2056.0
4227136.0
sorted_df = nba_data.sort_values('mp')
X = sorted_df[['mp']]
y = sorted_df['win_rating']
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
model.fit(X_poly, y)
predictions = model.predict(X_poly)
sns.scatterplot(x=X['mp'], y=y, alpha=0.5)
plt.plot(X['mp'], predictions, linewidth=3, color='r');
X = nba_data[['mp', 'poss', 'do_ratio', 'pacing']]
y = nba_data['win_rating']
polynomial_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = polynomial_features.fit_transform(pd.DataFrame(X))
model = LinearRegression()
cv_results = cross_validate(model, X_poly, y, cv=5)
cv_results['test_score'].mean()
X = nba_data[['mp', 'poss', 'do_ratio', 'pacing']]
y = nba_data['win_rating']
degrees = list(range(1,11))
scores = []
for i in degrees:
polynomial_features = PolynomialFeatures(degree=i, include_bias=False)
X_poly = polynomial_features.fit_transform(X)
model = LinearRegression()
cv_results = cross_validate(model, X_poly, y, cv=5)
scores.append(cv_results['test_score'].mean())
scores
# Transform our X to include polynomial features
poly_features = PolynomialFeatures(degree=5, include_bias=False)
X_poly = poly_features.fit_transform(X)
# Get train scores, train sizes, and validation scores using `learning_curve`, r2 score
train_sizes, train_scores, test_scores = learning_curve(
estimator = LinearRegression(),
X = X_poly,
y = y,
train_sizes = train_sizes,
cv = 5
)
# Take the mean of cross-validated train scores and test scores
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
# Plot the learning curves!
plt.figure(figsize=(10,6))
plt.plot(train_sizes, train_scores_mean, label = 'Training score')
plt.plot(train_sizes, test_scores_mean, label = 'Test score')
plt.ylabel('r2 score', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves', fontsize = 18, y = 1.03)
plt.ylim(0,1)
plt.legend();
# create the training size slices
train_sizes = np.linspace(100, 3200, 32, dtype='int')
X_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
# Get train scores, train sizes, and validation scores using `learning_curve`, r2 score
train_sizes, train_scores, test_scores = learning_curve(estimator = LinearRegression(),
X = X_poly,
y = y,
train_sizes = train_sizes,
cv = 5)
# Take the mean of cross-validated train scores and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
# Plot the learning curves!
plt.figure(figsize=(10,6))
plt.plot(train_sizes, train_scores_mean, label = 'training score')
plt.plot(train_sizes, test_scores_mean, label = 'test score')
plt.ylabel('r2 score', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves', fontsize = 18, y = 1.03)
plt.legend();
# Plotting the learning curves
plt.figure(figsize=(10,6))
plt.plot(train_sizes, train_scores_mean, label = 'training score')
plt.plot(train_sizes, test_scores_mean, label = 'test score')
plt.ylabel('r2 score', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves', fontsize = 18, y = 1.03)
# Plotting a line where difference of train and test score becomes <1%
plt.axvline(1400, linestyle='--', c='black')
plt.annotate('Past this line:\ntrain_score - test_score <= 0.01', xy=(1450, 0.7))
# Comparing test scores at that line and at max training data (80% of data)
plt.scatter(train_sizes[14], test_scores_mean[14], c='orange', s=50)
plt.annotate(f"R2: {round(test_scores_mean[14],2)}",
xy=(train_sizes[14] + 50, test_scores_mean[14] - 0.03),
fontsize=12, c='orange')
plt.scatter(train_sizes[31], test_scores_mean[31], c='orange', s=50)
plt.annotate(f"R2: {round(test_scores_mean[31],2)}",
xy=(train_sizes[31] - 200, test_scores_mean[31] - 0.03),
fontsize=12, c='orange')
plt.legend();
# preparing the features and target
X = nba_data[['mp', 'poss', 'do_ratio', 'pacing']]
y = nba_data.win_rating
# instantiating a linear regression model
model = LinearRegression()
# fitting the model
model.fit(X, y)
# calculating the MSE
reg_score = np.mean((model.predict(X) - y) ** 2)
print("Regular model MSE:", round(reg_score, 2))
### 2nd Model ###
# preparing the features and target
X = nba_data[['mp', 'poss', 'do_ratio', 'pacing']]
y = nba_data.win_rating
# transforming the features with degree-3 polynomial features
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
# instantiating a linear regression model
model = LinearRegression()
# fitting the model
model.fit(X_poly, y)
# calculating the MSE
poly_score = np.mean((model.predict(X_poly) - y) ** 2)
print("Degree-2 polynomial feature model MSE", round(poly_score, 2))
Regular model MSE: 4.36
Degree-2 polynomial feature model MSE 1.53