##### Notebook properties for better display
# Allow multiple outputs from single code chunk
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
# Surpress Warnings
import warnings
warnings.filterwarnings("ignore")
##### Data Analysis
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
import numpy as np
##### Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
# Inline plotting
%matplotlib inline
##### ML
## Cross validation
from sklearn.model_selection import train_test_split
## Linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
## Model evaluation tools
from yellowbrick.regressor import PredictionError, ResidualsPlot
# Load raw data
wine_df = pd.read_csv("/work/data/wine_quality.csv")
wine_df
(
wine_df
.describe()
.round(2)
)
sns.pairplot(
wine_df,
hue='wine_type',
corner=True
);
sns.pairplot(
wine_df,
hue='wine_type',
corner=True,
vars=[
'quality_score', 'alcohol',
'residual_sugar', 'density',
'total_sulfur_dioxide']
);
wine_corr_df = (
wine_df
.corr()
.round(2)
)
wine_corr_df
# Visualize correlations
sns.heatmap(wine_corr_df, cmap='vlag_r', vmin=-1, vmax=1);
# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(
wine_df[['alcohol']], ## Two bracket selection for features (maintains proper dimensions)
wine_df['quality_score'].values, # Single bracket selection
train_size=0.75,
random_state=314
)
X_train.shape
y_train.shape
# Initialize a linear regression model object
wine_lm = LinearRegression();
# Train the model with the training data and labels
wine_lm.fit(X_train, y_train);
"Intercept: {:.2f}. Slope: {:.2f}".format(wine_lm.intercept_, float(wine_lm.coef_))
y_train_pred = wine_lm.predict(X_train)
y_test_pred = wine_lm.predict(X_test)
# View the results
pd.DataFrame(
{
'quality_score': y_test,
'predicted_quality_score': np.round(y_test_pred, 2)
}
)
# R2 Metric
## Provided by default with the score() method
wine_lm.score(X_train, y_train)
wine_lm.score(X_test, y_test)
mean_squared_error(y_train, wine_lm.predict(X_train), squared=False)
mean_squared_error(y_test, wine_lm.predict(X_test), squared=False)
# Average prediction error as a proportion
mean_squared_error(y_test, wine_lm.predict(X_test), squared=False)/np.mean(y_test)
# Create a data frame with training data and predictions
training_results = (
pd.DataFrame(
{
'quality_score': y_train,
'predicted_quality_score': np.round(y_train_pred, 2),
'prediction_error': np.round(y_train_pred - y_train, 2)
}
)
)
training_results
sns.lmplot(
x="quality_score",
y="predicted_quality_score",
data=training_results,
fit_reg=False,
scatter_kws={'alpha': 0.25}
)
# Control x and y limits
plt.ylim(4, 10.5)
plt.xlim(4, 10.5);
# Add y = x
plt.axline((0, 0), (1, 1), linestyle='dashed', color='r');
wine_lm_pred = PredictionError(wine_lm, is_fitted=True, bestfit=False, alpha=0.4)
wine_lm_pred.score(X_test, y_test)
wine_lm_pred.show();
wine_lm_residuals = ResidualsPlot(wine_lm, is_fitted=True)
wine_lm_residuals.score(X_test, y_test)
wine_lm_residuals.show();