# ignore warnings
import warnings
warnings.filterwarnings('ignore')
!ls /datasets/datahub/ml-repo/loan-eligibility-xgboost
# import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import sys
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import math
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.feature_selection import RFE
sys.path.append('/work/ml_pipeline')
from eda import plot_histograms, plot_univariate_numeric, plot_univariate_categorical, plot_heatmap, plot_paired_boxplots, plot_paired_scatterplots, plot_residuals, plot_pearson_wrt_target
from stats import chi2, anova
from model_performance import calc_model_performance, compare_model_performance, calc_preds_in_residual_range, calc_preds_in_residual_perc_range
data = pd.read_csv('/datasets/datahub/ml-repo/loan-eligibility-xgboost/insurance.csv')
data.head()
data.info()
target = 'charges'
X = data.drop(target, axis=1)
y = data[target]
X.shape, y.shape
# Plots histogram for each feature using plotly library
plot_histograms(X)
# Plots histogram for target using plotly library
plot_histograms(pd.DataFrame(y), height=300)
plot_univariate_numeric(
X.select_dtypes(include=np.number),
y
)
plot_univariate_categorical(
X[['sex', 'smoker', 'region', 'children']],
y
)
plot_heatmap(
X[['age', 'bmi', 'children']],
y,
bins=10
)
plot_paired_boxplots(
X[['sex', 'smoker', 'region']],
y
)
plot_paired_scatterplots(X, y)
px.scatter_matrix(
X.select_dtypes(include=np.number)
)
px.imshow(X.select_dtypes(include=np.number).corr())
X_chi2 = chi2(X.select_dtypes(object))
X_chi2
X_chi2[X_chi2['p_value'] < 0.05]
X_anova = anova(X)
X_anova
X_anova[X_anova['p_value'] < 0.05]
plot_pearson_wrt_target(X, y)
data_anova = anova(data) # Use data as it contains the target
anova_wrt_target = data_anova[data_anova['num_column']=='charges']
anova_wrt_target
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.33,
random_state=42
)
cols_to_drop = [
'children',
'region',
'sex'
]
X_train.drop(cols_to_drop, axis=1, inplace=True)
X_test.drop(cols_to_drop, axis=1, inplace=True)
ohe = OneHotEncoder(use_cat_names=True)
X_train = ohe.fit_transform(X_train)
X_test = ohe.transform(X_test)
cols_to_drop = ['smoker_no']
X_train.drop(cols_to_drop, axis=1, inplace=True)
X_test.drop(cols_to_drop, axis=1, inplace=True)
pt = PowerTransformer(method='yeo-johnson')
y_train_t = pt.fit_transform(y_train.values.reshape(-1, 1))[:, 0]
y_test_t = pt.transform(y_test.values.reshape(-1, 1))[:, 0]
pd.Series(y_train_t).hist(figsize=(5, 3))
pd.Series(y_test_t).hist(figsize=(5, 3))
sample_weight = y_train / y_train.min()
lr = LinearRegression()
lr.fit(
X_train,
y_train_t,
sample_weight=sample_weight
)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
y_pred_train = pt.inverse_transform(y_pred_train.reshape(-1, 1))[:, 0]
y_pred_test = pt.inverse_transform(y_pred_test.reshape(-1, 1))[:, 0]
base_perf_train = calc_model_performance(y_train, y_pred_train)
base_perf_train
base_perf_test = calc_model_performance(y_test, y_pred_test)
base_perf_test
residuals_train = y_train - y_pred_train
residuals_test = y_test - y_pred_test
fig = sm.qqplot(
residuals_train,
fit=True,
line='45'
)
fig = sm.qqplot(
residuals_test,
fit=True,
line='45'
)
plot_residuals(y_true=y_train, y_pred=y_pred_train)
px.scatter(x=y_train, y=residuals_train)
px.scatter(x=y_test, y=residuals_test)
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.33,
random_state=42
)
ohe = OneHotEncoder(use_cat_names=True)
# Fit and transform the training dataset
X_train = ohe.fit_transform(X_train)
# Transform the testing dataset using the fitted encoder
X_test = ohe.transform(X_test)
rfe = RFE(estimator=XGBRegressor())
xgb = XGBRegressor()
steps = [
('rfe', rfe),
('xgb', xgb)
]
pipe = Pipeline(steps)
num_features = X_train.shape[1]
search_spaces = {
'rfe__n_features_to_select': Integer(1, num_features), # Num features returned by RFE
'xgb__n_estimators': Integer(1, 500), # Num trees built by XGBoost
'xgb__max_depth': Integer(2, 8), # Max depth of trees built by XGBoost
'xgb__reg_lambda': Integer(1, 200), # Regularisation term (lambda) used in XGBoost
'xgb__learning_rate': Real(0, 1), # Learning rate used in XGBoost
'xgb__gamma': Real(0, 2000) # Gamma used in XGBoost
}
xgb_bs_cv = BayesSearchCV(
estimator=pipe, # Pipeline
search_spaces=search_spaces, # Search spaces
scoring='neg_root_mean_squared_error', # BayesSearchCV tries to maximise scoring metric, so negative RMSE used
n_iter=75, # Num of optimisation iterations
cv=3, # Number of folds
n_jobs=-1, # Uses all available cores to compute
verbose=1, # Show progress
random_state=0 # Ensures reproducible results
)
xgb_bs_cv.fit(
X_train,
y_train,
)
cv_results = pd.DataFrame(xgb_bs_cv.cv_results_).sort_values('rank_test_score')
cv_results
y_pred_train_xgb = xgb_bs_cv.predict(X_train)
y_pred_test_xgb = xgb_bs_cv.predict(X_test)
xgb_perf_train = calc_model_performance(y_train, y_pred_train_xgb)
xgb_perf_train
xgb_perf_test = calc_model_performance(y_test, y_pred_test_xgb)
xgb_perf_test
perf_comp_train = compare_model_performance(base_perf_train, xgb_perf_train)
perf_comp_test = compare_model_performance(base_perf_test, xgb_perf_test)
perf_comp_train
perf_comp_test
calc_preds_in_residual_range(
y_true=y_test,
y_pred=y_pred_test_xgb,
range_=2000
)
calc_preds_in_residual_perc_range(
y_true=y_test,
y_pred=y_pred_test_xgb,
perc_range=20
)