Insurance Pricing

# ignore warnings import warnings warnings.filterwarnings('ignore')

!ls /datasets/datahub/ml-repo/loan-eligibility-xgboost

# import libraries import pandas as pd import numpy as np import plotly.express as px import sys from sklearn.model_selection import train_test_split from category_encoders import OneHotEncoder from sklearn.linear_model import LinearRegression import statsmodels.api as sm import math from xgboost import XGBRegressor from sklearn.pipeline import Pipeline from skopt import BayesSearchCV from skopt.space import Real, Categorical, Integer from sklearn.preprocessing import StandardScaler, PowerTransformer from sklearn.feature_selection import RFE

sys.path.append('/work/ml_pipeline') from eda import plot_histograms, plot_univariate_numeric, plot_univariate_categorical, plot_heatmap, plot_paired_boxplots, plot_paired_scatterplots, plot_residuals, plot_pearson_wrt_target from stats import chi2, anova from model_performance import calc_model_performance, compare_model_performance, calc_preds_in_residual_range, calc_preds_in_residual_perc_range

data = pd.read_csv('/datasets/datahub/ml-repo/loan-eligibility-xgboost/insurance.csv')

data.head()

data.info()

target = 'charges' X = data.drop(target, axis=1) y = data[target]

X.shape, y.shape

# Plots histogram for each feature using plotly library plot_histograms(X)

# Plots histogram for target using plotly library plot_histograms(pd.DataFrame(y), height=300)

plot_univariate_numeric( X.select_dtypes(include=np.number), y )

plot_univariate_categorical( X[['sex', 'smoker', 'region', 'children']], y )

plot_heatmap( X[['age', 'bmi', 'children']], y, bins=10 )

plot_paired_boxplots( X[['sex', 'smoker', 'region']], y )

plot_paired_scatterplots(X, y)

px.scatter_matrix( X.select_dtypes(include=np.number) )

px.imshow(X.select_dtypes(include=np.number).corr())

X_chi2 = chi2(X.select_dtypes(object))

X_chi2

X_chi2[X_chi2['p_value'] < 0.05]

X_anova = anova(X)

X_anova

X_anova[X_anova['p_value'] < 0.05]

plot_pearson_wrt_target(X, y)

data_anova = anova(data) # Use data as it contains the target anova_wrt_target = data_anova[data_anova['num_column']=='charges']

anova_wrt_target

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42 )

cols_to_drop = [ 'children', 'region', 'sex' ] X_train.drop(cols_to_drop, axis=1, inplace=True) X_test.drop(cols_to_drop, axis=1, inplace=True)

ohe = OneHotEncoder(use_cat_names=True) X_train = ohe.fit_transform(X_train) X_test = ohe.transform(X_test)

cols_to_drop = ['smoker_no'] X_train.drop(cols_to_drop, axis=1, inplace=True) X_test.drop(cols_to_drop, axis=1, inplace=True)

pt = PowerTransformer(method='yeo-johnson') y_train_t = pt.fit_transform(y_train.values.reshape(-1, 1))[:, 0] y_test_t = pt.transform(y_test.values.reshape(-1, 1))[:, 0]

pd.Series(y_train_t).hist(figsize=(5, 3)) pd.Series(y_test_t).hist(figsize=(5, 3))

sample_weight = y_train / y_train.min()

lr = LinearRegression() lr.fit( X_train, y_train_t, sample_weight=sample_weight )

y_pred_train = lr.predict(X_train) y_pred_test = lr.predict(X_test)

y_pred_train = pt.inverse_transform(y_pred_train.reshape(-1, 1))[:, 0] y_pred_test = pt.inverse_transform(y_pred_test.reshape(-1, 1))[:, 0]

base_perf_train = calc_model_performance(y_train, y_pred_train)

base_perf_train

base_perf_test = calc_model_performance(y_test, y_pred_test)

base_perf_test

residuals_train = y_train - y_pred_train residuals_test = y_test - y_pred_test

fig = sm.qqplot( residuals_train, fit=True, line='45' )

fig = sm.qqplot( residuals_test, fit=True, line='45' )

plot_residuals(y_true=y_train, y_pred=y_pred_train)

px.scatter(x=y_train, y=residuals_train)

px.scatter(x=y_test, y=residuals_test)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42 )

ohe = OneHotEncoder(use_cat_names=True) # Fit and transform the training dataset X_train = ohe.fit_transform(X_train) # Transform the testing dataset using the fitted encoder X_test = ohe.transform(X_test)

rfe = RFE(estimator=XGBRegressor()) xgb = XGBRegressor()

steps = [ ('rfe', rfe), ('xgb', xgb) ]

pipe = Pipeline(steps)

num_features = X_train.shape[1] search_spaces = { 'rfe__n_features_to_select': Integer(1, num_features), # Num features returned by RFE 'xgb__n_estimators': Integer(1, 500), # Num trees built by XGBoost 'xgb__max_depth': Integer(2, 8), # Max depth of trees built by XGBoost 'xgb__reg_lambda': Integer(1, 200), # Regularisation term (lambda) used in XGBoost 'xgb__learning_rate': Real(0, 1), # Learning rate used in XGBoost 'xgb__gamma': Real(0, 2000) # Gamma used in XGBoost }

xgb_bs_cv = BayesSearchCV( estimator=pipe, # Pipeline search_spaces=search_spaces, # Search spaces scoring='neg_root_mean_squared_error', # BayesSearchCV tries to maximise scoring metric, so negative RMSE used n_iter=75, # Num of optimisation iterations cv=3, # Number of folds n_jobs=-1, # Uses all available cores to compute verbose=1, # Show progress random_state=0 # Ensures reproducible results )

xgb_bs_cv.fit( X_train, y_train, )

cv_results = pd.DataFrame(xgb_bs_cv.cv_results_).sort_values('rank_test_score')

cv_results

y_pred_train_xgb = xgb_bs_cv.predict(X_train) y_pred_test_xgb = xgb_bs_cv.predict(X_test)

xgb_perf_train = calc_model_performance(y_train, y_pred_train_xgb)

xgb_perf_train

xgb_perf_test = calc_model_performance(y_test, y_pred_test_xgb)

xgb_perf_test

perf_comp_train = compare_model_performance(base_perf_train, xgb_perf_train) perf_comp_test = compare_model_performance(base_perf_test, xgb_perf_test)

perf_comp_train

perf_comp_test

calc_preds_in_residual_range( y_true=y_test, y_pred=y_pred_test_xgb, range_=2000 )

calc_preds_in_residual_perc_range( y_true=y_test, y_pred=y_pred_test_xgb, perc_range=20 )