Hotel Booking Churn Rate and Sales Forecasting - A Correlation-Causation Study

Background & Objective

Executive Summary

Connecting to Snowflake Data Warehouse

import os import pandas as pd import snowflake.connector WAREHOUSE = "COMPUTE_WH" DATABASE = "HOTEL_BOOKING" conn = snowflake.connector.connect(user="amirfaar", password="uBJqM%ZSQf1x3fqM", account="ba80824.east-us-2.azure", warehouse=WAREHOUSE, database=DATABASE, schema="PUBLIC") conn.cursor().execute("USE WAREHOUSE " + WAREHOUSE) conn.cursor().execute("USE DATABASE " + DATABASE) view_dict = { 'booking_sales_data': """SELECT * FROM booking_sales_data;""", } for k, v in view_dict.items(): locals()[k] = pd.read_sql(v, conn) conn.close()

Feature Engineering

df = booking_sales_data.copy() col = list(df.columns) col_lower = [] for i in range(len(col)): col_lower.append(col[i].lower()) df.columns = col_lower print(df.shape) display(df) print(df.dtypes)

Handling Missing Values

print(df.isna().mean().sort_values(ascending = False)[:10]) display(df.loc[:, df.columns[df.isna().mean() != 0]].head()) (df.isna().mean().sort_values(ascending = False)[:4]*100).plot.bar() # adding data labels ax = plt.gca() for p in ax.patches: ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), fontsize=14, color='black', ha='center', va='bottom') plt.yscale('log') plt.title('Columns with Missing values (% NaN)') plt.xticks(rotation=45, horizontalalignment='right', fontweight='light', fontsize='medium') plt.show()

# if the value of agent and company column is null, replace with 0 df[['agent', 'company']] = df[['agent', 'company']].fillna(0.0) # if the value of country column (string) is null, replace with MODE value df['country'].fillna(df.country.mode()[0], inplace=True) # if the value of children column is null, replace with MEDIAN df['children'] = df['children'].fillna(df.children.median()) # if the sum of adult, baby, and child is 0, drop row df.drop(df[(df.adults+df.babies+df.children)==0].index, inplace = True) print(df.shape)

Feature Creation

df_subset = df.copy() # Create [total_staying_nights] column df_subset['total_staying_nights'] = df_subset[ 'stays_in_week_nights'] + df_subset['stays_in_weekend_nights'] # drop rows where total_staying_nights below 1 df_subset.drop(df_subset[df_subset['total_staying_nights'] < 1].index, inplace=True) df_subset.reset_index(drop=True)

# Create [room] column # if assigned room is the same, then 1, else 0 df_subset['room'] = 0 df_subset.loc[df_subset['reserved_room_type'] == df_subset['assigned_room_type'], 'room'] = 1

# Create [more_canceled] column # if the cancellation is higher, then 1, else 0 df_subset['more_canceled'] = 0 df_subset.loc[df_subset['previous_cancellations'] > df_subset['previous_bookings_not_canceled'], 'more_canceled'] = 1

# Seperate year/month/date from reservation_status_date column df_subset['reservation_status_date'] = pd.to_datetime(df_subset['reservation_status_date']) df_subset['reservation_year'] = df_subset['reservation_status_date'].dt.year df_subset['reservation_month'] = df_subset['reservation_status_date'].dt.month df_subset['reservation_day'] = df_subset['reservation_status_date'].dt.day

df_subset.drop(df_subset.loc[:, df_subset.columns.str.contains('arrival')], axis=1, inplace=True)

# remove unnecessary columns # drop 'reservation_status' as we will use 'is_canceled': canceled (1) or not (0) df_subset = df_subset.drop([ 'reservation_status', 'booking_changes', 'assigned_room_type', 'reservation_status_date', 'distribution_channel' ], axis=1)

display(df_subset)

EDA: Cancellation Analysis

mpl.rcParams['figure.figsize'] = (4.5, 3.5) sns.set_context('talk', font_scale=0.7) sns.boxplot(y='lead_time', x="is_canceled", data=df_subset, palette='RdBu_r') plt.yscale('log') plt.show()

mpl.rcParams['figure.figsize'] = (12, 20) sns.set_context('talk', font_scale=.78) cols = [ 'hotel', 'total_of_special_requests', 'required_car_parking_spaces', 'room', 'more_canceled', 'market_segment', 'reservation_month', 'deposit_type' ] col_index = 0 nrows = 4 ncols = 2 fig, ax = plt.subplots(nrows, ncols) for i in range(nrows): for j in range(ncols): ggg = pd.DataFrame( df_subset.groupby([cols[col_index], 'is_canceled' ])['is_canceled'].count() / df_subset.groupby([cols[col_index]])['is_canceled'].count() * 100.0).unstack() # Grouped bar chart labels = list(ggg.index) x = np.arange(len(ggg.index)) # the label locations width = 0.35 # the width of the bars rects1 = ax[i, j].bar(x - width/2, ggg['is_canceled'][0], width, label='Not Canc.') rects2 = ax[i, j].bar(x + width/2, ggg['is_canceled'][1], width, label='Canc.') # Add some text for labels, title and custom x-axis tick labels, etc. ax[i, j].set_ylabel('Count %') ax[i, j].set_xlabel(cols[col_index]) ax[i, j].tick_params(axis='x', rotation=45) ax[i, j].set_xticks(x, labels) ax[i, j].legend(loc = 'best') # data labels ax[i, j].bar_label(rects1, padding=3, fmt='%.0f', fontsize=10) ax[i, j].bar_label(rects2, padding=3, fmt='%.0f', fontsize=10) # going to the next column name in cols list col_index = col_index + 1 fig.tight_layout() plt.show()

Correlation

mpl.rcParams['figure.figsize'] = (23, 12) sns.set_context('talk', font_scale=0.7) df_corr = df_subset.select_dtypes(exclude='object') sns.heatmap(df_corr.corr(method='pearson').where( np.tril(np.ones(df_corr.corr().shape)).astype(np.bool)), cmap='PuBu', linewidths=1, annot=True, annot_kws={"size": 12}, fmt=".0%") plt.show()

mpl.rcParams['figure.figsize'] = (20, 6) sns.set_context('talk', font_scale=0.8) df.corr()['is_canceled'].sort_values(ascending=False)[1:].plot(kind='bar') plt.xticks(rotation=65, horizontalalignment='right', fontweight='light', fontsize='medium') plt.title("Pearson Correlation Values for Cancellation") plt.show()

Encoding Categorical Features

Y = df_subset.pop('is_canceled') X = df_subset

# Convert children to "int" X['children'] = X.children.apply(int)

# Hotel is binary, so we will use Pandas replace() X['hotel'] = X['hotel'].replace({'City Hotel':1, 'Resort Hotel':0})

# Since country is a nominal category (not ordinal), we should use Panda’s get_dummies() to one-hot encode, but we will increase the DF's number of eatures by 177! # so we will do a more specific encoding by assuming that we are interested in the impact of country being US or not: X['country_US'] = np.where(X['country'].str.contains('USA'), 1, 0) X.drop('country', inplace = True, axis = 1)

meal_code = {'SC': 0, 'Undefined': 0, 'BB': 1, 'HB': 2, 'FB': 3} X['meal'] = X['meal'].map(meal_code) deposit_type_code = {'No Deposit': 0, 'Refundable': 1, 'Non Refund': 2} X['deposit_type'] = X['deposit_type'].map(deposit_type_code)

# one-hot encoding cat_cols = ['market_segment', 'reserved_room_type', 'customer_type'] for c in cat_cols: locals()[c] = pd.get_dummies(X[c], drop_first=False, prefix=c) X.drop(['market_segment', 'reserved_room_type', 'customer_type'], axis=1, inplace=True) X = pd.concat([X, market_segment, reserved_room_type, customer_type], axis=1) X.shape

X[['agent', 'company']].corr()

# drop company and agent X.drop(['company', 'agent'], axis =1, inplace = True) X.shape

Z-score Scaling

# Checking the variance before X.var().sort_values(ascending = False)[:10]

cols = list(X.columns) from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = pd.DataFrame(sc.fit_transform(X), columns = cols) X.var().sort_values(ascending = False)[:10]

Modeling Churn Rate

from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, stratify=Y)

ML Model Comparison

from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier

# LogisticRegression lr = LogisticRegression() lr.fit(X_train, y_train) pred_lr = lr.predict(X_test)

# RandomForest rd_clf = RandomForestClassifier() # n_estimators: number of decision trees to include in the forest rd_clf.fit(X_train, y_train) pred_rd_clf = rd_clf.predict(X_test)

# XGBoost xgb = XGBClassifier(booster = 'gbtree', n_estimators = 500) xgb.fit(X_train, y_train) pred_xgb = xgb.predict(X_test)

from sklearn.metrics import roc_auc_score models_auc = pd.DataFrame({ 'Model': [ 'Logistic Regression', 'Random Forest Classifier', 'XgBoost', ], 'AUC': [ round(roc_auc_score(y_test, pred_lr), 4), round(roc_auc_score(y_test, pred_rd_clf), 4), round(roc_auc_score(y_test, pred_xgb), 4) ] }) models_auc.sort_values(by='AUC', ascending=False)

import plotly.express as px fig = px.bar(data_frame=models_auc, x='AUC', y='Model', orientation='h', color='AUC', title='Models Comparison') fig.show()

# ROC data to a DataFrame from sklearn.metrics import roc_curve models = ['lr', 'rd_clf', 'xgb'] roc = pd.DataFrame() for m in models: y_pred_proba_test = locals()[m].predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_test) roc = pd.concat([ roc, pd.DataFrame({ f'FPR(Fall-out)_{m}': fpr, f'TPRate(Recall)_{m}': tpr, f'Threshold_{m}': thresholds }) ], axis=1) roc.drop(roc.columns[roc.columns.str.contains('Threshold')], axis = 1, inplace = True) roc

XGB Best Hyperparams by HYPEROPT Bayesian Optimization

# import machine learning libraries import xgboost as xgb from sklearn.metrics import accuracy_score # import packages for hyperparameters tuning from hyperopt import STATUS_OK, Trials, fmin, hp, tpe from timeit import default_timer as timer

# Initialize domain space for range of values space = { 'max_depth': hp.quniform("max_depth", 2, 8, 1), 'gamma': hp.uniform('gamma', 1, 10), 'reg_lambda': hp.uniform('reg_lambda', 1, 2), 'min_child_weight': hp.uniform('min_child_weight', 0.75, 1), 'learning_rate': hp.uniform('learning_rate', 0.1, 0.3), 'n_estimators': 500, 'booster': 'gbtree', 'seed': 0 }

# Define the objective function for xgb Classifier def objective(space): import sklearn.model_selection as ms from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split from sklearn import metrics from sklearn.metrics import r2_score, confusion_matrix, mean_squared_error, accuracy_score, f1_score, classification_report import xgboost as xgb X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, stratify = Y) xgb_clf = xgb.XGBClassifier( max_depth=int(space['max_depth']), gamma=space['gamma'], reg_lambda=space['reg_lambda'], min_child_weight=int(space['min_child_weight']), learning_rate=space['learning_rate'], n_estimators=space['n_estimators'], booster=space['booster'] ) evaluation = [(X_train, y_train), (X_test, y_test)] xgb_clf.fit( X_train, y_train, eval_set=evaluation, eval_metric="auc", early_stopping_rounds=10, verbose=False) pred = xgb_clf.predict(X_test) # classification metric accuracy = accuracy_score(y_test, pred > 0.5) print("SCORE:", accuracy) return { 'loss': -accuracy, # negative of the accuracy becasue we want to minimize the loss 'status': STATUS_OK }

# Optimization algorithm # trials database in which to store all the point evaluations of the search trials = Trials() # fmin function to minimize the objective function best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 450, # number of iterations to do trials = trials)

XGB Model

def xgb(x, y, test_size): '''>>> returns XGBoot model and its evaluations; >>> x: Independent variables as a DataFrame; >>> y: Dependent variable as a DataFrame >>> test_size: float between 0 to 1 ''' # libraries import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns sns.set_style('white') from sklearn.preprocessing import StandardScaler import sklearn.model_selection as ms from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split from sklearn import metrics from sklearn.metrics import r2_score, confusion_matrix, mean_squared_error, accuracy_score, f1_score, classification_report import xgboost as xgb import math import warnings warnings.filterwarnings('ignore') X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, stratify = Y) xgb = XGBClassifier(booster='gbtree', n_estimators=500, gamma=best_hyperparams['gamma'], learning_rate=best_hyperparams['learning_rate'], max_depth=int(best_hyperparams['max_depth']), min_child_weight=best_hyperparams['min_child_weight'], reg_lambda=best_hyperparams['reg_lambda']) xgb.fit(X_train, y_train) pred_xgb = xgb.predict(X_test) # confusion matrix plotting confusion_matrix(y_test, pred_xgb) mpl.rcParams['figure.figsize'] = (8, 6) sns.set_context('talk', font_scale = 0.85) class_names=[0,1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) sns.heatmap(confusion_matrix(y_test, pred_xgb), annot=True, linewidths=0.01, cmap="Greens", linecolor="gray", fmt= '.1f') ax.xaxis.set_label_position("top") plt.tight_layout() plt.title('Confusion Matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.show() # report table from sklearn.metrics import classification_report print('=== classification_report ===') print(classification_report(y_test, xgb.predict(X_test), digits=6)) # ROC curve from sklearn.metrics import roc_curve y_pred_proba_test = xgb.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_test) roc = pd.DataFrame({ 'FPR(Fall-out)': fpr, 'TPRate(Recall)': tpr, 'Threshold': thresholds }) mpl.rcParams['figure.figsize'] = (8, 8) plt.scatter(fpr, tpr) plt.plot(fpr, tpr, 'bo', alpha=0.2) plt.xlabel("FPR (Fall-out)") plt.ylabel("TPR (Recall)") plt.title('ROC Curve') plt.show() # AUC calc from sklearn.metrics import roc_auc_score print('\n AUC: ', round(roc_auc_score(y_test, pred_xgb), 6)) # feature_importance plot mpl.rcParams['figure.figsize'] = (18, 6) sns.set_context('talk', font_scale = 0.8) feature_impt = pd.Series(xgb.feature_importances_, x.columns).sort_values(ascending=False) feature_impt.plot(kind='bar') plt.yscale('log') plt.title('Feature Importance') plt.show() # top 25th pctle feature_impt[feature_impt >= feature_impt.quantile(0.75)].plot(kind='bar') plt.title('Feature Importance (top 25th percentile)') plt.show()

xgb(X, Y, 0.2)

Model Explainability

SHAP (SHapely Additive exPlanations)

import shap explainer = shap.TreeExplainer(xgb) shap_values = explainer(X_test) # Summary Plot (global explanation): feature importance measured as the mean absolute Shapley values shap.plots.bar(shap_values)

# Summary Plot as a dot chart - global explanation shap.summary_plot(shap_values, X_test, plot_type='dot')

# create a SHAP dependence plot to show the effect of a single feature across the whole dataset shap.dependence_plot('adr', shap_values.values, features=X_test)

Bayesian Networks Causal Inference for Churn

import bnlearn as bn DAG = bn.structure_learning.fit( df_bayes, methodtype='cl', scoretype='bic') # interactive plot G = bn.plot(DAG, scale=1, node_color=None, node_size=None, params_interactive={ 'directed': True, 'height': '800px', 'width': '70%', 'notebook': False, 'heading': 'Bayesian Causal Inference', 'layout': None, 'font_color': False, 'bgcolor': '#ffffff' }, params_static={ 'width': 25, 'height': 15, 'font_size': 10, 'font_family': 'times new roman', 'alpha': 0 }, interactive=True)

Sales Forecasting: fbprophet

df_fbp['sales'] = df_fbp['total_staying_nights'] * df_fbp['adr'] # create arrival date (Y/M/D) # create month:digit dictionary import calendar month_dict = dict((v, k) for k, v in enumerate(calendar.month_name)) # replace by mapping df_fbp.arrival_date_month = df_fbp.arrival_date_month.map(month_dict) # concatinate year, month and day to create arrival date cols = ["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"] df_fbp['arrival_date'] = df_fbp[cols].apply( lambda x: '-'.join(x.values.astype(str)), axis="columns") df_fbp['arrival_date'] = pd.to_datetime(df_fbp['arrival_date'], format='%Y-%m-%d')

df_sales = df_fbp.groupby(['arrival_date'])[['sales']].sum() df_sales

mpl.rcParams['figure.figsize'] = (18, 10) sns.set_context('talk', font_scale=0.8) plt.subplot(2, 2, 1) plt.plot(df_sales.groupby(df_sales.index.to_period('D')).sum().reset_index() ['arrival_date'].apply(str), df_sales.groupby( df_sales.index.to_period('D')).sum().reset_index()['sales']) plt.title('Daily Sales'); plt.xlabel(''); plt.xticks(['']) plt.subplot(2, 2, 2) plt.plot(df_sales.groupby(df_sales.index.to_period('W')).sum().reset_index() ['arrival_date'].apply(str), df_sales.groupby( df_sales.index.to_period('W')).sum().reset_index()['sales']) plt.title('Weekly Sales'); plt.xlabel(''); plt.xticks(['']) plt.subplot(2, 2, 3) plt.plot(df_sales.groupby(df_sales.index.to_period('M')).sum().reset_index() ['arrival_date'].apply(str), df_sales.groupby( df_sales.index.to_period('M')).sum().reset_index()['sales']) plt.title('Monthly Sales'); plt.tick_params(axis='x', rotation=45) plt.subplot(2, 2, 4) plt.plot(df_sales.groupby(df_sales.index.to_period('Q')).sum().reset_index() ['arrival_date'].apply(str), df_sales.groupby( df_sales.index.to_period('Q')).sum().reset_index()['sales']); plt.title('Quarterly Sales'); plt.tick_params(axis='x', rotation=45) plt.tight_layout() plt.show()

# set up the model 'm' with 95% confidence interval (CI) => "interval_width" in model from fbprophet import Prophet m = Prophet(seasonality_mode='additive', interval_width=0.95, daily_seasonality=False, yearly_seasonality=True, weekly_seasonality=False) # fit the model to our df model = m.fit(df_sales) # forecast future = m.make_future_dataframe(periods = 365, freq = 'D') # Creating the 'future' DataFrame forecast = m.predict(future) forecast.head()

mpl.rcParams['figure.figsize'] = (18, 6) sns.set_context('talk', font_scale = 0.7) plot1 = m.plot(forecast)

plt2 = m.plot_components(forecast)

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Hotel Booking Churn Rate and Sales Forecasting - A Correlation-Causation Study

Background &amp; Objective