Hotel Booking Churn Rate and Sales Forecasting - A Correlation-Causation Study
Background & Objective
Executive Summary
Connecting to Snowflake Data Warehouse
import os
import pandas as pd
import snowflake.connector
WAREHOUSE = "COMPUTE_WH"
DATABASE = "HOTEL_BOOKING"
conn = snowflake.connector.connect(user="amirfaar",
password="uBJqM%ZSQf1x3fqM",
account="ba80824.east-us-2.azure",
warehouse=WAREHOUSE,
database=DATABASE,
schema="PUBLIC")
conn.cursor().execute("USE WAREHOUSE " + WAREHOUSE)
conn.cursor().execute("USE DATABASE " + DATABASE)
view_dict = {
'booking_sales_data':
"""SELECT * FROM booking_sales_data;""",
}
for k, v in view_dict.items():
locals()[k] = pd.read_sql(v, conn)
conn.close()
Feature Engineering
df = booking_sales_data.copy()
col = list(df.columns)
col_lower = []
for i in range(len(col)):
col_lower.append(col[i].lower())
df.columns = col_lower
print(df.shape)
display(df)
print(df.dtypes)
Handling Missing Values
print(df.isna().mean().sort_values(ascending = False)[:10])
display(df.loc[:, df.columns[df.isna().mean() != 0]].head())
(df.isna().mean().sort_values(ascending = False)[:4]*100).plot.bar()
# adding data labels
ax = plt.gca()
for p in ax.patches:
ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()),
fontsize=14, color='black', ha='center', va='bottom')
plt.yscale('log')
plt.title('Columns with Missing values (% NaN)')
plt.xticks(rotation=45, horizontalalignment='right', fontweight='light', fontsize='medium')
plt.show()
# if the value of agent and company column is null, replace with 0
df[['agent', 'company']] = df[['agent', 'company']].fillna(0.0)
# if the value of country column (string) is null, replace with MODE value
df['country'].fillna(df.country.mode()[0], inplace=True)
# if the value of children column is null, replace with MEDIAN
df['children'] = df['children'].fillna(df.children.median())
# if the sum of adult, baby, and child is 0, drop row
df.drop(df[(df.adults+df.babies+df.children)==0].index, inplace = True)
print(df.shape)
Feature Creation
df_subset = df.copy()
# Create [total_staying_nights] column
df_subset['total_staying_nights'] = df_subset[
'stays_in_week_nights'] + df_subset['stays_in_weekend_nights']
# drop rows where total_staying_nights below 1
df_subset.drop(df_subset[df_subset['total_staying_nights'] < 1].index,
inplace=True)
df_subset.reset_index(drop=True)
# Create [room] column
# if assigned room is the same, then 1, else 0
df_subset['room'] = 0
df_subset.loc[df_subset['reserved_room_type'] ==
df_subset['assigned_room_type'], 'room'] = 1
# Create [more_canceled] column
# if the cancellation is higher, then 1, else 0
df_subset['more_canceled'] = 0
df_subset.loc[df_subset['previous_cancellations'] >
df_subset['previous_bookings_not_canceled'], 'more_canceled'] = 1
# Seperate year/month/date from reservation_status_date column
df_subset['reservation_status_date'] = pd.to_datetime(df_subset['reservation_status_date'])
df_subset['reservation_year'] = df_subset['reservation_status_date'].dt.year
df_subset['reservation_month'] = df_subset['reservation_status_date'].dt.month
df_subset['reservation_day'] = df_subset['reservation_status_date'].dt.day
df_subset.drop(df_subset.loc[:, df_subset.columns.str.contains('arrival')],
axis=1,
inplace=True)
# remove unnecessary columns
# drop 'reservation_status' as we will use 'is_canceled': canceled (1) or not (0)
df_subset = df_subset.drop([
'reservation_status', 'booking_changes', 'assigned_room_type',
'reservation_status_date', 'distribution_channel'
],
axis=1)
display(df_subset)
EDA: Cancellation Analysis
mpl.rcParams['figure.figsize'] = (4.5, 3.5)
sns.set_context('talk', font_scale=0.7)
sns.boxplot(y='lead_time',
x="is_canceled",
data=df_subset,
palette='RdBu_r')
plt.yscale('log')
plt.show()
mpl.rcParams['figure.figsize'] = (12, 20)
sns.set_context('talk', font_scale=.78)
cols = [
'hotel', 'total_of_special_requests', 'required_car_parking_spaces',
'room', 'more_canceled', 'market_segment', 'reservation_month',
'deposit_type'
]
col_index = 0
nrows = 4
ncols = 2
fig, ax = plt.subplots(nrows, ncols)
for i in range(nrows):
for j in range(ncols):
ggg = pd.DataFrame(
df_subset.groupby([cols[col_index], 'is_canceled'
])['is_canceled'].count() /
df_subset.groupby([cols[col_index]])['is_canceled'].count() *
100.0).unstack()
# Grouped bar chart
labels = list(ggg.index)
x = np.arange(len(ggg.index)) # the label locations
width = 0.35 # the width of the bars
rects1 = ax[i, j].bar(x - width/2, ggg['is_canceled'][0], width, label='Not Canc.')
rects2 = ax[i, j].bar(x + width/2, ggg['is_canceled'][1], width, label='Canc.')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax[i, j].set_ylabel('Count %')
ax[i, j].set_xlabel(cols[col_index])
ax[i, j].tick_params(axis='x', rotation=45)
ax[i, j].set_xticks(x, labels)
ax[i, j].legend(loc = 'best')
# data labels
ax[i, j].bar_label(rects1, padding=3, fmt='%.0f', fontsize=10)
ax[i, j].bar_label(rects2, padding=3, fmt='%.0f', fontsize=10)
# going to the next column name in cols list
col_index = col_index + 1
fig.tight_layout()
plt.show()
Correlation
mpl.rcParams['figure.figsize'] = (23, 12)
sns.set_context('talk', font_scale=0.7)
df_corr = df_subset.select_dtypes(exclude='object')
sns.heatmap(df_corr.corr(method='pearson').where(
np.tril(np.ones(df_corr.corr().shape)).astype(np.bool)),
cmap='PuBu',
linewidths=1,
annot=True,
annot_kws={"size": 12},
fmt=".0%")
plt.show()
mpl.rcParams['figure.figsize'] = (20, 6)
sns.set_context('talk', font_scale=0.8)
df.corr()['is_canceled'].sort_values(ascending=False)[1:].plot(kind='bar')
plt.xticks(rotation=65,
horizontalalignment='right',
fontweight='light',
fontsize='medium')
plt.title("Pearson Correlation Values for Cancellation")
plt.show()
Encoding Categorical Features
Y = df_subset.pop('is_canceled')
X = df_subset
# Convert children to "int"
X['children'] = X.children.apply(int)
# Hotel is binary, so we will use Pandas replace()
X['hotel'] = X['hotel'].replace({'City Hotel':1, 'Resort Hotel':0})
# Since country is a nominal category (not ordinal), we should use Panda’s get_dummies() to one-hot encode, but we will increase the DF's number of eatures by 177!
# so we will do a more specific encoding by assuming that we are interested in the impact of country being US or not:
X['country_US'] = np.where(X['country'].str.contains('USA'), 1, 0)
X.drop('country', inplace = True, axis = 1)
meal_code = {'SC': 0, 'Undefined': 0, 'BB': 1, 'HB': 2, 'FB': 3}
X['meal'] = X['meal'].map(meal_code)
deposit_type_code = {'No Deposit': 0, 'Refundable': 1, 'Non Refund': 2}
X['deposit_type'] = X['deposit_type'].map(deposit_type_code)
# one-hot encoding
cat_cols = ['market_segment', 'reserved_room_type', 'customer_type']
for c in cat_cols:
locals()[c] = pd.get_dummies(X[c], drop_first=False, prefix=c)
X.drop(['market_segment', 'reserved_room_type', 'customer_type'], axis=1, inplace=True)
X = pd.concat([X, market_segment, reserved_room_type, customer_type], axis=1)
X.shape
X[['agent', 'company']].corr()
# drop company and agent
X.drop(['company', 'agent'], axis =1, inplace = True)
X.shape
Z-score Scaling
# Checking the variance before
X.var().sort_values(ascending = False)[:10]
cols = list(X.columns)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X), columns = cols)
X.var().sort_values(ascending = False)[:10]
Modeling Churn Rate
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
Y,
test_size=0.20,
stratify=Y)
ML Model Comparison
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
# RandomForest
rd_clf = RandomForestClassifier() # n_estimators: number of decision trees to include in the forest
rd_clf.fit(X_train, y_train)
pred_rd_clf = rd_clf.predict(X_test)
# XGBoost
xgb = XGBClassifier(booster = 'gbtree', n_estimators = 500)
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
from sklearn.metrics import roc_auc_score
models_auc = pd.DataFrame({
'Model': [
'Logistic Regression',
'Random Forest Classifier',
'XgBoost',
],
'AUC': [
round(roc_auc_score(y_test, pred_lr), 4),
round(roc_auc_score(y_test, pred_rd_clf), 4),
round(roc_auc_score(y_test, pred_xgb), 4)
]
})
models_auc.sort_values(by='AUC', ascending=False)
import plotly.express as px
fig = px.bar(data_frame=models_auc,
x='AUC',
y='Model',
orientation='h',
color='AUC',
title='Models Comparison')
fig.show()
# ROC data to a DataFrame
from sklearn.metrics import roc_curve
models = ['lr', 'rd_clf', 'xgb']
roc = pd.DataFrame()
for m in models:
y_pred_proba_test = locals()[m].predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_test)
roc = pd.concat([
roc,
pd.DataFrame({
f'FPR(Fall-out)_{m}': fpr,
f'TPRate(Recall)_{m}': tpr,
f'Threshold_{m}': thresholds
})
],
axis=1)
roc.drop(roc.columns[roc.columns.str.contains('Threshold')], axis = 1, inplace = True)
roc
XGB Best Hyperparams by HYPEROPT Bayesian Optimization
# import machine learning libraries
import xgboost as xgb
from sklearn.metrics import accuracy_score
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from timeit import default_timer as timer
# Initialize domain space for range of values
space = {
'max_depth': hp.quniform("max_depth", 2, 8, 1),
'gamma': hp.uniform('gamma', 1, 10),
'reg_lambda': hp.uniform('reg_lambda', 1, 2),
'min_child_weight': hp.uniform('min_child_weight', 0.75, 1),
'learning_rate': hp.uniform('learning_rate', 0.1, 0.3),
'n_estimators': 500,
'booster': 'gbtree',
'seed': 0
}
# Define the objective function for xgb Classifier
def objective(space):
import sklearn.model_selection as ms
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score, confusion_matrix, mean_squared_error, accuracy_score, f1_score, classification_report
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, stratify = Y)
xgb_clf = xgb.XGBClassifier(
max_depth=int(space['max_depth']),
gamma=space['gamma'],
reg_lambda=space['reg_lambda'],
min_child_weight=int(space['min_child_weight']),
learning_rate=space['learning_rate'],
n_estimators=space['n_estimators'],
booster=space['booster']
)
evaluation = [(X_train, y_train), (X_test, y_test)]
xgb_clf.fit(
X_train,
y_train,
eval_set=evaluation,
eval_metric="auc",
early_stopping_rounds=10,
verbose=False)
pred = xgb_clf.predict(X_test)
# classification metric
accuracy = accuracy_score(y_test, pred > 0.5)
print("SCORE:", accuracy)
return {
'loss': -accuracy, # negative of the accuracy becasue we want to minimize the loss
'status': STATUS_OK
}
# Optimization algorithm
# trials database in which to store all the point evaluations of the search
trials = Trials()
# fmin function to minimize the objective function
best_hyperparams = fmin(fn = objective,
space = space,
algo = tpe.suggest,
max_evals = 450, # number of iterations to do
trials = trials)
XGB Model
def xgb(x, y, test_size):
'''>>> returns XGBoot model and its evaluations;
>>> x: Independent variables as a DataFrame;
>>> y: Dependent variable as a DataFrame
>>> test_size: float between 0 to 1
'''
# libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
from sklearn.preprocessing import StandardScaler
import sklearn.model_selection as ms
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score, confusion_matrix, mean_squared_error, accuracy_score, f1_score, classification_report
import xgboost as xgb
import math
import warnings
warnings.filterwarnings('ignore')
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, stratify = Y)
xgb = XGBClassifier(booster='gbtree',
n_estimators=500,
gamma=best_hyperparams['gamma'],
learning_rate=best_hyperparams['learning_rate'],
max_depth=int(best_hyperparams['max_depth']),
min_child_weight=best_hyperparams['min_child_weight'],
reg_lambda=best_hyperparams['reg_lambda'])
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
# confusion matrix plotting
confusion_matrix(y_test, pred_xgb)
mpl.rcParams['figure.figsize'] = (8, 6)
sns.set_context('talk', font_scale = 0.85)
class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(confusion_matrix(y_test, pred_xgb),
annot=True, linewidths=0.01, cmap="Greens",
linecolor="gray", fmt= '.1f')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
# report table
from sklearn.metrics import classification_report
print('=== classification_report ===')
print(classification_report(y_test, xgb.predict(X_test), digits=6))
# ROC curve
from sklearn.metrics import roc_curve
y_pred_proba_test = xgb.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_test)
roc = pd.DataFrame({
'FPR(Fall-out)': fpr,
'TPRate(Recall)': tpr,
'Threshold': thresholds
})
mpl.rcParams['figure.figsize'] = (8, 8)
plt.scatter(fpr, tpr)
plt.plot(fpr, tpr, 'bo', alpha=0.2)
plt.xlabel("FPR (Fall-out)")
plt.ylabel("TPR (Recall)")
plt.title('ROC Curve')
plt.show()
# AUC calc
from sklearn.metrics import roc_auc_score
print('\n AUC: ', round(roc_auc_score(y_test, pred_xgb), 6))
# feature_importance plot
mpl.rcParams['figure.figsize'] = (18, 6)
sns.set_context('talk', font_scale = 0.8)
feature_impt = pd.Series(xgb.feature_importances_,
x.columns).sort_values(ascending=False)
feature_impt.plot(kind='bar')
plt.yscale('log')
plt.title('Feature Importance')
plt.show()
# top 25th pctle
feature_impt[feature_impt >= feature_impt.quantile(0.75)].plot(kind='bar')
plt.title('Feature Importance (top 25th percentile)')
plt.show()
xgb(X, Y, 0.2)
Model Explainability
SHAP (SHapely Additive exPlanations)
import shap
explainer = shap.TreeExplainer(xgb)
shap_values = explainer(X_test)
# Summary Plot (global explanation): feature importance measured as the mean absolute Shapley values
shap.plots.bar(shap_values)
# Summary Plot as a dot chart - global explanation
shap.summary_plot(shap_values, X_test, plot_type='dot')
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot('adr', shap_values.values, features=X_test)
Bayesian Networks Causal Inference for Churn
import bnlearn as bn
DAG = bn.structure_learning.fit(
df_bayes,
methodtype='cl',
scoretype='bic')
# interactive plot
G = bn.plot(DAG,
scale=1,
node_color=None,
node_size=None,
params_interactive={
'directed': True,
'height': '800px',
'width': '70%',
'notebook': False,
'heading': 'Bayesian Causal Inference',
'layout': None,
'font_color': False,
'bgcolor': '#ffffff'
},
params_static={
'width': 25,
'height': 15,
'font_size': 10,
'font_family': 'times new roman',
'alpha': 0
},
interactive=True)
Sales Forecasting: fbprophet
df_fbp['sales'] = df_fbp['total_staying_nights'] * df_fbp['adr']
# create arrival date (Y/M/D)
# create month:digit dictionary
import calendar
month_dict = dict((v, k) for k, v in enumerate(calendar.month_name))
# replace by mapping
df_fbp.arrival_date_month = df_fbp.arrival_date_month.map(month_dict)
# concatinate year, month and day to create arrival date
cols = ["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"]
df_fbp['arrival_date'] = df_fbp[cols].apply(
lambda x: '-'.join(x.values.astype(str)), axis="columns")
df_fbp['arrival_date'] = pd.to_datetime(df_fbp['arrival_date'],
format='%Y-%m-%d')
df_sales = df_fbp.groupby(['arrival_date'])[['sales']].sum()
df_sales
mpl.rcParams['figure.figsize'] = (18, 10)
sns.set_context('talk', font_scale=0.8)
plt.subplot(2, 2, 1)
plt.plot(df_sales.groupby(df_sales.index.to_period('D')).sum().reset_index()
['arrival_date'].apply(str),
df_sales.groupby(
df_sales.index.to_period('D')).sum().reset_index()['sales'])
plt.title('Daily Sales'); plt.xlabel(''); plt.xticks([''])
plt.subplot(2, 2, 2)
plt.plot(df_sales.groupby(df_sales.index.to_period('W')).sum().reset_index()
['arrival_date'].apply(str),
df_sales.groupby(
df_sales.index.to_period('W')).sum().reset_index()['sales'])
plt.title('Weekly Sales'); plt.xlabel(''); plt.xticks([''])
plt.subplot(2, 2, 3)
plt.plot(df_sales.groupby(df_sales.index.to_period('M')).sum().reset_index()
['arrival_date'].apply(str),
df_sales.groupby(
df_sales.index.to_period('M')).sum().reset_index()['sales'])
plt.title('Monthly Sales'); plt.tick_params(axis='x', rotation=45)
plt.subplot(2, 2, 4)
plt.plot(df_sales.groupby(df_sales.index.to_period('Q')).sum().reset_index()
['arrival_date'].apply(str),
df_sales.groupby(
df_sales.index.to_period('Q')).sum().reset_index()['sales']);
plt.title('Quarterly Sales'); plt.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# set up the model 'm' with 95% confidence interval (CI) => "interval_width" in model
from fbprophet import Prophet
m = Prophet(seasonality_mode='additive',
interval_width=0.95,
daily_seasonality=False,
yearly_seasonality=True,
weekly_seasonality=False)
# fit the model to our df
model = m.fit(df_sales)
# forecast
future = m.make_future_dataframe(periods = 365, freq = 'D') # Creating the 'future' DataFrame
forecast = m.predict(future)
forecast.head()
mpl.rcParams['figure.figsize'] = (18, 6)
sns.set_context('talk', font_scale = 0.7)
plot1 = m.plot(forecast)
plt2 = m.plot_components(forecast)