import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
shifts_df = pd.read_csv("data/Chicago MSA Shifts.csv")
hcp_df = pd.read_csv("data/Chicago HCPs Active in October 2021.csv")
cancellations_df = pd.read_csv("data/Chicago Cancel Logs October 2021.csv")
cancellations_df['Cancel Date'] = pd.to_datetime(cancellations_df['Created At']).dt.date
hcp_worker_and_facility_cancels = cancellations_df[cancellations_df["Action"].isin(["WORKER_CANCEL", "NO_CALL_NO_SHOW", "FACILITY_CANCEL_FILLED"])]
hcp_all_worker_cancels = cancellations_df[cancellations_df["Action"].isin(["WORKER_CANCEL", "NO_CALL_NO_SHOW"])]
hcp_soft_cancels = cancellations_df[cancellations_df["Action"] == "WORKER_CANCEL"]
hcp_hard_cancels = cancellations_df[cancellations_df["Action"] == "NO_CALL_NO_SHOW"]
hcf_cancels = cancellations_df[cancellations_df["Action"] == "FACILITY_CANCEL_FILLED"]
print("Start date: " + cancellations_df.sort_values(by=['Created At']).iloc[0]['Created At'])
print("End date: " + cancellations_df.sort_values(by=['Created At']).iloc[-1]['Created At'])
shift_count = shifts_df['ID'].nunique()
print("Unique shifts: " + str(shift_count)) # Unique shifts
def print_cancel_rate(df, title):
rate = round(df['Shift ID'].nunique() / shift_count * 100, 1)
print(title + ": " + str(rate) + "%")
print_cancel_rate(hcp_worker_and_facility_cancels, "Total cancellation rate (worker + facility)")
print_cancel_rate(hcp_all_worker_cancels, "Worker cancellation rate (normal + no call no show)")
print_cancel_rate(hcp_hard_cancels, "Worker no call no show cancellation rate")
print_cancel_rate(hcf_cancels, "Facility cancellation rate")
cancels_per_shift = cancellations_df.groupby('Shift ID').size()
cancels_per_shift = pd.DataFrame({'Shift ID':cancels_per_shift.index, 'count':cancels_per_shift.values})
shifts_w_multiple_cancels = cancels_per_shift[cancels_per_shift['count'] > 1] # About 10% of all shifts with cancels have multiple cancels
cancellations_df[cancellations_df['Shift ID'] == '615738bcfe2d1a018293eb3e'] # Example of shift with multiple cancels
cancellations_df[cancellations_df['Shift ID'].isin(shifts_w_multiple_cancels['Shift ID'])].sort_values(by=['Shift ID']) # List of all shifts with multiple cancels
hcp_all_worker_cancels.groupby('Cancel Date').size().plot(figsize=(12,3), title='All worker cancellations (normal + no show no call)')
hcf_cancels.groupby('Cancel Date').size().plot(figsize=(12,3), title='Facility cancellations')
hcp_soft_cancels['Start'] = pd.to_datetime(hcp_soft_cancels['Start'])
hcp_soft_cancels['Created At'] = pd.to_datetime(hcp_soft_cancels['Created At'])
hcp_soft_cancels['Notice'] = (hcp_soft_cancels['Start'] - hcp_soft_cancels['Created At']).dt.days
hcp_cancels_groupby = hcp_soft_cancels.groupby('Notice').size()
hcp_cancels_groupby[0] += hcp_hard_cancels['Shift ID'].nunique() # Add no call no shows
hcp_cancels_groupby.plot(figsize=(12,3), title='Worker cancellations by notice days', kind='bar')
rate = round(hcp_cancels_groupby[0] / shift_count * 100, 1)
print("Worker cancel rate with less than 24h notice: " + str(rate) + "% of all shifts")
hcp_soft_cancels_same_day = hcp_soft_cancels[hcp_soft_cancels['Notice'] < 1]
hcp_soft_cancels_same_day['Notice Hours'] = (hcp_soft_cancels_same_day['Start'] - hcp_soft_cancels_same_day['Created At']).dt.total_seconds()/(60*60)
hcp_soft_cancels_same_day['Notice Hours'] = hcp_soft_cancels_same_day['Notice Hours'].round()
hcp_cancels_same_day_groupby = hcp_soft_cancels_same_day.groupby('Notice Hours').size()
hcp_cancels_same_day_groupby[0] += hcp_hard_cancels['Shift ID'].nunique() # Add no call no shows
hcp_cancels_same_day_groupby.plot(figsize=(12,3), title='Worker same day cancellations by notice hours', kind='bar')
less_than_6h_count = sum([hcp_cancels_same_day_groupby[x] for x in range(7)])
rate = round(less_than_6h_count / shift_count * 100, 1)
print("Worker cancel rate with less than 6h notice: " + str(rate) + "% of all shifts")
hcf_cancels['Start dt'] = pd.to_datetime(hcf_cancels['Start'])
hcf_cancels['Created At dt'] = pd.to_datetime(hcf_cancels['Created At'])
hcf_cancels['Notice'] = (hcf_cancels['Start dt'] - hcf_cancels['Created At dt']).dt.days
hcf_cancels_groupby = hcf_cancels.groupby('Notice').size()
less_than_24h_count = sum([hcf_cancels_groupby[x] for x in [-26, -1, 0]])
hcf_cancels_groupby[0] = less_than_24h_count
del[hcf_cancels_groupby[-1]]
del[hcf_cancels_groupby[-26]]
hcf_cancels_groupby.plot(figsize=(12,3), title='HCF cancellations by notice days', kind='bar')
rate = round(hcf_cancels_groupby[0] / shift_count * 100, 1)
print("Facility cancel rate with less than 24h notice: " + str(rate) + "% of all shifts")
hcf_cancels_same_day = hcf_cancels[hcf_cancels['Notice'] < 1]
hcf_cancels_same_day['Notice Hours'] = (hcf_cancels_same_day['Start dt'] - hcf_cancels_same_day['Created At dt']).dt.total_seconds()/(60*60)
hcf_cancels_same_day['Notice Hours'] = hcf_cancels_same_day['Notice Hours'].round()
hcf_cancels_same_day_groupby = hcf_cancels_same_day.groupby('Notice Hours').size()
idx_less_than_6h = [x for x in list(hcf_cancels_same_day_groupby.index) if x <= 6]
less_than_6h_count = sum([hcf_cancels_same_day_groupby[x] for x in idx_less_than_6h])
idx_less_than_0h = [x for x in list(hcf_cancels_same_day_groupby.index) if x <= 0]
show_up_no_shift_count = sum([hcf_cancels_same_day_groupby[x] for x in idx_less_than_0h])
hcf_cancels_same_day_groupby[0] = show_up_no_shift_count
for x in [i for i in idx_less_than_0h if i != 0]:
del[hcf_cancels_same_day_groupby[x]]
hcf_cancels_same_day_groupby.plot(figsize=(12,3), title='Facility same day cancellations by notice hours', kind='bar')
less_than_6h_rate = round(less_than_6h_count / shift_count * 100, 1)
show_up_no_shift_rate = round(show_up_no_shift_count / shift_count * 100, 1)
print("Facility cancel rate with less than 6h notice: " + str(less_than_6h_rate) + "% of all shifts")
print("Show up no shift: " + str(show_up_no_shift_rate) + "% of all shifts")
no_show_shift_ids = list(hcp_hard_cancels['Shift ID'].values)
no_show_shifts = shifts_df[shifts_df['ID'].isin(no_show_shift_ids)]
no_show_confirmed = no_show_shifts[no_show_shifts['Confirmed'] == True].shape[0]
no_show_not_confirmed = no_show_shifts[no_show_shifts['Confirmed'] != True].shape[0]
confirmed_no_show_rate = round(no_show_confirmed / shifts_df[shifts_df['Confirmed'] == True].shape[0] * 100, 1)
not_confirmed_no_show_rate = round(no_show_not_confirmed / shifts_df[shifts_df['Confirmed'] != True].shape[0] * 100, 1)
print("No show rate with confirmation: " + str(confirmed_no_show_rate) + "%")
print("No show rate without confirmation: " + str(not_confirmed_no_show_rate) + "%")
no_show_shifts_grouped = no_show_shifts.groupby('Agent Req').size()
all_shifts_grouped = shifts_df.groupby('Agent Req').size()
ratio_grouped = (no_show_shifts_grouped / all_shifts_grouped * 100).fillna(0)
ratio_grouped = ratio_grouped.round(1)
ratio_grouped.plot(figsize=(12,3), title='No call no show by worker type (in %)', kind='bar')
all_shifts_grouped.plot(figsize=(12,3), title='All shifts by worker type', kind='bar')
no_show_shifts_grouped = no_show_shifts.groupby('Shift Type').size()
all_shifts_grouped = shifts_df.groupby('Shift Type').size()
ratio_grouped = (no_show_shifts_grouped / all_shifts_grouped * 100).fillna(0)
ratio_grouped = ratio_grouped.round(1)
ratio_grouped.plot(figsize=(12,3), title='No call no show by shift type (in %)', kind='bar')
all_shifts_grouped.plot(figsize=(12,3), title='All shifts by shift type', kind='bar')
no_show_shifts_grouped = no_show_shifts.groupby('By Facility').size()
all_shifts_grouped = shifts_df.groupby('By Facility').size()
ratio_grouped = (no_show_shifts_grouped / all_shifts_grouped * 100).fillna(0)
ratio_grouped = ratio_grouped.round(1)
ratio_grouped.plot(figsize=(12,3), title='No call no show by facility post (in %)', kind='bar')
all_shifts_grouped.plot(figsize=(12,3), title='All shifts by facility post', kind='bar')
hcp_df['Cancel Rate'] = ((hcp_df['Cancelled Shifts'] / (hcp_df['Cancelled Shifts'] + hcp_df['Completed Shifts'])) * 100).round(1)
# hcp_df['Binned Cancel Rate'] = pd.cut(hcp_df['Cancel Rate'], [-0.1, 20, 40, 60, 80, 100])
share_values = []
share_index = []
for i in range(5):
sub_hcp_df = hcp_df[hcp_df['Cancel Rate'] < (i+1) * 20]
sub_hcp_df = sub_hcp_df[sub_hcp_df['Cancel Rate'] >= i * 20]
share = round(sum(sub_hcp_df['Cancelled Shifts'].values) / sum(hcp_df['Cancelled Shifts'].values) * 100, 1)
# share_arr.append([str(i) + "-" + str(i+1), share])
share_values.append(share)
share_index.append("workers with " + str(i*20) + "% - " + str((i+1)*20) + "% cancellation rate")
pd.DataFrame(share_values, index=share_index, columns=['% of cancelled shifts']).plot(figsize=(12,3), title='% of all cancelled shifts per worker group', kind='bar')
bins = [0, 1, 5, 10, 50, 100, 500]
# hcp_df['Binned Completed Shifts'] = pd.cut(hcp_df['Completed Shifts'], bins)
# hcp_df.groupby('Binned Completed Shifts').size().plot(figsize=(12,3), title='Nbr of workers per cancel rates', kind='bar')
share_values = []
share_index = []
for i in range(len(bins) - 1):
sub_hcp_df = hcp_df[hcp_df['Completed Shifts'] < bins[i+1]]
sub_hcp_df = sub_hcp_df[sub_hcp_df['Completed Shifts'] >= bins[i]]
share = round(sum(sub_hcp_df['Cancelled Shifts'].values) / sum(hcp_df['Cancelled Shifts'].values) * 100, 1)
# share_arr.append([str(i) + "-" + str(i+1), share])
share_values.append(share)
share_index.append("workers with " + str(bins[i]) + " - " + str(bins[i+1]) + " completed shifts")
pd.DataFrame(share_values, index=share_index, columns=['% of cancelled shifts']).plot(figsize=(12,3), title='% of all cancelled shifts per worker group', kind='bar')