# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("/home/lyrax/matplotlib-dracula/dracula.mplstyle")
import seaborn as sns
from tqdm import tqdm, trange
data = pd.read_excel("data/Claims Data 2.xlsx", parse_dates=['participant_date_of_birth', 'treatment_created_date',
'claim_finalized_date'])
data.sample(3)
# shape of dataset
data.shape
# info
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170995 entries, 0 to 170994
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Serial Number 170321 non-null float64
1 claim_status 170995 non-null object
2 provider_type 144502 non-null object
3 provider_region 167860 non-null object
4 program_type 170995 non-null object
5 program_cover 170995 non-null object
6 participant_date_of_birth 170989 non-null datetime64[ns]
7 participant_gender 170989 non-null object
8 treatment_created_date 170995 non-null datetime64[ns]
9 claim_finalized_date 154567 non-null datetime64[ns]
10 item_status 170995 non-null object
11 item_name 170995 non-null object
12 item_amount 170995 non-null float64
13 item_currency 170995 non-null object
14 item_quantity 170995 non-null int64
15 total_item_amount 170995 non-null float64
16 diagnoses 166207 non-null object
17 treatment_type 170995 non-null object
dtypes: datetime64[ns](3), float64(3), int64(1), object(11)
memory usage: 23.5+ MB
# missing data statistics as percentage
data.isnull().sum() *100 / len(data)
plt.figure(figsize=(10,6))
sns.heatmap(data.isnull(), cbar=False, yticklabels=False)
plt.title("Missing Data Heatmap")
plt.tight_layout()
plt.savefig("plots/missing data heatmap.png", dpi=1000)
plt.figure(figsize=(8,8))
pd.DataFrame(pd.Series(data.isnull().sum() *100 / len(data)).sort_values()[8:]).plot(kind='bar')
plt.title("Missing Data Barplot")
plt.tight_layout()
plt.savefig("plots/missing data barplot.png", dpi=1000)
# Serial number
data['Serial Number'].nunique()
len(data)
# claim status distribution
print(data.claim_status.value_counts())
plt.figure(figsize=(8,6))
sns.countplot(y=data.claim_status)
Approved 153549
Submitted 7392
PartiallyRejected 5978
ReSubmitted 3058
Rejected 1018
Name: claim_status, dtype: int64
data.claim_status.replace(['ReSubmitted', 'Submitted', 'PartiallyRejected', 'Rejected'], 'Not Approved', inplace=True)
print(data.claim_status.value_counts(), "\nPercentages:\n", data.claim_status.value_counts(normalize=True))
plt.figure(figsize=(8,6))
sns.countplot(x=data.claim_status)
plt.title("Claim Status")
plt.tight_layout()
plt.savefig("plots/claim status distribution.png", dpi=1000)
Approved 153549
Not Approved 17446
Name: claim_status, dtype: int64
Percentages:
Approved 0.897974
Not Approved 0.102026
Name: claim_status, dtype: float64
# provider type
print(data.provider_type.value_counts())
plt.figure(figsize=(8,10))
sns.countplot(y=data.provider_type)
Clinic 106374
Hospital 15369
Medical Clinic/ Centre 5891
Specialist 4868
Medical Centre 4681
Health Centre 3706
Nursing Home 2052
Medical Clinic 833
Clinic, Level 1 281
Maternity And Nursing Home 241
Primary Hospital-Category 2 85
Dentists 76
Pharmacy 30
Radiology Unit 15
Name: provider_type, dtype: int64
data.provider_type.replace(['Medical Clinic/ Centre', 'Medical Clinic', 'Clinic, Level 1', 'Medical Centre', 'Health Centre'],
'Clinic', inplace=True)
data.provider_type.replace(['Nursing Home', 'Maternity And Nursing Home', 'Primary Hospital-Category 2'], 'Hospital', inplace=True)
print(data.provider_type.value_counts())
plt.figure(figsize=(8,8))
sns.countplot(y=data.provider_type)
plt.title("Provider Type")
plt.tight_layout()
plt.savefig("plots/provider type distribution.png", dpi=1000)
Clinic 121766
Hospital 17747
Specialist 4868
Dentists 76
Pharmacy 30
Radiology Unit 15
Name: provider_type, dtype: int64
# provider region
print(f"Number of regions: {data.provider_region.nunique()}\n")
print(data.provider_region.value_counts())
Number of regions: 46
NAIROBI 69785
MOMBASA 15038
KISUMU 9964
KIAMBU 8480
KAKAMEGA 6596
NAKURU 4621
BUNGOMA 4553
KAJIADO 4206
TRANS-NZOIA 4073
UASIN GISHU 4033
KISII 2954
KERICHO 2816
NYERI 2611
MIGORI 2529
MACHAKOS 2448
KILIFI 2289
NANDI 2198
LAIKIPIA 2017
MERU 1861
GARISSA 1450
BUSIA 1428
SIAYA 1326
EMBU 1297
HOMABAY 1264
BARINGO 1242
KIRINYAGA 1125
TAITA-TAVETA 1025
KITUI 924
TURKANA 606
BOMET 397
MAKUENI 383
ISIOLO 359
WAJIR 342
KWALE 313
MURANG'A\n 285
VIHIGA 245
NAROK 211
WEST POKOT 137
MARSABIT 115
THARAKA-NITHI 86
LAMU 74
NYAMIRA 60
SAMBURU 35
ELGEYO-MARAKWET 25
NYANDARUA 25
MANDERA 9
Name: provider_region, dtype: int64
# correct error in MURANG'A listing
data.provider_region.replace("MURANG'A\n", "MURANG'A", inplace=True)
# program type
data.program_type.value_counts()
# drop the program type column as it has only one unique value
data.drop('program_type', axis=1, inplace=True)
# program cover
data.program_cover.value_counts()
data.program_cover.unique()
data.program_cover.replace(['OUT-PATIENT', 'AAR-Outpatient', 'outpatient', 'OUT-PATIENT ', 'Outpatient', 'OUTPATIENT ',
'Out-Patient', 'AAR - OP', 'Out Patient', 'OUTPATIENT '], 'OUTPATIENT', inplace=True)
data.program_cover.replace(['Dental', 'dental', 'AAR - Dental'], 'DENTAL', inplace=True)
data.program_cover.replace(['Optical', 'AAR - Optical', 'optical', 'SPECTACLE FRAME', 'AAR - Spectacle Frame'], 'OPTICAL', inplace=True)
data.program_cover.replace(['Annual check up', 'ANNUAL_CHECKUP', 'ANNUAL MEDICAL CHECK-UP',
'ANNUAL MEDICAL CHECK-UP FOR MAIN MEMBER / EMPLOYEE', 'ANNUAL_CHECK-UP',
'ANNUAL MEDICAL CHECK-UP at AAR healthcare clinics', 'AAR- Checkup'], 'ANNUAL CHECKUP', inplace=True)
data.program_cover.replace(['VACCINES - PRIVATE VACCINES', 'Well Baby Clinic', 'BABY FRIENDLY VACCINES / WELL BABY CLINIC',
'MATERNITY', 'BABY FRIENDLY VACCINES', 'Baby Friendly'], 'BABY FRIENDLY', inplace=True)
data.program_cover.unique()
plt.figure(figsize=(10,8))
sns.countplot(y=data.program_cover)
plt.title("Program Cover")
plt.tight_layout()
plt.savefig("plots/program cover distribution.png", dpi=1000)
# oldest participant/customer
print(f"Oldest participant is {2021 - data.participant_date_of_birth.min().year} years old\n\
Date of birth: {data.participant_date_of_birth.min()}")
Oldest participant is 93 years old
Date of birth: 1928-10-05 00:00:00
# youngest
print(f"Youngest participant is {2021 - data.participant_date_of_birth.max().year} years old\n\
Date of birth: {data.participant_date_of_birth.max()}")
Youngest participant is 0 years old
Date of birth: 2021-07-01 00:00:00
# save below two cells for feature engineering
# new features from DOB
"""data['participant_yearOB'] = data.participant_date_of_birth.dt.year.astype(int)
data['participant_monthOB'] = data.participant_date_of_birth.dt.month.astype(int)
data['participant_dayOB'] = data.participant_date_of_birth.dt.day.astype(int)
data.sample(3)"""
#data.participant_date_of_birth.dt.day_name().map({'Monday': 'weekday', 'Tuesday': 'weekday', 'Wednesday': 'weekday',
# 'Thursday': 'weekday', 'Friday': 'weekday', 'Saturday': 'weekend', 'Sunday': 'weekend'})
# gender
print(data.participant_gender.value_counts())
plt.figure(figsize=(8,6))
sns.countplot(x=data.participant_gender)
FEMALE 95751
MALE 75238
Name: participant_gender, dtype: int64
# treatment date
# earliest
data.treatment_created_date.min()
# latest
data.treatment_created_date.max()
# claim finalized date
data.claim_finalized_date.min()
data.claim_finalized_date.max()
# item status
print(data.item_status.value_counts())
plt.figure(figsize=(8,6))
sns.countplot(x=data.item_status)
APPROVED 152792
SUBMITTED 16726
REJECTED 1477
Name: item_status, dtype: int64
data.claim_status.value_counts()
len(data[(data.item_status == "REJECTED") & (data.claim_status == 'Approved')])
# item name
data.item_name.value_counts()[:20]
# item amount
sns.distplot(data.item_amount)
/home/lyrax/.local/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
# max and min amount
print(f"max: {data.item_amount.max()}\nmin: {data.item_amount.min()}")
max: 120000.0
min: 0.0
# number of amounts greater than 20k
len(data[data.item_amount > 20000])
# drop item currency column since it has only one unique value
data.drop('item_currency', axis=1, inplace=True)
# item quantity
data.item_quantity.value_counts()[:20]
data.diagnoses.value_counts()
# drop treatment type as it has only one unique value
data.drop('treatment_type', axis=1, inplace=True)
# In progress