# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("/home/lyrax/matplotlib-dracula/dracula.mplstyle")
import seaborn as sns
from tqdm import tqdm, trange
data = pd.read_excel("data/Claims Data 2.xlsx", parse_dates=['participant_date_of_birth', 'treatment_created_date',
'claim_finalized_date'])
data.sample(3)
# shape of dataset
data.shape
# info
data.info()
# missing data statistics as percentage
data.isnull().sum() *100 / len(data)
plt.figure(figsize=(10,6))
sns.heatmap(data.isnull(), cbar=False, yticklabels=False)
plt.title("Missing Data Heatmap")
plt.tight_layout()
plt.savefig("plots/missing data heatmap.png", dpi=1000)
plt.figure(figsize=(8,8))
pd.DataFrame(pd.Series(data.isnull().sum() *100 / len(data)).sort_values()[8:]).plot(kind='bar')
plt.title("Missing Data Barplot")
plt.tight_layout()
plt.savefig("plots/missing data barplot.png", dpi=1000)
# Serial number
data['Serial Number'].nunique()
len(data)
# claim status distribution
print(data.claim_status.value_counts())
plt.figure(figsize=(8,6))
sns.countplot(y=data.claim_status)
data.claim_status.replace(['ReSubmitted', 'Submitted', 'PartiallyRejected', 'Rejected'], 'Not Approved', inplace=True)
print(data.claim_status.value_counts(), "\nPercentages:\n", data.claim_status.value_counts(normalize=True))
plt.figure(figsize=(8,6))
sns.countplot(x=data.claim_status)
plt.title("Claim Status")
plt.tight_layout()
plt.savefig("plots/claim status distribution.png", dpi=1000)
# provider type
print(data.provider_type.value_counts())
plt.figure(figsize=(8,10))
sns.countplot(y=data.provider_type)
data.provider_type.replace(['Medical Clinic/ Centre', 'Medical Clinic', 'Clinic, Level 1', 'Medical Centre', 'Health Centre'],
'Clinic', inplace=True)
data.provider_type.replace(['Nursing Home', 'Maternity And Nursing Home', 'Primary Hospital-Category 2'], 'Hospital', inplace=True)
print(data.provider_type.value_counts())
plt.figure(figsize=(8,8))
sns.countplot(y=data.provider_type)
plt.title("Provider Type")
plt.tight_layout()
plt.savefig("plots/provider type distribution.png", dpi=1000)
# provider region
print(f"Number of regions: {data.provider_region.nunique()}\n")
print(data.provider_region.value_counts())
# correct error in MURANG'A listing
data.provider_region.replace("MURANG'A\n", "MURANG'A", inplace=True)
# program type
data.program_type.value_counts()
# drop the program type column as it has only one unique value
data.drop('program_type', axis=1, inplace=True)
# program cover
data.program_cover.value_counts()
data.program_cover.unique()
data.program_cover.replace(['OUT-PATIENT', 'AAR-Outpatient', 'outpatient', 'OUT-PATIENT ', 'Outpatient', 'OUTPATIENT ',
'Out-Patient', 'AAR - OP', 'Out Patient', 'OUTPATIENT '], 'OUTPATIENT', inplace=True)
data.program_cover.replace(['Dental', 'dental', 'AAR - Dental'], 'DENTAL', inplace=True)
data.program_cover.replace(['Optical', 'AAR - Optical', 'optical', 'SPECTACLE FRAME', 'AAR - Spectacle Frame'], 'OPTICAL', inplace=True)
data.program_cover.replace(['Annual check up', 'ANNUAL_CHECKUP', 'ANNUAL MEDICAL CHECK-UP',
'ANNUAL MEDICAL CHECK-UP FOR MAIN MEMBER / EMPLOYEE', 'ANNUAL_CHECK-UP',
'ANNUAL MEDICAL CHECK-UP at AAR healthcare clinics', 'AAR- Checkup'], 'ANNUAL CHECKUP', inplace=True)
data.program_cover.replace(['VACCINES - PRIVATE VACCINES', 'Well Baby Clinic', 'BABY FRIENDLY VACCINES / WELL BABY CLINIC',
'MATERNITY', 'BABY FRIENDLY VACCINES', 'Baby Friendly'], 'BABY FRIENDLY', inplace=True)
data.program_cover.unique()
plt.figure(figsize=(10,8))
sns.countplot(y=data.program_cover)
plt.title("Program Cover")
plt.tight_layout()
plt.savefig("plots/program cover distribution.png", dpi=1000)
# oldest participant/customer
print(f"Oldest participant is {2021 - data.participant_date_of_birth.min().year} years old\n\
Date of birth: {data.participant_date_of_birth.min()}")
# youngest
print(f"Youngest participant is {2021 - data.participant_date_of_birth.max().year} years old\n\
Date of birth: {data.participant_date_of_birth.max()}")
# save below two cells for feature engineering
# new features from DOB
"""data['participant_yearOB'] = data.participant_date_of_birth.dt.year.astype(int)
data['participant_monthOB'] = data.participant_date_of_birth.dt.month.astype(int)
data['participant_dayOB'] = data.participant_date_of_birth.dt.day.astype(int)
data.sample(3)"""
#data.participant_date_of_birth.dt.day_name().map({'Monday': 'weekday', 'Tuesday': 'weekday', 'Wednesday': 'weekday',
# 'Thursday': 'weekday', 'Friday': 'weekday', 'Saturday': 'weekend', 'Sunday': 'weekend'})
# gender
print(data.participant_gender.value_counts())
plt.figure(figsize=(8,6))
sns.countplot(x=data.participant_gender)
# treatment date
# earliest
data.treatment_created_date.min()
# latest
data.treatment_created_date.max()
# claim finalized date
data.claim_finalized_date.min()
data.claim_finalized_date.max()
# item status
print(data.item_status.value_counts())
plt.figure(figsize=(8,6))
sns.countplot(x=data.item_status)
data.claim_status.value_counts()
len(data[(data.item_status == "REJECTED") & (data.claim_status == 'Approved')])
# item name
data.item_name.value_counts()[:20]
# item amount
sns.distplot(data.item_amount)
# max and min amount
print(f"max: {data.item_amount.max()}\nmin: {data.item_amount.min()}")
# number of amounts greater than 20k
len(data[data.item_amount > 20000])
# drop item currency column since it has only one unique value
data.drop('item_currency', axis=1, inplace=True)
# item quantity
data.item_quantity.value_counts()[:20]
data.diagnoses.value_counts()
# drop treatment type as it has only one unique value
data.drop('treatment_type', axis=1, inplace=True)
# In progress