Exploratory Data Analysis
Import Libraries
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
import numpy as np
Run to view results
Load Dataset
from ucimlrepo import fetch_ucirepo
# fetch dataset
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296)
# data (as pandas dataframes)
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets
# metadata
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata)
# variable information
print(diabetes_130_us_hospitals_for_years_1999_2008.variables)
Run to view results
X.columns
Run to view results
Data
X
Run to view results
y
Run to view results
Analysis of Demographic Variables
race = pd.DataFrame(X['race'])
fig = px.pie(race, names='race', title='Distribution of Race')
fig.show()
age = pd.DataFrame(X['age'])
fig = px.pie(age, names='age', title='Distribution of Age')
fig.show()
gender = pd.DataFrame(X['gender'])
fig = px.pie(gender, names='gender', title='Distribution of Gender')
fig.show()
weight = pd.DataFrame(X['weight'])
fig = px.pie(weight, names='weight', title='Distribution of Weight')
fig.show()
Run to view results
Analysis of ICD-9 Diagnoses Codes
diag1_df = X['diag_1'].value_counts().reset_index()
diag1_df.columns = ['diag_1', 'count']
top_icd9 = diag1_df.head(50)
fig = px.bar(top_icd9, x='diag_1', y='count')
fig.show()
Run to view results
ICD-9 Diagnoses Codes (only Diabetes codes)
# TODO: Get rid of entries where Diag 1 is null
# my_df = X.query("not diag_1.notna()")
diag1_df = X.query("diag_1.notna() and diag_1.str.startswith('250')")['diag_1'].value_counts().reset_index()
diag1_df.columns = ['diabetes_diag_1', 'count']
diag1_df
top_diabetes = diag1_df.head(20)
fig = px.bar(top_diabetes, x='diabetes_diag_1', y='count')
fig.show()
Run to view results
ICD-9 Diagnoses Codes (combining Diabetes codes)
# new_X = X['diag_1'] = X['diag_1'].str.slice(0, 3)
diag1_df = X['diag_1'].str.slice(0, 3).value_counts().reset_index()
diag1_df.columns = ['diag_1', 'count']
top_icd9 = diag1_df.head(50)
fig = px.bar(top_icd9, x='diag_1', y='count')
fig.show()
Run to view results
Analysis of time in hospital
time = pd.DataFrame(X['time_in_hospital'])
fig = px.histogram(time, x='time_in_hospital', nbins=14, title='Distribution of Time in Days')
fig.show()
Run to view results
combine = pd.concat([X,y], axis=1)
time_with_readmissions = combine.groupby(['time_in_hospital', 'readmitted']).size().reset_index(name='count')
fig = px.bar(time_with_readmissions, x="time_in_hospital", y="count", color="readmitted")
fig.show()
Run to view results
Analysis of number of inpatient visits
long_df = pd.DataFrame(X['number_inpatient'])
long_df['readmitted'] = y
long_df
count_df = long_df.groupby(['number_inpatient', 'readmitted']).size().reset_index(name='count')
top_medical_specialties = count_df.groupby('number_inpatient')['count'].sum().nlargest(5).index
count_df_top = count_df[count_df['number_inpatient'].isin(top_medical_specialties)]
fig = px.bar(count_df_top, x="number_inpatient", y="count", color="readmitted")
fig.show()
Run to view results
Analysis of Readmissions
readmissions = y.value_counts().reset_index()
readmissions.columns = ['readmissions', 'count']
fig = px.bar(readmissions, x='readmissions', y='count')
fig.show()
Run to view results