import pandas as pd
import numpy as np
from glob import glob
import s2n_default_notebook.s2n_helpers as s2n
import os
os.getcwd()
# CPT Codes
lobectomy = ['32480','32482']
lobectomy_vats = ['32663','32670']
wedge = ['32505','32506','32507']
wedge_vats = ['32666','32666','32668']
pneumonectomy = ['32440','32442','32445']
pneumonectomy_vats = ['32671']
segment = ['32484','32488']
segment_vats = ['32669']
lvrs = ['32491']
lvrs_vats = ['32672']
other_vats = ['32601','32604','32606','32607','32607',
'32608','32609','32650','32651','32652','32653',
'32654','32655','32656','32658','32659','32661',
'32662','32664','32673','32674']
cpt_cats = [lobectomy,lobectomy_vats,wedge,wedge_vats,pneumonectomy,pneumonectomy_vats,segment,segment_vats,lvrs,lvrs_vats]
cpt_codes = lobectomy + lobectomy_vats + wedge + wedge_vats + pneumonectomy + pneumonectomy_vats + segment + segment_vats + lvrs + lvrs_vats
# ICD Codes
wedge_icd = s2n.regex_to_code(['0BB[BCDFGHKLM]0..'],code_type='ICD_10_PR')
wedge_vats_icd = s2n.regex_to_code(['0BB[BCDFGHKLM][4]..'],code_type='ICD_10_PR')
lobectomy_icd = s2n.regex_to_code(['0BT[BCDFGH]0..'],code_type='ICD_10_PR')
lobectomy_vats_icd = s2n.regex_to_code(['0BT[BCDFGH][4]..'],code_type='ICD_10_PR')
pneumonectomy_icd = s2n.regex_to_code(['0BT[KLM]0..'],code_type='ICD_10_PR')
pneumonectomy_vats_icd = s2n.regex_to_code(['0BT[KLM][4]..'],code_type='ICD_10_PR')
bronchial_icd = s2n.regex_to_code(['0B[BT][37]0..'],code_type='ICD_10_PR')
bronchial_vats_icd = s2n.regex_to_code(['0B[BT][37][4]..'],code_type='ICD_10_PR')
bronch_lobe_icd = s2n.regex_to_code(['0B[BT][4568B]0..'],code_type='ICD_10_PR')
bronch_lobe_vats_icd = s2n.regex_to_code(['0B[BT][4568B][4]..'],code_type='ICD_10_PR')
icd_codes = wedge_icd + wedge_vats_icd + lobectomy_icd + lobectomy_vats_icd + pneumonectomy_icd + pneumonectomy_vats_icd + bronchial_icd + bronchial_vats_icd + bronch_lobe_icd + bronch_lobe_vats_icd
dx_codes_cancer = s2n.regex_to_code(['C3[49]..'],code_type="ICD_10_DX")
dx_codes_noncancer = s2n.regex_to_code(['J4[34].'],code_type="ICD_10_DX")
robotic_pr = s2n.regex_to_code(['8E0W.C.'],code_type="ICD_10_PR")
robotic_pr
pip install pyarrow
WARNING: The directory '/home/jovyan/.cache/pip' or its parent directory is not owned or is not writable by the current user. The cache has been disabled. Check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.
Collecting pyarrow
Downloading pyarrow-3.0.0-cp37-cp37m-manylinux2014_x86_64.whl (20.7 MB)
|████████████████████████████████| 20.7 MB 19.8 MB/s
Requirement already satisfied: numpy>=1.16.6 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pyarrow) (1.19.5)
Installing collected packages: pyarrow
Successfully installed pyarrow-3.0.0
WARNING: You are using pip version 20.1.1; however, version 21.0.1 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
files = glob('/datasets/s2n-data/NIS*201[678]*Core*.parquet')
PR = ["PR"+str(i) for i in range(1,16)]
DX = ["DX"+str(i) for i in range(1,31)]
df = pd.DataFrame()
for f in files:
chunk = pd.read_parquet(f)
#mask = chunk['DX1'].isin(codes) #This is for primary diagnosis code
#mask = chunk['PR1'].isin(codes) #This is for primary procedure code
mask = chunk[PR].isin(icd_codes).any(1) #This is for all list procedure code
#mask = chunk[DX].isin(codes).any(1) #This is for all list diagnosis code
df = df.append(chunk[mask])
cancer_dx_mask = df[DX].isin(dx_codes_cancer).any(1)
#df = df[~cancer_dx_mask]
#df = df[df[DX].isin(dx_codes_noncancer).any(1)]
df = df[cancer_dx_mask]
code_cats = [wedge_icd , wedge_vats_icd , lobectomy_icd , lobectomy_vats_icd ,
pneumonectomy_icd , pneumonectomy_vats_icd , bronchial_icd , bronchial_vats_icd,
bronch_lobe_icd,bronch_lobe_vats_icd]
Years = [2016,2017,2018]
out = pd.DataFrame(columns=list("0123"))
for cat in code_cats:
for year in Years:
df_year = df[df['YEAR']==year]
mask = df_year[PR].isin(cat).any(1)
temp_df = df_year[mask]
discharges = temp_df.DISCWT.sum()
robotic = temp_df[temp_df[PR].isin(robotic_pr).any(1)].DISCWT.sum() / temp_df.DISCWT.sum()
df2 = pd.DataFrame([[str(cat),str(year),str(discharges),str(robotic)]],columns=list("0123"))
out = out.append(df2)
out
out.to_excel('lung_cancer_procedures.xlsx')
print('total discwt: ', df.groupby('YEAR').DISCWT.sum())
print('average los: ', df.LOS.mean())
total discwt: YEAR
2016 36259.990639
2017 36360.001471
2018 36959.999570
Name: DISCWT, dtype: float64
average los: 6.236493885745574
out = pd.DataFrame(df.groupby('DX1').DISCWT.sum().sort_values(ascending=False)).reset_index()
out['desc'] = s2n.icd10_dx_desc(out['DX1'])
files = glob('/datasets/s2n-data/NAS*2018*.parquet')
CPT = ["CPT"+str(i) for i in range(1,31)]
df_nass = pd.DataFrame()
for f in files:
chunk = pd.read_parquet(f)
#mask = chunk['CPT1'].isin(codes) #primary listed cpt code
mask = chunk[CPT].isin(cpt_codes).any(1) #primary listed cpt code
df_nass = df_nass.append(chunk[mask])
df_nass
files = glob('/datasets/s2n-data/CMS*PSPS*2018*.parquet')
df_psps = pd.DataFrame()
for f in files:
chunk = pd.read_parquet(f)
mask = chunk['HCPCS_CD'].isin(cpt_codes) #primary listed cpt code
df_psps = df_psps.append(chunk[mask])
sum(df_psps['PROVIDER_SPEC_CD']=="02")
out_df = pd.DataFrame()
for cat in cpt_cats:
df_temp = df_psps[df_psps['HCPCS_CD'].isin(cat)]
discharges = df_temp.PSPS_ALLOWED_SERVICES_CNT.sum()
inpatient = df_temp[df_temp['PLACE_OF_SERVICE_CD']==21].PSPS_ALLOWED_SERVICES_CNT.sum() / df_temp.PSPS_ALLOWED_SERVICES_CNT.sum()
thoracic = df_temp[df_temp['PROVIDER_SPEC_CD']=="33"].PSPS_ALLOWED_SERVICES_CNT.sum() / df_temp.PSPS_ALLOWED_SERVICES_CNT.sum()
cardiac = df_temp[df_temp['PROVIDER_SPEC_CD']=='78'].PSPS_ALLOWED_SERVICES_CNT.sum() / df_temp.PSPS_ALLOWED_SERVICES_CNT.sum()
gen = df_temp[df_temp['PROVIDER_SPEC_CD']=='02'].PSPS_ALLOWED_SERVICES_CNT.sum() / df_temp.PSPS_ALLOWED_SERVICES_CNT.sum()
df2 = pd.DataFrame([[str(cat),discharges,inpatient,thoracic,cardiac,gen]])
out_df = out_df.append(df2)
out_df
out_df.to_excel('psps_lung_cancer_procedures.xlsx')
print(df_psps.PSPS_ALLOWED_SERVICES_CNT.sum())
df_psps.groupby('PLACE_OF_SERVICE_CD').PSPS_ALLOWED_SERVICES_CNT.sum() / df_psps.PSPS_ALLOWED_SERVICES_CNT.sum()
40288.0
print('Total Discharges', df.PSPS_ALLOWED_SERVICES_CNT.sum())
df.groupby('PLACE_OF_SERVICE_CD').PSPS_ALLOWED_SERVICES_CNT.sum() / df.PSPS_ALLOWED_SERVICES_CNT.sum()
Total Discharges 9746.0
(df.groupby('PROVIDER_SPEC_CD').PSPS_ALLOWED_SERVICES_CNT.sum() /
df.PSPS_ALLOWED_SERVICES_CNT.sum()).sort_values(ascending=False).head(10)
# Get the files -- Change the year to the year you are interested in
files = glob('/datasets/s2n-data/NIS*2017*Core*.parquet')
codes = ['067C3DZ','067D3DZ','067F3DZ','067G3DZ','067M3DZ','067N3DZ']
PR = ["PR"+str(i) for i in range(1,26)]
DX = ["DX"+str(i) for i in range(1,31)]
df = pd.DataFrame()
for f in files:
chunk = pd.read_parquet(f)
#mask = chunk['DX1'].isin(codes) #This is for primary diagnosis code
#mask = chunk['PR1'].isin(codes) #This is for primary procedure code
mask = chunk[PR].isin(codes).any(1) #This is for all list procedure code
#mask = chunk[DX].isin(codes).any(1) #This is for all list diagnosis code
df = df.append(chunk[mask])
total_discharges = df.DISCWT.sum()
total_discharges
df.groupby('PAY1').DISCWT.sum() / total_discharges
files = glob('/datasets/s2n-data/NAS*2018*.parquet')
codes = ['37238'] #Venous stenting
CPT = ["CPT"+str(i) for i in range(1,31)]
df = pd.DataFrame()
for f in files:
chunk = pd.read_parquet(f)
#mask = chunk['CPT1'].isin(codes) #primary listed cpt code
mask = chunk[CPT].isin(codes).any(1) #primary listed cpt code
df = df.append(chunk[mask])
df.DISCWT.sum()
df.groupby('PAY1').DISCWT.sum() / df.DISCWT.sum()