A01_Ohm_Parikh

# Start writing code here...

import pandas as pd import numpy as np import seaborn as sns from numpy import NaN

df = pd.read_csv(r"/work/healthcare-dataset-stroke-data.csv") df.head() #as can be seen with .count() func, some values appear to be missing for bmi, therefore these rows will be dropped # df.count() df.dropna(axis = 0, inplace = True) df.count()

#Task A - Row and Column Names input passed as dict to change specific column names #changing column names using rename function, with 'columns' df.rename(columns = {'hypertension':'HTN', 'heart_disease':'CAD', 'Residence_type':'residence', 'work_type':'work'}, inplace = True) df.head()

#making the 'id' the index of each individual, 1st making sure that each person has a unique id len(df['id'].unique()) df.count() #5110 values for both, so can proceed df.set_index(df.id, inplace = True) df.drop(['id'], axis = 1, inplace = True) df.head()

#Task B - Groupby and Pivot Tables #groupby demonstration d = {'age':'mean', 'HTN':'sum', 'CAD':'sum','avg_glucose_level':'mean', 'bmi':'mean', 'stroke':'sum'} cols = d df_mf = df.groupby(by = 'gender').agg(d) for key, value in d.items(): cols[key] = str(key + '_' + value) df_mf.rename(columns = cols, inplace = True) df_mf['count'] = df.groupby(by = 'gender').count()['age'] df_mf.head()

#pivot / pivot_table demonstration #this dataframe provides the average BMI values for each category, distinguished by whether or not a stroke occurred (col names changes to reflect this) df_pivot = pd.pivot_table(df, values = 'bmi', index = ['gender', 'HTN', 'CAD'], columns = 'stroke', aggfunc = 'median', fill_value = 0, margins = True) df_pivot.rename(columns = {0:'No', 1:'Yes'}, inplace = True) df_pivot.head()

#Task C - Dataframe transformation using functions #displaying the # of strokes by age using the groupby func; same thing for # of strokes by residence, marriage status df.groupby(by = 'age')['stroke'].sum().sort_values(ascending = False) df.groupby(by = 'residence')['stroke'].sum().sort_values() df.groupby(by = 'ever_married')['stroke'].sum().sort_values()

# #returns the min, max, and median of the BGL using the pandas series.aggregate function df.avg_glucose_level.aggregate(['min', 'max', 'median']) # returns the sum of the two pandas series, CAD and HTN df.HTN.add(df.CAD, fill_value = 0) # returns the quotient of someone's BGL and BMI, respectively df.avg_glucose_level.div(df.bmi)

#Task D - Create new columns from existing columns #creates a diff columns from the pivot table, showing the difference in BMI between people who had a stroke and people who didn't from numpy import NaN df_pivot['diff'] = df_pivot['Yes'] - df_pivot['No'] df_pivot.head()

#creates a new dataframe #1st, assign numbers to the smoking_status df_rf = pd.DataFrame(df) list_in = list(df_rf.smoking_status.unique()) list_out =[.5, 0, 1, NaN] df_rf['smoking'] = df_rf['smoking_status'] df_rf['smoking'] = df_rf['smoking'].replace(list_in, list_out, inplace = False) df_rf = df_rf.drop('smoking_status', axis = 1, inplace = False) df_rf = df_rf.dropna(axis = 0, inplace = False) #Creates a new col, 'risk factors', for a numerical representation of a pt's risk factors df_rf['risk_factors'] = df[['CAD', 'HTN', 'smoking']].sum(axis = 1) #pivots data using pivot_table, w/ agg_func involving the addition of risk factor values df_rf = pd.pivot_table(df_rf, values = 'stroke', index = ['gender', 'work', 'residence'], columns = 'risk_factors', aggfunc = 'sum', fill_value = 0, margins = True) df_rf.head()

#Task E - Descriptive Statistics #Displays the minimum and maximum BMIs, as well as a range bmi_max = df['bmi'].max() bmi_min = df['bmi'].min() bmi_range = bmi_max - bmi_min print('calculation for range of bmi: ' + str(bmi_range) + ' = ' + str(bmi_max) + ' - ' + str(bmi_min))

#displays the standard deviation and mean of blood glucose levels print('the average BGL is: ' + str(df.avg_glucose_level.mean()) + '\nthe standard deviation in BGL is: ' + str(df.avg_glucose_level.std()))