# Start writing code here...
import pandas as pd
import numpy as np
import seaborn as sns
from numpy import NaN
df = pd.read_csv(r"/work/healthcare-dataset-stroke-data.csv")
df.head()
#as can be seen with .count() func, some values appear to be missing for bmi, therefore these rows will be dropped
# df.count()
df.dropna(axis = 0, inplace = True)
df.count()
#Task A - Row and Column Names input passed as dict to change specific column names
#changing column names using rename function, with 'columns'
df.rename(columns = {'hypertension':'HTN', 'heart_disease':'CAD', 'Residence_type':'residence', 'work_type':'work'}, inplace = True)
df.head()
#making the 'id' the index of each individual, 1st making sure that each person has a unique id
len(df['id'].unique())
df.count()
#5110 values for both, so can proceed
df.set_index(df.id, inplace = True)
df.drop(['id'], axis = 1, inplace = True)
df.head()
#Task B - Groupby and Pivot Tables
#groupby demonstration
d = {'age':'mean', 'HTN':'sum', 'CAD':'sum','avg_glucose_level':'mean', 'bmi':'mean', 'stroke':'sum'}
cols = d
df_mf = df.groupby(by = 'gender').agg(d)
for key, value in d.items():
cols[key] = str(key + '_' + value)
df_mf.rename(columns = cols, inplace = True)
df_mf['count'] = df.groupby(by = 'gender').count()['age']
df_mf.head()
#pivot / pivot_table demonstration
#this dataframe provides the average BMI values for each category, distinguished by whether or not a stroke occurred (col names changes to reflect this)
df_pivot = pd.pivot_table(df, values = 'bmi', index = ['gender', 'HTN', 'CAD'], columns = 'stroke', aggfunc = 'median', fill_value = 0, margins = True)
df_pivot.rename(columns = {0:'No', 1:'Yes'}, inplace = True)
df_pivot.head()
#Task C - Dataframe transformation using functions
#displaying the # of strokes by age using the groupby func; same thing for # of strokes by residence, marriage status
df.groupby(by = 'age')['stroke'].sum().sort_values(ascending = False)
df.groupby(by = 'residence')['stroke'].sum().sort_values()
df.groupby(by = 'ever_married')['stroke'].sum().sort_values()
#
#returns the min, max, and median of the BGL using the pandas series.aggregate function
df.avg_glucose_level.aggregate(['min', 'max', 'median'])
# returns the sum of the two pandas series, CAD and HTN
df.HTN.add(df.CAD, fill_value = 0)
# returns the quotient of someone's BGL and BMI, respectively
df.avg_glucose_level.div(df.bmi)
#Task D - Create new columns from existing columns
#creates a diff columns from the pivot table, showing the difference in BMI between people who had a stroke and people who didn't
from numpy import NaN
df_pivot['diff'] = df_pivot['Yes'] - df_pivot['No']
df_pivot.head()
#creates a new dataframe
#1st, assign numbers to the smoking_status
df_rf = pd.DataFrame(df)
list_in = list(df_rf.smoking_status.unique())
list_out =[.5, 0, 1, NaN]
df_rf['smoking'] = df_rf['smoking_status']
df_rf['smoking'] = df_rf['smoking'].replace(list_in, list_out, inplace = False)
df_rf = df_rf.drop('smoking_status', axis = 1, inplace = False)
df_rf = df_rf.dropna(axis = 0, inplace = False)
#Creates a new col, 'risk factors', for a numerical representation of a pt's risk factors
df_rf['risk_factors'] = df[['CAD', 'HTN', 'smoking']].sum(axis = 1)
#pivots data using pivot_table, w/ agg_func involving the addition of risk factor values
df_rf = pd.pivot_table(df_rf, values = 'stroke', index = ['gender', 'work', 'residence'], columns = 'risk_factors', aggfunc = 'sum', fill_value = 0, margins = True)
df_rf.head()
#Task E - Descriptive Statistics
#Displays the minimum and maximum BMIs, as well as a range
bmi_max = df['bmi'].max()
bmi_min = df['bmi'].min()
bmi_range = bmi_max - bmi_min
print('calculation for range of bmi: ' + str(bmi_range) + ' = ' + str(bmi_max) + ' - ' + str(bmi_min))
#displays the standard deviation and mean of blood glucose levels
print('the average BGL is: ' + str(df.avg_glucose_level.mean()) + '\nthe standard deviation in BGL is: ' + str(df.avg_glucose_level.std()))