# do the things
import pandas as pd
import numpy as np
# do the things but bigger * https://tenor.com/view/more-kylo-ren-adam-driver-screaming-star-wars-gif-17734637
pd.options.display.max_columns=100
pd.options.display.max_rows=100
select
sum(CASE WHEN Gender='Male' THEN 1 ELSE 0 END) as male_count,
sum(CASE WHEN Gender='Female' THEN 1 ELSE 0 END) as female_count,
Resident_District_Institution_Name as district
from bt_syn_individual_test.by_syn_individual
group by Resident_District_Institution_Name
order by district
syn_table_sample.groupby(['Resident_District_Institution_Name'
,'Gender'])['Gender'].count()
pd.concat([pd.Series(syn_table_sample.loc[
syn_table_sample['Gender']=='Male'].groupby(
'Resident_District_Institution_Name')[
'Resident_District_Institution_Name'].count().values,
name='male_count'),
pd.Series(syn_table_sample.loc[
syn_table_sample['Gender']=='Female'].groupby(
'Resident_District_Institution_Name')[
'Resident_District_Institution_Name'].count().values,
name='female_count'),
pd.Series(syn_table_sample.loc[
syn_table_sample['Gender']=='Female'].groupby(
'Resident_District_Institution_Name')[
'Resident_District_Institution_Name'].count().index,
name='district')],axis=1).sort_values('district')
# just a little check on numbers and stuff
syn_table_sample.groupby(['Resident_District_Institution_Name'])['Race_Ethnicity'].count().reset_index()
syn_table_group = syn_table_sample.groupby(['Resident_District_Institution_Name',
'Race_Ethnicity'])['age'].count().reset_index()
syn_table_group
syn_copy = syn_table_sample.copy() # not necessary, but just cautious!
syn_copy['district_total'] = 0
syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[0,0],
['district_total']] = test.iloc[0,1]
syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[1,0],
['district_total']] = test.iloc[1,1]
syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[2,0],
['district_total']] = test.iloc[2,1]
syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[3,0],
['district_total']] = test.iloc[3,1]
syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[4,0],
['district_total']] = test.iloc[4,1]
syn_percentage_group = ((syn_copy
.groupby(['Resident_District_Institution_Name',
'Race_Ethnicity'])['age']
.agg('count')
/ syn_copy
.groupby(['Resident_District_Institution_Name',
'Race_Ethnicity'])['district_total']
.agg('max')) * 100)
syn_percentage_group_df = syn_percentage_group.reset_index(name='percentage')
syn_percentage_group_df.sort_values(['Resident_District_Institution_Name', 'percentage'], ascending=[True, False])
pd.concat(
[(syn_table_sample.groupby(['Resident_District_Institution_Name'])
['Race_Ethnicity'].value_counts(normalize=True) * 100).rename('Percentage'),
(syn_table_sample.groupby(['Resident_District_Institution_Name'])
['Race_Ethnicity'].value_counts(normalize=False).rename('Count'))],
axis=1)
select by_syn_individual.Resident_District_Institution_Name district,
by_syn_individual.Race_Ethnicity race_ethnicity,
count(*) as race_ethnicity_count,
(count(*) / sum(count(*)) over(partition by Resident_District_Institution_Name) * 100) race_ethnicity_percentage
from bt_syn_individual_test.by_syn_individual
group by Resident_District_Institution_Name, Race_Ethnicity
order by district asc, race_ethnicity_count desc;