SQL vs Pandas: For Novices by a Novice

# do the things import pandas as pd import numpy as np

# do the things but bigger * https://tenor.com/view/more-kylo-ren-adam-driver-screaming-star-wars-gif-17734637 pd.options.display.max_columns=100 pd.options.display.max_rows=100

select sum(CASE WHEN Gender='Male' THEN 1 ELSE 0 END) as male_count, sum(CASE WHEN Gender='Female' THEN 1 ELSE 0 END) as female_count, Resident_District_Institution_Name as district from bt_syn_individual_test.by_syn_individual group by Resident_District_Institution_Name order by district

syn_table_sample.groupby(['Resident_District_Institution_Name' ,'Gender'])['Gender'].count()

pd.concat([pd.Series(syn_table_sample.loc[ syn_table_sample['Gender']=='Male'].groupby( 'Resident_District_Institution_Name')[ 'Resident_District_Institution_Name'].count().values, name='male_count'), pd.Series(syn_table_sample.loc[ syn_table_sample['Gender']=='Female'].groupby( 'Resident_District_Institution_Name')[ 'Resident_District_Institution_Name'].count().values, name='female_count'), pd.Series(syn_table_sample.loc[ syn_table_sample['Gender']=='Female'].groupby( 'Resident_District_Institution_Name')[ 'Resident_District_Institution_Name'].count().index, name='district')],axis=1).sort_values('district')

# just a little check on numbers and stuff syn_table_sample.groupby(['Resident_District_Institution_Name'])['Race_Ethnicity'].count().reset_index()

syn_table_group = syn_table_sample.groupby(['Resident_District_Institution_Name', 'Race_Ethnicity'])['age'].count().reset_index() syn_table_group

syn_copy = syn_table_sample.copy() # not necessary, but just cautious! syn_copy['district_total'] = 0

syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[0,0], ['district_total']] = test.iloc[0,1] syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[1,0], ['district_total']] = test.iloc[1,1] syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[2,0], ['district_total']] = test.iloc[2,1] syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[3,0], ['district_total']] = test.iloc[3,1] syn_copy.loc[syn_copy['Resident_District_Institution_Name']==test.iloc[4,0], ['district_total']] = test.iloc[4,1]

syn_percentage_group = ((syn_copy .groupby(['Resident_District_Institution_Name', 'Race_Ethnicity'])['age'] .agg('count') / syn_copy .groupby(['Resident_District_Institution_Name', 'Race_Ethnicity'])['district_total'] .agg('max')) * 100) syn_percentage_group_df = syn_percentage_group.reset_index(name='percentage')

syn_percentage_group_df.sort_values(['Resident_District_Institution_Name', 'percentage'], ascending=[True, False])

pd.concat( [(syn_table_sample.groupby(['Resident_District_Institution_Name']) ['Race_Ethnicity'].value_counts(normalize=True) * 100).rename('Percentage'), (syn_table_sample.groupby(['Resident_District_Institution_Name']) ['Race_Ethnicity'].value_counts(normalize=False).rename('Count'))], axis=1)

select by_syn_individual.Resident_District_Institution_Name district, by_syn_individual.Race_Ethnicity race_ethnicity, count(*) as race_ethnicity_count, (count(*) / sum(count(*)) over(partition by Resident_District_Institution_Name) * 100) race_ethnicity_percentage from bt_syn_individual_test.by_syn_individual group by Resident_District_Institution_Name, Race_Ethnicity order by district asc, race_ethnicity_count desc;