Structured data analysis 3: religion

# Don't change this cell; just run it. import numpy as np import pandas as pd # Safe settings for Pandas. pd.set_option('mode.chained_assignment', 'raise') %matplotlib inline import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') from client.api.notebook import Notebook ok = Notebook('religion.ok')

religion = pd.read_csv('oliner_tab6_6.csv') religion

_ = ok.grade('q_01_religion')

top_religion = religion.head(4) top_religion= top_religion.set_index('level') top_religion

_ = ok.grade('q_02_top_religion')

rescuers_actives = pd.DataFrame() rescuers_actives= top_religion.loc[:, ['rescuers', 'actives']] rescuers_actives

_ = ok.grade('q_03_rescuers_actives')

# Run this cell # Recreate the individual labels for group and religiousness. group = np.repeat(['rescuers', 'actives'], rescuers_actives.sum()) religiousness = np.repeat(['Very', 'Somewhat', 'Not very', 'Not at all'], rescuers_actives.transpose().sum()) # Shuffle the religiousness labels to give a random association. np.random.shuffle(religiousness) # Make, show fake table. fake_table = pd.crosstab(religiousness, group) fake_table

# Run this cell actual_by_very = rescuers_actives.loc['Very', 'actives'] actual_by_very

# Run this cell fake_by_very = fake_table.loc['Very', 'actives'] fake_by_very

very_actives = np.zeros(1000) for i in np.arange(1000): np.random.shuffle(religiousness) fake_by_very= pd.crosstab(religiousness, group) very_actives[i] = fake_table.loc['Very', 'actives'] # Show the first five values very_actives[:5]

_ = ok.grade('q_04_very_actives')

#- Plot a histogram of the very_actives values, calculate the proportion plt.hist(very_actives)

# Run this cell rescuer_counts = rescuers_actives.loc[:, 'rescuers'] rescuer_scores = np.repeat([3, 2, 1, 0], rescuer_counts) rescuer_scores

np.mean(rescuer_scores)

active_counts= rescuers_actives.loc[:, 'actives'] active_scores = np.repeat([3,2,1,0], active_counts) active_scores

_ = ok.grade('q_05_active_scores')

# Run this cell observed = np.mean(rescuer_scores) - np.mean(active_scores) observed

pooled= np.append(rescuer_scores, active_scores) fake_mean_diffs = np.zeros(10000) for i in np.arange(10000): fake_rescuers_scores= pooled[:210] fake_actives_scores= pooled[:210] fake_mean= np.mean(fake_rescuers_scores)- np.mean(fake_actives_scores) fake_mean_diffs[i]= fake_mean # Show the first five values fake_mean_diffs[:5]

_ = ok.grade('q_06_fake_mean_diffs')

#- Use this cell to plot histogram and calculate proportion. plt.hist(fake_mean_diffs) very_fake_prop= np.count_nonzero(fake_mean_diffs >= observed)/len(fake_mean_diffs) very_fake_prop

# For your convenience, you can run this cell to run all the tests at once! import os _ = [ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q')]