Structured data analysis 5: oxford astrazeneca

# Don't change this cell; just run it. import numpy as np import pandas as pd # Safe settings for Pandas. pd.set_option('mode.chained_assignment', 'raise') import matplotlib.pyplot as plt %matplotlib inline # The OKpy testing system. from client.api.notebook import Notebook ok = Notebook('oxford_astrazeneca.ok')

# Run this cell. ox_vax = pd.read_csv('ox_astra_cov002.csv') ox_vax

prop_covid = calc_prop_cases(ox_vax) prop_covid

_ = ok.grade('q_prop_covid')

def calc_prop_cases(df): return sum(ox_vax['Cases'])/sum(ox_vax['N'])

# Check this function returns the same value as you calculated above, when # called on the whole table. calc_prop_cases(ox_vax)

_ = ok.grade('q_calc_prop_cases')

vax_eff = 1- relative_risk(ox_vax) vax_eff

_ = ok.grade('q_vax_eff')

def calc_efficiency(df): ox_vax=ox_vax.groupby('GROUP').sum() return ox_vax['Cases'][1]/ox_vax['Cases'][0] return

# Run this cell, check you get the same answer as previously.

_ = ok.grade('q_calc_efficiency')

ld_vax_eff = ... sd_vax_eff = ... vax_eff_diff = ... print('LD efficiency', ld_vax_eff) print('SD efficiency', sd_vax_eff) print('Efficiency difference', vax_eff_diff)

_ = ok.grade('q_vax_eff_diff')

def calc_ld_sd_ediff(df): ... return ...

# Run this cell, check you get the same answer as previously. calc_ld_sd_ediff(ox_vax)

_ = ok.grade('q_calc_ld_sd_ediff')

# What the first five rows will look like. person_start = pd.DataFrame() person_start['Dose'] = np.repeat(['LD'], [5]) person_start['Group'] = np.repeat(['Control'], [5]) person_start['Case'] = np.repeat(['False'], [5]) person_start

person_df = pd.DataFrame() n_per_cell = np.array(ox_vax['N']) person_df['Dose'] = np.repeat(['LD', 'LD', 'SD', 'SD'], n_per_cell) # Show the first five rows person_df.head()

person_df... # Show the first five rows so far. person_df.head()

_ = ok.grade('q_person_df_group')

# Run this cell. # "Case" will be False for non-case, True for case. # Start with all False to_repeat = np.repeat([False], len(n_per_cell) * 2) # Set every other value, from the second, to True to_repeat[1:len(to_repeat):2] = True # Note that we can do the same thing with this short-cut. to_repeat[1::2] = True # Show the result to_repeat

# Start with an integer array of zeros, of the right length. repeat_nos = np.repeat([0], len(to_repeat)) ... person_df['Case'] = ... # Show the first five rows. person_df.head()

_ = ok.grade('q_person_df_with_case')

# Make a thing that knows how to "group" the rows "by" the pairs of labels in # "Dose" and "Group". grouped = person_df.groupby(['Dose', 'Group']) # Aggregate within these groups, by # * counting the number of rows # (put this value into the column "N") # * counting the number of True values in the "Case" column # (put this value into the column "Cases") aggregated = grouped.agg(N=('Case', len), Cases=('Case', np.count_nonzero)) # Drop the fancy index (row labels) to make an ordinary data frame. tabulated = aggregated.reset_index() tabulated

def cases_to_counts(full_df): """ Calculate "N" and "Cases" for "Dose" and "Group" cells of "full_df" """ return full_df.groupby(['Dose', 'Group']).agg( N=('Case', len), Cases=('Case', np.count_nonzero)).reset_index()

# This should return values identical to the original "ox_vax" data frame. cases_to_counts(person_df)

# This should should evaluate to (and show) True. cases_to_counts(person_df).equals(ox_vax)

fake_df = person_df.copy() fake_df['Dose'] = np.random.permutation(person_df['Dose']) fake_df.head()

n_iters = 1000 fake_ediffs = ... # Show the first 10 efficiency differences. fake_ediffs[:10]

_ = ok.grade('q_fake_ediffs')

#- Histogram of eff_diffs

prop_ediff_ge = ... # Show the result prop_ediff_ge

_ = ok.grade('q_prop_ediff_ge')

# For your convenience, you can run this cell to run all the tests at once! import os _ = [ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q')]