# importing the required libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats
# making plots look like R
plt.style.use('ggplot')
plt.rc('axes', facecolor='white', edgecolor='black', axisbelow=True, grid=False)
plt.rcParams['figure.figsize'] = (7,6)
# an array to store the simulated exposure and disease status of each individual
population = np.array([])
# randomly determining the size of the population
population_size = np.random.randint(5000,20000)
# probabilistically determining the probability of exposure (maximum possibly probability == 30%)
p_exposed = np.random.uniform(high =0.3)
# probabilistically determining how dangerous the exposure is, this will set the probability of disease given exposure
exposure_danger = np.random.uniform(low =0.2, high = 0.4)
# probabilistically determining the unconditional risk of the disease (set to be no higher than the risk given exposure)
disease_general = np.random.uniform(high = exposure_danger)
print('\nDetails of simulated population and parameters:')
print('\nPopulation size =', population_size )
print('Probability of exposure to hazrard = ', np.round(p_exposed,2))
print('Probability of disease given exposure to hazard = ', np.round(exposure_danger, 2))
print('Probability of having the disease, unconditional on exposure status =', np.round(disease_general,2))
# probabilistically assigning a disease status to each member of the population, where the probability of having the
# disease changes conditionally on exposure status
for i in np.arange(population_size):
exposed_or_not = np.random.choice(['exposed','nonexposed'], p = [p_exposed, 1-p_exposed])
if exposed_or_not == 'exposed':
population = np.append(population, np.random.choice(['exposed_disease','exposed_no_disease'],
p=[exposure_danger,1-exposure_danger]))
if exposed_or_not == 'nonexposed':
population = np.append(population, np.random.choice(['nonexposed_disease','nonexposed_no_disease'],
p=[disease_general,1-disease_general]))
# displaying the population counts
pop_counts = pd.Series(population).value_counts()
display(pop_counts)
# plotting the counts
pop_counts.plot(kind = 'barh')
plt.xlabel('Frequency');
# creating counts based on exposure and disease status
population_n = len(population)
nonexposed_no_disease = np.count_nonzero(population == 'nonexposed_no_disease')
nonexposed_disease = np.count_nonzero(population == 'nonexposed_disease')
number_nonexposed = nonexposed_no_disease + nonexposed_disease
exposed_no_disease = np.count_nonzero(population == 'exposed_no_disease')
exposed_disease = np.count_nonzero(population == 'exposed_disease')
number_exposed = exposed_no_disease + exposed_disease
total_disease = nonexposed_disease + exposed_disease
# storing the counts in a dataframe
df_pop = pd.DataFrame(index = ['exposed:','nonexposed:', 'total:'], columns = ['disease:','no_disease:', 'total:'])
df_pop.loc['exposed:', 'disease:'] = exposed_disease
df_pop.loc['exposed:', 'no_disease:'] = exposed_no_disease
df_pop.loc['exposed:', 'total:'] = exposed_disease + exposed_no_disease
df_pop.loc['nonexposed:', 'disease:']= nonexposed_disease
df_pop.loc['nonexposed:', 'no_disease:'] = nonexposed_no_disease
df_pop.loc['nonexposed:', 'total:'] = nonexposed_disease + nonexposed_no_disease
df_pop.loc['total:', 'disease:'] = exposed_disease + nonexposed_disease
df_pop.loc['total:', 'no_disease:'] = exposed_no_disease + nonexposed_no_disease
df_pop.loc['total:', 'total:'] = population_n
# correcting 0 counts (if there are any) so the odds ratio is defined
if df_pop.iloc[0,0] == 0:
df_pop.iloc[0,0] = 0.5
if df_pop.iloc[0,1] == 0:
df_pop.iloc[0,1] = 0.5
if df_pop.iloc[1,0] == 0:
df_pop.iloc[1,0] = 0.5
if df_pop.iloc[1,1] == 0:
df_pop.iloc[1,1] = 0.5
# show the counts
print('\nSimulated Population Data:')
display(df_pop)
# calculating the population effect size
odds_disease_giv_exposure = exposed_disease/exposed_no_disease
odds_disease_giv_nonexposure = nonexposed_disease/nonexposed_no_disease
true_odds_ratio = odds_disease_giv_exposure/odds_disease_giv_nonexposure
true_log_odds_ratio = np.log(true_odds_ratio)
# print the true effect size
print()
print('True Odds Ratio:', true_odds_ratio)
print()
# plotting the population of odds of disease given exposure status
plt.figure(figsize = (10,6))
plt.bar(x = ['Exposed', 'Nonexposed'], height = [odds_disease_giv_exposure, odds_disease_giv_nonexposure]
, color = ['red', 'blue'])
plt.xlabel('Exposure Status')
plt.ylabel('Odds of Disease')
plt.title('Population Odds of Disease Given Exposure/Nonexposure')
plt.show()
# simulating studies for an inverse-variance fixed effect meta-analysis
no_of_studies = np.random.randint(5,40)
studies = []
# randomly sampling from the population
for i in np.arange(no_of_studies):
sample_size = np.random.randint(10,800)
study = np.random.choice(population, size = sample_size, replace = False)
studies.append(study)
counts = np.array([])
# creating a dataframe for the study counts
df = pd.DataFrame(columns = ['exposed_disease', 'exposed_no_disease',
'nonexposed_disease','nonexposed_no_disease'])
# creating counts for disease status and exposure status within each study
for i in studies:
temp_exposed_disease = np.count_nonzero(i == 'exposed_disease')
temp_exposed_no_disease = np.count_nonzero(i == 'exposed_no_disease')
temp_nonexposed_disease = np.count_nonzero(i == 'nonexposed_disease')
temp_nonexposed_no_disease = np.count_nonzero(i == 'nonexposed_no_disease')
study = [temp_exposed_disease,temp_exposed_no_disease,temp_nonexposed_disease,temp_nonexposed_no_disease]
for i in np.arange(len(study)):
if study[i] == 0:
study[i] = 0.5
df = df.append(pd.Series(study, index = ['exposed_disease', 'exposed_no_disease',
'nonexposed_disease','nonexposed_no_disease']), ignore_index = True)
df['sample_size']= df['exposed_disease'] + df['exposed_no_disease']+ df['nonexposed_disease'] + df['nonexposed_no_disease']
df.index.name = 'study number:'
# calculating the odds ratio for each simulated study
df['odds_ratio'] = (df['exposed_disease']/df['exposed_no_disease'])/(df['nonexposed_disease']/df['nonexposed_no_disease'])
# ensuring all datatypes are float
for i in df.columns:
df[i] = df[i].astype('float')
# calculating the log odds ratio for each simulated study
df['log_odds_ratio']= np.log(df['odds_ratio'])
# calculating the variance, standard error and 95% confidence intervals of the log odds ratio for each study
SE_log_odds_ratio = np.array([])
var_log_odds_ratio = np.array([])
var_log_odds_ratio = 1/df['exposed_disease'] + 1/df['exposed_no_disease'] + 1/df['nonexposed_disease'] + 1/df['nonexposed_no_disease']
SE_log_odds_ratio = np.sqrt(var_log_odds_ratio)
df['var_log_odds_ratio'] = var_log_odds_ratio
df['ub_ci_log_odds_ratio'] = df['log_odds_ratio'] + 1.96 * SE_log_odds_ratio
df['lb_ci_log_odds_ratio'] = df['log_odds_ratio'] - 1.96 * SE_log_odds_ratio
df['weight'] = 1/var_log_odds_ratio
# display the data
display(df)
# creating a forest plot of odds ratios
y = []
zeros = []
for i in np.arange(len(df.index)+2):
y.append(i)
zeros.append(0)
study_list = list(df.index)
plt.figure(figsize = (14,10))
plt.plot(zeros,y, label = 'line of no effect')
plt.xlabel('Log Odds Ratio')
plt.ylabel('Study Number')
plt.yticks(study_list)
plt.gca().invert_yaxis()
# plotting the confidence intervals
for i in np.arange(len(df)):
xs = np.linspace(df['lb_ci_log_odds_ratio'].iloc[i],df['ub_ci_log_odds_ratio'].iloc[i])
ys = []
for x in xs:
ys.append(study_list[i])
plt.plot(xs, ys, color = 'cyan')
plt.plot(xs, ys, color = 'cyan', label = '95% confidence interval')
# creating weights to scale the points on the plot by the weight the study is given in the meta-analysis
stndrd_weights = df['weight']/np.sum(df['weight']) * 100
# plotting the study effect sizes
for i in np.arange(len(df)):
plt.plot(df['log_odds_ratio'].iloc[i], i, 'o', color = 'darkblue', markersize = stndrd_weights[i])
# calculating the fixed effect meta-analysis statistics
effect_sizes = df['log_odds_ratio'].copy()
weights = df['weight'].copy()
sum_of_weights = np.sum(weights)
effect_size_times_weight = effect_sizes * weights
sum_effect_size_times_weight = np.sum(effect_size_times_weight)
weighted_mean = sum_effect_size_times_weight/sum_of_weights
var_weighted_mean = 1/sum_of_weights
se_weighted_mean = np.sqrt(var_weighted_mean)
ub_ci_wm = weighted_mean + 1.96 * se_weighted_mean
lb_ci_wm = weighted_mean - 1.96 * se_weighted_mean
# calculating a 95% confidence interval for the summary effect
ci_line = np.linspace(lb_ci_wm,ub_ci_wm)
ci_ys = []
for i in np.arange(len(ci_line)):
ci_ys.append(max(study_list)+1)
# plot the data
plt.plot(ci_line,ci_ys, color = 'red', label = '95% confidence interval')
plt.plot(df['log_odds_ratio'].iloc[len(df['log_odds_ratio'])-1], len(df['log_odds_ratio'])-1, 'o', color = 'darkblue', label = 'Study Log Odds Ratio (size indicates weight)')
plt.plot(weighted_mean,max(study_list)+1, 'o', color = 'crimson', label = 'Meta-analytic Estimate of Population Log Odds Ratio')
plt.plot(true_log_odds_ratio,max(df.index)+2, 'o', color = 'blue', label = 'True Log Odds Ratio')
# turning off a deprecation warning
import warnings
import matplotlib.cbook
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
ax = plt.subplot()
ax.legend(loc='center', bbox_to_anchor=(0.5, -0.15), shadow=True, ncol=1)
plt.show()
# calculating the significance of the summary effect size
z = weighted_mean/se_weighted_mean
p = 2 * (1 - scipy.stats.norm.cdf(abs(z)))
if p == 0:
p = 0.001
print('Z statistic of summary effect estimate = ', z)
print('p < ', p)
else:
print('Z statistic of summary effect estimate = ', z)
print('p = ', p)