02 (Permutation)Hypothesis Testing - Simulation and Visual Intro

library(ggdist) library(stringr) library(ggplot2) library(dplyr) library(distributional) library(patchwork)

K=1000

emailA_res <- c(1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1) p_a = (round(sum(emailA_res)/length(emailA_res),2)) n_a = (length(emailA_res)) print(str_glue("p_a={p_a}, n_a={n_a}"))

emailB_res <- c(0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1) p_b = (round(sum(emailB_res)/length(emailB_res),2)) n_b = (length(emailB_res)) print(str_glue("p_b={p_b}, n_b={n_b}"))

observed_diff = p_a - p_b print(observed_diff)

sim_results = tibble( difference = numeric(K) ) for(i in 1:K){ res = c(emailA_res, emailB_res) condition = sample(c("A", "B"), length(emailA_res) + length(emailB_res), replace = T) prop_A = sum(res[condition == "A"])/length(res[condition == "A"]) prop_B = sum(res[condition == "B"])/length(res[condition == "B"]) sim_results[i, "difference"] = prop_A - prop_B } sim_results %>% ggplot(aes(x = difference)) + stat_dotsinterval() + xlim(-0.15,0.15) + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.title.y=element_blank()) + ggtitle( "difference distribution (n=500)" ) + geom_vline(xintercept = 0, color = "blue4") -> p4 p4

p4 + geom_vline(xintercept = observed_diff, color = "darkgoldenrod3") -> p5 p5

sim_results = tibble( difference = numeric(K) ) emailA_distr <- dist_bernoulli(0.56) generate(emailA_distr, 3000)[[1]] %>% as.numeric() -> emailA_res emailB_distr <- dist_bernoulli(0.5) generate(emailB_distr, 2300)[[1]] %>% as.numeric() -> emailB_res # p_a = round(sum(emailA_res)/length(emailA_res),2) # p_b = round(sum(emailB_res)/length(emailB_res),2) # # observed_diff = p_a - p_b for(i in 1:K){ res = c(emailA_res, emailB_res) condition = sample(c("A", "B"), length(emailA_res) + length(emailB_res), replace = T) prop_A = sum(res[condition == "A"])/length(res[condition == "A"]) prop_B = sum(res[condition == "B"])/length(res[condition == "B"]) sim_results[i, "difference"] = prop_A - prop_B } sim_results %>% ggplot(aes(x = difference)) + stat_dotsinterval() + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.title.y=element_blank()) + xlim(-0.15,0.15) + ggtitle( "difference distribution (n=~5000)" ) + geom_vline(xintercept = 0, color = "blue4") + geom_vline(xintercept = observed_diff, color = "darkgoldenrod3")

sim_results = tibble( difference = numeric(K) ) emailA_distr <- dist_bernoulli(0.5) generate(emailA_distr, 300)[[1]] %>% as.numeric() -> emailA_res emailB_distr <- dist_bernoulli(0.5) generate(emailB_distr, 200)[[1]] %>% as.numeric() -> emailB_res # p_a = round(sum(emailA_res)/length(emailA_res),2) # p_b = round(sum(emailB_res)/length(emailB_res),2) # # observed_diff = p_a - p_b for(i in 1:K){ res = c(emailA_res, emailB_res) condition = sample(c("A", "B"), length(emailA_res) + length(emailB_res), replace = T) prop_A = sum(res[condition == "A"])/length(res[condition == "A"]) prop_B = sum(res[condition == "B"])/length(res[condition == "B"]) sim_results[i, "difference"] = prop_A - prop_B } sim_results %>% ggplot(aes(x = difference)) + stat_dotsinterval() + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.title.y=element_blank()) + xlim(-0.15,0.15) + ggtitle( "difference distribution (n=~5000)" ) + geom_vline(xintercept = 0, color = "blue4") + geom_vline(xintercept = observed_diff, color = "darkgoldenrod3")