library(ggdist)
library(stringr)
library(ggplot2)
library(dplyr)
library(distributional)
library(patchwork)
K=1000
emailA_res <- c(1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1)
p_a = (round(sum(emailA_res)/length(emailA_res),2))
n_a = (length(emailA_res))
print(str_glue("p_a={p_a}, n_a={n_a}"))
emailB_res <- c(0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1)
p_b = (round(sum(emailB_res)/length(emailB_res),2))
n_b = (length(emailB_res))
print(str_glue("p_b={p_b}, n_b={n_b}"))
observed_diff = p_a - p_b
print(observed_diff)
sim_results = tibble(
difference = numeric(K)
)
for(i in 1:K){
res = c(emailA_res, emailB_res)
condition = sample(c("A", "B"), length(emailA_res) + length(emailB_res), replace = T)
prop_A = sum(res[condition == "A"])/length(res[condition == "A"])
prop_B = sum(res[condition == "B"])/length(res[condition == "B"])
sim_results[i, "difference"] = prop_A - prop_B
}
sim_results %>%
ggplot(aes(x = difference)) +
stat_dotsinterval() +
xlim(-0.15,0.15) +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.title.y=element_blank()) +
ggtitle(
"difference distribution (n=500)"
) +
geom_vline(xintercept = 0, color = "blue4") -> p4
p4
p4 + geom_vline(xintercept = observed_diff, color = "darkgoldenrod3") -> p5
p5
sim_results = tibble(
difference = numeric(K)
)
emailA_distr <- dist_bernoulli(0.56)
generate(emailA_distr, 3000)[[1]] %>% as.numeric() -> emailA_res
emailB_distr <- dist_bernoulli(0.5)
generate(emailB_distr, 2300)[[1]] %>% as.numeric() -> emailB_res
# p_a = round(sum(emailA_res)/length(emailA_res),2)
# p_b = round(sum(emailB_res)/length(emailB_res),2)
#
# observed_diff = p_a - p_b
for(i in 1:K){
res = c(emailA_res, emailB_res)
condition = sample(c("A", "B"), length(emailA_res) + length(emailB_res), replace = T)
prop_A = sum(res[condition == "A"])/length(res[condition == "A"])
prop_B = sum(res[condition == "B"])/length(res[condition == "B"])
sim_results[i, "difference"] = prop_A - prop_B
}
sim_results %>%
ggplot(aes(x = difference)) +
stat_dotsinterval() +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.title.y=element_blank()) +
xlim(-0.15,0.15) +
ggtitle(
"difference distribution (n=~5000)"
) +
geom_vline(xintercept = 0, color = "blue4") +
geom_vline(xintercept = observed_diff, color = "darkgoldenrod3")
sim_results = tibble(
difference = numeric(K)
)
emailA_distr <- dist_bernoulli(0.5)
generate(emailA_distr, 300)[[1]] %>% as.numeric() -> emailA_res
emailB_distr <- dist_bernoulli(0.5)
generate(emailB_distr, 200)[[1]] %>% as.numeric() -> emailB_res
# p_a = round(sum(emailA_res)/length(emailA_res),2)
# p_b = round(sum(emailB_res)/length(emailB_res),2)
#
# observed_diff = p_a - p_b
for(i in 1:K){
res = c(emailA_res, emailB_res)
condition = sample(c("A", "B"), length(emailA_res) + length(emailB_res), replace = T)
prop_A = sum(res[condition == "A"])/length(res[condition == "A"])
prop_B = sum(res[condition == "B"])/length(res[condition == "B"])
sim_results[i, "difference"] = prop_A - prop_B
}
sim_results %>%
ggplot(aes(x = difference)) +
stat_dotsinterval() +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.title.y=element_blank()) +
xlim(-0.15,0.15) +
ggtitle(
"difference distribution (n=~5000)"
) +
geom_vline(xintercept = 0, color = "blue4") +
geom_vline(xintercept = observed_diff, color = "darkgoldenrod3")