import numpy as np
import pandas as pd
import random
import statistics as stats
n, p = 1, .5
counter = 0
for i in range(1,5000):
X = lst = [0] * 40
x = random.sample(range(0,40),30)
for k in range(1,30):
X[x[k]] ==1
w = np.random.binomial(1, 0.5,40)
X_tran = np.transpose(np.array(X))
W_tran = np.transpose(np.array(w))
dataset = pd.DataFrame({'X': X, 'W':w }, columns=['X', 'W'])
subset_df = dataset[dataset["W"] ==1]
column_count = len(subset_df['W'])/len(dataset)
if column_count < 0.95: counter +=1
percent = counter/5000
print('Percent poorly balanced',percent*100, 'percent')
n, p = 1, .5
simulated_stat = []
Yw1 = []
Yw0 = []
cnttreat = 3750
for i in range(1,5000):
X = lst = [0] * 5000
x = random.sample(range(0,5000),3750)
for k in range(1,3750):
X[x[k]] ==1
w = np.random.binomial(1, 0.5,5000)
X_tran = np.transpose(np.array(X))
W_tran = np.transpose(np.array(w))
dataset = pd.DataFrame({'X': X, 'W':w }, columns=['X', 'W'])
treated_df = dataset[dataset["W"] ==1]
untreated_df = dataset[dataset["W"] ==0]
mean_treated_df=treated_df.mean()
mean_untreated_df=untreated_df.mean()
mean_diff=mean_treated_df-mean_untreated_df
simulated_stat.append(abs(mean_diff))
Yw1.append(mean_treated_df)
Yw0.append(mean_untreated_df)
estimate = np.mean(Yw1)/cnttreat -np.mean(Yw0)/(5000-cnttreat)
estimate
estimate = np.mean(Yw1)/cnttreat -np.mean(Yw0)/(5000-cnttreat)
sigma1 = np.var(np.array(Yw1))
sigma2 = np.var(np.array(Yw0))
range_ = 1.96*np.sqrt(sigma1/cnttreat +sigma2/(40-cnttreat))
Lower_bound = estimate -range_
Upper_bound = estimate + range_
print('The CI is [',Lower_Bound,':',Upper_bound,']')
![Proof for 3 part a ii ](C:/Users/pault/Desktop/Harvard Data Science Masters 2020/Spring 2022/Stat 186/HW/HW 2/3a.png)
from IPython import display
display.Image("/work/3a.png")
from IPython import display
display.Image("/work/3b.png")
data = pd.read_csv('HS.csv')
data.head()
# generate y and w
data['y'] = data['Steps.Day11'] - data['Steps.Day10']
data['w'] = data['Planning.Eve.Day10']
data['w'].value_counts()
N = len(data['w'])
Nt = len(data[data['w']==1])
constant = (N/(Nt*(N-Nt)))
Test_statistic = constant*(np.sum(data['y']*(data['w']-constant)))
Test_statistic