Lets compare the coverage of two confidence intervals - one made using Normal approximation (equation 3.6 or HC2 robust standard errors), the other under the constant treatment effects assumption.

# Create Dataset

library(dplyr)
library(randomizr)
library(knitr)

set.seed(100)

data <- data_frame(Y0 = rnorm(500, mean = 0, sd = 8), Y1 = Y0 + rnorm(500, mean = 5,
sd = 4), X1 = as.numeric(Y0 >= 0), X2 = randomizr::complete_ra(N = 500,
m = 250), C1 = ifelse(Y0 <= -1, 1, ifelse((Y0 > -1) & (Y0 <= 0), 2, ifelse((Y0 >
0) & (Y0 <= 1), 3, 4))), C2 = sample(x = c(1, 2, 3, 4), size = 500, replace = T,
prob = c(0.25, 0.25, 0.25, 0.25)))

kable(head(data), caption = "Dataset for Confidence Interval Converage Test",
row.names = T)
Dataset for Confidence Interval Converage Test
Y0 Y1 X1 X2 C1 C2
1 -4.0175388 -4.802689 0 1 1 2
2 1.0522493 7.315672 1 1 4 1
3 -0.6313367 2.997673 0 1 2 1
4 7.0942785 4.368866 1 0 4 2
5 0.9357702 6.907054 1 1 3 4
6 2.5490407 6.097969 1 1 4 4
true_ate <- mean(data$Y1 - data$Y0)  # True ATE
true_ate
## [1] 5.28487

# Sampling distribution of ATE estimates

declaration <- declare_ra(N = nrow(data), m = 250)  # Complete random assignment, m=250, N=500

Z <- conduct_ra(declaration)

# Get the sampling distribution of all ATE estimates

D <- obtain_permutation_matrix(declaration)

ate_sims <- rep(NA, ncol(D))

for (i in 1:ncol(D)) {
Z_temp <- D[, i]
Y <- data$Y1 * Z_temp + data$Y0 * (1 - Z_temp)
ate_sims[i] <- mean(Y[Z_temp == 1]) - mean(Y[Z_temp == 0])
}

hist(ate_sims, breaks = 100, main = "Sampling Distribution of ATE Estimates")

# Performance of Normal approximation confidence intervals

To get the coverage of confidence intervals constructed using equation 3.6 I run a simulation in which for different samples of the population ($$N_{pop}=500$$ and $$n_{sample}=200$$), I compute the ate estimate, standard error estimate, and check if the resulting confidence interval contains the true ATE (which is known from the above dataset: $$E[\tau_i] = 5.28487$$).

library(commarobust)

# Define output vector
coverage_normal <- rep(NA, 1000)
# For a random sample, create a declaration in randomizr
sample_declaration <- declare_rs(N = 500, n = 200)

for (i in 1:1000) {
# Generate a random sample
data$selected <- draw_rs(sample_declaration) sample <- data %>% filter(selected == 1) # Conduct complete random assignment Z_temp <- complete_ra(N = 200, m = 100) # Switching equation Y <- sample$Y1 * Z_temp + sample$Y0 * (1 - Z_temp) model <- lm(Y ~ Z_temp) # ATE estimate ate_estimate <- commarobust(model)[2, 1] # SE estimate (HC2 robust SE) se_estimate <- commarobust(model)[2, 2] # Construct CI ci_lower <- ate_estimate - 1.96 * se_estimate ci_upper <- ate_estimate + 1.96 * se_estimate # Check if this CI contains true ate coverage_normal[i] <- as.numeric((true_ate >= ci_lower) & (true_ate <= ci_upper)) } # Coverage of CI mean(coverage_normal) ## [1] 0.951 # Performance of Gerber and Green SEs To get the coverage of confidence intervals constructed under the constant treatment effects assumption, I run a simulation in which for different samples ($$n_{sample} = 200$$) of the population ($$N_{pop} = 500$$), I compute the ate estimate, use that to get a hypothetical schedule of potential outcomes under the fixed effects assumption, and check if the resulting confidence interval contains the true ATE (5.28487) coverage_gerbgreen <- rep(NA, 1000) sample_declaration <- declare_rs(N = 500, n = 200) set.seed(32) for (i in 1:1000) { # Generate a random sample data$selected <- draw_rs(sample_declaration)
sample <- data %>% filter(selected == 1)
# Conduct complete random assignment
Z_temp <- complete_ra(N = 200, m = 100)
# Switching equation
Y <- sample$Y1 * Z_temp + sample$Y0 * (1 - Z_temp)
model <- lm(Y ~ Z_temp)
# ATE estimate
ate_estimate <- commarobust(model)[2, 1]
# SE estimate (under constant effects) Schedule of potential outcomes, under
# fixed effects
Y0_temp <- Y - ate_estimate * Z_temp
Y1_temp <- Y0_temp + ate_estimate
# Create permutation matrix
declaration <- declare_ra(N = length(Y), m = length(Y)/2)
D <- obtain_permutation_matrix(declaration, maximum_permutations = 1000)
# Loop to get sampling distribution
ate_sims <- rep(NA, ncol(D))
for (j in 1:ncol(D)) {
z_sim <- D[, j]
# Compute ATE
ate_sims[j] <- mean(Y1_temp[z_sim == 1]) - mean(Y0_temp[z_sim == 0])
}
# Construct CI
ci_lower <- quantile(ate_sims, probs = 0.025)
ci_upper <- quantile(ate_sims, probs = 0.975)
# Check if this CI contains true ate
coverage_gerbgreen[i] <- as.numeric((true_ate >= ci_lower) & (true_ate <=
ci_upper))
}

# Coverage of CI
mean(coverage_gerbgreen)
## [1] 0.956

# Small and large sample properties

I write a function that computes coverage for different sample sizes: $$n_{sample} \in \{10,50,100,250\}$$.

Sample.Size Coverage.NormalApprox Coverage.GerberGreen
10 0.912 0.896
50 0.944 0.932
100 0.961 0.951
250 0.949 0.947