The complete R package logreg2ph
and code for the
simulation settings included in this paper.
Part of the R package TwoPhaseReg
as described in this paper.
This package combines elements of the R packages logreg2ph
and TwoPhaseReg
To install the package, run the following in your R
console:
devtools::install_github("Epic-Doughnut/sleev")
Inside the simulations
subdirectory, you will find the
following:
Table1_SimSetup.R
: simulations with outcome
misclassification and a binary error-prone covariate, intended to
inspect increasing Phase I and Phase II sample sizesTables2&3_SimSetup.R
: Simulations with outcome
misclassification and a continuous covariate with additive errors,
intended to inspect varied error varianceTable4_SimSetup.R
: Simulations with outcome
misclassification and a continuous covariate with additive errors,
intended to inspect varied outcome error ratesTableS2_SimSetup.R
: Simulations with outcome
misclassification and binary error-prone covariate, intended to inspect
robustness of the fully-parametric MLE and proposed SMLE under complex
covariate errorTableS3_SimSetup.R
: Simulations with outcome
misclassification and binary error-prone covariate, intended to inspect
robustness of the fully-parametric MLE and proposed SMLE under complex
outcome and covariate errorTableS4_SimSetup.R
: Simulations with continuous
covariate with additive errors (classical measurement error), intended
to compare performance of proposed SMLE to traditional regression
calibration (RC) approachTableS5_SimSetup.R
: Simulation results for the naive
estimator under outcome misclassification and a continuous covariate
under varied parameterizationsTableS6_SimSetup.R
: Simulations with outcome
misclassification and continuous covariate with differentially biased
additive errors, intended to demonstrate robustness of the proposed SMLE
to different covariate error typesTableS7_SimSetup.R
: Simulations with outcome
misclassification and continuous covariate with differential
multiplicative errors, intended to demonstrate robustness of the
proposed SMLE to different covariate error typesInside each of the files above, you will find code to generate the appropriate data for that simulation setting, e.g.,
```{r, eval = F, tidy = TRUE} set.seed(918)
N <- 1000 # Phase I size = N n <- 250 # Phase II/audit size = n
Xa <- rbinom(n = N, size = 1, prob = 0.25) Xb <- rbinom(n = N, size = 1, prob = 0.5) Y <- rbinom(n = N, size = 1, prob = (1 + exp(-(- 0.65 - 0.2 * Xb - 0.1 * Xa))) ^ (- 1))
sensX <- specX <- 0.75 delta0 <- - log(specX / (1 - specX)) delta1 <- - delta0 - log((1 - sensX) / sensX) Xbstar <- rbinom(n = N, size = 1, prob = (1 + exp(- (delta0 + delta1 * Xb + 0.5 * Xa))) ^ (- 1))
sensY <- 0.95 specY <- 0.90 theta0 <- - log(specY / (1 - specY)) theta1 <- - theta0 - log((1 - sensY) / sensY) Ystar <- rbinom(n = N, size = 1, prob = (1 + exp(- (theta0 - 0.2 * Xbstar + theta1 * Y - 0.2 * Xb - 0.1 * Xa))) ^ (- 1))
Then, the user has the option of two audit designs: simple random sampling (SRS) or 1:1 case-control sampling based on $Y^*$ (naive case-control). Based on these designs, the validation indicators V are generated as follows:
```{r, eval = FALSE}
# Choose audit design: SRS or -----------------------------
## Naive case-control: case-control based on Y^* ----
audit <- "SRS" #or "Naive case-control"
# Draw audit of size n based on design --------------------
## V is a TRUE/FALSE vector where TRUE = validated --------
if(audit == "SRS")
{
V <- seq(1, N) %in% sample(x = seq(1, N), size = n, replace = FALSE)
}
if(audit == "Naive case-control")
{
V <- seq(1, N) %in% c(sample(x = which(Ystar == 0), size = 0.5 * n, replace = FALSE),
sample(x = which(Ystar == 1), size = 0.5 * n, replace = FALSE))
}
Finally, combine the generated data and validation indicators into an analytical dataset:
{r, eval = F, tidy = T} # Build dataset -------------------------------------------- sdat <- cbind(Y, Xb, Ystar, Xbstar, Xa, V) # Make Phase-II variables Y, Xb NA for unaudited subjects --- sdat[!V, c("Y", "Xb")] <- NA
The R
scripts each contain implementations for the
estimators discussed in the paper. Examples of each are provided
below:
{r, eval = F, tidy = T} naive <- glm(Ystar ~ Xbstar + Xa, family = "binomial", data = data.frame(sdat)) beta_naive <- naive$coefficients[2] se_naive <- sqrt(diag(vcov(naive)))[2]
{r, eval = F, tidy = T} cc <- glm(Y[V] ~ Xb[V] + Xa[V], family = "binomial") beta_cc <- cc$coefficients[2] se_cc <- sqrt(diag(vcov(cc)))[2]
{r, eval = F, tidy = T} library(sandwich) if (audit == "Naive case-control") { sample_wts <- ifelse(Ystar[V] == 0, 1 / ((0.5 * n) / (table(Ystar)[1])), 1 / ((0.5 * n) / (table(Ystar)[2]))) ht <- glm(Y[V] ~ Xb[V] + Xa[V], family = "binomial", weights = sample_wts) beta_ht <- ht$coefficients[2] se_ht <- sqrt(diag(sandwich(ht)))[2] }
```{r, eval = F, tidy = T} ### Influence function for logistic regression ### Taken from: https://github.com/T0ngChen/multiwave/blob/master/sim.r inf.fun <- function(fit) { dm <- model.matrix(fit) Ihat <- (t(dm) %% (dm fit\(fitted.values * (1 - fit\)fitted.values))) / nrow(dm) ## influence function infl <- (dm * resid(fit, type = “response”)) %*% solve(Ihat) infl }
naive_infl <- inf.fun(naive) # error-prone influence functions based on naive model colnames(naive_infl) <- paste0(“if”, 1:3)
sdat <- cbind(id = 1:N, sdat, naive_infl)
library(survey) if (audit == “SRS”) { sstudy <- twophase(id = list(~id, ~id), data = data.frame(sdat), subset = ~V) } else if (audit == “Naive case-control”) { sstudy <- twophase(id = list(~id, ~id), data = data.frame(sdat), strat = list(NULL, ~Ystar), subset = ~V) } scal <- calibrate(sstudy, ~ if1 + if2 + if3, phase = 2, calfun = “raking”)
rake <- svyglm(Y ~ Xb + Xa, family = “binomial”, design = scal) beta_rake <- rake$coefficients[2] se_rake <- sqrt(diag(vcov(rake)))[2]
#### 5. Maximum Likelihood Estimator (MLE) (for Binary Xb* Only)
```{r, eval = F, tidy = T}
# Script: two-phase log-likelihood specification adapted from Tang et al. (2015) named ~/code/Tang_twophase_loglik_binaryX.R
source("Tang_twophase_loglik_binaryX.R")
fit_Tang <- nlm(f = Tang_twophase_loglik,
p = rep(0, 12),
hessian = TRUE,
Val = "V",
Y_unval = "Ystar",
Y_val="Y",
X_unval = "Xbstar",
X_val = "Xb",
C = "Xa",
data = sdat)
beta_mle <- fit_Tang$estimate[10]
se_mle <- sqrt(diag(solve(fit_Tang$hessian)))[10]
```{r, eval = F, tidy = T} # Construct B-spline basis ——————————- nsieve <- 4 B <- matrix(0, nrow = N, ncol = nsieve) B[which(Xa == 0 & Xbstar == 0), 1] <- 1 B[which(Xa == 0 & Xbstar == 1), 2] <- 1 B[which(Xa == 1 & Xbstar == 0), 3] <- 1 B[which(Xa == 1 & Xbstar == 1), 4] <- 1 colnames(B) <- paste0(“bs”, seq(1, nsieve)) sdat <- cbind(sdat, B)
library(“logreg2ph”) smle <- logreg2ph(Y_unval = “Ystar”, Y_val = “Y”, X_unval = “Xbstar”, X_val = “Xb”, C = “Xa”, Validated = “V”, Bspline = colnames(B), data = sdat, noSE = FALSE, MAX_ITER = 1000, TOL = 1E-4) beta_smle <- smle\(Coefficients\)Coefficient[2] se_smle <- smle\(Coefficients\)SE[2] ```