library(mvtBinaryEP) library(geepack) library(binGroup) library(BSDA) #corstr is correlation structure. you can choose between "independence", #"exchangeable", "unstructured", "fixed". When "fixed" is chosen, the R option #should also be specified. For two traits, we can stick with "exchangeable". #start and gt.control option are the same as in gtreg() in binGroup. gtreg.mt <- function(formula, data, subject, groupn, trait, sens, spec, linkf = c("logit", "probit", "cloglog"), method = c("ES", "GEE"), R = NULL, corstr = "independence", start = NULL, control = gt.control(...), ...) { call <- match.call() mf <- match.call(expand.dots = FALSE) m <- match(c("formula", "data", "subject", "groupn", "trait"), names(mf), 0) mf <- mf[c(1, m)] mf$drop.unused.levels <- TRUE mf[[1]] <- as.name("model.frame") mf <- eval(mf, parent.frame()) mt <- attr(mf, "terms") subj <- model.extract(mf, "subject") gr <- model.extract(mf, "groupn") tr <- model.extract(mf, "trait") Y <- model.response(mf, "any") if (length(dim(Y)) == 1) { nm <- rownames(Y) dim(Y) <- NULL if (!is.null(nm)) names(Y) <- nm } X <- if (!is.empty.model(mt)) model.matrix(mt, mf) else matrix(, NROW(Y), 0) linkf <- match.arg(linkf) CORSTRS <- c("independence", "exchangeable", "unstructured", "fixed") corstrv <- pmatch(corstr, CORSTRS, -1) if (corstrv == -1) stop("invalid corstr.") if (corstrv == 4) { if (is.null(R)) stop("need R matrix for fixed corstr.") else { if (!is.matrix(R)) R <- as.matrix(R) if ((nrow(R) != ncol(R)) || nrow(R) != length(sens)) stop("R needs to be a square matrix of dimensions equal to number of traits.") } } fit <- switch(method <- match.arg(method), "ES" = ES(Y, X, subj, gr, tr, sens, spec, linkf, R, corstrv, start, control), "GEE" = GEE.gt(Y, X, gr, tr, sens, spec, linkf, R, corstrv, start, control)) fit <- c(fit, list(call = call, formula = formula, link = linkf, terms = mt)) class(fit) <- c("gt.mt", "gt") fit } #using 2nd order approximation method to estimate alpha ES <- function(Y, X, subject, groupn, trait, sens, spec, linkf, R, corstrv, start = NULL, control = gt.control()) { if (control$time) start.time <- proc.time() nt <- length(sens) sam <- length(Y)/nt vec <- 1:sam groupn1 <- groupn[trait == 1] num.g <- max(groupn1) X1 <- X[order(subject), ] temp <- by(matrix(Y, nrow = sam), groupn1, tail, n = 1) z <- as.matrix(do.call(rbind, temp)) colnames(z) <- NULL if (is.null(start)) { groupn2 <- groupn + (trait - 1) * num.g temp <- by(X, groupn2, mean) cova.mean <- do.call(rbind, temp) sub2 <- rep(1:num.g, nt) cova.mean <- cova.mean[order(sub2), ] z <- z[order(sub2)] beta.old <- geese.fit(cova.mean, z, id = sort(sub2), family = binomial(link = linkf), corstr = "exch")$beta } else beta.old <- start beta.old <- as.vector(beta.old) group.sizes <- rep(NA, num.g) g2 <- unique(groupn1) for (i in 1:num.g) { group.sizes[i] <- length(groupn1[groupn1 == g2[i]]) } theta <- prodp <- matrix(NA, nrow = num.g, ncol = nt) counts <- 1 diff <- 1 extra.loop <- FALSE next.loop <- TRUE while (next.loop) { xib <- matrix(X %*% beta.old, nrow = sam) pijk <- switch(linkf, logit = plogis(xib), probit = pnorm(xib), cloglog = 1 - exp(-exp(xib))) new.data <- numeric(0) for (i in 1:nt) { sens1 <- sens[i] spec1 <- spec[i] prodp[, i] <- tapply(1 - pijk[, i], groupn1, prod) theta[, i] <- (1 - spec1) * prodp[, i] + sens1 * (1 - prodp[, i]) expect <- rep(NA, times = sam) y <- Y[trait == i] den <- rep((1 - spec1) * prodp[, i] + sens1 * (1 - prodp[, i]), group.sizes) den2 <- rep(spec1 * prodp[, i] + (1 - sens1) * (1 - prodp[, i]), group.sizes) for (j in vec) { if (y[j] == 0) expect[j] <- (1 - sens1) * pijk[j, i]/den2[j] else expect[j] <- sens1 * pijk[j, i]/den[j] } expect[expect > 1] <- 1 expect[expect < 0] <- 0 new.data <- c(new.data, expect) } resi <- z - theta rjk <- matrix(new.data, sam) - pijk if (corstrv != 1) { if (corstrv == 4) workR <- R if (corstrv == 2 || corstrv == 3) { dp <- sqrt(pijk/(1 - pijk)) sum1 <- t.lin <- t.sq <- matrix(0, nrow = nt, ncol = nt) for (j in 1:(nt - 1)) for (j1 in (j + 1):nt) { cz <- resi[, j] * resi[, j1] sum1[j, j1] <- sum1[j, j1] + sum(cz) af <- (1 - sens[j] - spec[j]) * (1 - sens[j1] - spec[j1]) cf <- tapply(dp[, j] * dp[, j1], groupn1, sum) ffa <- af * prodp[, j] * prodp[, j1] * cf aka <- cz/ffa for (grn in 1:num.g) { gs <- group.sizes[grn] vec1 <- vec[groupn1 == grn] m <- Combinations(gs, 2) m1 <- vec1[m[1, ]] m2 <- vec1[m[2, ]] cf2 <- sum(dp[m1, j] * dp[m1, j1] * dp[m2, j] * dp[m2, j1]) coef.sq <- af * cf2 * prodp[grn, j] * prodp[grn, j1] coef.lin <- af * cf[grn] * prodp[grn, j] * prodp[grn, j1] t.lin[j, j1] <- t.lin[j, j1] + coef.lin t.sq[j, j1] <- t.sq[j, j1] + coef.sq } } if (corstrv == 2) { t.linex <- sum(t.lin) t.sqex <- sum(t.sq) sum1ex <- sum(sum1) if ((sqm <- (t.linex^2 + 4 * t.sqex * sum1ex)) < 0) ak <- sum1ex/t.linex else ak <- as.numeric((-t.linex + sqrt(sqm))/2/t.sqex) alpha <- ak workR <- matrix(rep(alpha, nt^2), nrow = nt) diag(workR) <- 1 } else { workR <- matrix(0, nrow = nt, ncol = nt) for (j in 1:(nt - 1)) for (j1 in (j + 1):nt) { if ((sqm <- (t.lin[j, j1]^2 + 4 * t.sq[j, j1] * sum1[j, j1])) < 0) ak <- sum1[j, j1]/t.lin[j, j1] else ak <- as.numeric((-t.lin[j, j1] + sqrt(sqm))/2/t.sq[j, j1]) workR[j, j1] <- ak } workR <- workR + t(workR) + diag(nt) } } zcor <- rep(workR[lower.tri(workR)], sam) #print(workR) } else workR <- diag(nt) if (!extra.loop) { new.data <- new.data[order(subject)] suppress <- function(w) if (any(grepl("non-integer #successes in a binomial glm", w))) invokeRestart("muffleWarning") if (corstrv != 1) { mod.fit <- withCallingHandlers(geese.fit(X1, new.data, id = sort(subject), family = binomial(link = linkf), corstr = "fixed", zcor = zcor), warning = suppress) } else { mod.fit <- withCallingHandlers(geese.fit(X1, new.data, id = sort(subject), family = binomial(link = linkf)), warning = suppress) } relc <- (beta.old - mod.fit$beta)/beta.old diff <- max(abs(relc)) beta.old <- mod.fit$beta if (control$trace) cat("beta is", beta.old, "\tdiff is", diff, "\n") counts <- counts + 1 if (diff <= control$tol || counts > control$maxit) extra.loop <- TRUE } else next.loop <- FALSE } pt1 <- pt3 <- 0 for (grn in 1:num.g) { gs <- group.sizes[grn] vec1 <- vec[groupn1 == grn] nb <- matrix(NA, nrow = gs, ncol = nt) for (i1 in 1:gs) { nb[i1, ] <- make.link(linkf)$mu.eta(xib[vec1[i1], ]) } p2k <- X[groupn1 == grn, ] pt2 <- 0 for (i in vec1) { seq1 <- seq(i - vec1[1] + 1, nt * gs, by = gs) dxik <- t(p2k[seq1, ]) dpijk <- make.link(linkf)$mu.eta(xib[i, ]) delta.inv <- diag(dpijk) dik <- dxik %*% delta.inv bik <- diag(sqrt(pijk[i, ] * (1 - pijk[i, ]))) vik <- bik %*% workR %*% bik aa <- rep(NA, gs) aa1 <- matrix(NA, nrow = nt, ncol = gs) for (j in 1:nt) { fm <- (1 - sens[j] - spec[j]) * prodp[grn, j] + sens[j] for (i1 in 1:gs) { if (vec1[i1] == i) { ge1 <- (sens[j] * fm + (1 - sens[j] - spec[j]) * prodp[grn, j] /(1 - pijk[i, j]) * sens[j] * pijk[i, j])/fm^2 ge2 <- ((1 - sens[j]) * (1 - fm) - (1 - sens[j] - spec[j]) * prodp[grn, j] /(1 - pijk[i, j]) * (1 - sens[j]) * pijk[i, j])/(1 - fm)^2 aa[i1] <- ge1 * theta[grn, j] + ge2 * (1 - theta[grn, j]) } else { ge1 <- ((1 - sens[j] - spec[j]) * prodp[grn, j] /(1 - pijk[vec1[i1], j]) * sens[j] * pijk[i, j])/fm^2 ge2 <- -((1 - sens[j] - spec[j]) * prodp[grn, j] /(1 - pijk[vec1[i1], j]) * (1 - sens[j]) * pijk[i, j])/(1 - fm)^2 aa[i1] <- ge1 * theta[grn, j] + ge2 * (1 - theta[grn, j]) } } aa1[j, ] <- aa } aa2 <- matrix(0, nrow = nt, ncol = gs * nt) for (j in 1:nt) { ca <- (1 + (j - 1) * gs) : (j * gs) aa2[j, ca] <- aa1[j, ] * nb[, j] } dex <- aa2 %*% p2k add <- dik %*% solve(vik) %*% (t(dik) - dex) pt1 <- pt1 + add add2 <- dik %*% solve(vik) %*% rjk[i, ] pt2 <- pt2 + add2 } pt3 <- pt3 + pt2 %*% t(pt2) } cov.san <- solve(pt1) %*% pt3 %*% solve(pt1) cov.nai <- solve(pt1) if (diff > control$tol && counts > control$maxit) warning("ES algorithm did not converge.") if (control$time) { end.time <- proc.time() save.time <- end.time - start.time cat("\n Number of minutes running:", round(save.time[3]/60, 2), "\n \n") } list(coefficients = beta.old, robust.variance = cov.san, naive.variance = cov.nai, theta = theta, R = workR, counts = counts - 1, residuals = resi) } GEE.gt <- function(Y, X, groupn, trait, sens, spec, linkf, R, corstrv, start = NULL, control = gt.control()) { if (control$time) start.time <- proc.time() nt <- length(sens) sam <- length(Y)/nt vec <- 1:sam groupn1 <- groupn[trait == 1] num.g <- max(groupn1) K <- ncol(X) temp <- by(matrix(Y, nrow = sam), groupn1, tail, n = 1) z <- as.matrix(do.call(rbind, temp)) colnames(z) <- NULL if (is.null(start)) { groupn2 <- groupn + (trait - 1) * num.g temp <- by(X, groupn2, mean) cova.mean <- do.call(rbind, temp) sub2 <- rep(1:num.g, nt) cova.mean <- cova.mean[order(sub2), ] z <- z[order(sub2)] beta.old <- geese.fit(cova.mean, z, id = sort(sub2), family = binomial(link = linkf), corstr = "exch")$beta } else beta.old <- start beta.old <- as.vector(beta.old) group.sizes <- rep(NA, num.g) g2 <- unique(groupn1) for (i in 1:num.g) { group.sizes[i] <- length(groupn1[groupn1 == g2[i]]) } theta <- matrix(NA, nrow = num.g, ncol = nt) counts <- 1 diff <- 1 extra.loop <- FALSE next.loop <- TRUE while (next.loop) { xib <- matrix(X %*% beta.old, nrow = sam) pijk <- switch(linkf, logit = plogis(xib), probit = pnorm(xib), cloglog = 1 - exp(-exp(xib))) for (j in 1:nt) { prodp <- tapply(1 - pijk[, j], groupn1, prod) theta[, j] <- (1 - spec[j]) * prodp + sens[j] * (1 - prodp) } resi <- z - theta rjk <- resi/sqrt(theta * (1 - theta)) for (i in 1:num.g) for (j in 1:nt) if (!is.finite(rjk[i, j])) rjk[i, j] <- 0 if (corstrv != 1) { if (corstrv == 4) workR <- R if (corstrv == 2 || corstrv == 3) { sum1 <- matrix(0, nrow = nt, ncol = nt) for (j in 1:(nt - 1)) for (j1 in (j + 1):nt) { temp <- sum(rjk[, j] * rjk[, j1]) sum1[j, j1] <- sum1[j, j1] + temp } if (corstrv == 2) { cs <- nt * (nt - 1) * num.g/2 alpha <- sum(sum1)/(cs - K) workR <- matrix(rep(alpha, nt^2), nrow = nt) diag(workR) <- 1 } else { workR <- matrix(0, nrow = nt, ncol = nt) for (j in 1:(nt - 1)) for (j1 in (j + 1):nt) { ak <- sum1[j, j1]/num.g workR[j, j1] <- ak } workR <- workR + t(workR) + diag(nt) } } } else workR <- diag(nt) pt1 <- pt2 <- pt3 <- 0 for (grn in 1:num.g) { gs <- group.sizes[grn] vec1 <- vec[groupn1 == grn] p2k <- X[groupn1 == grn, ] p1k <- matrix(0, nrow = nt, ncol = gs * nt) for (j in 1:nt) { pro <- switch(linkf, logit = 1 + exp(xib[, j]), probit = 1 - pnorm(xib[, j]), cloglog = exp(-exp(xib[, j]))) d1 <- tapply(pro, groupn1, prod) for (i1 in 1:gs) { col1 <- i1 + (j - 1) * gs p1k[j, col1] <- -(1 - sens[j] - spec[j]) * switch(linkf, logit = plogis(xib[vec1[i1], j])/d1[grn], probit = -dnorm(xib[vec1[i1], j])/ (1 - pnorm(xib[vec1[i1], j])) * d1[grn], cloglog = -exp(xib[vec1[i1], j]) * d1[grn]) } } dik <- p1k %*% p2k bik <- diag(sqrt(theta[grn, ] * (1 - theta[grn, ]))) vik <- bik %*% workR %*% bik add <- t(dik) %*% solve(vik) %*% dik pt1 <- pt1 + add add2 <- t(dik) %*% solve(vik) %*% resi[grn, ] pt2 <- pt2 + add2 cov.zk <- resi[grn, ] %*% t(resi[grn, ]) add3 <- t(dik) %*% solve(vik) %*% cov.zk %*% solve(vik) %*% dik pt3 <- pt3 + add3 } if (!extra.loop) { beta.n <- beta.old + qr.solve(pt1, pt2, tol = 1e-10) relc <- (beta.n - beta.old)/beta.old diff <- max(abs(relc)) beta.old <- as.vector(beta.n) names(beta.old) <- colnames(X) if (control$trace) cat("beta is", beta.old, "\tdiff is", diff, "\n") counts <- counts + 1 if (diff <= control$tol || counts > control$maxit) extra.loop <- TRUE } else next.loop <- FALSE } cov.san <- solve(pt1) %*% pt3 %*% solve(pt1) cov.nai <- solve(pt1) if (diff > control$tol && counts > control$maxit) warning("Fisher's scoring algorithm did not converge.") if (control$time) { end.time <- proc.time() save.time <- end.time - start.time cat("\n Number of minutes running:", round(save.time[3]/60, 2), "\n \n") } list(coefficients = beta.old, robust.variance = cov.san, naive.variance = cov.nai, theta = theta, R = workR, counts = counts - 1, residuals = resi) } print.gt.mt <- function (x, digits = max(3, getOption("digits") - 3)) { cat("\nCall:\n", deparse(x$call), "\n\n", sep = "") if (length(coef(x))) { cat("Coefficients:\n") print.default(format(coef(x), digits = digits), print.gap = 2, quote = FALSE) } else cat("No coefficients\n") cat("\nEstimated Correlation Matrix:\n") print(x$R) cat("\nNumber of Iterations:", x$counts, "\n") invisible(x) } summary.gt.mt <- function(obj) { coef.p <- obj$coefficients cov.mat <- obj$robust.variance dimnames(cov.mat) <- list(names(coef.p), names(coef.p)) var.cf <- diag(cov.mat) s.err <- sqrt(var.cf) zvalue <- coef.p/s.err dn <- c("Estimate", "san.se") pvalue <- 2 * pnorm(-abs(zvalue)) coef.table <- cbind(coef.p, s.err, zvalue, pvalue) dimnames(coef.table) <- list(names(coef.p), c(dn, "z value", "Pr(>|z|)")) keep <- match(c("call", "link", "counts", "R"), names(obj), 0) ans <- c(obj[keep], list(coefficients = coef.table, resid = residuals(obj, type="pearson"), cov.mat =cov.mat)) class(ans) <- "summary.gt.mt" return(ans) } print.summary.gt.mt <- function(obj, digits = max(3, getOption("digits") - 3), signif.stars = getOption("show.signif.stars"), ...) { cat("\nCall:\n") cat(paste(deparse(obj$call), sep = "\n", collapse = "\n"), "\n\n", sep = "") cat("Summary of Pearson Residuals: \n") obj$resid <- quantile(obj$resid, na.rm = TRUE) names(obj$resid) <- c("Min", "1Q", "Median", "3Q", "Max") print.default(obj$resid, digits = digits, na.print = "", print.gap = 2) cat("\nCoefficients:\n") coefs <- obj$coefficients printCoefmat(coefs, digits = digits, signif.stars = signif.stars, na.print = "NA", ...) cat("\nEstimated Correlation Matrix:\n") print(round(obj$R, 4)) cat("\nNumber of Iterations:", obj$counts, "\n") invisible(obj) } residuals.gt.mt <- function(object, type = c("pearson", "response")) { type <- match.arg(type) r <- object$residuals zhat <- object$theta res <- switch(type, response = r, pearson = r/sqrt(zhat * (1 - zhat))) res } gt.control <- function (tol = 0.001, maxit = 500, trace = FALSE, time = TRUE) { if (!is.numeric(tol) || tol <= 0) stop("value of 'tol' must be > 0") if (!is.numeric(maxit) || maxit <= 0) stop("maximum number of iterations must be > 0") list(tol = tol, maxit = maxit, trace = trace, time = time) } sim.mt <- function(x = NULL, gshape = 17, gscale = 1.4, par, R, linkf = c("logit", "probit", "cloglog"), sample.size = NULL, group.size, sens, spec, sens.ind = NULL, spec.ind = NULL, print.Bounds = T) { nt <- length(sens) if (is.null(sens.ind)) sens.ind <- sens if (is.null(spec.ind)) spec.ind <- spec if (is.null(x)) { x <- rgamma(n = sample.size, shape = gshape, scale = gscale) X <- cbind(1, x) } else { X <- cbind(1, x) sample.size <- nrow(X) } linkf <- match.arg(linkf) par <- matrix(par, ncol = nt, byrow = T) pijk <- switch(linkf, logit = plogis(X %*% par), probit = pnorm(X %*% par), cloglog = 1 - exp(-exp(X %*% par))) if (print.Bounds) { for (i in 1:(nt - 1)) for (j in (i + 1):nt) { p1 <- pijk[, i] p2 <- pijk[, j] a <- as.vector(sqrt(p1 * (1 - p2)/(p2 * (1 - p1)))) b <- as.vector(sqrt(p1 * p2/(1 - p1)/(1 - p2))) upper <- min(cbind(a, 1/a)) lower <- max(cbind(-b, -1/b)) cat("The corr between trait", i, "&", j, "is bounded above by", upper, "\n", "and bounded below by", lower, "\n") } } y <- matrix(NA, nrow = sample.size, ncol = nt) for (i in 1:sample.size) { y[i, ] <- ep(mu = pijk[i, ], R = R)$y } base.data <- data.frame(subject = 1:sample.size, x = x) if (ncol(X) > 2) for (i in 2:ncol(X)) colnames(base.data)[i] <- paste("x", i - 1, sep="") lowest <- sample.size%/%group.size group.numb <- rep(1:(lowest + 1), each = group.size) groupn <- group.numb[1:sample.size] num.g <- max(groupn) group.size <- tapply(1:sample.size, groupn, length) ret <- matrix(NA, nrow = sample.size, ncol = nt) gr.data <- base.data for (i in 1:nt) { sens1 <- sens[i] spec1 <- spec[i] trait <- y[, i] save.sum <- tapply(trait, groupn, sum) save.group <- as.vector(ifelse(save.sum > 0, 1, 0)) save.obs <- rep(NA, num.g) for (j in 1:num.g) save.obs[j] <- ifelse(save.group[j] == 1, rbinom(1, 1, sens1), 1 - rbinom(1, 1, spec1)) gres <- rep(save.obs, group.size) gr.data <- data.frame(gr.data, gres) for (j in 1:sample.size) { if (gres[j] == 1) ret[j, i] <- ifelse(trait[j] == 1, rbinom(1, 1, sens.ind), rbinom(1, 1, 1 - spec.ind)) } } gr.data <- data.frame(gr.data, groupn) ind.data <- data.frame(base.data, trait = rep(1:nt, each = sample.size), res = as.vector(y)) ind.data <- ind.data[order(ind.data$subject), ] gr.data <- reshape(gr.data, idvar = "subject", timevar = "trait", varying = list((1 + ncol(X)):(ncol(X) + nt)), v.names = "gres", direction = "long") gr.data <- data.frame(gr.data, retest = as.vector(ret)) list(group = gr.data, individual = ind.data) } ################################################################################# # Examples for a simulated data set (not in the paper) #Simulate data set.seed(461) alpha<-0.6 gee1 <- sim.mt(par=c(-7, -6, 0.13, 0.1), R=matrix(c(1,alpha, alpha,1),nrow=2), sample.size=2000, group.size=5, sens=rep(.95,2), spec=rep(.95,2)) #exchangeable correlation structure, ES algorithm fit1 <- gtreg.mt(formula=gres~factor(trait)+x:factor(trait)-1, data=gee1$group, subject=subject, groupn=groupn, trait=trait, method="ES", sens=rep(.95,2), spec=rep(.95,2), start=c(-7, -6, 0.13, 0.1), corstr="exch") summary(fit1) #exchangeable correlation structure, GEE fit2 <- gtreg.mt(formula=gres~factor(trait)+x:factor(trait)-1, data=gee1$group, subject=subject, groupn=groupn, trait=trait, method="GEE", sens=rep(.95,2), spec=rep(.95,2), start=c(-7, -6, 0.13, 0.1), corstr="exch") summary(fit2)