#AC data ################################################################################ # Initial look at data y <- c(3,5,7,18,43,85,91,98,100,130,230,487) t <- mean(y) t #EDF par(pty = "s", mfrow = c(1,2), lend = "square") plot.ecdf(x = y, verticals = TRUE, do.p = FALSE, main = "EDF for AC failure times", lwd = 2, xlim = c(0,600), panel.first = grid(), ylab = expression(hat(F)), xlab = "y") #Notice the "rate" parameter is 1/scale parameter - I am using the MLE (sample mean) in it curve(expr = pexp(q = x, rate = 1/t), xlim = c(0, 600), col = "red", add = TRUE) #QQ-Plot exp.quant <- qexp(p = seq(from = 1/(length(y)+1), to = 1-1/(length(y)+1), by = 1/(length(y)+1)), rate = 1/t) plot(y = sort(y), x = exp.quant, main = "QQ-Plot for AC failure times", ylab = "y", xlab = "Exp. quantiles", panel.first = grid(), ylim = c(0,600)) data.frame(exp.quant, y) abline(a = 0, b = 1, col = "red") ################################################################################ # Method #1: Take resamples and calculate interval #One resample set.seed(8912) y.star <- sample(x = y, replace = TRUE) y.star table(y.star) mean(y.star) # t* #Another way to take a resample set.seed(8912) index.star <- sample(x = 1:12, replace = TRUE) y[index.star] #Large number of resamples and calculate t* for each R <- 4999 t.star <- numeric(R) set.seed(8912) for (r in 1:R) { y.star <- sample(x = y, replace = TRUE) t.star[r] <- mean(y.star) } #EDF par(pty = "s", mfrow = c(1,2), lend = "square") #par(mar = c(5, 5, 4, 2) + 0.1) plot.ecdf(x = t.star - t, verticals = TRUE, do.p = FALSE, main = "EDF for t* - t", lwd = 2, panel.first = grid(), ylab = "EDF", xlab = "t* - t") hist(x = t.star - t, xlab = "t* - t", main = "Histogram for t* - t") alpha <- 0.025 order.val <- c((R+1)*alpha, (R+1)*(1-alpha)) sort(t.star)[order.val] quantile(x = t.star, type = 1, probs = c(alpha, 1-alpha)) #type = 1 is inverse of EDF #(1-2*alpha)100% basic bootstrap interval low <- 2*t - sort(t.star)[(R+1)*(1-alpha)] up <- 2*t - sort(t.star)[(R+1)*alpha] data.frame(low, up) ################################################################################ # Method #2: Take resamples and calculate interval library(boot) #Function for the statistic # First element is the original or resampled data. # Second element represents the indices of the data. For example, the indices will be 1:length(y) for the observed # data. calc.t <- function(data, i) { d <- data[i] mean(d) } #Try it calc.t(data = y, i = 1:length(y)) #Try it again set.seed(8912) calc.t(data = y, i = sample(x = 1:length(y), size = 12, replace = TRUE)) #Do bootstrap set.seed(8912) boot.res <- boot(data = y, statistic = calc.t, R = 4999, sim = "ordinary") boot.res names(boot.res) boot.res$t0 # t head(boot.res$t) # t* boot.res$statistic class(boot.res) methods(class = boot) plot(boot.res) #List of indices from the resamples options(width = 60) save.ind <- boot.array(boot.out = boot.res, indices = TRUE) head(save.ind) #CI - Different resamples were taken than with my for() function # implementation, so that's why the interval is different than earlier. boot.ci(boot.out = boot.res, conf = 0.95, type = "basic") ################################################################################ # Usual t-distribution based interval n <- length(y) alpha <- 0.025 mean(y) + qt(p = c(alpha, 1 - alpha), df = n - 1) * sd(y)/sqrt(n) ################################################################################ # Bias and variance #Bias-corrected estimate b <- mean(boot.res$t[,1]) - boot.res$t0 # mean(boot.res$t) also works for first part because only one column b boot.res$t0 - b #bootstrap estimated variance var(boot.res$t[,1]) sd(boot.res$t[,1]) # s/sqrt(n) sd(y)/sqrt(length(y)) #s^2 / n var(y)/length(y) #jackknife l.jack <- empinf(data = y, statistic = calc.t, stype = "i", type = "jack") var.linear(l.jack) sum(l.jack^2)/length(y)^2 #regression estimates - Equation 2.46 in Davison and Hinkley (1996) l.reg <- empinf(boot.out = boot.res) var.linear(l.reg) ################################################################################ # Studentized interval ################## #Jackknife version calc.t2 <- function(data, i) { d <- data[i] mean(d) } calc.t <- function(data, i) { d <- data[i] l.jack <- empinf(data = d, statistic = calc.t2, stype = "i", type = "jack") v.jack <- var.linear(L = l.jack) c(mean(d), v.jack) } #Try it calc.t(data = y, i = 1:length(y)) #Do bootstrap set.seed(8912) boot.res <- boot(data = y, statistic = calc.t, R = 4999, sim = "ordinary") boot.res head(boot.res$t) boot.res$t0 z.star <- (boot.res$t[,1] - boot.res$t0[1])/sqrt(boot.res$t[,2]) alpha <- 0.025 z.quant <- quantile(x = z.star, type = 1, probs = c(alpha, 1-alpha)) z.quant qt(p = c(alpha, 1 - alpha), df = length(y) - 1) low <- boot.res$t0[1] - z.quant[2]*sqrt(boot.res$t0[2]) up <- boot.res$t0[1] - z.quant[1]*sqrt(boot.res$t0[2]) data.frame(low, up) boot.ci(boot.out = boot.res, conf = 0.95, type = "stud") #Safer way to use boot.ci() boot.ci(boot.out = boot.res, conf = 0.95, type = "stud", var.t0 = boot.res$t0[2], var.t = boot.res$t[,2], t0 = boot.res$t0[1], t = boot.res$t[,1]) #Plot Z* and t-distribution par(pty = "s", mfrow = c(1,2), lend = "square") #par(mar = c(5, 5, 4, 2) + 0.1) plot.ecdf(x = z.star, verticals = TRUE, do.p = FALSE, main = "EDF for z*", lwd = 2, panel.first = grid(), ylab = "EDF", xlab = "z*") curve(expr = pt(q = x, df = length(y) - 1), xlim = c(min(z.star), max(z.star)), col = "red", add = TRUE) hist(x = z.star, xlab = "z*", main = "Histogram for z*", freq = FALSE, ylim = c(0, 0.3+0.1)) curve(expr = dt(x = x, df = length(y) - 1), xlim = c(min(z.star), max(z.star)), col = "red", add = TRUE) #################### #Double boot version calc.t2<-function(data, i) { d2 <- data[i] mean(d2) } calc.t<-function(data, i, M) { d <- data[i] boot.res.M <- boot(data = d, statistic = calc.t2, R = M, sim = "ordinary") v.boot <- var(boot.res.M$t) c(mean(d), v.boot) } #Testing calc.t(data = y, i = 1:length(y), M = 999) #Find start time start.time <- proc.time() set.seed(8912) boot.res2 <- boot(data = y, statistic = calc.t, R = 4999, sim="ordinary", M = 999) end.time<-proc.time() save.time<-end.time-start.time cat("\n Number of minutes running:", save.time[3]/60, "\n \n") boot.res2 z.star2 <- (boot.res2$t[,1] - boot.res2$t0[1])/sqrt(boot.res2$t[,2]) z.quant2 <- quantile(x = z.star2, type = 1, probs = c(alpha, 1-alpha)) z.quant2 boot.ci(boot.out = boot.res2, conf = 0.95, type = "stud") #Examine the standard deviations produced by the double boot par(mfrow = c(1,1)) summary(sqrt(boot.res$t[,2])) hist(x = sqrt(boot.res$t[,2]), main = "S.D. produced by double bootstrap", xlab = "sqrt(v*)") abline(h=0) ################################################################################ # Percentile interval boot.ci(boot.out = boot.res, conf = 0.95, type = c("basic", "perc")) alpha <- 0.025 quantile(x = t.star, type = 1, probs = c(alpha, 1-alpha)) #type = 1 is inverse of EDF order.val <- c((R+1)*alpha, (R+1)*(1-alpha)) sort(t.star)[order.val] ################################################################################ # BCa interval #BCa boot.ci(boot.out = boot.res, conf = 0.95, type = "bca") #Empirical influence values estimated by the jackknife l.jack <- empinf(data = y, statistic = calc.t, stype = "i", type = "jack") # l.reg <- empinf(boot.out = boot.res) #Boot estimated # l.j <- y - mean(y) #Actual values #Acceleration a.jack < -1/6 * sum(l.jack^3)/sum(l.jack^2)^(3/2) # a.reg<-1/6 * sum(l.reg^3)/sum(l.reg^2)^(3/2) # a<-1/6 * sum(l.j^3)/sum(l.j^2)^(3/2) #Bias correction sum(boot.res$t[,1]<=boot.res$t0[1])/(R+1) sum(boot.res$t[,1]<=boot.res$t0[1])/R w <- qnorm(p = sum(boot.res$t[,1]<=boot.res$t0[1])/(R+1)) #This is how Davison and Hinkley (1997) does the calculation w <- qnorm(p = sum(boot.res$t[,1]<=boot.res$t0[1])/R) #This is how bca.ci() does the calculation w #BCa calculations Davison and Hinkley (1997, Table 5.4) alpha <- c(0.025, 0.975) z.tilde <- w + qnorm(p = alpha) alpha.tilde <- pnorm(q = w + z.tilde/(1-a.jack*z.tilde)) d <- (R+1)*alpha.tilde d #Note that r will need to be an integer or the quantile function can be used as below #quantile(x = boot.res$t[,1], probs = alpha.tilde, type = 1) limit.ceil <- sort(boot.res$t[,1])[ceiling(d)] limit.floor <- sort(boot.res$t[,1])[floor(d)] #Alternatively, interpolation could be used as outlined in Davison and Hinkley (1997, p. 195) data.frame(alpha, z.tilde, alpha.tilde, d, limit.ceil, limit.floor) ################################################################################ # Additional aspects of boot.ci() #All intervals boot.ci(boot.out = boot.res2, conf = 0.95, type = "all") lower <- t - b - qnorm(0.975, mean = 0, sd = 1)*sd(boot.res$t[,1]) upper <- t - b - qnorm(0.025, mean = 0, sd = 1)*sd(boot.res$t[,1]) data.frame(name = "Normal interval calc in boot.ci", lower, upper) #Basic and studentized using log transformation hdot.func <- function(u) { 1/u } save.tran.ci <- boot.ci(boot.out = boot.res2, conf = 0.95, type = c("basic", "stud"), h = log, hinv = exp, hdot = hdot.func) save.tran.ci #