######################################################################
# NAME:  Chris Bilder                                                #
# DATE:  8-15-06                                                     #
# UPDATE:                                                            #
# Purpose: Automate part of the linear regression diagnostic         #
#          procedures into one function.                             #
#                                                                    #
# NOTES: Based on examine.mod.multiple.R used in Chapter 6.          #
#        This function needs to be run just once before it is used.  #
#        One needs to call the function with model fit object and    #
#        first.order (how many "first order" model terms).           #
#        For example,                                                #
#                                                                    #
#  examine.mod.multiple.final(mod.fit.obj = mod.fit, first.order = 4)#
#                                                                    #
#        The first.order requirement tells R the number of predictor #
#        variables there are before examining squared terms,         #
#        interactions, ... .  These predictor variables need to be   #
#        given FIRST in the formula statement when fitting the model.#
#                                                                    #
#        The seed for jittering can be changed with the              #
#        seed option.  The Levene and Breusch-Pagan tests can be     #
#        performed if the var.test option is changed to TRUE.  Note  #
#        that the car and lmtest packages need to be loaded for these#
#        tests.  The Y optional values can be used if a              #
#        transformation of Y was used for the model.  For            #
#        example, if log(Y) was used, then fit the model with log(Y) #
#        and include the Y = log(original Y variable) when invoking  #
#        this function.  Unlike examine.mod.simple.R, there is not a #
#        similar X option.  The boxcox.find = TRUE option produces   #
#        an estimate for lambda.                                     #
#                                                                    #
######################################################################

examine.mod.multiple.final<-function(mod.fit.obj, first.order, seed = 9180, const.var.test = FALSE, Y = NULL, 
                                     boxcox.find = FALSE, labels = 1:n) {

  n<-length(mod.fit.obj$residuals)
  p<-length(mod.fit.obj$coefficients)
  
  #X<-mod.fit.obj$model[,2:(2+first.order-1)]
  X<-as.matrix(mod.fit.obj$model[,2:(2+first.order-1)]) 
  #Need as.matrix() since first.order may be 1, which would cause problems in boxplot(x = X[,j], ...) part of code
  #  If first.order = 1, R will not know how to interpret X[,j] since it will be of a vector, not matrix, class type
  
  if (is.null(Y)) { Y<-mod.fit.obj$model[,1]}
 
  r.i<-rstandard(model = mod.fit.obj)
  t.i<-rstudent(model = mod.fit.obj)
 
   
  ##############################################################################
  #Graph window set #1
  
    #Open a new plotting window 1x2
    win.graph(width = 6, height = 6, pointsize = 10)
    par(mfrow = c(1,2)) 
    
    set.seed(seed) 
    boxplot(x = Y, col = "lightblue", main = "Box plot", ylab = "Response variable", xlab = " ")                   
    stripchart(x = Y, method = "jitter", vertical = TRUE, pch = 1, main = "Dot plot", ylab = "Response variable")  
      
      
    counter<-1 #Need for opening new plot windows below
    win.graph(width = 6, height = 6, pointsize = 10)
    par(mfrow = c(2,2)) 

    #X.ij box and dot plots
    for(j in 1:first.order) {
 
      if (counter/2 > 1) { 
         win.graph(width = 6, height = 6, pointsize = 10)
         par(mfrow = c(2,2))
         counter<-1  

      }
       
      boxplot(x = X[,j], col = "lightblue", main = "Box plot", ylab = paste("Predictor variable", j), xlab = " ")                 
      stripchart(x = X[,j], method = "jitter", vertical = TRUE, pch = 1, main = "Dot plot", ylab = paste("Predictor variable", j)) 

      counter<-counter + 1
    }
    
    #Initial summary statistics
    summary.data<-summary(data.frame(Y,X))
  
  
  ##############################################################################
  #Graph window set #2
  
    #Open a new plotting window 1x1
    win.graph(width = 6, height = 6, pointsize = 10)
    par(mfrow = c(1,1), pty = "m") 
 
    #e.i vs. Y.hat.i
    plot(x = mod.fit.obj$fitted.values, y = mod.fit.obj$residuals, xlab = "Estimated mean response", ylab = "Residuals", 
         main = "Residuals vs. estimated mean response", panel.first = grid(col = "gray", lty = "dotted"))
    abline(h = 0, col = "darkgreen")


    #Open a new plotting window 1x2
    win.graph(width = 8, height = 6, pointsize = 10)
    par(mfrow = c(1,2), pty = "s") 

    #r.i vs. Y.hat.i
    plot(x = mod.fit.obj$fitted.values, y = r.i, xlab = "Estimated mean response", 
        ylab = "Studentized residuals", main = expression(paste(r[i], " vs. estimated mean response")), 
        panel.first = grid(col = "gray", lty = "dotted"),
        ylim = c(min(qt(p = 0.05/(2*n), df = mod.fit.obj$df.residual), min(r.i)), 
                 max(qt(p = 1-0.05/(2*n), df = mod.fit.obj$df.residual), max(r.i))))
    abline(h = 0, col = "darkgreen")
    abline(h = c(qt(p = 0.01/2, df = mod.fit.obj$df.residual),qt(p = 1-0.01/2, df = mod.fit.obj$df.residual)), 
           col = "red", lwd = 2)
    abline(h = c(qt(p = 0.05/(2*n), df = mod.fit.obj$df.residual),qt(p = 1-0.05/(2*n), df = mod.fit.obj$df.residual)), 
           col = "darkred", lwd = 2)
    identify(x = mod.fit.obj$fitted.values, y = r.i, labels = labels)
        
 
    #t.i vs. Y.hat.i
    plot(x = mod.fit.obj$fitted.values, y = t.i, xlab = "Estimated mean response", 
        ylab = "Studentized deleted residuals", main = expression(paste(t[i], " vs. estimated mean response")), 
        panel.first = grid(col = "gray", lty = "dotted"),
        ylim = c(min(qt(p = 0.05/(2*n), df = mod.fit.obj$df.residual-1), min(t.i)), 
                 max(qt(p = 1-0.05/(2*n), df = mod.fit.obj$df.residual-1), max(t.i))))
    abline(h = 0, col = "darkgreen")
    abline(h = c(qt(p = 0.01/2, df = mod.fit.obj$df.residual-1),qt(p = 1-0.01/2, df = mod.fit.obj$df.residual-1)), col = "red", lwd = 2)
    abline(h = c(qt(p = 0.05/(2*n), df = mod.fit.obj$df.residual-1),qt(p = 1-0.05/(2*n), df = mod.fit.obj$df.residual-1)), col = "darkred", lwd = 2)
    identify(x = mod.fit.obj$fitted.values, y = t.i, labels = labels)

    #Fox's outlier.test() function gives the Bonferroni adjusted p-value for the largest in absolute value 
    #  studentized deleted residual; see p. 194 of Fox
    library(car)
    outlier.res<-outlierTest(mod.fit.obj)
    

  ##############################################################################
  #Graph window set #3 - could also do partial regression plots: cr.plots(model = mod.fit.obj, ask=FALSE, span=span)  

    win.graph(width = 6, height = 6, pointsize = 10)
    par(mfrow = c(2,2), pty = "m") 
    counter<-1 #Need for opening new plot windows below

    #e.i vs. X.ij
    for(j in 1:first.order) {
  
      if (counter/4 > 1) { 
         win.graph(width = 6, height = 6, pointsize = 10)
         par(mfrow = c(2,2))
         counter<-1  #Set back to 0 (Will be 1 after counter<-counter+1
      }
        
      plot(x = X[,j], y = mod.fit.obj$residuals, ylab = "Residuals", main = paste("Residuals vs. predictor",j), 
          xlab = paste("Predictor variable",j), panel.first = grid(col = "gray", lty = "dotted"))
      abline(h = 0, col = "darkgreen")
      
     counter<-counter + 1
    }



  ##############################################################################
  #Graph window set #4

    #Open a new plotting window 2x2
    win.graph(width = 6, height = 6, pointsize = 10)
    par(mfrow = c(2,2)) 

    #(1,1) - e.i vs. obs. number
    plot(x = 1:n,, y = mod.fit.obj$residuals, xlab = "Observation number", ylab = "Residuals", type = "o",
         main = "Residuals vs. observation number", panel.first = grid(col = "gray", lty = "dotted"))
    abline(h = 0, col = "darkgreen")
    
    #(1,2) - Histogram of the residuals with normal distribution overlay
    hist(x = mod.fit.obj$residuals, main = "Histogram of residuals", xlab = "Residuals",
         freq = FALSE)
    curve(expr = dnorm(x, mean = mean(mod.fit.obj$residuals), sd = sd(mod.fit.obj$residuals)), col = "red", add = TRUE)

    #(2,1) - QQ-plot done by R
    qqnorm(y = mod.fit.obj$residuals, ylab = "Residuals", panel.first = grid(col = "gray", lty = "dotted"))
    qqline(y = mod.fit.obj$residuals, col = "red")
 
 
  ##############################################################################
  #Graph window set #5

    #Open a new plotting window 2x2
    win.graph(width = 8, height = 6, pointsize = 10)
    par(mfrow = c(2,1))   
    
    #(1,1) - DFFITS vs. observation number
    dffits.i<-dffits(model = mod.fit.obj)
    plot(x = 1:n, y = dffits.i, xlab = "Observation number", ylab = "DFFITS", main = "DFFITS vs. observation number", 
         panel.first = grid(col = "gray", lty = "dotted"), 
         ylim = c(min(-1, -2*sqrt(p/n), min(dffits.i)), max(1, 2*sqrt(p/n), max(dffits.i))))
    abline(h = 0, col = "darkgreen")
    abline(h = c(-2*sqrt(p/n), 2*sqrt(p/n)), col = "red", lwd = 2)
    abline(h = c(-1,1), col = "darkred", lwd = 2)
    identify(x = 1:n, y = dffits.i, labels = labels)
    
    #(1,2) - Cook's distance vs. observation number
    cook.i<-cooks.distance(model = mod.fit.obj)
    plot(x = 1:n, y = cook.i, xlab = "Observation number", ylab = "Cook's D", main = "Cook's D vs. observation number", 
         panel.first = grid(col = "gray", lty = "dotted"), 
         ylim = c(0, max(qf(p=0.5,df1=p, df2=mod.fit.obj$df.residual), cook.i)))
    abline(h = 0, col = "darkgreen")
    abline(h = qf(p=0.5,df1=p, df2=mod.fit.obj$df.residual), col = "red", lwd = 2)
    identify(x = 1:n, y = cook.i, labels = labels)


    #Open a new plotting window 2x2
    win.graph(width = 8, height = 6, pointsize = 10)
    par(mfrow = c(2,2))   
    counter<-1 #Need for opening new plot windows below

    #DFBETAS vs. observation number
    dfbeta.all<-dfbetas(model = mod.fit.obj) 
    for(j in 1:(p-1)) {
    
      if (counter/4 > 1) { 
         win.graph(width = 8, height = 6, pointsize = 10)
         par(mfrow = c(2,2))
         counter<-0  #Set back to 0 (Will be 1 after counter<-counter+1
      }
  
      plot(x = 1:n, y = dfbeta.all[,1+j], xlab = "Observation number", ylab = "DFBETAS", 
           main = paste("DFBETAS for term", j, "vs. observation number"), 
           panel.first = grid(col = "gray", lty = "dotted"), 
           ylim = c(min(-1, -2/sqrt(n), min(dfbeta.all[,1+j])), max(1, 2/sqrt(n), max(dfbeta.all[,1+j]))))
      abline(h = 0, col = "darkgreen")
      abline(h = c(-2/sqrt(n), 2/sqrt(n)), col = "red", lwd = 2)
      abline(h = c(-1,1), col = "darkred", lwd = 2)
      identify(x = 1:n, y = dfbeta.all[,1+j], labels = labels)
      
      counter<-counter + 1
    }
  
  #Only need to do the VIF if more than 1 predictor variable
  if (first.order > 1) { vif1<-vif(mod.fit.obj)}
  else { vif1<-NA }
  
 
  #Save all results so far
  save.res<-list(sum.data = summary.data, dffits = round(dffits.i,2), cook = round(cook.i,2), 
                 studentized = r.i, stud.del = t.i, dfbetas = dfbeta.all, outlier.test = outlier.res,
                 vif = vif1)
 

 
  ##############################################################################
  #Levene and BP tests - Examine the residuals for normality
  
  if (const.var.test) {
  
    library(car)     #The Levene's Test function is in the package for Fox's book
    
    save.levene<-matrix(NA, nrow = first.order, ncol = 3)
    for(j in 1:first.order) {
    
      group<-ifelse(X[,j] < median(X[,j]), 1, 2)
      save.levene.test<-leveneTest(y = mod.fit.obj$residuals, group = factor(group))
      save.levene[j,]<-c(j, round(save.levene.test$"F value"[1],4), save.levene.test$"Pr(>F)"[1])
 
    } 
 
    library(lmtest)  #Location of BP test function
    save.bp<-bptest(mod.fit.obj,  studentize = FALSE)  #KNN Version of the test
    
    save.res$levene<-save.levene
    save.res$bp<-save.bp
    
  }
  
  
  ##############################################################################
  #Box-cox transformation
  
  if (boxcox.find) {
  
    library(MASS)     #Function is in the MASS library
    save.bc<-boxcox(object = mod.fit.obj, lambda = seq(from = -2, to = 2, by = 0.01))
    title(main = "Box-Cox transformation plot")
    lambda.hat<-save.bc$x[save.bc$y == max(save.bc$y)] 
    save.res$lambda.hat<-lambda.hat
    
  }
  
  ##############################################################################
  #Return results
  
    save.res

}