# Examine models for placekicking data more closely options(width = 70) placekick <- read.csv(file = "C:\\data\\placekick.csv") head(placekick) tail(placekick) ##################################################################### # Aggregate data according to variables we plan to put into the model # Find the observed proportion of successes at each distance w <- aggregate(x = good ~ distance, data = placekick, FUN = sum) n <- aggregate(x = good ~ distance, data = placekick, FUN = length) w.n <- data.frame(distance = w$distance, success = w$good, trials = n$good, prop = round(w$good/n$good,4)) head(w.n) tail(w.n) # Fit model using aggregated data mod.fit.bin <- glm(formula = success/trials ~ distance, weights = trials, family = binomial(link = logit), data = w.n) summary(mod.fit.bin) # Show how to find the hat matrix using matrix algebra X <- model.matrix(mod.fit.bin) # mod.fit.bin$weights is a shortcut for n*hat(pi)*(1-hat(pi)) # Compare: w.n$trials*mod.fit.bin$fitted.values*(1-mod.fit.bin$fitted.values) V <- diag(mod.fit.bin$weights) H <- sqrt(V)%*%X%*%solve(t(X)%*%V%*%X)%*%t(X)%*%sqrt(V) head(diag(H)) head(hatvalues(mod.fit.bin)) # Matches ##################################################################### # Individual diagnostic measures pi.hat <- predict(mod.fit.bin, type = "response") s.res <- rstandard(mod.fit.bin, type = "pearson") # If want Pearson residuals # p.res <- residuals(mod.fit.bin, type = "pearson") prob.smaller <- pbinom(q = w.n$success, size = w.n$trials, prob = pi.hat, lower.tail = TRUE) prob.higher <- pbinom(q = w.n$success, size = w.n$trials, prob = pi.hat, lower.tail = FALSE) + dbinom(x = w.n$success, size = w.n$trials, prob = pi.hat) # Mininum of P(W <= w_m) and P(W >= w_m) tail.prob <- apply(X = cbind(prob.smaller, prob.higher), MARGIN = 1, FUN = min) lin.pred <- mod.fit.bin$linear.predictors w.n <- data.frame(w.n, pi.hat, s.res, tail.prob, lin.pred) round(head(w.n), digits = 3) round(w.n[abs(w.n$s.res) > 2, ], digits = 3) round(w.n[w.n$tail.prob < 0.025, ], digits = 3) #################################################################### # Residual plots dev.new(height = 7, width = 13, pointsize=20) par(mfrow = c(1,3)) # Standardized Pearson residual vs x plot plot(x = w.n$distance, y = w.n$s.res, xlab = "Distance", ylab = "Standardized Pearson residuals", main = "Standardized residuals vs. \n X") abline(h = c(3, 2, 0, -2, -3), lty = 3, col = "blue") # Add loess model to help visualize trend smooth.stand <- loess(formula = s.res ~ distance, data = w.n, weights = trials) # Make sure that loess estimates are ordered by "x" for the plots, so that they are displayed properly order.dist <- order(w.n$distance) lines(x = w.n$distance[order.dist], y = predict(smooth.stand)[order.dist], lty = 3, col = "red", lwd = 3) # Standardized Pearson residual vs pi plot plot(x = w.n$pi.hat, y = w.n$s.res, xlab = "Estimated probability of success", ylab = "Standardized Pearson residuals", main = "Standardized residuals vs. \n pi.hat") abline(h = c(3, 2, 0, -2, -3), lty = "dotted", col = "blue") smooth.stand <- loess(formula = s.res ~ pi.hat, data = w.n, weights = trials) # Make sure that loess estimates are ordered by "X" for the plots, so that they are displayed properly order.pi.hat <- order(w.n$pi.hat) lines(x = w.n$pi.hat[order.pi.hat], y = predict(smooth.stand)[order.pi.hat], lty = 3, col = "red", lwd = 3) # Standardized Pearson residual vs linear predictor plot plot(x = w.n$lin.pred, y = w.n$s.res, xlab = "Linear predictor", ylab = "Standardized Pearson residuals", main = "Standardized residuals vs. \n Linear predictor") abline(h = c(3, 2, 0, -2, -3), lty = "dotted", col = "blue") smooth.stand <- loess(formula = s.res ~ lin.pred, data = w.n, weights = trials) # Make sure that loess estimates are ordered by "X" for the plots, so that they are displayed properly order.lin.pred <- order(w.n$lin.pred) lines(x = w.n$lin.pred[order.lin.pred], y = predict(smooth.stand)[order.lin.pred], lty = 3, col = "red", lwd = 3) # dev.off() # Create plot for book ################################################################################ # Use Examine.logistic.reg.R source(file = "C:\\Rprograms\\Examine.logistic.reg.R") save.info1 <- examine.logistic.reg(mod.fit.obj = mod.fit.bin, identify.points = TRUE) names(save.info1) data.frame(w.n[,-4], prop = round(w.n[,4],2), pi.hat = round(mod.fit.bin$fitted.values,2), std.Pe = round(save.info1$stand.resid,2), tail.p = round(save.info1$tail.prob,2), dXsq = round(save.info1$deltaXsq,2), cook = round(save.info1$cookd,2)) save.info2 <- examine.logistic.reg(mod.fit.obj = mod.fit.bin, identify.points = TRUE, scale.n = sqrt, scale.cookd = sqrt) one.fourth.root <- function(x) { x^0.25 } one.fourth.root(16) # Example save.info3 <- examine.logistic.reg(mod.fit.obj = mod.fit.bin, identify.points = TRUE, scale.n = one.fourth.root, scale.cookd = one.fourth.root) # Now include distance and PAT w2 <- aggregate(x = good ~ distance + PAT, data = placekick, FUN = sum) n2 <- aggregate(x = good ~ distance + PAT, data = placekick, FUN = length) w.n2 <- data.frame(distance = w2$distance, PAT = w2$PAT, success = w2$good, trials = n2$good, prop = round(w2$good/n2$good,4)) head(w.n2) tail(w.n2) mod.fit.bin2 <- glm(formula = success/trials ~ distance + PAT, weights = trials, family = binomial(link = logit), data = w.n2) summary(mod.fit.bin2) # Note that this function will not work because the 44th EVP is the only one with # PAT = 1. The standardized residual value is given as "NaN" due to the perfect fit save.info.dist.PAT <- examine.logistic.reg(mod.fit.obj = mod.fit.bin2, identify.points = TRUE) #