##################################################################### # NAME: Chris Bilder # # DATE: 9-1-11 # # Purpose: Logistic regression with the placekick data set # # # # NOTES: # ##################################################################### ##################################################################### # Reading the data placekick<-read.csv("C:\\chris\\unl\\Dropbox\\NEW\\workshop\\Gallup\\placekick.csv") head(placekick) ##################################################################### # Fitting the model #Estimate the model with the raw data mod.fit<-glm(formula = good ~ change + pat + wind + distance + distance:wind, data = placekick, family = binomial(link = logit)) summary(mod.fit) names(mod.fit) #Equiavlent to the above mod.fit2<-glm(formula = good ~ change + pat + wind + distance*wind, data = placekick, family = binomial(link = logit)) mod.fit2$coefficients #Equiavlent to the above mod.fit3<-glm(formula = good ~ change + pat + wind + (distance + wind)^2, data = placekick, family = binomial(link = logit)) mod.fit3$coefficients #Convert data to a binomial form set1<-aggregate(formula = good ~ change + pat + wind + distance, data = placekick, FUN = sum) head(set1) set2<-aggregate(formula = good ~ change + pat + wind + distance, data = placekick, FUN = length) head(set2) placekick.bin<-data.frame(set1[,-5], success = set1$good, trials = set2$good, proportion = round(set1$good/set2$good, 4)) head(placekick.bin) #Estimate the model with the binomial form of the data mod.fit.bin<-glm(formula = success/trials ~ change + pat + wind + distance + distance:wind, data = placekick.bin, weight = trials, family = binomial(link = logit)) summary(mod.fit.bin) #################################################################### #Estimating the response #Estimate P(success) for change = 1, pat = 0, wind = 0, distance = 30 beta.hat<-mod.fit.bin$coefficients beta.hat exp(beta.hat[1] + beta.hat[2]*1 + beta.hat[5]*30) / (1 + exp(beta.hat[1] + beta.hat[2]*1 + beta.hat[5]*30)) plogis(q = beta.hat[1] + beta.hat[2]*1 + beta.hat[5]*30) as.numeric(plogis(q = beta.hat[1] + beta.hat[2]*1 + beta.hat[5]*30)) #Removes label predict(object = mod.fit.bin, newdata = data.frame(change = 1, pat = 0, wind = 0, distance = 30), type = "response") save.lp<-predict(object = mod.fit.bin, newdata = data.frame(change = 1, pat = 0, wind = 0, distance = 30), type = "link") save.lp plogis(q = save.lp) #Confidence interval save.lp<-predict(object = mod.fit.bin, newdata = data.frame(change = 1, pat = 0, wind = 0, distance = 30), type = "link", se = TRUE) save.lp alpha<-0.05 lower.lp<-save.lp$fit-qnorm(p = 1-alpha/2)*save.lp$se.fit upper.lp<-save.lp$fit+qnorm(p = 1-alpha/2)*save.lp$se.fit lower.pi<-plogis(q = lower.lp) upper.pi<-plogis(q = upper.lp) data.frame(lower.pi, upper.pi) #Function for the confidence interval ci.pi<-function(newdata, mod.fit.obj, alpha){ save.lp<-predict(object = mod.fit.obj, newdata = newdata, type = "link", se = TRUE) lower.lp<-save.lp$fit-qnorm(1-alpha/2)*save.lp$se.fit upper.lp<-save.lp$fit+qnorm(1-alpha/2)*save.lp$se.fit lower.pi<-plogis(q = lower.lp) upper.pi<-plogis(q = upper.lp) list(pi.hat = plogis(save.lp$fit), lower = lower.pi, upper = upper.pi) } ci.pi(newdata = data.frame(change = 1, pat = 0, wind = 0, distance = c(30, 40)), mod.fit.obj = mod.fit.bin, alpha = 0.05) #Dummy plot of the estimated proportion of success at each distance plot(x = placekick.bin$distance, y = placekick.bin$proportion, xlab="Distance in Yards", ylab="Estimated Probability of Success", type="n", panel.first=grid(col = "gray", lty = "dotted"), main = "Estimated probability of success of a field goal (PAT=0)") #Put estimated logistic regression model on the plot - change=0, wind=0 curve(expr = plogis(beta.hat[1] + beta.hat[5]*x), lwd=2, col = "red", add = TRUE, n = 1000) #Another way to do the same curve as above #curve(expr = predict(object = mod.fit.bin, newdata = data.frame(change = 1, pat = 0, wind = 0, distance = x), type = "response"), # lwd=2, col = "red", add = TRUE, n = 1000) #Put estimated logistic regression model on the plot - change=1, wind=0 curve(expr = plogis(beta.hat[1] + beta.hat[2]*1 + beta.hat[5]*x), lty=3, lwd=2, col = "green", add = TRUE) #Put estimated logistic regression model on the plot - change=0, wind=1 curve(expr = plogis(beta.hat[1] + beta.hat[4]*1 + beta.hat[5]*x + beta.hat[6]*1*x), lty=4, lwd=2, col = "blue", add = TRUE) #Put estimated logistic regression model on the plot - change=1, wind=1 curve(expr = plogis(beta.hat[1] + beta.hat[2]*1 + beta.hat[4]*1 + beta.hat[5]*x + beta.hat[6]*1*x), lty=2, lwd=2, col = "purple", add = TRUE) names1<-c("Change=0, Wind=0", "Change=1, Wind=0", "Change=0, Wind=1", "Change=1, Wind=1") legend(locator(1), legend = names1, lty = c(1,3,4,2), col = c("red","green","blue","purple"), bty="n", cex=0.75, lwd=2) ############################################### #Another plot with the confidence iterval bands plot(x = placekick.bin$distance, y = placekick.bin$proportion, xlab="Distance in Yards", ylab="Estimated Probability of Success", type="n", panel.first=grid(col = "gray", lty = "dotted"), main = "Estimated probability of success of a field goal (PAT=0)") #Change=0 and Wind=0 curve(expr = ci.pi(newdata = data.frame(change = 0, pat = 0, wind = 0, distance = x), mod.fit.obj = mod.fit.bin, alpha = 0.10)$pi.hat, lty = 1, lwd = 2, col = "red", add = TRUE) curve(expr = ci.pi(newdata = data.frame(change = 0, pat = 0, wind = 0, distance = x), mod.fit.obj = mod.fit.bin, alpha = 0.10)$lower, lty = 3, lwd = 2, col = "red", add = TRUE) curve(expr = ci.pi(newdata = data.frame(change = 0, pat = 0, wind = 0, distance = x), mod.fit.obj = mod.fit.bin, alpha = 0.10)$upper, lty = 3, lwd = 2, col = "red", add = TRUE) #Change=1 and Wind=1 curve(expr = ci.pi(newdata = data.frame(change = 1, pat = 0, wind = 1, distance = x), mod.fit.obj = mod.fit.bin, alpha = 0.10)$pi.hat, lty = 2, lwd = 2, col = "purple", add = TRUE) curve(expr = ci.pi(newdata = data.frame(change = 1, pat = 0, wind = 1, distance = x), mod.fit.obj = mod.fit.bin, alpha = 0.10)$lower, lty = 3, lwd = 2, col = "purple", add = TRUE) curve(expr = ci.pi(newdata = data.frame(change = 1, pat = 0, wind = 1, distance = x), mod.fit.obj = mod.fit.bin, alpha = 0.10)$upper, lty = 3, lwd = 2, col = "purple", add = TRUE) text(x = 25, y = 0.4, "Lowest Number of Risk Factors", cex=0.75) names1<-c("Estimated Probability", "90% Confidence Interval") legend(x = 17, y = 0.38, legend = names1, lty = c(1,3), col = c("red","red"), bty = "n", lwd = 2, cex = 0.75) text(x = 25, y = 0.25, "Highest Number of Risk Factors", cex=0.75) names1<-c("Estimated Probability", "90% Confidence Interval") legend(x = 17, y = 0.23, legend =names1, lty = c(2,3), col = c("purple","purple"), bty = "n", lwd = 2, cex = 0.75) ##################################################################### # Object oriented language class(mod.fit.bin) methods(class = glm) #Method functions associated with glm() vcov(object = mod.fit.bin) confint(object = mod.fit.bin, level = 0.95) head(residuals(object = mod.fit.bin)) anova(object = mod.fit.bin, test = "Chisq") deviance(object = mod.fit.bin) head(rstudent(model = mod.fit.bin)) influence(model = mod.fit.bin) plot(x = mod.fit) #Example with the Anova function from the car package library(package = car) Anova(mod = mod.fit, test = "LR") #Example of testing a full model vs. reduced model mod.fit.bin.reduced<-glm(formula = success/trials ~ change + pat, data = placekick.bin, weight = trials, family = binomial(link = logit)) anova(mod.fit.bin.reduced, mod.fit.bin, test = "Chisq") ##################################################################### # Writing your own functions #The examine.model() function is in this program: source("C:\\chris\\unl\\Dropbox\\NEW\\workshop\\Gallup\\examine.model.logistic.reg.R") #Diagnostics for model save.it<-examine.model(mod.fit.obj = mod.fit.bin) names(save.it) #