############################################################################
# NAME:  Chris Bilder                                                      #
# DATE:  12-20-03, 12-10-05, 7-16-06                                       #
# PURPOSE: Chapter 1 example with the GPA data set                         #
#                                                                          #
# NOTES: 1)                                                                #
#                                                                          #
############################################################################

#Read in the data
gpa<-read.table(file = "C:\\chris\\UNL\\STAT870\\Chapter1\\gpa.txt", header=TRUE, sep = "")

#gpa.csv<-read.csv(file = "C:\\chris\\UNL\\Dropbox\\NEW\\STAT870\\Chapter1\\gpa.csv")

#Print data set
gpa

#Print one variable
gpa$HS.GPA
gpa[,1]

#Summary statistics for variables
summary(gpa)

#Simple scatter plot
plot(x = gpa$HS.GPA, y = gpa$College.GPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA", 
     xlim = c(0,4.5), ylim = c(0,4.5), col = "red", pch = 1, cex = 1.0, panel.first = grid(col = "gray", lty = "dotted"))



#Read in an Excel version of the file
library(RODBC)
z<-odbcConnectExcel("C:\\chris\\UNL\\STAT870\\R_intro\\gpa.xls")
gpa.excel<-sqlFetch(z, "sheet1")
close(z)
gpa.excel


########################################################################
# Find estimated simple linear regression model (sample model)
     
  #Fit the simple linear regression model and save the results in mod.fit
  mod.fit<-lm(formula = College.GPA ~ HS.GPA, data = gpa)

  #A very brief look of what is inside of mod.fit - see the summary function for a better way
  mod.fit

  #See the names of all of the object components
  names(mod.fit)
  mod.fit$coefficients 
  mod.fit$residuals

  #Put some of the components into a data.frame object
  save.fit<-data.frame(gpa, College.GPA.hat = round(mod.fit$fitted.values,2), residuals = round(mod.fit$residuals,2))

  #Print contents save.fit 
  save.fit

  #Summarize the information stored in mod.fit
  summary(mod.fit)
  
  
  #Prediction - note that the actual function used here is predict.lm()
  predict(object = mod.fit)
  predict.lm(object = mod.fit)
  new.data<-data.frame(HS.GPA = c(2,3))
  save.pred<-predict(object = mod.fit, newdata = new.data)
  round(save.pred,2)
   

  #Other ways to find MSE:
  
    #Method #1
    names(mod.fit)
    mod.fit$residuals
    sum(mod.fit$residuals^2)/mod.fit$df.residual

    #Method #2
    summary.fit<-summary(mod.fit)
    names(summary.fit)
    summary.fit$sigma
    summary.fit$sigma^2



########################################################################
#Put sample model on plot

  #Open a new graphics window
  win.graph(width = 6, height = 6, pointsize = 10)

  #Same scatter plot as before
  plot(x = gpa$HS.GPA, y = gpa$College.GPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA", 
       xlim = c(0,4.5), ylim = c(0,4.5), col = "red", pch = 1, cex = 1.0, panel.first=grid(col = "gray", lty = "dotted"))

  #Puts the line y = a + bx on the plot
  abline(a = mod.fit$coefficients[1], b = mod.fit$coefficients[2], lty = 1, col = "blue", lwd = 2)


  #Notice the above line goes outside of the range of the x-values.  To prevent this, we can use the segments function
  #Open a new graphics window - do not need to
  win.graph(width = 6, height = 6, pointsize = 10)
 
  #Same scatter plot as before
  plot(x = gpa$HS.GPA, y = gpa$College.GPA, xlab = "HS GPA", ylab = "College GPA", main = "College GPA vs. HS GPA", 
       xlim = c(0,4.5), ylim = c(0,4.5), col = "red", pch = 1, cex = 1.0, panel.first=grid(col = "gray", lty = "dotted"))

  #Draw a line from (x0, y0) to (x1, y1)
  segments(x0 = min(gpa$HS.GPA), y0 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*min(gpa$HS.GPA), 
           x1 = max(gpa$HS.GPA), y1 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*max(gpa$HS.GPA),
           lty = "solid", col = "blue", lwd = 2)
  #Another way
  #curve(expr = mod.fit$coefficients[1] +  mod.fit$coefficients[2]*x, xlim =
  #  c(min(gpa$HS.GPA), max(gpa$HS.GPA)), add = TRUE, col = "blue")




############################################################################################################
# Create a function to find the sample model and put the line on a scatter plot

  my.reg.func<-function(x, y, data) {
  
    #Fit the simple linear regression model and save the results in mod.fit
    mod.fit<-lm(y ~ x, data = data)
  
    #Open a new graphics window - do not need to
    win.graph(width = 6, height = 6, pointsize = 10)
 
    #Same scatter plot as before
    plot(x = x, y = y, xlab = "x", ylab = "y", main = "y vs. x", 
         col = "red", pch = 1, cex = 1.0, panel.first=grid(col = "gray", lty = "dotted"))

    #Draw a line from (x0, y0) to (x1, y1)
    segments(x0 = min(x), y0 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*min(x), 
             x1 = max(x), y1 = mod.fit$coefficients[1] + mod.fit$coefficients[2]*max(x),
             lty = 1, col = "blue", lwd = 2) #lty = 1 is "solid"
    
    #This is the object returned
    mod.fit           
  }


  save.it<-my.reg.func(gpa$HS.GPA, gpa$College.GPA, gpa)  
  names(save.it)
  summary(save.it)
  
  








#