# Source:  www.openintro.org/stat/supplements.php
# License: CC BY-SA
#          http://creativecommons.org/licenses/by-sa/3.0/


#_____ Data Download _____#
download.file("http://www.openintro.org/stat/data/cc.RData", destfile = "cc.RData")
load("cc.RData")
cc <- countyComplete

#_____ Libraries and Data _____#
library(openintro)
library(maps)
data(COL)
data(countyComplete)


#_____ Initialize Variables _____#
these <- c("FIPS", "pop2010", "pop2000",
           "age_under_5", "age_under_18",
           "age_over_65", "female", "black",
           "hispanic", "white_not_hispanic",
           "no_move_in_one_plus_year",
           "foreign_born",
           "foreign_spoken_at_home", "hs_grad",
           "bachelors", "mean_work_travel",
           "home_ownership",
           "housing_multi_unit",
           "median_val_owner_occupied",
           "persons_per_household",
           "per_capita_income", "poverty",
           "sales_per_capita", "density")
d <- countyComplete[,these]
d <- na.omit(d)
dim(d)
growth <- 100*(d$pop2010/d$pop2000-1)
dd     <- data.frame(growth, d)
# dd is identical to downloaded data
identical(dd, cc)
row.names(dd) <- NULL


#_____ Mapping Function _____#
# Source: OpenIntro Statistics source
# www.openintro.org/stat/textbook.php
countyMap <- function(values, FIPS,
                      col=c("red", "green", "blue"),
                      varTrans=I, gtlt="", ...){
  if(missing(FIPS)){
    stop("Must provide the county FIPS")
  }
  
  #===> Drop NAs <===#
  FIPS   <- FIPS[!is.na(values)]
  values <- values[!is.na(values)]
  
  #===> Scale Values <===#
  MI  <- min(values)
  MA  <- max(values)
  Leg <- seq(MI, MA, length.out=50)
  if(identical(varTrans, log)){
    VAL    <- varTrans(values+0.1)
    valCol <- varTrans(values+0.1)
    LegCol <- varTrans(Leg+0.1)
  } else {
    VAL    <- varTrans(values)
    valCol <- varTrans(values)
    LegCol <- varTrans(Leg)
  }
  valCol <- 0.98*(valCol - MI)/(MA - MI) + 0.01
  LegCol <- 0.98*(LegCol - MI)/(MA - MI) + 0.01
  
  val.04 <- 0.4*(1 - valCol)
  val.06 <- 0.6*(1 - valCol)
  val.07 <- 0.7*(1 - valCol)
  val.08 <- 0.8*(1 - valCol)
  val.10 <- 1.0*(1 - valCol)
  
  Leg.04 <- 0.4*(1 - LegCol)
  Leg.06 <- 0.6*(1 - LegCol)
  Leg.07 <- 0.7*(1 - LegCol)
  Leg.08 <- 0.8*(1 - LegCol)
  Leg.10 <- 1.0*(1 - LegCol)
  
  if(col[1] == "red"){
    col <- rgb(val.10, val.07, val.07)
    COL <- rgb(Leg.10, Leg.07, Leg.07)
  } else if(col[1] == "green"){
    col <- rgb(val.07, val.10, val.07)
    COL <- rgb(Leg.07, Leg.10, Leg.07)
  } else if(col[1] == "bg"){
    col <- rgb(val.06, val.08, val.10)
    COL <- rgb(Leg.06, Leg.08, Leg.10)
  } else if(col[1] == "ye"){
    col <- rgb(val.10, val.10, val.04)
    COL <- rgb(Leg.10, Leg.10, Leg.04)
  } else {
    col <- rgb(val.06, val.06, val.10)
    COL <- rgb(Leg.06, Leg.06, Leg.10)
  }
  
  #===> Remove These <===#
  data(county.fips)
  col <- col[match(county.fips$fips, FIPS)]
  plot(0,0,type="n", axes=FALSE, xlab="", ylab="")
  par(mar=rep(0.1,4), usr=c(-0.385,0.41,0.44,0.91))
  map("county", col=col, fill=TRUE, resolution=0,
    lty=0, projection="polyconic", mar=rep(0.1,4), add=TRUE, ...)
  
  x1 <- 0.33
  x2 <- 0.36
  for(i in 1:50){
    y1 <- i/50 * 0.25 + 0.5
    y2 <- (i-1)/50 * 0.25 + 0.5
    rect(x1, y1, x2, y2, border="#00000000", col=COL[i])
  }
  
  VR    <- range(VAL)
  VR[3] <- VR[2]
  VR[2] <- mean(VR[c(1,3)])
  
  VR1    <- c()
  VR1[1] <- values[which.min(abs(VAL - VR[1]))]
  VR1[2] <- values[which.min(abs(VAL - VR[2]))]
  VR1[2] <- values[which.min(abs(VAL - VR[3]))]
  
  VR    <- round(VR)
  if(gtlt %in% c(">", "><")){
    VR[3] <- paste(">", VR[3], sep="")
  }
  if(gtlt %in% c("<", "><")){
    VR[1] <- paste("<", VR[1], sep="")
  }
  text(0.36, 0.51, VR[1], pos=4)
  text(0.36, 0.625, VR[2], pos=4)
  text(0.36, 0.74, VR[3], pos=4)
}


#_____ Growth Figures And Summaries _____#
summary(growth)
sd(growth)
hist(growth, breaks = 100, xlab="Growth (percent)",
     main="", col=COL[1])
val <- (county$pop2010 - county$pop2000) / county$pop2000
val <- val*100
val[val > 30]  <- 30
val[val < -30] <- -30
countyMap(val, countyComplete$FIPS, "green", gtlt="><")


#_____ Population Summaries _____#
summary(d$pop2010)
sd(d$pop2010)
hist(d$pop2000[d$pop2000 < 1e6], breaks = 100, xlim = c(0, 1000000),
     xlab="Population, Year 2000", col=COL[1], main="")
temp <- paste(sum(d$pop2000 > 1e6),
              "counties with a population\ngreater than 1,000,000\nare not shown")
text(7.5e5, 200, temp)
hist(log(d$pop2000), breaks = 60,
     xlab="Natural Log of Population, Year 2000",
     col=COL[1], main="")
th  <- c(1e3, 1e4, 1e5, 1e6)
thl <- c("1,000", "10,000", "100,000", "1,000,000")
segments(log(th), rep(0, length(th)),
         log(th), rep(185, length(th)), col=COL[4],
         lwd=2.5)
text(log(th)+c(-0.4,-0.6,0.65,0.75),
     rep(195, length(th)), thl,
     col=COL[4])


#_____ Collinearity _____#
plot(d$age_under_5, d$age_under_18,
     xlab="Percent under Age 5",
     ylab="Percent under Age 18",
     col=COL[1,4], pch=19, cex=1.3)
plot(d$black, d$white_not_hispanic,
     xlab="Percent Black",
     ylab="Percent White (not Hispanic)",
     col=COL[1,4], pch=19, cex=1.3)
abline(100, -1, col="#FFFFFF", lty=2, lwd=2)
abline(100, -1, col=COL[6], lty=2)
plot(d$hs_grad, d$bachelors,
     xlab="Percent With High School Degree",
     ylab="Percent With Bachelors Degree",
     col=COL[1,4], pch=19, cex=1.3,
     xlim=c(40,100))


#_____ Models _____#
m1 <- lm(growth ~ log(pop2000) + age_under_5 + age_under_18 + age_over_65 + female + black + hispanic + white_not_hispanic + no_move_in_one_plus_year + foreign_born + foreign_spoken_at_home + hs_grad + bachelors + mean_work_travel + home_ownership + housing_multi_unit + median_val_owner_occupied + persons_per_household + per_capita_income + poverty + sales_per_capita + density, dd)
summary(m1)

m2 <- lm(growth ~ log(pop2000) + age_under_5 + age_under_18 + age_over_65 + female + hispanic + white_not_hispanic + no_move_in_one_plus_year + foreign_born + foreign_spoken_at_home + bachelors + mean_work_travel + housing_multi_unit + median_val_owner_occupied + persons_per_household + per_capita_income + poverty + sales_per_capita, dd)
summary(m2)


#_____ Diagnostics _____#
qqnorm(m2$res, main="",
       xlab="", pch=19, col=COL[1,4])

plot(m2$fit, m2$res,
       xlab="Predicted Values",
       ylab="Residuals",
       pch=19, col=COL[1,4])
abline(h=0, col=COL[6], lty=2)
text(-45, 32, "Kalawao County, Hawaii")
points(-59.4, 20.6, cex=3)

val <- m2$res
val <- val
val[val > 18]  <- 18
val[val < -18] <- -18
countyMap(val, dd$FIPS, "green", gtlt="><")


#_____ IQR Summary _____#
apply(m2$model[,-1], 2, IQR) * m2$coef[-1]