Friday
Monday
Friday
Monday
For a given data set with many potential predictors, there exists many possible models.
How should we systematically evaluate those models?
bridge <- read.table("http://www.stat.tamu.edu/~sheather/book/docs/datasets/bridge.txt", header=TRUE) head(bridge)
## Case Time DArea CCost Dwgs Length Spans ## 1 1 78.8 3.60 82.4 6 90 1 ## 2 2 309.5 5.33 422.3 12 126 2 ## 3 3 184.5 6.29 179.8 9 78 1 ## 4 4 69.6 2.20 100.0 5 60 1 ## 5 5 68.8 1.44 103.0 5 60 1 ## 6 6 95.7 5.40 134.4 5 60 1
logDArea <- log(bridge$DArea) logCCost <- log(bridge$CCost) logDwgs <- log(bridge$Dwgs) logLength <- log(bridge$Length) logSpans <- log(bridge$Spans) X <- cbind(logDArea,logCCost,logDwgs,logLength,logSpans) library(leaps) b <- regsubsets(as.matrix(X), log(bridge$Time)) summary(b)$outmat
## logDArea logCCost logDwgs logLength logSpans ## 1 ( 1 ) " " " " "*" " " " " ## 2 ( 1 ) " " " " "*" " " "*" ## 3 ( 1 ) " " "*" "*" " " "*" ## 4 ( 1 ) "*" "*" "*" " " "*" ## 5 ( 1 ) "*" "*" "*" "*" "*"
summary(b)$outmat
## logDArea logCCost logDwgs logLength logSpans ## 1 ( 1 ) " " " " "*" " " " " ## 2 ( 1 ) " " " " "*" " " "*" ## 3 ( 1 ) " " "*" "*" " " "*" ## 4 ( 1 ) "*" "*" "*" " " "*" ## 5 ( 1 ) "*" "*" "*" "*" "*"
m1 <- lm(log(bridge$Time) ~ logDwgs) m2 <- lm(log(bridge$Time) ~ logDwgs + logSpans) m3 <- lm(log(bridge$Time) ~ logDwgs + logSpans + logCCost) m4 <- lm(log(bridge$Time) ~ logDwgs + logSpans + logCCost + logDArea) m5 <- lm(log(bridge$Time) ~ logDwgs + logSpans + logCCost + logDArea + logLength) models <- list(m1, m2, m3, m4, m5)
summary(m2)$coef
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.6617 0.26871 9.905 1.489e-12 ## logDwgs 1.0416 0.15420 6.755 3.259e-08 ## logSpans 0.2853 0.09095 3.137 3.116e-03
summary(m3)$coef
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.3317 0.3577 6.519 7.902e-08 ## logDwgs 0.8356 0.2135 3.914 3.356e-04 ## logSpans 0.1963 0.1107 1.773 8.371e-02 ## logCCost 0.1483 0.1075 1.380 1.752e-01
The two and the three predictor model both do well, but the two predictor model is preferred because of the statistical significance of the predictors.
Be aware that when looking at p-values of many many models, the burden of proof should be much higher. That is, we need very low p-values to be convinced of statistical significance.
backAIC <- step(m5, direction = "backward", data = bridge)
## Start: AIC=-98.71 ## log(bridge$Time) ~ logDwgs + logSpans + logCCost + logDArea + ## logLength ## ## Df Sum of Sq RSS AIC ## - logLength 1 0.006 3.85 -100.6 ## - logDArea 1 0.013 3.86 -100.6 ## <none> 3.84 -98.7 ## - logCCost 1 0.182 4.03 -98.6 ## - logSpans 1 0.266 4.11 -97.7 ## - logDwgs 1 1.454 5.30 -86.3 ## ## Step: AIC=-100.6 ## log(bridge$Time) ~ logDwgs + logSpans + logCCost + logDArea ## ## Df Sum of Sq RSS AIC ## - logDArea 1 0.020 3.87 -102.4 ## <none> 3.85 -100.6 ## - logCCost 1 0.181 4.03 -100.6 ## - logSpans 1 0.315 4.16 -99.1 ## - logDwgs 1 1.449 5.30 -88.3 ## ## Step: AIC=-102.4 ## log(bridge$Time) ~ logDwgs + logSpans + logCCost ## ## Df Sum of Sq RSS AIC ## <none> 3.87 -102.4 ## - logCCost 1 0.180 4.05 -102.4 ## - logSpans 1 0.297 4.17 -101.1 ## - logDwgs 1 1.445 5.31 -90.1
backBIC <- step(m5, direction = "backward", data = bridge, k = log(nrow(bridge)))
## Start: AIC=-87.87 ## log(bridge$Time) ~ logDwgs + logSpans + logCCost + logDArea + ## logLength ## ## Df Sum of Sq RSS AIC ## - logLength 1 0.006 3.85 -91.6 ## - logDArea 1 0.013 3.86 -91.5 ## - logCCost 1 0.182 4.03 -89.6 ## - logSpans 1 0.266 4.11 -88.7 ## <none> 3.84 -87.9 ## - logDwgs 1 1.454 5.30 -77.2 ## ## Step: AIC=-91.61 ## log(bridge$Time) ~ logDwgs + logSpans + logCCost + logDArea ## ## Df Sum of Sq RSS AIC ## - logDArea 1 0.020 3.87 -95.2 ## - logCCost 1 0.181 4.03 -93.4 ## - logSpans 1 0.315 4.16 -91.9 ## <none> 3.85 -91.6 ## - logDwgs 1 1.449 5.30 -81.0 ## ## Step: AIC=-95.19 ## log(bridge$Time) ~ logDwgs + logSpans + logCCost ## ## Df Sum of Sq RSS AIC ## - logCCost 1 0.180 4.05 -97.0 ## - logSpans 1 0.297 4.17 -95.7 ## <none> 3.87 -95.2 ## - logDwgs 1 1.445 5.31 -84.7 ## ## Step: AIC=-96.95 ## log(bridge$Time) ~ logDwgs + logSpans ## ## Df Sum of Sq RSS AIC ## <none> 4.05 -97.0 ## - logSpans 1 0.95 5.00 -91.3 ## - logDwgs 1 4.40 8.45 -67.7
mint <- lm(log(Time) ~ 1, data = bridge) forwardAIC <- step(mint, scope = list(lower = ~1, upper = ~log(DArea) + log(CCost) + log(Dwgs) + log(Length) + log(Spans)), direction = "forward", data = bridge)
## Start: AIC=-41.35 ## log(Time) ~ 1 ## ## Df Sum of Sq RSS AIC ## + log(Dwgs) 1 12.18 5.00 -94.9 ## + log(CCost) 1 11.61 5.56 -90.1 ## + log(DArea) 1 10.29 6.88 -80.5 ## + log(Length) 1 10.01 7.16 -78.7 ## + log(Spans) 1 8.73 8.45 -71.3 ## <none> 17.17 -41.3 ## ## Step: AIC=-94.9 ## log(Time) ~ log(Dwgs) ## ## Df Sum of Sq RSS AIC ## + log(Spans) 1 0.949 4.05 -102.4 ## + log(CCost) 1 0.832 4.17 -101.1 ## + log(Length) 1 0.669 4.33 -99.4 ## + log(DArea) 1 0.476 4.52 -97.4 ## <none> 5.00 -94.9 ## ## Step: AIC=-102.4 ## log(Time) ~ log(Dwgs) + log(Spans) ## ## Df Sum of Sq RSS AIC ## + log(CCost) 1 0.1796 3.87 -102 ## <none> 4.05 -102 ## + log(DArea) 1 0.0185 4.03 -101 ## + log(Length) 1 0.0169 4.03 -101 ## ## Step: AIC=-102.4 ## log(Time) ~ log(Dwgs) + log(Spans) + log(CCost) ## ## Df Sum of Sq RSS AIC ## <none> 3.87 -102 ## + log(DArea) 1 0.0196 3.85 -101 ## + log(Length) 1 0.0129 3.86 -101
forwardBIC <- step(mint, scope = list(lower = ~1, upper = ~log(DArea) + log(CCost) + log(Dwgs) + log(Length) + log(Spans)), direction = "forward", data = bridge,k = log(nrow(bridge)))
## Start: AIC=-39.54 ## log(Time) ~ 1 ## ## Df Sum of Sq RSS AIC ## + log(Dwgs) 1 12.18 5.00 -91.3 ## + log(CCost) 1 11.61 5.56 -86.5 ## + log(DArea) 1 10.29 6.88 -76.9 ## + log(Length) 1 10.01 7.16 -75.1 ## + log(Spans) 1 8.73 8.45 -67.7 ## <none> 17.17 -39.5 ## ## Step: AIC=-91.28 ## log(Time) ~ log(Dwgs) ## ## Df Sum of Sq RSS AIC ## + log(Spans) 1 0.949 4.05 -97.0 ## + log(CCost) 1 0.832 4.17 -95.7 ## + log(Length) 1 0.669 4.33 -93.9 ## + log(DArea) 1 0.476 4.52 -92.0 ## <none> 5.00 -91.3 ## ## Step: AIC=-96.95 ## log(Time) ~ log(Dwgs) + log(Spans) ## ## Df Sum of Sq RSS AIC ## <none> 4.05 -97.0 ## + log(CCost) 1 0.1796 3.87 -95.2 ## + log(DArea) 1 0.0185 4.03 -93.4 ## + log(Length) 1 0.0169 4.03 -93.3
Backward Elimination
Optimal model based on AIC included log(CCost)
, log(Dwgs)
, and log(Spans)
. Optimal model based on BIC included only log(Dwgs)
and log(Spans)
.
Forward Selection
Using AIC, the optimal model is the same as by backward AIC. Using BIC, the optimal model is the same as backward BIC.
We have the same choice between the 2 and 3 predictor models.
Build a full model (with no transformations) to the Haldcement.txt
file from the book's website to predict the Y using all of the X's.