Multicollinearity
Data Cars2004nh
data(Cars2004nh, package = "mffSM")
head(Cars2004nh)
## vname type drive price.retail price.dealer price cons.city cons.highway
## 1 Chevrolet.Aveo.4dr 1 1 11690 10965 11327.5 8.4 6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch 1 1 12585 11802 12193.5 8.4 6.9
## 3 Chevrolet.Cavalier.2dr 1 1 14610 13697 14153.5 9.0 6.4
## 4 Chevrolet.Cavalier.4dr 1 1 14810 13884 14347.0 9.0 6.4
## 5 Chevrolet.Cavalier.LS.2dr 1 1 16385 15357 15871.0 9.0 6.4
## 6 Dodge.Neon.SE.4dr 1 1 13670 12849 13259.5 8.1 6.5
## consumption engine.size ncylinder horsepower weight iweight lweight wheel.base length width
## 1 7.65 1.6 4 103 1075 0.0009302326 6.980076 249 424 168
## 2 7.65 1.6 4 103 1065 0.0009389671 6.970730 249 389 168
## 3 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 4 7.70 2.2 4 140 1214 0.0008237232 7.101676 264 465 173
## 5 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 6 7.30 2.0 4 132 1171 0.0008539710 7.065613 267 442 170
## ftype fdrive
## 1 personal front
## 2 personal front
## 3 personal front
## 4 personal front
## 5 personal front
## 6 personal front
dim(Cars2004nh)
## [1] 425 20
summary(Cars2004nh)
## vname type drive price.retail price.dealer
## Length:425 Min. :1.000 Min. :1.000 Min. : 10280 Min. : 9875
## Class :character 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 20370 1st Qu.: 18973
## Mode :character Median :1.000 Median :1.000 Median : 27905 Median : 25672
## Mean :2.219 Mean :1.692 Mean : 32866 Mean : 30096
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.: 39235 3rd Qu.: 35777
## Max. :6.000 Max. :3.000 Max. :192465 Max. :173560
##
## price cons.city cons.highway consumption engine.size ncylinder
## Min. : 10078 Min. : 6.20 Min. : 5.100 Min. : 5.65 Min. :1.300 Min. :-1.000
## 1st Qu.: 19600 1st Qu.:11.20 1st Qu.: 8.100 1st Qu.: 9.65 1st Qu.:2.400 1st Qu.: 4.000
## Median : 26656 Median :12.40 Median : 9.000 Median :10.70 Median :3.000 Median : 6.000
## Mean : 31481 Mean :12.36 Mean : 9.142 Mean :10.75 Mean :3.208 Mean : 5.791
## 3rd Qu.: 37514 3rd Qu.:13.80 3rd Qu.: 9.800 3rd Qu.:11.65 3rd Qu.:3.900 3rd Qu.: 6.000
## Max. :183012 Max. :23.50 Max. :19.600 Max. :21.55 Max. :8.300 Max. :12.000
## NA's :14 NA's :14 NA's :14
## horsepower weight iweight lweight wheel.base length
## Min. :100.0 Min. : 923 Min. :0.0003067 Min. :6.828 Min. :226.0 Min. :363.0
## 1st Qu.:165.0 1st Qu.:1412 1st Qu.:0.0005542 1st Qu.:7.253 1st Qu.:262.0 1st Qu.:450.0
## Median :210.0 Median :1577 Median :0.0006341 Median :7.363 Median :272.0 Median :472.0
## Mean :216.8 Mean :1626 Mean :0.0006412 Mean :7.373 Mean :274.9 Mean :470.6
## 3rd Qu.:255.0 3rd Qu.:1804 3rd Qu.:0.0007082 3rd Qu.:7.498 3rd Qu.:284.0 3rd Qu.:490.0
## Max. :500.0 Max. :3261 Max. :0.0010834 Max. :8.090 Max. :366.0 Max. :577.0
## NA's :2 NA's :2 NA's :2 NA's :2 NA's :26
## width ftype fdrive
## Min. :163.0 personal:242 front:223
## 1st Qu.:175.0 wagon : 30 rear :110
## Median :180.0 SUV : 60 4x4 : 92
## Mean :181.1 pickup : 24
## 3rd Qu.:185.0 sport : 49
## Max. :206.0 minivan : 20
## NA's :28
Cars2004nh <- transform(Cars2004nh, weightTon = weight/1000)
To be able to compare models fitted using different sets of covariates, we shall consider a subset of data where all covariate values are known.
CarsNoNA <- na.omit(subset(Cars2004nh, select = c("consumption", "fdrive", "ftype", "weightTon", "engine.size",
"horsepower", "wheel.base", "length", "width")))
dim(CarsNoNA)
## [1] 384 9
consumption
on fdrive
, ftype
, weightTon
, engine.size
, horsepower
, wheel.base
, length
, width
round(cor(CarsNoNA[, c("consumption", "weightTon", "engine.size", "horsepower", "wheel.base", "length", "width")]), 2)
## consumption weightTon engine.size horsepower wheel.base length width
## consumption 1.00 0.88 0.81 0.70 0.51 0.42 0.65
## weightTon 0.88 1.00 0.81 0.62 0.75 0.65 0.80
## engine.size 0.81 0.81 1.00 0.78 0.63 0.62 0.72
## horsepower 0.70 0.62 0.78 1.00 0.39 0.37 0.49
## wheel.base 0.51 0.75 0.63 0.39 1.00 0.87 0.75
## length 0.42 0.65 0.62 0.37 0.87 1.00 0.74
## width 0.65 0.80 0.72 0.49 0.75 0.74 1.00
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
car::scatterplotMatrix(~consumption + weightTon + engine.size + horsepower + wheel.base + length + width,
reg.line = FALSE, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = CarsNoNA, pch = 16)
psych::pairs.panels(subset(CarsNoNA, select = c("consumption", "weightTon", "engine.size", "horsepower", "wheel.base", "length", "width")),
bg = "palegoldenrod", col = "red3", pch = 21,
ellipses = FALSE, smooth = FALSE, lm = TRUE, hist.col = "lightblue", rug = FALSE)
m1 <- lm(consumption ~ fdrive + ftype + weightTon + engine.size + horsepower + wheel.base + length + width, data = CarsNoNA)
summary(m1)
##
## Call:
## lm(formula = consumption ~ fdrive + ftype + weightTon + engine.size +
## horsepower + wheel.base + length + width, data = CarsNoNA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1230 -0.4497 -0.0620 0.4725 3.4650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.679118 1.343556 7.204 3.28e-12 ***
## fdriverear 0.199015 0.116199 1.713 0.08760 .
## fdrive4x4 0.243282 0.119890 2.029 0.04315 *
## ftypewagon -0.002042 0.147780 -0.014 0.98898
## ftypeSUV 0.886129 0.165514 5.354 1.51e-07 ***
## ftypesport 0.134164 0.177934 0.754 0.45132
## ftypeminivan 0.344158 0.212437 1.620 0.10607
## weightTon 4.544234 0.352521 12.891 < 2e-16 ***
## engine.size 0.462633 0.086767 5.332 1.69e-07 ***
## horsepower 0.004428 0.001052 4.208 3.23e-05 ***
## wheel.base -0.017565 0.005548 -3.166 0.00167 **
## length -0.004357 0.002813 -1.549 0.12225
## width -0.012166 0.009613 -1.265 0.20649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7274 on 371 degrees of freedom
## Multiple R-squared: 0.8801, Adjusted R-squared: 0.8763
## F-statistic: 227 on 12 and 371 DF, p-value: < 2.2e-16
drop1(m1, test = "F")
## Single term deletions
##
## Model:
## consumption ~ fdrive + ftype + weightTon + engine.size + horsepower +
## wheel.base + length + width
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 196.28 -231.694
## fdrive 2 2.953 199.24 -229.959 2.7912 0.062637 .
## ftype 4 16.205 212.49 -209.233 7.6573 6.160e-06 ***
## weightTon 1 87.915 284.20 -91.572 166.1690 < 2.2e-16 ***
## engine.size 1 15.041 211.32 -205.342 28.4292 1.692e-07 ***
## horsepower 1 9.370 205.65 -215.787 17.7105 3.230e-05 ***
## wheel.base 1 5.303 201.59 -223.457 10.0239 0.001673 **
## length 1 1.269 197.55 -231.219 2.3992 0.122254
## width 1 0.847 197.13 -232.040 1.6015 0.206490
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(m1)
## GVIF Df GVIF^(1/(2*Df))
## fdrive 2.602373 2 1.270113
## ftype 7.771294 4 1.292146
## weightTon 9.109377 1 3.018174
## engine.size 5.552078 1 2.356285
## horsepower 3.891614 1 1.972717
## wheel.base 7.197089 1 2.682739
## length 6.422375 1 2.534241
## width 4.863856 1 2.205415
mffSM::plotLM(m1)
m2 <- lm(consumption ~ fdrive + ftype + weightTon + horsepower + wheel.base, data = CarsNoNA)
summary(m2)
##
## Call:
## lm(formula = consumption ~ fdrive + ftype + weightTon + horsepower +
## wheel.base, data = CarsNoNA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1277 -0.4482 -0.0782 0.4373 3.7606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0370056 0.9172752 7.672 1.49e-13 ***
## fdriverear 0.3474035 0.1137757 3.053 0.00242 **
## fdrive4x4 0.2171375 0.1212333 1.791 0.07409 .
## ftypewagon -0.0461471 0.1521680 -0.303 0.76186
## ftypeSUV 0.9970747 0.1574768 6.332 6.96e-10 ***
## ftypesport 0.1166273 0.1740838 0.670 0.50330
## ftypeminivan 0.3100956 0.2030457 1.527 0.12755
## weightTon 4.9946908 0.3192539 15.645 < 2e-16 ***
## horsepower 0.0069427 0.0009768 7.108 6.01e-12 ***
## wheel.base -0.0229337 0.0043961 -5.217 3.02e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7527 on 374 degrees of freedom
## Multiple R-squared: 0.8706, Adjusted R-squared: 0.8675
## F-statistic: 279.5 on 9 and 374 DF, p-value: < 2.2e-16
drop1(m2, test = "F")
## Single term deletions
##
## Model:
## consumption ~ fdrive + ftype + weightTon + horsepower + wheel.base
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 211.91 -208.275
## fdrive 2 5.885 217.80 -201.755 5.1934 0.005961 **
## ftype 4 24.807 236.72 -173.765 10.9453 2.105e-08 ***
## weightTon 1 138.686 350.60 -16.944 244.7625 < 2.2e-16 ***
## horsepower 1 28.626 240.54 -161.619 50.5213 6.012e-12 ***
## wheel.base 1 15.421 227.33 -183.301 27.2158 3.021e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(m2)
## GVIF Df GVIF^(1/(2*Df))
## fdrive 2.236020 2 1.222838
## ftype 5.184095 4 1.228384
## weightTon 6.976128 1 2.641236
## horsepower 3.131296 1 1.769547
## wheel.base 4.219425 1 2.054124
mffSM::plotLM(m2)