NMSA407 Linear Regression: Tutorial

Joint inference on a vector of estimable parameters

Data Cars2004nh

Introduction

Load used data and calculate basic summaries

data(Cars2004nh, package = "mffSM")

##                         vname type drive price.retail price.dealer   price cons.city cons.highway
## 1          Chevrolet.Aveo.4dr    1     1        11690        10965 11327.5       8.4          6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch    1     1        12585        11802 12193.5       8.4          6.9
## 3      Chevrolet.Cavalier.2dr    1     1        14610        13697 14153.5       9.0          6.4
## 4      Chevrolet.Cavalier.4dr    1     1        14810        13884 14347.0       9.0          6.4
## 5   Chevrolet.Cavalier.LS.2dr    1     1        16385        15357 15871.0       9.0          6.4
## 6           Dodge.Neon.SE.4dr    1     1        13670        12849 13259.5       8.1          6.5
##   consumption engine.size ncylinder horsepower weight      iweight  lweight wheel.base length width
## 1        7.65         1.6         4        103   1075 0.0009302326 6.980076        249    424   168
## 2        7.65         1.6         4        103   1065 0.0009389671 6.970730        249    389   168
## 3        7.70         2.2         4        140   1187 0.0008424600 7.079184        264    465   175
## 4        7.70         2.2         4        140   1214 0.0008237232 7.101676        264    465   173
## 5        7.70         2.2         4        140   1187 0.0008424600 7.079184        264    465   175
## 6        7.30         2.0         4        132   1171 0.0008539710 7.065613        267    442   170
##      ftype fdrive
## 1 personal  front
## 2 personal  front
## 3 personal  front
## 4 personal  front
## 5 personal  front
## 6 personal  front

dim(Cars2004nh)

## [1] 425  20

summary(Cars2004nh)

##     vname                type           drive        price.retail     price.dealer
##  Length:425         Min.   :1.000   Min.   :1.000   Min.   : 10280   Min.   :  9875
##  Class :character   1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 20370   1st Qu.: 18973
##  Mode  :character   Median :1.000   Median :1.000   Median : 27905   Median : 25672
##                     Mean   :2.219   Mean   :1.692   Mean   : 32866   Mean   : 30096
##                     3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.: 39235   3rd Qu.: 35777
##                     Max.   :6.000   Max.   :3.000   Max.   :192465   Max.   :173560
##
##      price          cons.city      cons.highway     consumption     engine.size      ncylinder
##  Min.   : 10078   Min.   : 6.20   Min.   : 5.100   Min.   : 5.65   Min.   :1.300   Min.   :-1.000
##  1st Qu.: 19600   1st Qu.:11.20   1st Qu.: 8.100   1st Qu.: 9.65   1st Qu.:2.400   1st Qu.: 4.000
##  Median : 26656   Median :12.40   Median : 9.000   Median :10.70   Median :3.000   Median : 6.000
##  Mean   : 31481   Mean   :12.36   Mean   : 9.142   Mean   :10.75   Mean   :3.208   Mean   : 5.791
##  3rd Qu.: 37514   3rd Qu.:13.80   3rd Qu.: 9.800   3rd Qu.:11.65   3rd Qu.:3.900   3rd Qu.: 6.000
##  Max.   :183012   Max.   :23.50   Max.   :19.600   Max.   :21.55   Max.   :8.300   Max.   :12.000
##                   NA's   :14      NA's   :14       NA's   :14
##    horsepower        weight        iweight             lweight        wheel.base        length
##  Min.   :100.0   Min.   : 923   Min.   :0.0003067   Min.   :6.828   Min.   :226.0   Min.   :363.0
##  1st Qu.:165.0   1st Qu.:1412   1st Qu.:0.0005542   1st Qu.:7.253   1st Qu.:262.0   1st Qu.:450.0
##  Median :210.0   Median :1577   Median :0.0006341   Median :7.363   Median :272.0   Median :472.0
##  Mean   :216.8   Mean   :1626   Mean   :0.0006412   Mean   :7.373   Mean   :274.9   Mean   :470.6
##  3rd Qu.:255.0   3rd Qu.:1804   3rd Qu.:0.0007082   3rd Qu.:7.498   3rd Qu.:284.0   3rd Qu.:490.0
##  Max.   :500.0   Max.   :3261   Max.   :0.0010834   Max.   :8.090   Max.   :366.0   Max.   :577.0
##                  NA's   :2      NA's   :2           NA's   :2       NA's   :2       NA's   :26
##      width            ftype       fdrive
##  Min.   :163.0   personal:242   front:223
##  1st Qu.:175.0   wagon   : 30   rear :110
##  Median :180.0   SUV     : 60   4x4  : 92
##  Mean   :181.1   pickup  : 24
##  3rd Qu.:185.0   sport   : 49
##  Max.   :206.0   minivan : 20
##  NA's   :28


Complete cases subset used here

To be able to compare a model fitted here with other models where also other covariates will be included, we restrict ourselves to a subset of the dataset where all variables consumption, lweight and engine.size are known.

isComplete <- complete.cases(Cars2004nh[, c("consumption", "lweight", "engine.size")])
sum(!isComplete)

## [1] 16

CarsNow <- subset(Cars2004nh, isComplete, select = c("consumption", "drive", "fdrive", "weight", "lweight", "engine.size"))
dim(CarsNow)

## [1] 409   6

summary(CarsNow)

##   consumption        drive         fdrive        weight        lweight       engine.size
##  Min.   : 5.65   Min.   :1.000   front:212   Min.   : 923   Min.   :6.828   Min.   :1.300
##  1st Qu.: 9.65   1st Qu.:1.000   rear :108   1st Qu.:1415   1st Qu.:7.255   1st Qu.:2.400
##  Median :10.70   Median :1.000   4x4  : 89   Median :1577   Median :7.363   Median :3.000
##  Mean   :10.75   Mean   :1.699               Mean   :1622   Mean   :7.371   Mean   :3.178
##  3rd Qu.:11.65   3rd Qu.:2.000               3rd Qu.:1804   3rd Qu.:7.498   3rd Qu.:3.800
##  Max.   :21.55   Max.   :3.000               Max.   :2903   Max.   :7.973   Max.   :6.000


Dependence of consumption on fdrive

Overall and group means

(ybar <- with(CarsNow, mean(consumption)))

## [1] 10.75134

(ybargr <- with(CarsNow, tapply(consumption, fdrive, mean)))

##     front      rear       4x4
##  9.741274 11.293981 12.498876


Basic plots to start

set.seed(20010911)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(CarsNow[, "drive"] + runif(nrow(CarsNow), -0.2, 0.2), CarsNow[, "consumption"], xaxt = "n", col = COL, bg = BGC, pch = PCH,
xlab = "Drive", ylab = "Consumption [l/100 km]", cex.lab = 1.2, cex.axis = 1.2)
points(1:3, ybargr, pch = 22, col = "darkgreen", bg = "seagreen", cex = 2)
axis(1, at = 1:3, labels = levels(CarsNow[, "fdrive"]), cex.axis = 1.2)


FCOL <- rainbow_hcl(3)
names(FCOL) <- levels(CarsNow[, "fdrive"])
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ fdrive, data = CarsNow, col = FCOL, xlab = "Drive", ylab = "Consumption [l/100 km]", cex.lab = 1.2, cex.axis = 1.2)


One-way analysis of variance

A well-known one-way ANOVA is used below to evaluate the effect of fdrive on consumption.

a1 <- aov(consumption ~ fdrive, data = CarsNow)
summary(a1)

##              Df Sum Sq Mean Sq F value Pr(>F)
## fdrive        2  519.9  259.94   78.92 <2e-16 ***
## Residuals   406 1337.4    3.29
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Linear model with reference group parameterization

Covariate fdrive is categorical with three levels. So called reference group pseudocontrasts are used to include this covariate in a linear model.

levels(CarsNow[["fdrive"]])                       ## Which group is the first one?

## [1] "front" "rear"  "4x4"

mTrt <- lm(consumption ~ fdrive, data = CarsNow)
summary(mTrt)

##
## Call:
## lm(formula = consumption ~ fdrive, data = CarsNow)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -4.0913 -1.2489 -0.0440  0.9587  9.0511
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   9.7413     0.1247  78.149  < 2e-16 ***
## fdriverear    1.5527     0.2146   7.237 2.32e-12 ***
## fdrive4x4     2.7576     0.2292  12.030  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.815 on 406 degrees of freedom
## Multiple R-squared:  0.2799, Adjusted R-squared:  0.2764
## F-statistic: 78.91 on 2 and 406 DF,  p-value: < 2.2e-16


Matrix $$\mbox{MS}_e\bigl(\mathbf{X}^\top\mathbf{X}\bigr)^{-1}$$

vcov(mTrt)

##             (Intercept)  fdriverear   fdrive4x4
## (Intercept)  0.01553763 -0.01553763 -0.01553763
## fdriverear  -0.01553763  0.04603742  0.01553763
## fdrive4x4   -0.01553763  0.01553763  0.05254861


Inference on $$\theta = \mathbb{L}\beta$$

Matrix $$\mathbb{L}$$ is chosen such that $$\theta$$ provides the group means.

Matrix $$\mathbb{L}$$

Lmu <- cbind(1, contr.treatment(3))
rownames(Lmu) <- levels(CarsNow[["fdrive"]])
colnames(Lmu) <- names(coef(mTrt))
print(Lmu)

##       (Intercept) fdriverear fdrive4x4
## front           1          0         0
## rear            1          1         0
## 4x4             1          0         1


BLUE of $$\theta = \mathbb{L}\beta$$

(muhat <- Lmu %*% coef(mTrt))

##            [,1]
## front  9.741274
## rear  11.293981
## 4x4   12.498876


Confidence intervals for elements of $$\theta$$

• LSest is a function from package mffSM.
• Each confidence interval has a univariate coverage of 95%.
library("mffSM")
(muCI <- LSest(mTrt, L = Lmu))

##        Estimate Std. Error  t value    P value     Lower     Upper
## front  9.741274  0.1246500 78.14899 < 2.22e-16  9.496234  9.986314
## rear  11.293981  0.1746419 64.66937 < 2.22e-16 10.950666 11.637297
## 4x4   12.498876  0.1923824 64.96892 < 2.22e-16 12.120686 12.877066


Confidence ellipse for $$\theta_1$$ and $$\theta_2$$

Parameters of the confidence ellipse

(Lnow <- Lmu[1:2,])

##       (Intercept) fdriverear fdrive4x4
## front           1          0         0
## rear            1          1         0

(mNow <- nrow(Lnow))                                             ## number of parameters to estimate

## [1] 2

(elCenter <- as.numeric(Lnow %*% coef(mTrt)))                    ## center of the ellipse

## [1]  9.741274 11.293981

(elShape <- Lnow %*% vcov(mTrt) %*% t(Lnow))                     ## shape of the ellipse

##            front       rear
## front 0.01553763 0.00000000
## rear  0.00000000 0.03049979

(elRadius <- sqrt(mNow * qf(0.95, mNow, mTrt[["df.residual"]]))) ## radius of the ellipse

## [1] 2.456805


Coordinates of the ellipse to plot

• ellipse is a function from package car.
library("car")
el <- ellipse(elCenter, elShape, elRadius, draw = FALSE)
#print(el)


Plot of the confidence ellipse

plot(y ~ x, data = el, type = "l", xlab = rownames(Lnow)[1], ylab = rownames(Lnow)[2], col = "red3", lwd = 2)
points(elCenter[1], elCenter[2], pch = 21, col = "red3", bg = "orange", cex = 2)


Plot of the confidence ellipse again

• confidenceEllipse is a function from package car
• We also add lines showing the univariate confidence intervals into the plot.
confidenceEllipse(mTrt, L = Lnow, col = "red3", fill = TRUE, center.cex = 2)
abline(v = c(muCI[["Lower"]][1], muCI[["Upper"]][1]), col = "blue", lwd = 2)
abline(h = c(muCI[["Lower"]][2], muCI[["Upper"]][2]), col = "blue", lwd = 2)
title(main = paste("(", paste(attr(muCI, "row.names")[c(1, 2)], collapse = ", "), ")", sep = ""))


Confidence ellipses for each pair of the group means

• Each ellipse has a JOINT (bivariate) coverage of 95%.
par(bty = BTY, mar = c(4, 4, 4, 1) + 0.1)
layout(matrix(c(0,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
for (j in 1:2){
for (l in (j+1):3){
confidenceEllipse(mTrt, L = Lmu[c(j, l),], col = "red3", fill = TRUE, center.cex = 2)
abline(v = c(muCI[["Lower"]][j], muCI[["Upper"]][j]), col = "blue", lwd = 2)
abline(h = c(muCI[["Lower"]][l], muCI[["Upper"]][l]), col = "blue", lwd = 2)
title(main = paste("(", paste(attr(muCI, "row.names")[c(j, l)], collapse = ", "), ")", sep = ""))
}
}


par(mfrow = c(1, 1))


Confidence ellipsoid for all three group means

• The ellipsoid has a joint coverage of 95%.

Parameters of the ellipsoid

(mmu <- length(muhat))

## [1] 3

(el3DCenter <- as.numeric(Lmu %*% coef(mTrt)))                   ## center of the ellipsoid

## [1]  9.741274 11.293981 12.498876

(el3DShape <- Lmu %*% vcov(mTrt) %*% t(Lmu))                     ## shape of the ellipsoid

##              front       rear          4x4
## front 1.553763e-02 0.00000000 3.469447e-18
## rear  0.000000e+00 0.03049979 0.000000e+00
## 4x4   3.469447e-18 0.00000000 3.701098e-02

(el3DRadius <- sqrt(mmu * qf(0.95, mmu, mTrt[["df.residual"]]))) ## radius of the ellipsoid

## [1] 2.807249


Coordinates of the ellipse to plot

• ellipse3d is a function from package rgl.
library("rgl")
el3D <- ellipse3d(x = el3DShape, centre = el3DCenter, t = el3DRadius)
#print(el3D)


Dynamic plot of the ellipsoid

plot3d(el3D, xlab = rownames(Lmu)[1], ylab = rownames(Lmu)[2], zlab = rownames(Lmu)[3], type = "wire", col = "blue")
#rgl.snapshot("/home/komarek/teach/mff_2015/nmsa407_LinRegr/Lecture/RkoTutor/figPNG/LinRegr-NormalLM-02-06.png")

plot3d(el3D, xlab = rownames(Lmu)[1], ylab = rownames(Lmu)[2], zlab = rownames(Lmu)[3],  col = "blue")
#rgl.snapshot("/home/komarek/teach/mff_2015/nmsa407_LinRegr/Lecture/RkoTutor/figPNG/LinRegr-NormalLM-02-07.png")