# NMSA407 Linear Regression: Tutorial

Basic Regression Diagnostics

Data Cars2004nh

## Introduction

### Load used data and calculate basic summaries

data(Cars2004nh, package = "mffSM")

##                         vname type drive price.retail price.dealer   price cons.city cons.highway
## 1          Chevrolet.Aveo.4dr    1     1        11690        10965 11327.5       8.4          6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch    1     1        12585        11802 12193.5       8.4          6.9
## 3      Chevrolet.Cavalier.2dr    1     1        14610        13697 14153.5       9.0          6.4
## 4      Chevrolet.Cavalier.4dr    1     1        14810        13884 14347.0       9.0          6.4
## 5   Chevrolet.Cavalier.LS.2dr    1     1        16385        15357 15871.0       9.0          6.4
## 6           Dodge.Neon.SE.4dr    1     1        13670        12849 13259.5       8.1          6.5
##   consumption engine.size ncylinder horsepower weight      iweight  lweight wheel.base length width
## 1        7.65         1.6         4        103   1075 0.0009302326 6.980076        249    424   168
## 2        7.65         1.6         4        103   1065 0.0009389671 6.970730        249    389   168
## 3        7.70         2.2         4        140   1187 0.0008424600 7.079184        264    465   175
## 4        7.70         2.2         4        140   1214 0.0008237232 7.101676        264    465   173
## 5        7.70         2.2         4        140   1187 0.0008424600 7.079184        264    465   175
## 6        7.30         2.0         4        132   1171 0.0008539710 7.065613        267    442   170
##      ftype fdrive
## 1 personal  front
## 2 personal  front
## 3 personal  front
## 4 personal  front
## 5 personal  front
## 6 personal  front

dim(Cars2004nh)

## [1] 425  20

summary(Cars2004nh)

##     vname                type           drive        price.retail     price.dealer
##  Length:425         Min.   :1.000   Min.   :1.000   Min.   : 10280   Min.   :  9875
##  Class :character   1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 20370   1st Qu.: 18973
##  Mode  :character   Median :1.000   Median :1.000   Median : 27905   Median : 25672
##                     Mean   :2.219   Mean   :1.692   Mean   : 32866   Mean   : 30096
##                     3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.: 39235   3rd Qu.: 35777
##                     Max.   :6.000   Max.   :3.000   Max.   :192465   Max.   :173560
##
##      price          cons.city      cons.highway     consumption     engine.size      ncylinder
##  Min.   : 10078   Min.   : 6.20   Min.   : 5.100   Min.   : 5.65   Min.   :1.300   Min.   :-1.000
##  1st Qu.: 19600   1st Qu.:11.20   1st Qu.: 8.100   1st Qu.: 9.65   1st Qu.:2.400   1st Qu.: 4.000
##  Median : 26656   Median :12.40   Median : 9.000   Median :10.70   Median :3.000   Median : 6.000
##  Mean   : 31481   Mean   :12.36   Mean   : 9.142   Mean   :10.75   Mean   :3.208   Mean   : 5.791
##  3rd Qu.: 37514   3rd Qu.:13.80   3rd Qu.: 9.800   3rd Qu.:11.65   3rd Qu.:3.900   3rd Qu.: 6.000
##  Max.   :183012   Max.   :23.50   Max.   :19.600   Max.   :21.55   Max.   :8.300   Max.   :12.000
##                   NA's   :14      NA's   :14       NA's   :14
##    horsepower        weight        iweight             lweight        wheel.base        length
##  Min.   :100.0   Min.   : 923   Min.   :0.0003067   Min.   :6.828   Min.   :226.0   Min.   :363.0
##  1st Qu.:165.0   1st Qu.:1412   1st Qu.:0.0005542   1st Qu.:7.253   1st Qu.:262.0   1st Qu.:450.0
##  Median :210.0   Median :1577   Median :0.0006341   Median :7.363   Median :272.0   Median :472.0
##  Mean   :216.8   Mean   :1626   Mean   :0.0006412   Mean   :7.373   Mean   :274.9   Mean   :470.6
##  3rd Qu.:255.0   3rd Qu.:1804   3rd Qu.:0.0007082   3rd Qu.:7.498   3rd Qu.:284.0   3rd Qu.:490.0
##  Max.   :500.0   Max.   :3261   Max.   :0.0010834   Max.   :8.090   Max.   :366.0   Max.   :577.0
##                  NA's   :2      NA's   :2           NA's   :2       NA's   :2       NA's   :26
##      width            ftype       fdrive
##  Min.   :163.0   personal:242   front:223
##  1st Qu.:175.0   wagon   : 30   rear :110
##  Median :180.0   SUV     : 60   4x4  : 92
##  Mean   :181.1   pickup  : 24
##  3rd Qu.:185.0   sport   : 49
##  Max.   :206.0   minivan : 20
##  NA's   :28


### Complete cases subset used here

To be able to compare a model fitted here with other models where also other covariates will be included, we restrict ourselves to a subset of the dataset where all variables consumption, lweight and engine.size are known.

isComplete <- complete.cases(Cars2004nh[, c("consumption", "lweight", "engine.size")])
sum(!isComplete)

## [1] 16

CarsUsed <- subset(Cars2004nh, isComplete, select = c("consumption", "drive", "fdrive", "weight", "lweight", "engine.size"))
dim(CarsUsed)

## [1] 409   6

summary(CarsUsed)

##   consumption        drive         fdrive        weight        lweight       engine.size
##  Min.   : 5.65   Min.   :1.000   front:212   Min.   : 923   Min.   :6.828   Min.   :1.300
##  1st Qu.: 9.65   1st Qu.:1.000   rear :108   1st Qu.:1415   1st Qu.:7.255   1st Qu.:2.400
##  Median :10.70   Median :1.000   4x4  : 89   Median :1577   Median :7.363   Median :3.000
##  Mean   :10.75   Mean   :1.699               Mean   :1622   Mean   :7.371   Mean   :3.178
##  3rd Qu.:11.65   3rd Qu.:2.000               3rd Qu.:1804   3rd Qu.:7.498   3rd Qu.:3.800
##  Max.   :21.55   Max.   :3.000               Max.   :2903   Max.   :7.973   Max.   :6.000


## Dependence of consumption on ldrive

### Scatterplot

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ lweight, data = CarsUsed, pch = PCH, col = COL, bg = BGC,
xlab = "Log(weight) [log(kg)]", ylab = "Consumption [l/100 km]")


#lines(lowess(CarsUsed[, "lweight"], CarsUsed[, "consumption"]), col = "blue", lwd = 2)


## Regression line

### Fit

m1 <- lm(consumption ~ lweight, data = CarsUsed)
summary(m1)

##
## Call:
## lm(formula = consumption ~ lweight, data = CarsUsed)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -3.6544 -0.7442 -0.1526  0.5160  5.1616
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -58.2480     1.8941  -30.75   <2e-16 ***
## lweight       9.3606     0.2569   36.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.035 on 407 degrees of freedom
## Multiple R-squared:  0.7654, Adjusted R-squared:  0.7648
## F-statistic:  1328 on 1 and 407 DF,  p-value: < 2.2e-16


### Scatterplot with the fitted line

Ybar <- round(with(CarsUsed, mean(consumption)), 2)
be0 <- round(coef(m1)[1], 2)
be1 <- round(coef(m1)[2], 4)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ lweight, data = CarsUsed, pch = PCH, col = COL2, bg = BGC2,
xlab = "Log(weight) [log(kg)]", ylab = "Consumption [l/100 km]")
abline(m1, col = "red2", lwd = 2)


## Quantities of the regression diagnostics

• We always print only the first 20 values in the output.

### Diagonal elements of the $$\mathbb{H}$$ matrix (leverages)

hatvalues(m1)

##           1           2           3           4           5           6           7           8
## 0.011878156 0.012334248 0.007704019 0.006925319 0.007704019 0.008204039 0.007583579 0.007764917
##           9          10          11          12          13          14          15          16
## 0.007857115 0.007857115 0.006736268 0.010678880 0.009477400 0.009259385 0.014420754 0.013592323
##          17          18          19          20
## 0.012520926 0.007464931 0.007464931 0.006656819


### Diagonal elements of the $$\mathbb{M}$$ matrix

1 - hatvalues(m1)

##         1         2         3         4         5         6         7         8         9        10
## 0.9881218 0.9876658 0.9922960 0.9930747 0.9922960 0.9917960 0.9924164 0.9922351 0.9921429 0.9921429
##        11        12        13        14        15        16        17        18        19        20
## 0.9932637 0.9893211 0.9905226 0.9907406 0.9855792 0.9864077 0.9874791 0.9925351 0.9925351 0.9933432


#### Descriptive statistics for the diagonal elements of the $$\mathbb{M}$$ matrix

summary(1 - hatvalues(m1))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
##  0.9752  0.9941  0.9967  0.9951  0.9974  0.9976


### (Raw) residuals

residuals(m1)

##           1           2           3           4           5           6           7           8
##  0.56060511  0.64808761 -0.31710554 -0.52763929 -0.31710554 -0.59007308 -0.74859610  0.04867956
##           9          10          11          12          13          14          15          16
## -0.37759277  0.07240723 -0.23145786 -0.53008371 -1.68953471 -0.83893092  0.97471267  0.82944403
##          17          18          19          20
##  0.63331084 -0.12998107 -0.12998107 -0.35442857


### Standardized residuals

rstandard(m1)

##           1           2           3           4           5           6           7           8
##  0.54508926  0.63029600 -0.30767983 -0.51175489 -0.30767983 -0.57267795 -0.72630061  0.04723405
##           9          10          11          12          13          14          15          16
## -0.36639740  0.07026040 -0.22446853 -0.51510011 -1.64078149 -0.81463308  0.94895728  0.80718799
##          17          18          19          20
##  0.61598309 -0.12610230 -0.12610230 -0.34371215


## Basic residual plots

• Method plot applied to objects of class lm (result of the fitting function lm) provides the basic residual plots. Its argument which determines which plot is produced.

### Residuals versus fitted values (which = 1)

par(mar = c(4, 4, 1, 1) + 0.1)
plot(m1, which = 1, pch = 21, col = "blue4", bg = "skyblue")


### Normal QQ plot based on standardized residuals (which = 2)

par(mar = c(4, 4, 1, 1) + 0.1)
plot(m1, which = 2, pch = 21, col = "blue4", bg = "skyblue")


### Scale-location plot (which = 3)

par(mar = c(4, 4, 1, 1) + 0.1)
plot(m1, which = 3, pch = 21, col = "blue4", bg = "skyblue")


### Three residual plots

• Function plotLM from package mffSM produces all three above residual plots at once.
par(mar = c(4, 4, 3, 1) + 0.1)
plotLM(m1)