NMSA407 Linear Regression: Tutorial

Numeric covariate: simple transformation, polynomial regression, regression splines

Data Houses1987




Introduction

Load used data and calculate basic summaries

data(Houses1987, package = "mffSM")
head(Houses1987)
##   price ground bed bath floor garage airco gas fbed fbath ffloor fgarage fairco fgas
## 1 42000    544   3    1     2      1     0   0    3     1      2       1     No   No
## 2 38500    372   2    1     1      0     0   0  <=2     1      1       0     No   No
## 3 49500    285   3    1     1      0     0   0    3     1      1       0     No   No
## 4 60500    619   3    1     2      0     0   0    3     1      2       0     No   No
## 5 61000    592   2    1     1      0     0   0  <=2     1      1       0     No   No
## 6 66000    387   3    1     1      0     1   0    3     1      1       0    Yes   No
dim(Houses1987)
## [1] 546  14
summary(Houses1987)
##      price            ground            bed             bath           floor           garage      
##  Min.   : 25000   Min.   : 153.0   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.: 49125   1st Qu.: 335.0   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median : 62000   Median : 428.0   Median :3.000   Median :1.000   Median :2.000   Median :0.0000  
##  Mean   : 68122   Mean   : 479.1   Mean   :2.965   Mean   :1.286   Mean   :1.808   Mean   :0.6923  
##  3rd Qu.: 82000   3rd Qu.: 592.0   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :190000   Max.   :1507.0   Max.   :6.000   Max.   :4.000   Max.   :4.000   Max.   :3.0000  
##      airco             gas           fbed     fbath     ffloor    fgarage   fairco     fgas    
##  Min.   :0.0000   Min.   :0.00000   <=2:138   1  :402   1  :227   0  :300   No :373   No :521  
##  1st Qu.:0.0000   1st Qu.:0.00000   3  :301   2  :133   2  :238   1  :126   Yes:173   Yes: 25  
##  Median :0.0000   Median :0.00000   4  : 95   >=3: 11   >=3: 81   >=2:120                      
##  Mean   :0.3168   Mean   :0.04579   >=5: 12                                                    
##  3rd Qu.:1.0000   3rd Qu.:0.00000                                                              
##  Max.   :1.0000   Max.   :1.00000




Dependence of price on ground

Basis scatterplot

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")

plot of chunk fig-ParamCovar-01-01

Scatterplot equiped with the LOWESS smoother

lws <- lowess(Houses1987[, "ground"], Houses1987[, "price"])
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
lines(lws, col = "blue", lwd = 2)

plot of chunk fig-ParamCovar-01-02




Model with log-transformed covariate

Fit

m1 <- lm(price ~ log(ground), data = Houses1987)
summary(m1)
## 
## Call:
## lm(formula = price ~ log(ground), data = Houses1987)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55841 -14946  -2805  10965 105124 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -161279      14538  -11.09   <2e-16 ***
## log(ground)    37658       2381   15.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22120 on 544 degrees of freedom
## Multiple R-squared:  0.3149, Adjusted R-squared:  0.3137 
## F-statistic: 250.1 on 1 and 544 DF,  p-value: < 2.2e-16

Fitted regression function (original \(z\)-scale)

grpred <- seq(150, 1510, length = 500)
pm1 <- predict(m1, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
#lines(lws, col = "blue", lwd = 2)
lines(grpred, pm1, col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-03

#legend(160, 190000, legend = c("Y ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Fitted regression function (transformed \(\log(z)\)-scale)

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ log(ground), data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("log(Ground size) [log(", m^2, ")]")), ylab = "Price [CAD]")
#lines(lowess(log(Houses1987[, "ground"]), Houses1987[, "price"]), col = "blue", lwd = 2)
abline(m1, col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-04

#legend(5, 190000, legend = c("Y ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Basic residual plots

library("mffSM")
plotLM(m1)

plot of chunk fig-ParamCovar-01-05




Additional log transformation of response

We additionally consider a log-transformation of the response which improves some other aspects of the model.

Fit

m2 <- lm(log(price) ~ log(ground), data = Houses1987)
summary(m2)
## 
## Call:
## lm(formula = log(price) ~ log(ground), data = Houses1987)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8571 -0.1988  0.0046  0.1929  0.8969 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.75625    0.19933   38.91   <2e-16 ***
## log(ground)  0.54216    0.03265   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3033 on 544 degrees of freedom
## Multiple R-squared:  0.3364, Adjusted R-squared:  0.3351 
## F-statistic: 275.7 on 1 and 544 DF,  p-value: < 2.2e-16

Fitted regression function (original \(z\)-scale, transformed \(\log(y)\)-scale)

pm2 <- predict(m2, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
#lines(lowess(Houses1987[, "ground"], log(Houses1987[, "price"])), col = "blue", lwd = 2)
lines(grpred, pm2, col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-06

#legend(160, log(190000), legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Fitted regression function (transformed \(\log(z)\)-scale and \(\log(y)\)-scale)

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ log(ground), data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("X = log(Z) = log(Ground size) [log(", m^2, ")]")), ylab = "Y = log(Price)")
#lines(lowess(log(Houses1987[, "ground"]), log(Houses1987[, "price"])), col = "blue", lwd = 2)
abline(m2, col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-07

#legend(5, log(190000), legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Prediction line (both \(x\) and \(y\) axes use original units)

pm2 <- predict(m2, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
#lines(lws, col = "blue", lwd = 2)
lines(grpred, exp(pm2), col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-08

#legend(160, 190000, legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Basic residual plots

plotLM(m2)

plot of chunk fig-ParamCovar-01-09




Polynomial models with log(price) as response

Fits

p <- list()
p[["0"]] <- lm(log(price) ~ 1, data = Houses1987)
p[["1"]] <- lm(log(price) ~ ground, data = Houses1987)
p[["2"]] <- lm(log(price) ~ ground + I(ground^2), data = Houses1987)
p[["3"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
p[["4"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4), data = Houses1987)
p[["5"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4) + I(ground^5), data = Houses1987)

sapply(p, summary)
## $`0`
## 
## Call:
## lm(formula = log(price) ~ 1, data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93233 -0.25685 -0.02407  0.25551  1.09582 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.05896    0.01592   694.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.372 on 545 degrees of freedom
## 
## 
## $`1`
## 
## Call:
## lm(formula = log(price) ~ ground, data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96501 -0.20546  0.00402  0.19730  0.88491 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.058e+01  3.452e-02  306.50   <2e-16 ***
## ground      1.001e-03  6.641e-05   15.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3127 on 544 degrees of freedom
## Multiple R-squared:  0.2947, Adjusted R-squared:  0.2934 
## F-statistic: 227.3 on 1 and 544 DF,  p-value: < 2.2e-16
## 
## 
## $`2`
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2), data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.89001 -0.19452  0.00597  0.19397  0.91316 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.020e+01  6.698e-02  152.28  < 2e-16 ***
## ground       2.470e-03  2.339e-04   10.56  < 2e-16 ***
## I(ground^2) -1.200e-06  1.838e-07   -6.53 1.52e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3014 on 543 degrees of freedom
## Multiple R-squared:  0.3461, Adjusted R-squared:  0.3437 
## F-statistic: 143.7 on 2 and 543 DF,  p-value: < 2.2e-16
## 
## 
## $`3`
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3), 
##     data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87279 -0.19903  0.00212  0.19780  0.90934 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.965e+00  1.371e-01  72.682  < 2e-16 ***
## ground       3.784e-03  7.109e-04   5.323 1.49e-07 ***
## I(ground^2) -3.306e-06  1.092e-06  -3.028  0.00258 ** 
## I(ground^3)  9.700e-10  4.958e-10   1.957  0.05091 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared:  0.3507, Adjusted R-squared:  0.3471 
## F-statistic: 97.57 on 3 and 542 DF,  p-value: < 2.2e-16
## 
## 
## $`4`
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3) + 
##     I(ground^4), data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.91381 -0.19784  0.00291  0.20086  0.93320 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.071e+01  2.634e-01  40.653  < 2e-16 ***
## ground      -1.788e-03  1.834e-03  -0.975  0.33013    
## I(ground^2)  1.052e-05  4.339e-06   2.424  0.01569 *  
## I(ground^3) -1.256e-08  4.143e-09  -3.032  0.00254 ** 
## I(ground^4)  4.450e-12  1.353e-12   3.290  0.00107 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2979 on 541 degrees of freedom
## Multiple R-squared:  0.3634, Adjusted R-squared:  0.3587 
## F-statistic: 77.21 on 4 and 541 DF,  p-value: < 2.2e-16
## 
## 
## $`5`
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3) + 
##     I(ground^4) + I(ground^5), data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.90198 -0.19046  0.00679  0.19546  0.94797 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.145e+01  4.996e-01  22.923   <2e-16 ***
## ground      -9.058e-03  4.532e-03  -1.998   0.0462 *  
## I(ground^2)  3.589e-05  1.511e-05   2.376   0.0179 *  
## I(ground^3) -5.238e-08  2.308e-08  -2.269   0.0236 *  
## I(ground^4)  3.277e-11  1.621e-11   2.022   0.0437 *  
## I(ground^5) -7.379e-15  4.209e-15  -1.753   0.0801 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2973 on 540 degrees of freedom
## Multiple R-squared:  0.367,  Adjusted R-squared:  0.3612 
## F-statistic: 62.62 on 5 and 540 DF,  p-value: < 2.2e-16

Fitted regression functions

pp <- lapply(p, predict, newdata = data.frame(ground = grpred))

Plot of data together with the fitted regression functions (original \(z\)-scale)

COLS <- c("black", "brown", "red", "orange", "darkgreen", "blue")
names(COLS) <- names(p)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = Log(price)")
for (i in 1:length(pp)){
  lines(grpred, pp[[i]], col = COLS[i], lwd = 2)
}  
lines(grpred, exp(pm2), col = "red3", lwd = 2)
legend(1300, 10.90, legend = 0:5, lty = 1, lwd = 2, col = COLS)
text(1300, 10.95, labels = "Degree", pos = 4, font = 2)

plot of chunk fig-ParamCovar-01-10

Residuals versus fitted values

par(mfrow = c(2, 3))
for (i in 1:length(p)){
  plot(p[[i]], which = 1, pch = 21, col = "blue4", bg = "skyblue", caption = paste("Degree ", i - 1, sep = ""))
}  

plot of chunk fig-ParamCovar-01-11

par(mfrow = c(1, 1))

Fitted regression function for a cubic polynomial

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(grpred, pp[["3"]], col = "red3", lwd = 2)