# NMSA407 Linear Regression: Tutorial

Weighted least squares

Data Kojeni and wKojeni

## Introduction

### Load data Kojeni and calculate basic summaries

``````data(Kojeni, package = "mffSM")
``````
``````##   bplace n.child educ bfeed.dur bfeed24 gender bweight blength weight24 length24 breast father teat
## 1      1       3    2        18       0      1    3200      50     8000       75      1      0    1
## 2      1       2    3         5       0      1    3720      52     8110       68      0      0    1
## 3      1       1    2        24       1      1    4100      53     8750       64      1      0    1
## 4      1       1    3        14       0      1    3960      53     8350       72      1      1    1
## 5      1       1    1        24       1      0    3700      52     6700       68      0      0    1
## 6      1       1    2        16       0      0    3700      51     7990       68      1      0    1
##   plan mheight fheight mage fage fbplace     feduc fbfeed24 fgender fbreast ffather fteat fplan
## 1    1     160     180   26   30  Prague Secondary       No     Boy     Yes      No   Yes   Yes
## 2    1     160     187   35   38  Prague  Tertiary       No     Boy      No      No   Yes   Yes
## 3    1     173     186   26   28  Prague Secondary      Yes     Boy     Yes      No   Yes   Yes
## 4    1     159     177   24   26  Prague  Tertiary       No     Boy     Yes     Yes   Yes   Yes
## 5    0     168     183   22   28  Prague   Primary      Yes    Girl      No      No   Yes    No
## 6    1     173     186   24   29  Prague Secondary       No    Girl     Yes      No   Yes   Yes
``````
``````dim(Kojeni)
``````
``````## [1] 99 26
``````
``````summary(Kojeni)
``````
``````##      bplace         n.child           educ         bfeed.dur        bfeed24           gender
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   : 0.00   Min.   :0.0000   Min.   :0.0000
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 7.00   1st Qu.:0.0000   1st Qu.:0.0000
##  Median :1.000   Median :1.000   Median :2.000   Median :16.00   Median :0.0000   Median :0.0000
##  Mean   :1.293   Mean   :1.566   Mean   :1.838   Mean   :14.43   Mean   :0.2828   Mean   :0.4949
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:24.00   3rd Qu.:1.0000   3rd Qu.:1.0000
##  Max.   :2.000   Max.   :4.000   Max.   :3.000   Max.   :24.00   Max.   :1.0000   Max.   :1.0000
##     bweight        blength        weight24       length24         breast           father
##  Min.   :2040   Min.   :46.0   Min.   :5850   Min.   :62.00   Min.   :0.0000   Min.   :0.0000
##  1st Qu.:3200   1st Qu.:50.0   1st Qu.:7025   1st Qu.:66.00   1st Qu.:0.0000   1st Qu.:0.0000
##  Median :3480   Median :51.0   Median :7770   Median :68.00   Median :0.0000   Median :0.0000
##  Mean   :3470   Mean   :50.6   Mean   :7690   Mean   :68.55   Mean   :0.4848   Mean   :0.3636
##  3rd Qu.:3720   3rd Qu.:52.0   3rd Qu.:8250   3rd Qu.:70.00   3rd Qu.:1.0000   3rd Qu.:1.0000
##  Max.   :4400   Max.   :54.0   Max.   :9950   Max.   :78.00   Max.   :1.0000   Max.   :1.0000
##       teat             plan           mheight       fheight           mage           fage
##  Min.   :0.0000   Min.   :0.0000   Min.   :153   Min.   :162.0   Min.   :18.0   Min.   :19.00
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:164   1st Qu.:175.0   1st Qu.:23.0   1st Qu.:26.00
##  Median :1.0000   Median :1.0000   Median :167   Median :179.0   Median :25.0   Median :28.00
##  Mean   :0.7677   Mean   :0.5859   Mean   :167   Mean   :179.3   Mean   :25.7   Mean   :28.89
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:171   3rd Qu.:185.0   3rd Qu.:28.0   3rd Qu.:32.00
##  Max.   :1.0000   Max.   :1.0000   Max.   :187   Max.   :195.0   Max.   :38.0   Max.   :43.00
##      fbplace         feduc    fbfeed24 fgender   fbreast  ffather  fteat    fplan
##  Prague  :70   Primary  :34   No :71   Girl:50   No :51   No :63   No :23   No :41
##  Province:29   Secondary:47   Yes:28   Boy :49   Yes:48   Yes:36   Yes:76   Yes:58
##                Tertiary :18
##
##
##
``````

### Load data wKojeni and calculate basic summaries

``````data(wKojeni, package = "mffSM")
print(wKojeni)
``````
``````##    blength  w  bweight weight24 length24
## 46      46  1 2040.000 6900.000 64.00000
## 47      47  2 2825.000 7385.000 66.00000
## 48      48  5 2986.000 6820.000 66.00000
## 49      49 15 3156.000 7282.667 67.60000
## 50      50 24 3281.250 7490.000 67.83333
## 51      51 21 3562.762 7938.095 69.47619
## 52      52 21 3745.714 8079.048 69.76190
## 53      53  9 4144.444 7895.889 69.11111
## 54      54  1 4000.000 9070.000 72.00000
``````
``````dim(wKojeni)
``````
``````## [1] 9 5
``````
``````summary(wKojeni)
``````
``````##     blength         w         bweight        weight24       length24
##  Min.   :46   Min.   : 1   Min.   :2040   Min.   :6820   Min.   :64.00
##  1st Qu.:48   1st Qu.: 2   1st Qu.:2986   1st Qu.:7283   1st Qu.:66.00
##  Median :50   Median : 9   Median :3281   Median :7490   Median :67.83
##  Mean   :50   Mean   :11   Mean   :3305   Mean   :7651   Mean   :67.98
##  3rd Qu.:52   3rd Qu.:21   3rd Qu.:3746   3rd Qu.:7938   3rd Qu.:69.48
##  Max.   :54   Max.   :24   Max.   :4144   Max.   :9070   Max.   :72.00
``````

### Print some columns from both datasets

``````print(Kojeni[, c("blength", "bweight")])
``````
``````##    blength bweight
## 1       50    3200
## 2       52    3720
## 3       53    4100
## 4       53    3960
## 5       52    3700
## 6       51    3700
## 7       50    2860
## 8       51    3420
## 9       52    3820
## 10      50    2900
## 11      52    4050
## 12      51    3550
## 13      49    3080
## 14      51    3333
## 15      51    3840
## 16      50    3550
## 17      48    3160
## 18      51    3310
## 19      51    3330
## 20      52    3290
## 21      53    4310
## 22      49    3070
## 23      49    3200
## 24      50    3420
## 25      50    3200
## 26      49    3310
## 27      53    4080
## 28      50    3300
## 29      50    3770
## 30      51    3650
## 31      52    3640
## 32      51    3860
## 33      50    3230
## 34      51    3700
## 35      50    2980
## 36      49    2820
## 37      50    3590
## 38      52    3720
## 39      50    3300
## 40      49    3500
## 41      54    4000
## 42      51    4115
## 43      49    3080
## 44      48    3020
## 45      51    3480
## 46      50    3730
## 47      50    3250
## 48      49    2880
## 49      52    4000
## 50      52    3960
## 51      52    3730
## 52      52    3330
## 53      49    3150
## 54      49    3300
## 55      52    3650
## 56      53    4400
## 57      52    4000
## 58      50    3530
## 59      48    3550
## 60      51    3980
## 61      49    3460
## 62      52    3720
## 63      47    2450
## 64      49    3190
## 65      51    3600
## 66      49    2900
## 67      50    2850
## 68      51    3005
## 69      51    3430
## 70      50    3140
## 71      52    3700
## 72      52    3890
## 73      50    3600
## 74      53    4360
## 75      52    3940
## 76      53    3490
## 77      50    2850
## 78      48    2200
## 79      53    4200
## 80      50    3250
## 81      50    3200
## 82      51    3400
## 83      51    3400
## 84      52    4100
## 85      52    3550
## 86      50    3150
## 87      50    3300
## 88      51    3600
## 89      49    3400
## 90      51    3555
## 91      51    3560
## 92      47    3200
## 93      52    3450
## 94      53    4400
## 95      50    3600
## 96      46    2040
## 97      52    3700
## 98      48    3000
## 99      49    3000
``````
``````print(wKojeni[, c("blength", "bweight", "w")])
``````
``````##    blength  bweight  w
## 46      46 2040.000  1
## 47      47 2825.000  2
## 48      48 2986.000  5
## 49      49 3156.000 15
## 50      50 3281.250 24
## 51      51 3562.762 21
## 52      52 3745.714 21
## 53      53 4144.444  9
## 54      54 4000.000  1
``````

## Dependence of birth weight on birth length

### Scatterplots

``````YLIM <- with(Kojeni, range(bweight))
par(mfrow = c(1, 2), bty = BTY, mar = c(5, 4, 1, 1)+0.1)
plot(bweight ~ blength, data = Kojeni, col = COL, bg = BGC, ylim = YLIM,
pch = PCH, xlab = "Birth length [cm]", ylab = "Birth weight [g]", main = "Kojeni")
plot(bweight ~ blength, data = wKojeni, col = COL, bg = BGC, ylim = YLIM,
pch = PCH, xlab = "Birth length [cm]", ylab = "Birth weight [g]",
main = "wKojeni", cex = 1.2)
``````

### Scatterplots (weights shown on the plot)

``````par(mfrow = c(1, 2), bty = BTY, mar = c(5, 4, 1, 1)+0.1)
plot(bweight ~ blength, data = Kojeni, col = COL, bg = BGC, ylim = YLIM,
pch = PCH, xlab = "Birth length [cm]", ylab = "Birth weight [g]", main = "Kojeni")
plot(bweight ~ blength, data = wKojeni, col = COL, bg = BGC, ylim = YLIM,
pch = PCH, xlab = "Birth length [cm]", ylab = "Birth weight [g]",
cex = w/5, main = "wKojeni")
``````

## Least squares estimation

### Ordinary least squares using original (complete) data

``````m1 <- lm(bweight ~ blength, data = Kojeni)
summary(m1)
``````
``````##
## Call:
## lm(formula = bweight ~ blength, data = Kojeni)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -685.93 -152.83  -30.76  196.83  664.07
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7905.80     895.45  -8.829 4.52e-14 ***
## blength       224.83      17.69  12.709  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 271.7 on 97 degrees of freedom
## Multiple R-squared:  0.6248, Adjusted R-squared:  0.6209
## F-statistic: 161.5 on 1 and 97 DF,  p-value: < 2.2e-16
``````
``````confint(m1)
``````
``````##                  2.5 %     97.5 %
## (Intercept) -9683.0226 -6128.5847
## blength       189.7184   259.9372
``````

### Weighted least squares using averaged data

``````wm1 <- lm(bweight ~ blength, weights = w, data = wKojeni)
summary(wm1)
``````
``````##
## Call:
## lm(formula = bweight ~ blength, data = wKojeni, weights = w)
##
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max
## -396.28 -234.90   10.75  223.76  403.12
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7905.80     975.42  -8.105 8.39e-05 ***
## blength       224.83      19.27  11.667 7.68e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 295.9 on 7 degrees of freedom
## Multiple R-squared:  0.9511, Adjusted R-squared:  0.9441
## F-statistic: 136.1 on 1 and 7 DF,  p-value: 7.676e-06
``````
``````confint(wm1)
``````
``````##                   2.5 %     97.5 %
## (Intercept) -10212.3079 -5599.2995
## blength        179.2623   270.3934
``````

### Ordinary least squares based on replicated data from wKojeni

• Many numbers in the output are wrong!!!
``````replKojeni <- data.frame(bweight = rep(wKojeni[, "bweight"], wKojeni[, "w"]),
blength = rep(wKojeni[, "blength"], wKojeni[, "w"]))
m1repl <- lm(bweight ~ blength, data = replKojeni)
summary(m1repl)
``````
``````##
## Call:
## lm(formula = bweight ~ blength, data = replKojeni)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -396.28  -54.34    2.35   45.24  163.90
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7905.804    262.033  -30.17   <2e-16 ***
## blength       224.828      5.177   43.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79.5 on 97 degrees of freedom
## Multiple R-squared:  0.9511, Adjusted R-squared:  0.9506
## F-statistic:  1886 on 1 and 97 DF,  p-value: < 2.2e-16
``````
``````confint(m1repl)
``````
``````##                  2.5 %     97.5 %
## (Intercept) -8425.8658 -7385.7416
## blength       214.5539   235.1018
``````

## Confidence bands around the regression line

### Calculation

``````blengthNew <- seq(45.8, 54.2, length = 500)
newdata <- data.frame(blength = blengthNew)
p1mean  <- predict(m1, newdata = newdata, interval = "confidence", level = 0.95)
wp1mean <- predict(wm1, newdata = newdata, interval = "confidence", level = 0.95)
p1replmean <- predict(m1repl, newdata = newdata, interval = "confidence", level = 0.95)
``````

### Confidence bands based on OLS (original data) and WLS (averaged data)

``````par(mfrow = c(1, 1), bty = BTY, mar = c(5, 4, 1, 1)+0.1)
plot(bweight ~ blength, data = Kojeni, col = COL2, bg = BGC2, ylim = YLIM, pch = PCH,
xlab = "Birth length [cm]", ylab = "Birth weight [g]")
abline(wm1, col = "red3", lwd = 2)

lines(blengthNew, wp1mean[, "lwr"], col = "red2", lwd = 2, lty = 5)
lines(blengthNew, wp1mean[, "upr"], col = "red2", lwd = 2, lty = 5)

lines(blengthNew, p1mean[, "lwr"], col = "blue", lwd = 2, lty = 5)
lines(blengthNew, p1mean[, "upr"], col = "blue", lwd = 2, lty = 5)

legend(46, 3900, legend = c("Averaged data (n = 9, WLS)", "Original data (n = 99, OLS)"),
col = c("red2", "blue"), lwd = 2, lty = 5)
text(46, 4000, labels = "Confidence band around the regression line", font = 2, cex = 1, pos = 4)
``````

### Confidence bands based on OLS (original data) and WLS (averaged data)

``````par(mfrow = c(1, 1), bty = BTY, mar = c(5, 4, 1, 1)+0.1)
plot(bweight ~ blength, data = wKojeni, col = COL2, bg = BGC2, ylim = YLIM, pch = PCH,
xlab = "Birth length [cm]", ylab = "Birth weight [g]", cex = 1.2)
abline(wm1, col = "red3", lwd = 2)

lines(blengthNew, wp1mean[, "lwr"], col = "red2", lwd = 2, lty = 5)
lines(blengthNew, wp1mean[, "upr"], col = "red2", lwd = 2, lty = 5)

lines(blengthNew, p1mean[, "lwr"], col = "blue", lwd = 2, lty = 5)
lines(blengthNew, p1mean[, "upr"], col = "blue", lwd = 2, lty = 5)

legend(46, 3900, legend = c("Averaged data (n = 9, WLS)", "Original data (n = 99, OLS)"),
col = c("red2", "blue"), lwd = 2, lty = 5)
text(46, 4000, labels = "Confidence band around the regression line", font = 2, cex = 1, pos = 4)
``````

### Confidence bands based on OLS (original data), WLS (averaged data), OLS (replicated data)

``````par(mfrow = c(1, 1), bty = BTY, mar = c(5, 4, 1, 1)+0.1)
plot(bweight ~ blength, data = wKojeni, col = COL2, bg = BGC2, ylim = YLIM, pch = PCH,
xlab = "Birth length [cm]", ylab = "Birth weight [g]", cex = 1.2)
abline(wm1, col = "red3", lwd = 2)

lines(blengthNew, wp1mean[, "lwr"], col = "red2", lwd = 2, lty = 5)
lines(blengthNew, wp1mean[, "upr"], col = "red2", lwd = 2, lty = 5)

lines(blengthNew, p1mean[, "lwr"], col = "blue", lwd = 2, lty = 5)
lines(blengthNew, p1mean[, "upr"], col = "blue", lwd = 2, lty = 5)

lines(blengthNew, p1replmean[, "lwr"], col = "darkgreen", lwd = 2, lty = 5)
lines(blengthNew, p1replmean[, "upr"], col = "darkgreen", lwd = 2, lty = 5)

legend(46, 3900, legend = c("Averaged data (n = 9, WLS)", "Original data (n = 99, OLS)", "Replicated data (n = 99, OLS)"),
col = c("red2", "blue", "darkgreen"), lwd = 2, lty = 5)
text(46, 4000, labels = "Confidence band around the regression line", font = 2, cex = 1, pos = 4)
``````