# NMSA407 Linear Regression: Tutorial

Multicollinearity

Data IQ

## Introduction

### Load used data and calculate basic summaries

``````data(IQ, package = "mffSM")
``````
``````##   gender fgender  iq  zn7  zn8
## 1      0    girl  94 1.84 1.81
## 2      0    girl 134 1.00 1.00
## 3      0    girl 108 1.69 1.90
## 4      1     boy  96 2.60 2.90
## 5      0    girl 101 1.92 1.72
## 6      1     boy  97 2.00 2.25
``````
``````dim(IQ)
``````
``````## [1] 111   5
``````
``````summary(IQ)
``````
``````##      gender       fgender         iq             zn7             zn8
##  Min.   :0.0000   girl:54   Min.   : 72.0   Min.   :1.000   Min.   :1.000
##  1st Qu.:0.0000   boy :57   1st Qu.:101.0   1st Qu.:1.300   1st Qu.:1.270
##  Median :1.0000             Median :108.0   Median :1.620   Median :1.720
##  Mean   :0.5135             Mean   :109.2   Mean   :1.746   Mean   :1.782
##  3rd Qu.:1.0000             3rd Qu.:118.0   3rd Qu.:2.070   3rd Qu.:2.090
##  Max.   :1.0000             Max.   :141.0   Max.   :3.300   Max.   :3.360
``````

### Summary by gender

Boys

``````summary(subset(IQ, fgender == "boy", select = c("iq", "zn7", "zn8")))
``````
``````##        iq             zn7             zn8
##  Min.   : 72.0   Min.   :1.000   Min.   :1.000
##  1st Qu.: 97.0   1st Qu.:1.380   1st Qu.:1.270
##  Median :106.0   Median :1.840   Median :1.900
##  Mean   :107.5   Mean   :1.968   Mean   :2.012
##  3rd Qu.:117.0   3rd Qu.:2.610   3rd Qu.:2.550
##  Max.   :141.0   Max.   :3.300   Max.   :3.360
``````

Girls

``````summary(subset(IQ, fgender == "girl", select = c("iq", "zn7", "zn8")))
``````
``````##        iq             zn7             zn8
##  Min.   : 80.0   Min.   :1.000   Min.   :1.000
##  1st Qu.:102.2   1st Qu.:1.208   1st Qu.:1.270
##  Median :111.0   Median :1.460   Median :1.540
##  Mean   :111.1   Mean   :1.511   Mean   :1.538
##  3rd Qu.:118.5   3rd Qu.:1.830   3rd Qu.:1.810
##  Max.   :141.0   Max.   :2.230   Max.   :2.450
``````

## Dependence of IQ on grades from 7th and 8th year and on gender

### Scatterplots (and histograms) of all numeric variables and gender represented by zeros and ones

• Function `pairs.panels` from package `psych` is used here. It also shows pairwise correlation coefficients.
``````library("psych")
pairs.panels(subset(IQ, select = c("iq", "gender", "zn7", "zn8")), bg = "palegoldenrod", col = "red3", pch = 21,
ellipses = FALSE, smooth = FALSE, lm = TRUE, hist.col = "lightblue", rug = FALSE)
``````

### Correlation coefficients (again)

``````round(cor(IQ[, c("iq", "gender", "zn7", "zn8")]), 2)
``````
``````##           iq gender   zn7   zn8
## iq      1.00  -0.12 -0.69 -0.66
## gender -0.12   1.00  0.37  0.38
## zn7    -0.69   0.37  1.00  0.95
## zn8    -0.66   0.38  0.95  1.00
``````

### One more useful plot

``````COLS <- c("red3", "darkblue")
BGCOLS <- rainbow_hcl(2, start = 30, end = 230)
PCH <- c(25, 24)
names(COLS) <- names(BGCOLS) <- names(PCH) <- c("girl", "boy")
layout(matrix(c(0,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
par(mar = c(5, 4, 1, 1) + 0.1)
plot(iq ~ fgender, col = BGCOLS, data=IQ)
plot(iq ~ zn7, col = COLS[fgender], bg = BGCOLS[fgender], pch = PCH[fgender], data = IQ)
plot(iq ~ zn8, col = COLS[fgender], bg = BGCOLS[fgender], pch = PCH[fgender], data = IQ)
legend(2.5, 140, legend = c("Girl", "Boy"), col = COLS, pt.bg = BGCOLS, pch = PCH)
``````

``````#par(mfrow = c(1, 1))
``````

## Linear model with all regressors included additively

### LSE fit

``````(sm1 <- summary(m1 <- lm(iq ~ gender + zn7 + zn8, data = IQ)))
``````
``````##
## Call:
## lm(formula = iq ~ gender + zn7 + zn8, data = IQ)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -22.1677  -7.5243  -0.4338   7.1780  26.4095
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  138.222      3.119  44.314  < 2e-16 ***
## gender         4.563      2.221   2.055  0.04232 *
## zn7          -16.767      5.536  -3.029  0.00308 **
## zn8           -1.149      5.557  -0.207  0.83658
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.81 on 107 degrees of freedom
## Multiple R-squared:  0.4943, Adjusted R-squared:  0.4801
## F-statistic: 34.87 on 3 and 107 DF,  p-value: 8.472e-16
``````

### Standardized variables (regressors of unity standard deviation)

``````(sm1st <- summary(m1st <- lm(scale(iq) ~ scale(gender) + scale(zn7) + scale(zn8), data = IQ)))
``````
``````##
## Call:
## lm(formula = scale(iq) ~ scale(gender) + scale(zn7) + scale(zn8),
##     data = IQ)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.47790 -0.50164 -0.02892  0.47855  1.76069
##
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)
## (Intercept)    1.169e-16  6.844e-02   0.000  1.00000
## scale(gender)  1.528e-01  7.434e-02   2.055  0.04232 *
## scale(zn7)    -6.989e-01  2.308e-01  -3.029  0.00308 **
## scale(zn8)    -4.800e-02  2.321e-01  -0.207  0.83658
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.721 on 107 degrees of freedom
## Multiple R-squared:  0.4943, Adjusted R-squared:  0.4801
## F-statistic: 34.87 on 3 and 107 DF,  p-value: 8.472e-16
``````

### Standardized variables (regressors of unity norm)

``````n <- nrow(IQ)
IQ <- transform(IQ, siq     = scale(iq) / sqrt(n - 1),
sgender = scale(gender) / sqrt(n - 1),
szn7    = scale(zn7) / sqrt(n - 1),
szn8    = scale(zn8) / sqrt(n - 1))
(sm1st2 <- summary(m1st2 <- lm(siq ~ sgender + szn7 + szn8, data = IQ)))
``````
``````##
## Call:
## lm(formula = siq ~ sgender + szn7 + szn8, data = IQ)
##
## Residuals:
##       Min        1Q    Median        3Q       Max
## -0.140912 -0.047829 -0.002757  0.045628  0.167875
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  9.731e-18  6.525e-03   0.000  1.00000
## sgender      1.528e-01  7.434e-02   2.055  0.04232 *
## szn7        -6.989e-01  2.308e-01  -3.029  0.00308 **
## szn8        -4.800e-02  2.321e-01  -0.207  0.83658
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06875 on 107 degrees of freedom
## Multiple R-squared:  0.4943, Adjusted R-squared:  0.4801
## F-statistic: 34.87 on 3 and 107 DF,  p-value: 8.472e-16
``````

### Variance inflation factors

• Function `vif` is from package `car`.
``````detach("package:psych")
library("car")
vif(m1)
``````
``````##   gender      zn7      zn8
##  1.16923 11.26866 11.40240
``````
``````vif(lm(iq ~ fgender + zn7 + zn8, data = IQ))
``````
``````##  fgender      zn7      zn8
##  1.16923 11.26866 11.40240
``````

## Linear models where gender and only either grade from the 7th year of from the 8th year included

### Gender and grade from the 7th year included

``````(sm27 <- summary(m27 <- lm(iq ~ gender + zn7, data = IQ)))
``````
``````##
## Call:
## lm(formula = iq ~ gender + zn7, data = IQ)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -21.9606  -7.4290  -0.1927   7.0047  26.5244
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  138.093      3.043  45.376   <2e-16 ***
## gender         4.513      2.198   2.054   0.0424 *
## zn7          -17.852      1.765 -10.116   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.77 on 108 degrees of freedom
## Multiple R-squared:  0.4941, Adjusted R-squared:  0.4848
## F-statistic: 52.74 on 2 and 108 DF,  p-value: < 2.2e-16
``````
``````vif(m27)
``````
``````##  gender     zn7
## 1.15531 1.15531
``````

### Gender and grade from the 8th year included

``````(sm28 <- summary(m28 <- lm(iq ~ gender + zn8, data = IQ)))
``````
``````##
## Call:
## lm(formula = iq ~ gender + zn8, data = IQ)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -25.5378  -7.9585  -0.0763   7.1273  31.0778
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  137.402      3.223  42.634  < 2e-16 ***
## gender         4.474      2.303   1.943   0.0547 .
## zn8          -17.095      1.846  -9.263 2.21e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.22 on 108 degrees of freedom
## Multiple R-squared:  0.451,  Adjusted R-squared:  0.4408
## F-statistic: 44.36 on 2 and 108 DF,  p-value: 8.673e-15
``````
``````vif(m28)
``````
``````##   gender      zn8
## 1.169022 1.169022
``````

### Fitted regression functions

``````pzn <- c(1, 3.4)
pdata <- data.frame(zn7 = rep(pzn, 2), zn8 = rep(pzn, 2), gender = rep(c(0, 1), each = 2))

pm27 <- predict(m27, newdata = pdata)
pm28 <- predict(m28, newdata = pdata)
``````

### Plot with fitted regression functions

``````par(mfrow = c(1, 2), mar = c(5, 4, 1, 1) + 0.1)
plot(iq ~ zn7, col = COLS[fgender], bg = BGCOLS[fgender], pch = PCH[fgender], data = IQ)
lines(pzn, pm27[1:2], col = COLS["girl"], lwd = 2)
lines(pzn, pm27[3:4], col = COLS["boy"], lwd = 2)
legend(2.5, 140, legend = c("Girl", "Boy"), col = COLS, pt.bg = BGCOLS, pch = PCH, lty = 1, lwd = 2)
plot(iq ~ zn8, col = COLS[fgender], bg = BGCOLS[fgender], pch = PCH[fgender], data = IQ)
lines(pzn, pm28[1:2], col = COLS["girl"], lwd = 2)
lines(pzn, pm28[3:4], col = COLS["boy"], lwd = 2)
legend(2.5, 140, legend = c("Girl", "Boy"), col = COLS, pt.bg = BGCOLS, pch = PCH, lty = 1, lwd = 2)
``````

``````par(mfrow = c(1, 1))
``````