### ---------------------------------------------------------------------###
### Winter Term 2019/2020 ###
### NMSA407 - Linear Regression ###
### ###
### EXERCISE #2 Simple Linear Regression with Nitrogen in Peat Data ###
### ###
### ---------------------------------------------------------------------###
### Setting the working directory
### =============================
setwd("H:/nmsa407_LinRegr/")
### Loading necessary packages
### ==========================
### In K10/K11, we have to add a network directory L:/Statistika/R/library to the path
### where the R packages are looked for:
### .libPaths("L:/Statistika/R/library")
### This is not needed on most private PC's where R and all extension packages are
### installed on one place.
### If you encounter problems when loading this package in K4/K10/K11 because of a low
### version of R software (< 3.2.0), then you can run R software from the
### network disc L, where the version 3.2.2. is installed. The path is
### L:\Statistika\R\bin\x64\Rgui.exe
library("mffSM")
### Everything in today's exercises will be exemplified
### on the sample data containing nitrogen concentration in various peat depth.
### The data are not included in the mffSM package and need to be downloaded separately.
###
### The command below assumes that a data file peat.csv
### has been downloaded into the subdirectory 'Data'
### of the working directory.
peat <- read.csv("./Data/peat.csv", header = TRUE, stringsAsFactors = TRUE)
### Alternatively, the data file can by loaded directly, if the computer is connected to the internet.
peat <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/peat.csv", header=T, stringsAsFactors = TRUE)
head(peat)
summary(peat)
### ===========================================================
### Insight into the data
### ===========================================================
### In the following, we would like to investigate the nitrogen concentration using the information
### the information available in the data and the linear regression framework.
PCH <- c(21, 22, 25, 25)
COL <- heat_hcl(4)
BGC <- diverge_hcl(4)
names(PCH) <- names(COL) <- names(BGC) <- levels(peat[, "group"])
XLAB <- "Depth [cm]"
YLAB <- "Nitrogen concentration [weight %]"
XLIM <- range(peat[, "depth"])
YLIM <- range(peat[, "N"])
### All information from the data in one single plot:
plot(N ~ depth, data = peat, pch = 21, bg = "lightblue", xlab = XLAB, ylab = YLAB, xlim = XLIM, ylim = YLIM)
### But it can be done better:
par(mfrow = c(2, 2), bty = "o") ## 2x2 figures per plot
for (g in 1:4){
Group <- levels(peat[, "group"])[g]
xx <- subset(peat, group == Group)[, "depth"]
yy <- subset(peat, group == Group)[, "N"]
plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
}
par(mfrow = c(1, 1)) ## again 1x1 figure per plot
### Alternatively
plot(N ~ depth, data = peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB)
legend(1, 1.7, legend = levels(peat[, "group"]), pch = PCH, col = COL, pt.bg = BGC, y.intersp = 1.2)
### How many different values of depth are contained in the data?
table(peat[, "depth"])
### Let us create a jittered version of the depth (a random shift from uniform distribution on [-.5,+.5])
set.seed(20010911)
peat[, "jdepth"] <- peat[, "depth"] + runif(nrow(peat), -0.5, 0.5)
summary(peat)
### Another plot, this time with the jittered depth
plot(N ~ jdepth, data = peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB, xaxt = "n")
axis(1, at = seq(0, 14, by = 2))
abline(v = seq(1, 13, by = 2), col = "lightblue", lty = 5, lwd = 2)
legend(1.3, 1.7, legend = levels(peat[, "group"]), pch = PCH, col = COL, pt.bg = BGC, y.intersp = 1.2)
### Similar plot using an R function
plot(N~jitter(depth), data=peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB, xaxt = "n")
### =================================================================
### Conditional empirical characteristics by groups
### =================================================================
Groups <- levels(peat[, "group"])
print(Groups)
### Averages, standard deviations, numbers of observations for the depths in individual groups
tabs <- list()
### try to perform one loop by yourself by setting g <- 1
for (g in 1:length(Groups)){
subdata <- subset(peat, group == Groups[g])
## data subset containing only the g-th group
Mean <- with(subdata, tapply(N, depth, mean))
## averages for particular depths
StdDev <- with(subdata, tapply(N, depth, sd))
## standard deviations for particular depths
ns <- with(subdata, tapply(N, depth, length))
## numbers of observations for particular depths
cat("\n",Groups[g],"\n")
print(Mean)
print(StdDev)
print(ns)
tabs[[g]] <- data.frame(Depth = as.numeric(names(Mean)), Mean = Mean, Std.Dev = StdDev, n = ns)
rownames(tabs[[g]]) <- 1:nrow(tabs[[g]])
}
names(tabs) <- Groups
print(tabs)
### plot (site by site) with group averages
par(mfrow = c(2, 2))
for (g in 1:4){
Group <- Groups[g]
xx <- subset(peat, group == Group)[, "depth"]
yy <- subset(peat, group == Group)[, "N"]
plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
points(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[g])
}
par(mfrow = c(1, 1)) ## again 1x1 figure per plot
### Do you agree that a linear regression line is (for each separate group) suitable
### to describe the nitrogen-depth relationship?
### =================================================================
### Regression line for a selected group
### =================================================================
### Estimate of the regression line for the CB-VJJ group
Group <- "CB-VJJ" ### Change this into anything else if you want to analyze another group
g <- 2
fit1 <- lm(N ~ depth, data = peat, subset = (group == Group))
print(fit1)
### --> only estimated regression coefficients (by LS)
### Object fit1 contains many interesting quantities...
### Check, what's inside.
names(fit1)
### fit1 is in fact a list, its components
### are accessible either as fit1[["COMPONENT"]] or as fit1$COMPONENT
fit1[["coefficients"]]
fit1$coefficients
coef(fit1) ## hat{beta}
fit1[["fitted.values"]]
fitted(fit1) ## hat{Y}
# how is fitted computed?
rownames(subset(peat, group == Group))
Yvalues <- peat$N[peat$group == Group]
Xvalues <- peat$depth[peat$group == Group]
#
Yhat = cbind(1,Xvalues)%*%coef(fit1)
all.equal(as.numeric(Yhat),unname(fitted(fit1))) # exactly the same as fitted(fit1)
#
plot(Yvalues~Xvalues,pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
points(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[g])
points(fitted(fit1)~Xvalues, pch = 16, col = "red", bg = BGC[2], cex = 1.5)
abline(a=coef(fit1)[1],b=coef(fit1)[2])
abline(fit1, col = BGC[Group], lwd = 2)
fit1[["residuals"]]
residuals(fit1) ## U = Y - hat{Y}
(Res = c(Yvalues - Yhat))
fit1[["rank"]] ## r
fit1[["df.residual"]] ## n - r
(r = qr(cbind(1,Xvalues))$rank)
n = length(Yvalues)
n - r
### Many other things can be extracted from the object returned by summary(fit1).
summary(fit1) ## --> all basic inferential quantities
# summary statistics of the residuals
summary(Res)
### Residual sum of squares
deviance(fit1) ## SS_e
### Manually reconstructed:
Yvalues <- peat$N[peat$group == Group]
Xvalues <- peat$depth[peat$group == Group]
(SSe <- sum((Yvalues - (coef(fit1)[1] + coef(fit1)[2] * Xvalues))^2))
sum(residuals(fit1)^2)
sum(Res^2)
## --> can you reconstruct the other quantities by yourself?
### What is this good for?
fit0 <- lm(N ~ 1, data = peat, subset = (group == Group))
summary(fit0)
mean(Yvalues) # beta coefficient
sd(Yvalues) # residual standard error
sd(Yvalues)/sqrt(n) # estimated sd of beta
deviance(fit0) ## SS_T
###--> which can be obtained as
sum((Yvalues - mean(Yvalues))^2)
### Overall F-test (once more)
summary(fit1)
anova(fit0, fit1)
### Matrix MS_e * (X'X)^{-1}
vcov(fit1)
### Correlation matrix derived from vcov(fit1)
cov2cor(vcov(fit1))
### Confidence intervals for regression coefficients
confint(fit1, level= 0.95)
### A plot with the conditional averages and the regression line
xx <- subset(peat, group == Group)[, "depth"]
yy <- subset(peat, group == Group)[, "N"]
plot(xx, yy, pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
points(tabs[[Group]][, "Depth"], tabs[[Group]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[Group])
abline(fit1, col = BGC[Group], lwd = 2)
### Does the line seem to be a suitable model for estimating the nitrogen-depth relationship?
### compare with the model where each depth level is considered separately
(fit2 = lm(N~as.factor(depth), data = peat, subset = (group == Group)))
points(subset(peat, group==Group)$depth, fitted(fit2), pch=20, cex=2, col="red")
# or for a single depth level
depthlevel = 0
(fit3 = lm(N~1, data=peat, subset = ((group==Group) & (depth==depthlevel))))
points(subset(peat, (group==Group) & (depth==depthlevel))$depth, fitted(fit3), pch=18, cex=2, col=1)
# note the differences in Std. Error
summary(fit2)
summary(fit3)
### Is it possible to judge (just visually) from the third column
### whether one assumption of the classical linear model is satisfied? Which assumption?
print(tabs[[Group]])
### Regression line estimate once again
fit1 <- lm(N ~ depth, data = peat, subset = (group == Group), x = TRUE)
## it stores the model matrix in fit1 object this time
### Element 'x' is additionally here
names(fit1)
### model matrix, response etc.
X <- fit1[["x"]]
print(X)
y <- subset(peat, group == Group)[, "N"]
print(y)
(XtX <- t(X) %*% X) ### What are the elements of XtX?
(Xty <- t(X) %*% y) ### What are the elements of Xty?
(b <- solve(XtX, Xty)) ### What is this?
coef(fit1)
### projection matrix
H <- X %*% solve(XtX) %*% t(X) ### What is its purpose?
dim(H)
all.equal(c(H%*%y),unname(fitted(fit1)))
### better way how to get it (using the QR decomposition)
Q <- qr.Q(fit1[["qr"]]) ## Q matrix (orthogonal - columns are orthogonal vectors)
H <- qr.Q(fit1[["qr"]]) %*% t(qr.Q(fit1[["qr"]]))
dim(H)
H[1:10, 1:10] ## part of H matrix
summary(c(abs(X %*% solve(XtX) %*% t(X) - H))) # numerically zero
### fitted values
yhat <- H %*% y
yhat2 <- X %*% b
summary(yhat - yhat2) ### Numerically only zeros. Is it surprising?
### residuals
mean(y - yhat) ### Numerically zero. Is it surprising?
### complement of the projection matrix
M <- diag(nrow(H)) - H ### What is its purpose?
### M is the projection matrix into the space of residuals
all.equal(c(M%*%y),unname(fit1$residuals))
### SS_e (residual sum of squares)
deviance(fit1)
(SSe <- as.numeric(t(y) %*% M %*% y))
(SSe2 <- as.numeric(crossprod(y - yhat))) ## The same. Is it surprising?
### Residual mean square (and its square root)
(df <- length(y) - 2)
(s2 <- SSe / df)
(s <- sqrt(s2))
summary(fit1)
### What are we going to calculate by this? What is its purpose?
deviance(fit0)
(SST <- as.numeric(crossprod(y - mean(y))))
(WhatIsIt <- 1 - SSe / SST)
summary(fit1) ### Is the number from above somewhere here?
cor(Xvalues,Yvalues)^2 # coefficient of determination - square of the coefficient of (multiple) correlation between Y and the regressors
### Variance and standard deviations of the regression coefficients' estimates. Why is it important or useful?
(varb <- s2 * solve(XtX))
(sb1 <- sqrt(varb[1, 1]))
(sb2 <- sqrt(varb[2, 2]))
summary(fit1) ### Are the numbers from above somewhere here?
### Does the nitrogen concentration depend on the depth significantly?
(T <- (b[2] - 0) / sb2) ### Is there a connection with the test statistic of the one-sample t-test?
### critical value
alpha <- 0.05
(K <- qt(1 - alpha/2, df=df)) ### What is our decision?
### p-value
(Pval <- 2 * pt(-abs(T), df = df)) ### Is it correct and why?
### What is our decision?
### Is the test from above somewhere here?
summary(fit1)
### Determine the confidence interval for the expected increase/decrease
### of the nitrogen concentration if the depth is increased by 1cm.
(delta.beta2 <- K * sb2) ### What is this?
(CI.beta2 <- b[2] + c(-1, 1) * delta.beta2) ### Does this remind you the confidence interval
### for the mean of the normally distributed random vector?
confint(fit1, conf.level = 1 - alpha)
LSest(fit1, L = c(0, 1), conf.level = 0.95) ### function from mffSM package
### - provides inference for theta = L*beta
### (assuming it is estimable)
### Also one-sided tests/confidence intervals can be calculated
### with the LSest function.
LSest(fit1, L = c(0, 1), alternative = "greater") ## What is this saying?
# H0: beta = 0
# H1: beta > 0
### Also tests against other than zero value can be calculated.
LSest(fit1, L = c(0, 1), theta0 = -0.02) ## What is this saying?
### Construct the confidence interval for the expected change
### of the nitrogen concentration if the depth is increased by 10cm.
### Is it significant that the concentration DECREASES with the increasing depth?
### Formulate the hypothesis with the corresponding alternative and perform the test.
### Determine the p-value of the test.
LSest(fit1, L = c(0, 1), alternative = "less")
### Finally, model-based estimated mean concentration for a sequence of depths
### including the 95% confidence intervals
depth.grid <- 0:14
pdata <- data.frame(depth = depth.grid)
rownames(pdata) <- paste("depth =", depth.grid)
print(pdata)
predict(fit1, newdata = pdata) ## only point estimates
predict(fit1, newdata = pdata, se.fit = TRUE) ## including standard errors
## What is the "residual.scale"?
predict(fit1, newdata = pdata, interval = "confidence", level = 0.95) ## including the confidence intervals
### Plot
depth.grid2 <- seq(0, 14, length = 100) ## denser grid than before
pfit1 <- predict(fit1, newdata = data.frame(depth = depth.grid2), interval = "confidence", level = 0.95)
plot(N ~ depth, data = subset(peat, group == Group), pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
points(tabs[[Group]][, "Depth"], tabs[[Group]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[Group])
abline(fit1, col = "blue4", lwd = 2)
lines(depth.grid2, pfit1[, "lwr"], col = "blue4", lty = 2, lwd = 2)
lines(depth.grid2, pfit1[, "upr"], col = "blue4", lty = 2, lwd = 2)
### =================================================================
### Regression line in each site.
### =================================================================
### Perform the above calculations for other sites as well. What are the conclusions?