### ---------------------------------------------------------------------### ### Winter Term 2019/2020 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #2 Simple Linear Regression with Nitrogen in Peat Data ### ### ### ### ---------------------------------------------------------------------### ### Setting the working directory ### ============================= setwd("H:/nmsa407_LinRegr/") ### Loading necessary packages ### ========================== ### In K10/K11, we have to add a network directory L:/Statistika/R/library to the path ### where the R packages are looked for: ### .libPaths("L:/Statistika/R/library") ### This is not needed on most private PC's where R and all extension packages are ### installed on one place. ### If you encounter problems when loading this package in K4/K10/K11 because of a low ### version of R software (< 3.2.0), then you can run R software from the ### network disc L, where the version 3.2.2. is installed. The path is ### L:\Statistika\R\bin\x64\Rgui.exe library("mffSM") ### Everything in today's exercises will be exemplified ### on the sample data containing nitrogen concentration in various peat depth. ### The data are not included in the mffSM package and need to be downloaded separately. ### ### The command below assumes that a data file peat.csv ### has been downloaded into the subdirectory 'Data' ### of the working directory. peat <- read.csv("./Data/peat.csv", header = TRUE, stringsAsFactors = TRUE) ### Alternatively, the data file can by loaded directly, if the computer is connected to the internet. peat <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/peat.csv", header=T, stringsAsFactors = TRUE) head(peat) summary(peat) ### =========================================================== ### Insight into the data ### =========================================================== ### In the following, we would like to investigate the nitrogen concentration using the information ### the information available in the data and the linear regression framework. PCH <- c(21, 22, 25, 25) COL <- heat_hcl(4) BGC <- diverge_hcl(4) names(PCH) <- names(COL) <- names(BGC) <- levels(peat[, "group"]) XLAB <- "Depth [cm]" YLAB <- "Nitrogen concentration [weight %]" XLIM <- range(peat[, "depth"]) YLIM <- range(peat[, "N"]) ### All information from the data in one single plot: plot(N ~ depth, data = peat, pch = 21, bg = "lightblue", xlab = XLAB, ylab = YLAB, xlim = XLIM, ylim = YLIM) ### But it can be done better: par(mfrow = c(2, 2), bty = "o") ## 2x2 figures per plot for (g in 1:4){ Group <- levels(peat[, "group"])[g] xx <- subset(peat, group == Group)[, "depth"] yy <- subset(peat, group == Group)[, "N"] plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM) } par(mfrow = c(1, 1)) ## again 1x1 figure per plot ### Alternatively plot(N ~ depth, data = peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB) legend(1, 1.7, legend = levels(peat[, "group"]), pch = PCH, col = COL, pt.bg = BGC, y.intersp = 1.2) ### How many different values of depth are contained in the data? table(peat[, "depth"]) ### Let us create a jittered version of the depth (a random shift from uniform distribution on [-.5,+.5]) set.seed(20010911) peat[, "jdepth"] <- peat[, "depth"] + runif(nrow(peat), -0.5, 0.5) summary(peat) ### Another plot, this time with the jittered depth plot(N ~ jdepth, data = peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB, xaxt = "n") axis(1, at = seq(0, 14, by = 2)) abline(v = seq(1, 13, by = 2), col = "lightblue", lty = 5, lwd = 2) legend(1.3, 1.7, legend = levels(peat[, "group"]), pch = PCH, col = COL, pt.bg = BGC, y.intersp = 1.2) ### Similar plot using an R function plot(N~jitter(depth), data=peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB, xaxt = "n") ### ================================================================= ### Conditional empirical characteristics by groups ### ================================================================= Groups <- levels(peat[, "group"]) print(Groups) ### Averages, standard deviations, numbers of observations for the depths in individual groups tabs <- list() ### try to perform one loop by yourself by setting g <- 1 for (g in 1:length(Groups)){ subdata <- subset(peat, group == Groups[g]) ## data subset containing only the g-th group Mean <- with(subdata, tapply(N, depth, mean)) ## averages for particular depths StdDev <- with(subdata, tapply(N, depth, sd)) ## standard deviations for particular depths ns <- with(subdata, tapply(N, depth, length)) ## numbers of observations for particular depths cat("\n",Groups[g],"\n") print(Mean) print(StdDev) print(ns) tabs[[g]] <- data.frame(Depth = as.numeric(names(Mean)), Mean = Mean, Std.Dev = StdDev, n = ns) rownames(tabs[[g]]) <- 1:nrow(tabs[[g]]) } names(tabs) <- Groups print(tabs) ### plot (site by site) with group averages par(mfrow = c(2, 2)) for (g in 1:4){ Group <- Groups[g] xx <- subset(peat, group == Group)[, "depth"] yy <- subset(peat, group == Group)[, "N"] plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM) points(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[g]) } par(mfrow = c(1, 1)) ## again 1x1 figure per plot ### Do you agree that a linear regression line is (for each separate group) suitable ### to describe the nitrogen-depth relationship? ### ================================================================= ### Regression line for a selected group ### ================================================================= ### Estimate of the regression line for the CB-VJJ group Group <- "CB-VJJ" ### Change this into anything else if you want to analyze another group g <- 2 fit1 <- lm(N ~ depth, data = peat, subset = (group == Group)) print(fit1) ### --> only estimated regression coefficients (by LS) ### Object fit1 contains many interesting quantities... ### Check, what's inside. names(fit1) ### fit1 is in fact a list, its components ### are accessible either as fit1[["COMPONENT"]] or as fit1$COMPONENT fit1[["coefficients"]] fit1$coefficients coef(fit1) ## hat{beta} fit1[["fitted.values"]] fitted(fit1) ## hat{Y} # how is fitted computed? rownames(subset(peat, group == Group)) Yvalues <- peat$N[peat$group == Group] Xvalues <- peat$depth[peat$group == Group] # Yhat = cbind(1,Xvalues)%*%coef(fit1) all.equal(as.numeric(Yhat),unname(fitted(fit1))) # exactly the same as fitted(fit1) # plot(Yvalues~Xvalues,pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM) points(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[g]) points(fitted(fit1)~Xvalues, pch = 16, col = "red", bg = BGC[2], cex = 1.5) abline(a=coef(fit1)[1],b=coef(fit1)[2]) abline(fit1, col = BGC[Group], lwd = 2) fit1[["residuals"]] residuals(fit1) ## U = Y - hat{Y} (Res = c(Yvalues - Yhat)) fit1[["rank"]] ## r fit1[["df.residual"]] ## n - r (r = qr(cbind(1,Xvalues))$rank) n = length(Yvalues) n - r ### Many other things can be extracted from the object returned by summary(fit1). summary(fit1) ## --> all basic inferential quantities # summary statistics of the residuals summary(Res) ### Residual sum of squares deviance(fit1) ## SS_e ### Manually reconstructed: Yvalues <- peat$N[peat$group == Group] Xvalues <- peat$depth[peat$group == Group] (SSe <- sum((Yvalues - (coef(fit1)[1] + coef(fit1)[2] * Xvalues))^2)) sum(residuals(fit1)^2) sum(Res^2) ## --> can you reconstruct the other quantities by yourself? ### What is this good for? fit0 <- lm(N ~ 1, data = peat, subset = (group == Group)) summary(fit0) mean(Yvalues) # beta coefficient sd(Yvalues) # residual standard error sd(Yvalues)/sqrt(n) # estimated sd of beta deviance(fit0) ## SS_T ###--> which can be obtained as sum((Yvalues - mean(Yvalues))^2) ### Overall F-test (once more) summary(fit1) anova(fit0, fit1) ### Matrix MS_e * (X'X)^{-1} vcov(fit1) ### Correlation matrix derived from vcov(fit1) cov2cor(vcov(fit1)) ### Confidence intervals for regression coefficients confint(fit1, level= 0.95) ### A plot with the conditional averages and the regression line xx <- subset(peat, group == Group)[, "depth"] yy <- subset(peat, group == Group)[, "N"] plot(xx, yy, pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM) points(tabs[[Group]][, "Depth"], tabs[[Group]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[Group]) abline(fit1, col = BGC[Group], lwd = 2) ### Does the line seem to be a suitable model for estimating the nitrogen-depth relationship? ### compare with the model where each depth level is considered separately (fit2 = lm(N~as.factor(depth), data = peat, subset = (group == Group))) points(subset(peat, group==Group)$depth, fitted(fit2), pch=20, cex=2, col="red") # or for a single depth level depthlevel = 0 (fit3 = lm(N~1, data=peat, subset = ((group==Group) & (depth==depthlevel)))) points(subset(peat, (group==Group) & (depth==depthlevel))$depth, fitted(fit3), pch=18, cex=2, col=1) # note the differences in Std. Error summary(fit2) summary(fit3) ### Is it possible to judge (just visually) from the third column ### whether one assumption of the classical linear model is satisfied? Which assumption? print(tabs[[Group]]) ### Regression line estimate once again fit1 <- lm(N ~ depth, data = peat, subset = (group == Group), x = TRUE) ## it stores the model matrix in fit1 object this time ### Element 'x' is additionally here names(fit1) ### model matrix, response etc. X <- fit1[["x"]] print(X) y <- subset(peat, group == Group)[, "N"] print(y) (XtX <- t(X) %*% X) ### What are the elements of XtX? (Xty <- t(X) %*% y) ### What are the elements of Xty? (b <- solve(XtX, Xty)) ### What is this? coef(fit1) ### projection matrix H <- X %*% solve(XtX) %*% t(X) ### What is its purpose? dim(H) all.equal(c(H%*%y),unname(fitted(fit1))) ### better way how to get it (using the QR decomposition) Q <- qr.Q(fit1[["qr"]]) ## Q matrix (orthogonal - columns are orthogonal vectors) H <- qr.Q(fit1[["qr"]]) %*% t(qr.Q(fit1[["qr"]])) dim(H) H[1:10, 1:10] ## part of H matrix summary(c(abs(X %*% solve(XtX) %*% t(X) - H))) # numerically zero ### fitted values yhat <- H %*% y yhat2 <- X %*% b summary(yhat - yhat2) ### Numerically only zeros. Is it surprising? ### residuals mean(y - yhat) ### Numerically zero. Is it surprising? ### complement of the projection matrix M <- diag(nrow(H)) - H ### What is its purpose? ### M is the projection matrix into the space of residuals all.equal(c(M%*%y),unname(fit1$residuals)) ### SS_e (residual sum of squares) deviance(fit1) (SSe <- as.numeric(t(y) %*% M %*% y)) (SSe2 <- as.numeric(crossprod(y - yhat))) ## The same. Is it surprising? ### Residual mean square (and its square root) (df <- length(y) - 2) (s2 <- SSe / df) (s <- sqrt(s2)) summary(fit1) ### What are we going to calculate by this? What is its purpose? deviance(fit0) (SST <- as.numeric(crossprod(y - mean(y)))) (WhatIsIt <- 1 - SSe / SST) summary(fit1) ### Is the number from above somewhere here? cor(Xvalues,Yvalues)^2 # coefficient of determination - square of the coefficient of (multiple) correlation between Y and the regressors ### Variance and standard deviations of the regression coefficients' estimates. Why is it important or useful? (varb <- s2 * solve(XtX)) (sb1 <- sqrt(varb[1, 1])) (sb2 <- sqrt(varb[2, 2])) summary(fit1) ### Are the numbers from above somewhere here? ### Does the nitrogen concentration depend on the depth significantly? (T <- (b[2] - 0) / sb2) ### Is there a connection with the test statistic of the one-sample t-test? ### critical value alpha <- 0.05 (K <- qt(1 - alpha/2, df=df)) ### What is our decision? ### p-value (Pval <- 2 * pt(-abs(T), df = df)) ### Is it correct and why? ### What is our decision? ### Is the test from above somewhere here? summary(fit1) ### Determine the confidence interval for the expected increase/decrease ### of the nitrogen concentration if the depth is increased by 1cm. (delta.beta2 <- K * sb2) ### What is this? (CI.beta2 <- b[2] + c(-1, 1) * delta.beta2) ### Does this remind you the confidence interval ### for the mean of the normally distributed random vector? confint(fit1, conf.level = 1 - alpha) LSest(fit1, L = c(0, 1), conf.level = 0.95) ### function from mffSM package ### - provides inference for theta = L*beta ### (assuming it is estimable) ### Also one-sided tests/confidence intervals can be calculated ### with the LSest function. LSest(fit1, L = c(0, 1), alternative = "greater") ## What is this saying? # H0: beta = 0 # H1: beta > 0 ### Also tests against other than zero value can be calculated. LSest(fit1, L = c(0, 1), theta0 = -0.02) ## What is this saying? ### Construct the confidence interval for the expected change ### of the nitrogen concentration if the depth is increased by 10cm. ### Is it significant that the concentration DECREASES with the increasing depth? ### Formulate the hypothesis with the corresponding alternative and perform the test. ### Determine the p-value of the test. LSest(fit1, L = c(0, 1), alternative = "less") ### Finally, model-based estimated mean concentration for a sequence of depths ### including the 95% confidence intervals depth.grid <- 0:14 pdata <- data.frame(depth = depth.grid) rownames(pdata) <- paste("depth =", depth.grid) print(pdata) predict(fit1, newdata = pdata) ## only point estimates predict(fit1, newdata = pdata, se.fit = TRUE) ## including standard errors ## What is the "residual.scale"? predict(fit1, newdata = pdata, interval = "confidence", level = 0.95) ## including the confidence intervals ### Plot depth.grid2 <- seq(0, 14, length = 100) ## denser grid than before pfit1 <- predict(fit1, newdata = data.frame(depth = depth.grid2), interval = "confidence", level = 0.95) plot(N ~ depth, data = subset(peat, group == Group), pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM) points(tabs[[Group]][, "Depth"], tabs[[Group]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[Group]) abline(fit1, col = "blue4", lwd = 2) lines(depth.grid2, pfit1[, "lwr"], col = "blue4", lty = 2, lwd = 2) lines(depth.grid2, pfit1[, "upr"], col = "blue4", lty = 2, lwd = 2) ### ================================================================= ### Regression line in each site. ### ================================================================= ### Perform the above calculations for other sites as well. What are the conclusions?