### ---------------------------------------------------------------------###
### Winter Term 2019/2020                                                ###
### NMSA407 - Linear Regression                                          ###
###                                                                      ###
### EXERCISE #2  Simple Linear Regression with Nitrogen in Peat Data     ###
###                                                                      ###
### ---------------------------------------------------------------------###

### Setting the working directory
### =============================

setwd("H:/nmsa407_LinRegr/")

### Loading necessary packages
### ==========================

### In K10/K11, we have to add a network directory L:/Statistika/R/library to the path
### where the R packages are looked for:
### .libPaths("L:/Statistika/R/library")      

### This is not needed on most private PC's where R and all extension packages are
### installed on one place.

### If you encounter problems when loading this package in K4/K10/K11 because of a low 
### version of R software (< 3.2.0), then you can run R software from the 
### network disc L, where the version 3.2.2. is installed. The path is
###  L:\Statistika\R\bin\x64\Rgui.exe                                           

library("mffSM")

### Everything in today's exercises will be exemplified
### on the sample data containing nitrogen concentration in various peat depth.
### The data are not included in the mffSM package and need to be downloaded separately.
###
### The command below assumes that a data file peat.csv
### has been downloaded into the subdirectory 'Data'
### of the working directory.

peat <- read.csv("./Data/peat.csv", header = TRUE, stringsAsFactors = TRUE)

### Alternatively, the data file can by loaded directly, if the computer is connected to the internet. 

peat <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/peat.csv", header=T, stringsAsFactors = TRUE)

head(peat)
summary(peat)

### ===========================================================
### Insight into the data
### ===========================================================

### In the following, we would like to investigate the nitrogen concentration using the information 
### the information available in the data and the linear regression framework. 

PCH <- c(21, 22, 25, 25)
COL <- heat_hcl(4)
BGC <- diverge_hcl(4)
names(PCH) <- names(COL) <- names(BGC) <- levels(peat[, "group"])

XLAB <- "Depth [cm]"
YLAB <- "Nitrogen concentration [weight %]"

XLIM <- range(peat[, "depth"])
YLIM <- range(peat[, "N"])

### All information from the data in one single plot: 
plot(N ~ depth, data = peat, pch = 21, bg = "lightblue", xlab = XLAB, ylab = YLAB,  xlim = XLIM, ylim = YLIM)

### But it can be done better: 
par(mfrow = c(2, 2), bty = "o")      ## 2x2 figures per plot
for (g in 1:4){
  Group <- levels(peat[, "group"])[g]
  xx <- subset(peat, group == Group)[, "depth"]
  yy <- subset(peat, group == Group)[, "N"]
  plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
}  
par(mfrow = c(1, 1))               ## again 1x1 figure per plot


### Alternatively
plot(N ~ depth, data = peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB)
legend(1, 1.7, legend = levels(peat[, "group"]), pch = PCH, col = COL, pt.bg = BGC, y.intersp = 1.2)


### How many different values of depth are contained in the data?
table(peat[, "depth"])


### Let us create a jittered version of the depth (a random shift from uniform distribution on [-.5,+.5])
set.seed(20010911)  
peat[, "jdepth"] <- peat[, "depth"] + runif(nrow(peat), -0.5, 0.5)
summary(peat)


### Another plot, this time with the jittered depth
plot(N ~ jdepth, data = peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB, xaxt = "n")
axis(1, at = seq(0, 14, by = 2))
abline(v = seq(1, 13, by = 2), col = "lightblue", lty = 5, lwd = 2)
legend(1.3, 1.7, legend = levels(peat[, "group"]), pch = PCH, col = COL, pt.bg = BGC, y.intersp = 1.2)

### Similar plot using an R function
plot(N~jitter(depth), data=peat, pch = PCH[group], col = COL[group], bg = BGC[group], xlab = XLAB, ylab = YLAB, xaxt = "n")

### =================================================================
### Conditional empirical characteristics by groups
### =================================================================
Groups <- levels(peat[, "group"])
print(Groups)


### Averages, standard deviations, numbers of observations for the depths in individual groups
tabs <- list()

### try to perform one loop by yourself by setting g <- 1
for (g in 1:length(Groups)){
  subdata <- subset(peat, group == Groups[g])
  ## data subset containing only the g-th group
  Mean    <- with(subdata, tapply(N, depth, mean))
  ## averages for particular depths
  StdDev  <- with(subdata, tapply(N, depth, sd))
  ## standard deviations for particular depths
  ns      <- with(subdata, tapply(N, depth, length))
  ## numbers of observations for particular depths

  cat("\n",Groups[g],"\n")
  print(Mean)
  print(StdDev)
  print(ns)

  tabs[[g]] <- data.frame(Depth = as.numeric(names(Mean)), Mean = Mean, Std.Dev = StdDev, n = ns)
  rownames(tabs[[g]]) <- 1:nrow(tabs[[g]])
}  
names(tabs) <- Groups
print(tabs)


### plot (site by site) with group averages
par(mfrow = c(2, 2))
for (g in 1:4){
  Group <- Groups[g]
  xx <- subset(peat, group == Group)[, "depth"]
  yy <- subset(peat, group == Group)[, "N"]
  plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
  points(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[g])
}  
par(mfrow = c(1, 1))               ## again 1x1 figure per plot

### Do you agree that a linear regression line is (for each separate group) suitable 
### to describe the nitrogen-depth relationship?


### =================================================================
### Regression line for a selected group
### =================================================================

### Estimate of the regression line for the CB-VJJ group
Group <- "CB-VJJ"   ### Change this into anything else if you want to analyze another group
g <- 2 

fit1 <- lm(N ~ depth, data = peat, subset = (group == Group))
print(fit1)
### --> only estimated regression coefficients (by LS)

### Object fit1 contains many interesting quantities...
### Check, what's inside.
names(fit1)

### fit1 is in fact a list, its components
### are accessible either as fit1[["COMPONENT"]] or as fit1$COMPONENT

fit1[["coefficients"]]
fit1$coefficients
coef(fit1)                   ## hat{beta}

fit1[["fitted.values"]]
fitted(fit1)                 ## hat{Y}

# how is fitted computed?
rownames(subset(peat, group == Group))
Yvalues <- peat$N[peat$group == Group]
Xvalues <- peat$depth[peat$group == Group]
#
Yhat = cbind(1,Xvalues)%*%coef(fit1)
all.equal(as.numeric(Yhat),unname(fitted(fit1)))  # exactly the same as fitted(fit1)
#
plot(Yvalues~Xvalues,pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
points(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[g])
points(fitted(fit1)~Xvalues, pch = 16, col = "red", bg = BGC[2],  cex = 1.5)
abline(a=coef(fit1)[1],b=coef(fit1)[2])
abline(fit1, col = BGC[Group], lwd = 2)


fit1[["residuals"]]
residuals(fit1)              ## U = Y - hat{Y}
(Res = c(Yvalues - Yhat))

fit1[["rank"]]               ## r
fit1[["df.residual"]]        ## n - r
(r = qr(cbind(1,Xvalues))$rank)
n = length(Yvalues)
n - r

### Many other things can be extracted from the object returned by summary(fit1).
summary(fit1)            ## --> all basic inferential quantities

# summary statistics of the residuals
summary(Res)

### Residual sum of squares
deviance(fit1)        ## SS_e

### Manually reconstructed:
Yvalues <- peat$N[peat$group == Group]
Xvalues <- peat$depth[peat$group == Group]

(SSe <- sum((Yvalues - (coef(fit1)[1] + coef(fit1)[2] * Xvalues))^2))
sum(residuals(fit1)^2)
sum(Res^2)
## --> can you reconstruct the other quantities by yourself? 


### What is this good for?
fit0 <- lm(N ~ 1, data = peat, subset = (group == Group))
summary(fit0)

mean(Yvalues)           # beta coefficient
sd(Yvalues)             # residual standard error
sd(Yvalues)/sqrt(n)     # estimated sd of beta

deviance(fit0)          ## SS_T 
###--> which can be obtained as              
sum((Yvalues - mean(Yvalues))^2)

### Overall F-test (once more)
summary(fit1)
anova(fit0, fit1)

### Matrix MS_e * (X'X)^{-1}
vcov(fit1)

### Correlation matrix derived from vcov(fit1)
cov2cor(vcov(fit1))

### Confidence intervals for regression coefficients
confint(fit1, level= 0.95)


### A plot with the conditional averages and the regression line
xx <- subset(peat, group == Group)[, "depth"]
yy <- subset(peat, group == Group)[, "N"]
plot(xx, yy, pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
points(tabs[[Group]][, "Depth"], tabs[[Group]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[Group])
abline(fit1, col = BGC[Group], lwd = 2)
### Does the line seem to be a suitable model for estimating the nitrogen-depth relationship?

### compare with the model where each depth level is considered separately
(fit2 = lm(N~as.factor(depth), data = peat, subset = (group == Group)))
points(subset(peat, group==Group)$depth, fitted(fit2), pch=20, cex=2, col="red")
# or for a single depth level
depthlevel = 0
(fit3 = lm(N~1, data=peat, subset = ((group==Group) & (depth==depthlevel))))
points(subset(peat, (group==Group) & (depth==depthlevel))$depth, fitted(fit3), pch=18, cex=2, col=1)

# note the differences in Std. Error
summary(fit2)
summary(fit3)


### Is it possible to judge (just visually) from the third column 
### whether one assumption of the classical linear model is satisfied? Which assumption?
print(tabs[[Group]])


### Regression line estimate once again
fit1 <- lm(N ~ depth, data = peat, subset = (group == Group), x = TRUE)
## it stores the model matrix in fit1 object this time

### Element 'x' is additionally here
names(fit1)

### model matrix, response etc.
X <- fit1[["x"]]
print(X)

y <- subset(peat, group == Group)[, "N"]
print(y)

(XtX <- t(X) %*% X)              ### What are the elements of XtX?
(Xty <- t(X) %*% y)              ### What are the elements of Xty?
(b <- solve(XtX, Xty))           ### What is this?

coef(fit1)

### projection matrix
H <- X %*% solve(XtX) %*% t(X)    ### What is its purpose?
dim(H)

all.equal(c(H%*%y),unname(fitted(fit1)))  

### better way how to get it (using the QR decomposition)
Q <- qr.Q(fit1[["qr"]])  ## Q matrix (orthogonal - columns are orthogonal vectors)

H <- qr.Q(fit1[["qr"]]) %*% t(qr.Q(fit1[["qr"]]))

dim(H)
H[1:10, 1:10]     ## part of H matrix

summary(c(abs(X %*% solve(XtX) %*% t(X) - H))) # numerically zero


### fitted values
yhat  <- H %*% y
yhat2 <- X %*% b
summary(yhat - yhat2)             ### Numerically only zeros. Is it surprising?


### residuals
mean(y - yhat)                    ### Numerically zero. Is it surprising?


### complement of the projection matrix
M <- diag(nrow(H)) - H            ### What is its purpose?

### M is the projection matrix into the space of residuals
all.equal(c(M%*%y),unname(fit1$residuals))

### SS_e (residual sum of squares)
deviance(fit1)

(SSe  <- as.numeric(t(y) %*% M %*% y))
(SSe2 <- as.numeric(crossprod(y - yhat)))  ## The same. Is it surprising?


### Residual mean square (and its square root)
(df <- length(y) - 2)
(s2 <- SSe / df)
(s <- sqrt(s2))

summary(fit1)

### What are we going to calculate by this? What is its purpose?
deviance(fit0)

(SST <- as.numeric(crossprod(y - mean(y))))
(WhatIsIt <- 1 - SSe / SST)

summary(fit1)              ### Is the number from above somewhere here?

cor(Xvalues,Yvalues)^2      # coefficient of determination - square of the coefficient of (multiple) correlation between Y and the regressors

### Variance and standard deviations of the regression coefficients' estimates. Why is it important or useful?
(varb <- s2 * solve(XtX))
(sb1 <- sqrt(varb[1, 1]))
(sb2 <- sqrt(varb[2, 2]))

summary(fit1)              ### Are the numbers from above somewhere here?


### Does the nitrogen concentration depend on the depth significantly?
(T <- (b[2] - 0) / sb2)                     ### Is there a connection with the test statistic of the one-sample t-test?

### critical value
alpha <- 0.05
(K <- qt(1 - alpha/2, df=df))               ### What is our decision?

### p-value
(Pval <- 2 * pt(-abs(T), df = df))          ### Is it correct and why?
                                            ### What is our decision?

### Is the test from above somewhere here?
summary(fit1)   


### Determine the confidence interval for the expected increase/decrease
### of the nitrogen concentration if the depth is increased by 1cm.
(delta.beta2 <- K * sb2)                       ### What is this?
(CI.beta2 <- b[2] + c(-1, 1) * delta.beta2)    ### Does this remind you the confidence interval
                                               ### for the mean of the normally distributed random vector?
confint(fit1, conf.level = 1 - alpha)
LSest(fit1, L = c(0, 1), conf.level = 0.95)    ### function from mffSM package
                                               ### - provides inference for theta = L*beta
                                               ###   (assuming it is estimable)

### Also one-sided tests/confidence intervals can be calculated
### with the LSest function.
LSest(fit1, L = c(0, 1), alternative = "greater")   ## What is this saying?

# H0: beta = 0
# H1: beta > 0

### Also tests against other than zero value can be calculated.
LSest(fit1, L = c(0, 1), theta0 = -0.02)            ## What is this saying?


### Construct the confidence interval for the expected change
### of the nitrogen concentration if the depth is increased by 10cm.   


### Is it significant that the concentration DECREASES with the increasing depth?
### Formulate the hypothesis with the corresponding alternative and perform the test.
### Determine the p-value of the test.

LSest(fit1, L = c(0, 1), alternative = "less") 

### Finally, model-based estimated mean concentration for a sequence of depths
### including the 95% confidence intervals
depth.grid <- 0:14
pdata <- data.frame(depth = depth.grid)
rownames(pdata) <- paste("depth =", depth.grid)
print(pdata)

predict(fit1, newdata = pdata)                ## only point estimates
predict(fit1, newdata = pdata, se.fit = TRUE) ## including standard errors
                                              ## What is the "residual.scale"?
predict(fit1, newdata = pdata, interval = "confidence", level = 0.95)  ## including the confidence intervals

### Plot
depth.grid2 <- seq(0, 14, length = 100)     ## denser grid than before
pfit1 <- predict(fit1, newdata = data.frame(depth = depth.grid2), interval = "confidence", level = 0.95)

plot(N ~ depth, data = subset(peat, group == Group), pch = PCH[Group], col = COL[Group], bg = BGC[Group], xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
points(tabs[[Group]][, "Depth"], tabs[[Group]][, "Mean"], pch = 23, cex = 2, col = "cadetblue", bg = BGC[Group])
abline(fit1, col = "blue4", lwd = 2)
lines(depth.grid2, pfit1[, "lwr"], col = "blue4", lty = 2, lwd = 2)
lines(depth.grid2, pfit1[, "upr"], col = "blue4", lty = 2, lwd = 2)


### =================================================================
### Regression line in each site.
### =================================================================

### Perform the above calculations for other sites as well. What are the conclusions?