### -------------------------------------------------------------------------###
### Winter Term 2021/2022                                                    ###
### NMSA407 - Linear Regression                                              ###
###                                                                          ###
### EXERCISE #10    Confounders and partial residuals                        ###
###                                                                          ###
### -------------------------------------------------------------------------###

rm(list=ls())
library(mffSM)
library(lmtest)
library(rgl)    # 3D visualisations

### -------------------------------------------------------------------------###
###  Block 1: TEACHING MATERIAL                                              ###
### -------------------------------------------------------------------------###

?Policie
data(Policie, package = "mffSM")
head(Policie)
dim(Policie)

### Our interest is in how 'fat' [percentage of the applicant's body fat]
### is related to the 'height' of the applicant. 

### Basic scatterplots to start
scatterplotMatrix(~fat + height + weight, smooth = FALSE, regLine = FALSE, 
                  data = Policie, pch = 16)

### 3D visualization
with(Policie, plot3d(height,weight,fat, main="Policie data", size=5, col=2))

###
### Three linear models: Policie
###

###
### Model 1: marginal model 'fat' ~ 'height'

m.marg <- lm(fat ~ height, data = Policie)
summary(m.marg)

plot(fat ~ height, data = Policie, main= 
       "Percentage of fat vs. height of the applicant")
lines(lowess(Policie$fat ~ Policie$height), lty="dashed",lwd=2)
abline(a=m.marg$coef[1], b=m.marg$coef[2], col="blue",lwd=2)
legend("bottomright", lty=c("solid", "dashed"), col=c("blue", "black"), 
       legend=c("linear fit", "lowess"), lwd=2)

### Interpret the effect of height on the percentage of body fat. 
### Is this effect in agreement with your experience? 

par(mfrow = c(1,2));
plot(fat~weight, data=Policie, main="Fat vs. weight");
plot(height~weight, data=Policie, main="Height vs. weight");
par(mfrow = c(1,1));

###
### Model 2: Additive model

### Explain the effect of 'height' on 'fat' in the following model [m.addit]. 
### Is 'weight' a confounder of this relationship? 
m.addit <- lm(fat ~ height + weight, data = Policie)
summary(m.addit)

###
### Model 3: Full model

### Is 'weight' an effect modifier of the effect of 'height' on 'fat'? 
m.full <- lm(fat ~ height + weight + height:weight, data = Policie)
summary(m.full)

### Plot of the effect of [height] on [fat] as estimated by the three models 
plot(Policie$weight, coef(m.full)['height'] + 
       coef(m.full)['height:weight']*Policie$weight, xlab="weight", 
     ylab="Effect of increase in height by 1",
     main="Effect of height on fat", type="l",ylim=c(-.5,.4));
### The effect of height on fat in model m.addit
abline(h=coef(m.addit)['height'], col = 2, lwd = 2, lty = 2)
abline(h=0,lty=3)
abline(h=coef(m.marg)['height'], col = "orange", lwd = 2)
legend("bottomleft",c("Marginal model", "Interaction model", "Additive model"),
       lwd=c(2,1,2),
       lty=c(1,1,2),col=c("orange",1:2))

### Is 'weight' an effect modifier in the relation of 'height' and 'fat'?
anova(m.addit,m.full)
summary(m.full)

### Does, conditionally on 'height', variable 'fat' depend on 'weight'? 
summary(m.full)
anova(m.marg,m.full)

### How is it possible that 'weight' is significantly contributing to 
### model m.full, yet both coefficients with 'weight' and 'height:weight'
### are non-significant in m.full?

### Look at the confidence regions
confidenceEllipse(m.full, which.coef=c(3,4))
title(main="Confidence regions for terms involving weight")
points(0,0,col=3,pch=16,cex=1.5)
(CI = confint(m.full))
abline(v=CI[3,],lwd=1, lty=2)
abline(h=CI[4,],lwd=1, lty=2)
rect(CI[3,1],CI[4,1],CI[3,2],CI[4,2],lwd=2)
### Multicollinearity! Compare the following outputs:
vif(m.addit)
vif(m.full)
### Is it surprising that vif is extremely high for the interaction term?

### Bonferroni correction
(CI = confint(m.full,level=1-0.05/2))
abline(v=CI[3,],lwd=1, lty=2, col="brown")
abline(h=CI[4,],lwd=1, lty=2, col="brown")
rect(CI[3,1],CI[4,1],CI[3,2],CI[4,2],lwd=3,border="brown")
legend("bottomright",c("conf. region for both coefficients", 
                       "marginal confidence regions", 
                       "confidence rectangle for both coefficients (Bonferroni)"
),lty=c(1,2,1),lwd=c(2,1,3),col=c(4,1,"brown"))

### What about reducing to other models?
anova(m.addit,m.full)
summary(m.weight <- lm(fat ~ weight, data = Policie))
anova(m.weight,m.full)

### A good model appears to be the additive model m.addit

###
### Visualization
### 

### 3D visualization in the model m.marg
with(Policie, plot3d(height,weight,fat, main="Model without weight", size=5))
hgrid = seq(min(Policie$height), max(Policie$height), length=100)
wgrid = seq(min(Policie$weight), max(Policie$weight), length=100)
fgrid = outer(hgrid, wgrid, function(x,y) coef(m.marg)["(Intercept)"] + 
                x*coef(m.marg)["height"])
surface3d(hgrid,wgrid,fgrid, alpha = .4)
points3d(Policie$height, Policie$weight, fitted(m.marg),col=2, size=3)
segments3d(rep(Policie$height, each=2), rep(Policie$weight, each=2),
           rbind(fitted(m.marg),Policie$fat),col=2,alpha=.4)

## function used for 3D visualization of models with at most two regressors 
plotlm.3d = function(x,y,z,m){
  plot3d(x,y,z,xlab=names(coef(m))[2], ylab=names(coef(m))[3],zlab = "Response", 
         size=5)
  xgrid = seq(min(x), max(x), length=100)
  ygrid = seq(min(y), max(y), length=100)
  nc = length(coef(m))
  if(nc==1) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1])
  if(nc==2) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2])
  if(nc==3) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2] 
                          + b*coef(m)[3])
  if(nc==4) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2] 
                          + b*coef(m)[3] + a*b*coef(m)[4])
  surface3d(xgrid,ygrid,zgrid, alpha = .4)
  points3d(x,y,fitted(m),col=2, size=3)
  segments3d(rep(x, each=2), rep(y, each=2), rbind(fitted(m),z),col=2,alpha=.4)
}

### Visualisation: additive model
with(Policie,plotlm.3d(height,weight,fat,m.addit))

### Visualisation: model with interactions
with(Policie,plotlm.3d(height,weight,fat,m.full))

###
### Alternative visualisation
###

summary(Policie[, "weight"])

### For the following weights the relationship between the 
### fat and height will be shown more explicitly
weight.show <- c(70, 80, 90)              ### approx. quartiles + median
Policie <- transform(Policie, fweight = cut(weight, breaks = c(50, 75, 85, 120), 
                                            labels = c("<=75", "75-85", ">85")))
PALwe <- diverge_hcl(length(weight.show))
LINwe <- c("blue", "grey50", "red")
names(PALwe) <- names(LINwe) <- levels(Policie[, "fweight"])

PCH <- 23
BTY <- "o";
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(fat ~ height, data = Policie, pch = PCH, col = LINwe[fweight], 
     bg = PALwe[fweight], 
     xlab = "Height [cm]", ylab = "Body fat [%]")
for (i in 1:length(weight.show)){
  ### in the additive model
     abline(a = coef(m.addit)["(Intercept)"] + coef(m.addit)["weight"] *
             weight.show[i],
            b = coef(m.addit)["height"], col = LINwe[i], lwd = 2)
  ### in model with interactions change to the following
  # abline(a = coef(m.full)["(Intercept)"] + coef(m.full)["weight"] *
  #            weight.show[i],
  #        b = coef(m.full)["height"] + coef(m.full)["height:weight"] *
  #            weight.show[i],
  #        col = LINwe[i], lwd = 2)
  }
abline(m.marg, col = "orange", lwd = 2)
legend(184, 27, legend = levels(Policie[, "fweight"]), pch = PCH, col = LINwe, 
       pt.bg = PALwe, bty = "n")
legend(188, 27, legend = weight.show, col = LINwe, lty = 1, lwd = 2, bty = "n")
text(184, 27.5, labels = "Weight [kg]", pos = 4)
legend(188, 23.5, legend = "Marginal", col = "orange",lty = 1,lwd = 2,bty = "n")

###
### Partial residuals: 
### Visualisation of the net effect of a regressor on respose
###

### To visualize whether the effect of height is only linear in model 
### 'm.addit: fat~height + weight', one can use the partial residuals 
head(residuals(m.addit, type="partial"))

### 'Manual' computation
### Using Definition 9.1 of partial residuals
part.res = residuals(m.addit)+
  matrix(coef(m.addit)[2:3],nrow=nrow(Policie),ncol=2,byrow=TRUE)*
  model.matrix(m.addit)[,2:3]
head(part.res)

### in R the zero-mean partial residuals are computed 
X = model.matrix(m.addit)
X.c = t(t(X) - colMeans(X))  # centered model matrix
round(colMeans(X.c),10)
part.res.zero = residuals(m.addit)+
  matrix(coef(m.addit)[2:3],nrow=nrow(Policie),ncol=2,byrow=TRUE)*X.c[,2:3]
head(part.res.zero)
max(abs(part.res.zero-residuals(m.addit,type="partial")))
# the partial residuals are indeed zero-mean
round(colSums(part.res.zero),10)

### Lemma 9.2: In the linear model part.res ~ regressor, the estimated 
###            slope coincides with the estimated coefficient for the regressor
###            in the original model

### partial residuals for height
coef(lm(part.res[,1]~Policie$height))       # with partial residuals
coef(lm(part.res.zero[,1]~Policie$height))  # with zero-mean partial residuals
c(-colMeans(X)[2]*coef(m.addit)[2], coef(m.addit)[2])

part.res.height <- part.res[,1];
plot(Policie$height, part.res.height, xlab = "height", 
     ylab="The zero-mean partial residuals for 'height'", 
     main="Partial residuals [height] - model m.addit");
abline(a=0, b=m.addit$coef[2], col="blue", lwd=2);
lines(lowess(part.res.height~Policie$height), lty="dashed", lwd=2)

### Compare the plots of height against residuals and partial residuals, resp.
par(mfrow=c(1,2))
plot(residuals(m.addit)~height, data=Policie, main="Residuals ~ height",
     ylab="Residuals")
abline(a=0,b=0,lwd=2,col=2)
# (recall that b=0 in the plot above is apropriate because)
lm(residuals(m.addit)~height,data=Policie)
# Why are both estimated coefficients in the model above zero?
#
plot(residuals(m.addit)+coef(m.addit)["height"]*height~height, data=Policie, 
     main = "Partial residuals ~ height", ylab="Partial residuals")
abline(a=0,b=coef(m.addit)["height"],lwd=2,col=2)
par(mfrow=c(1,1))
  
### The plot with partial residuals using function termplot
termplot(m.addit, terms="height", partial.resid=TRUE, smooth=panel.smooth, 
         lwd.term=2, col.res=1)

### -------------------------------------------------------------------------###
###  Block 2: INDIVIDUAL WORK                                                ###
### -------------------------------------------------------------------------###

### 
### Soil data   
### 

# load("Data/SoilData.RData")
(load(url("http://www.karlin.mff.cuni.cz/~nagy/NMSA407/Data/SoilData.RData")))

# The data comes from an experiment focused on the evaluation 
# of the phosporus concentration in some specific types of soil after
# adding one of two available ash-based fertilizers. 

head(soilData);
summary(soilData);

### Variables of interest for this exercise class 
# P         ... phosphorus concentration in a soil sample measured in ug per kg 
#               after adding an ash-based fertilizer;
# SoilP     ... initial phosphorus concentration in a soil sample [ug per kg];
# SoilpH    ... pH acidity of the soil sample;
# SoilType  ... factor variable with four levels for four different soil types; 

DATA <- soilData;

# Considering Soiltype as a factor 
DATA$fSoilType <- factor(DATA$SoilType); 

# Response: Increase in the concentration of phosphorus 
DATA$Y <- log(DATA$P/DATA$SoilP)

# Suppose we are interested in the relation of the response on pH of the soil 
summary(m0 <- lm(Y ~ SoilpH, data=DATA))
plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", 
     main="log(P/SoilP) vs. SoilpH");
lines(lowess(DATA$Y~DATA$SoilpH), lty="dashed", lwd=2)
abline(reg=m0, col="blue", lwd=2) 
# or equivalently: abline(a=m0$coef[1], b=m0$coef[2], col="blue")
legend(4.75,-4, lty=c("solid", "dashed"), col=c("blue", "black"), 
       legend=c("linear fit", "lowess"), lwd=2)

# Taking fSoilType into consideration 
summary(m1 <- lm(Y ~ SoilpH+fSoilType, data=DATA))

# Is type of the soil (SoilType) an effect modifier of the relationship 
# of the response and pH? 
anova(m2 <- lm(Y ~ SoilpH*fSoilType, data=DATA))

# Can SoilpH be removed from the model altogether?
Anova(m2)

# How is such a result possible?
plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", 
     main="log(P/SoilP) vs. SoilpH", pch=SoilType, col=SoilType, lwd=2);
i <- 0; 
for(f in levels(DATA$fSoilType)){	
  i <- i+1; 
  qqq <- as.logical(DATA$SoilType == f);
  lines(lowess(DATA$Y[qqq]~DATA$SoilpH[qqq]), lty="dashed", col=i, lwd=2)
  b <- coef(lm(fitted(m2)[qqq]~DATA$SoilpH[qqq]))[2]; # Sign of the coefficient 
  if(b > 0){
    yrange <- range(fitted(m2)[qqq])		
  }
  else{
    yrange <- (range(fitted(m2)[qqq]))[c(2,1)]		
  }
  lines(range(DATA$SoilpH[qqq]), yrange, col=i, lwd=2)	
}
legend(x=5, y=-4, pch=1:4, col=1:4, legend=paste("Soil type", 
                                                 levels(DATA$fSoilType)), lwd=2)
legend(x=5, y=-4.72, lty=c("solid", "dashed"), 
       legend=c("linear fit [m2]", "lowess"), lwd=2)

### Model diagnostics: Durbin-Watson's test on the correlation of errors
###  (see Section 9.5.1 of the lecture notes)

### Durbin-Watson test of correlation of errors 
# p-value based on some asymptotic approximations 
dwtest(m2)
# alternatively, p-value with the help of bootstrap 
# (a simulation method - details will be provided in 
#  NMST434: Modern statistical methods)
library(car)
durbinWatsonTest(m2)

### Why does the DW test reject the null hypothesis?
plot(residuals(m2),main="Residulas vs index: observed data")
### Is any pattern visible in the residuals taken as a time series?

### What if the rows of the data were randomly shuffled?
DATA2 = DATA[sample.int(nrow(DATA)),]
m2shuf = lm(Y ~ SoilpH*fSoilType, data = DATA2) 
dwtest(m2shuf)
durbinWatsonTest(m2shuf) # and again the same test, with a different p-value
durbinWatsonTest(m2shuf)
plot(residuals(m2shuf),main="Residulas vs index: random shuffle")
### Clearly no visible pattern here

### In the other extreme case: What if the rows are ordered according to 
### the values of residuals?  
DATA3 = DATA[order(residuals(m2)),]
m2ord = lm(Y ~ SoilpH*fSoilType, data = DATA3) 
dwtest(m2ord)
plot(residuals(m2ord),main="Residulas vs index: ordered residuals")
### This is essentially the inverse CDF of the residuals

### Reason for rejection the null hypotehsis in the DW test: 
### Observed values of Y as a function of the index of the observations
plot(DATA$Y, col=DATA$SoilType, pch=DATA$SoilType, lwd=2, main="Y vs index")
legend("bottomright",paste("soil type ", 1:4, sep=""), col=1:4, pch=1:4, lty=0, 
       lwd=2)
### The observations were taken sequentially, most likely by repeated probing
### in the same soil sample!

### This problem with dependence in the dataset is not visible from the 
### basic diagnostic plots
plotLM(m2)
### Nothing terribly wrong here..

### Maybe a heteroscedasticity issue?
bptest(m2, ~fitted(m2))
bptest(m2)

### Residuals appear to have different variances in different soil types
boxplot(residuals(m2)~DATA$SoilType)

### -------------------------------------------------------------------------###
###  Block 3: ADDITIONAL MATERIAL                                            ###
### -------------------------------------------------------------------------###

### Durbin-Watson's test

dwtest(m2)
durbinWatsonTest(m2)

### 'Manual' calculation of the test statistic of the Durbin-Watson test  
U <- residuals(m2);
n <- length(U)
(DW <- sum((U[-1] - U[-n])^2)/sum(U^2));

### Interpretation: shifted residuals appear to depend on the value of the 
### unshifted residuals
plot(U[-n], U[-1], xlab=expression(U[i]), ylab=expression(U[i+1]), 
     main=paste("Autocorrelation with lag = 1 :", round(cor(U[-n], U[-1]), 2)));
abline(reg=lm(U[-1]~U[-n]), lwd=2)