### -------------------------------------------------------------------------### ### Winter Term 2021/2022 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #10 Confounders and partial residuals ### ### ### ### -------------------------------------------------------------------------### rm(list=ls()) library(mffSM) library(lmtest) library(rgl) # 3D visualisations ### -------------------------------------------------------------------------### ### Block 1: TEACHING MATERIAL ### ### -------------------------------------------------------------------------### ?Policie data(Policie, package = "mffSM") head(Policie) dim(Policie) ### Our interest is in how 'fat' [percentage of the applicant's body fat] ### is related to the 'height' of the applicant. ### Basic scatterplots to start scatterplotMatrix(~fat + height + weight, smooth = FALSE, regLine = FALSE, data = Policie, pch = 16) ### 3D visualization with(Policie, plot3d(height,weight,fat, main="Policie data", size=5, col=2)) ### ### Three linear models: Policie ### ### ### Model 1: marginal model 'fat' ~ 'height' m.marg <- lm(fat ~ height, data = Policie) summary(m.marg) plot(fat ~ height, data = Policie, main= "Percentage of fat vs. height of the applicant") lines(lowess(Policie$fat ~ Policie$height), lty="dashed",lwd=2) abline(a=m.marg$coef[1], b=m.marg$coef[2], col="blue",lwd=2) legend("bottomright", lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess"), lwd=2) ### Interpret the effect of height on the percentage of body fat. ### Is this effect in agreement with your experience? par(mfrow = c(1,2)); plot(fat~weight, data=Policie, main="Fat vs. weight"); plot(height~weight, data=Policie, main="Height vs. weight"); par(mfrow = c(1,1)); ### ### Model 2: Additive model ### Explain the effect of 'height' on 'fat' in the following model [m.addit]. ### Is 'weight' a confounder of this relationship? m.addit <- lm(fat ~ height + weight, data = Policie) summary(m.addit) ### ### Model 3: Full model ### Is 'weight' an effect modifier of the effect of 'height' on 'fat'? m.full <- lm(fat ~ height + weight + height:weight, data = Policie) summary(m.full) ### Plot of the effect of [height] on [fat] as estimated by the three models plot(Policie$weight, coef(m.full)['height'] + coef(m.full)['height:weight']*Policie$weight, xlab="weight", ylab="Effect of increase in height by 1", main="Effect of height on fat", type="l",ylim=c(-.5,.4)); ### The effect of height on fat in model m.addit abline(h=coef(m.addit)['height'], col = 2, lwd = 2, lty = 2) abline(h=0,lty=3) abline(h=coef(m.marg)['height'], col = "orange", lwd = 2) legend("bottomleft",c("Marginal model", "Interaction model", "Additive model"), lwd=c(2,1,2), lty=c(1,1,2),col=c("orange",1:2)) ### Is 'weight' an effect modifier in the relation of 'height' and 'fat'? anova(m.addit,m.full) summary(m.full) ### Does, conditionally on 'height', variable 'fat' depend on 'weight'? summary(m.full) anova(m.marg,m.full) ### How is it possible that 'weight' is significantly contributing to ### model m.full, yet both coefficients with 'weight' and 'height:weight' ### are non-significant in m.full? ### Look at the confidence regions confidenceEllipse(m.full, which.coef=c(3,4)) title(main="Confidence regions for terms involving weight") points(0,0,col=3,pch=16,cex=1.5) (CI = confint(m.full)) abline(v=CI[3,],lwd=1, lty=2) abline(h=CI[4,],lwd=1, lty=2) rect(CI[3,1],CI[4,1],CI[3,2],CI[4,2],lwd=2) ### Multicollinearity! Compare the following outputs: vif(m.addit) vif(m.full) ### Is it surprising that vif is extremely high for the interaction term? ### Bonferroni correction (CI = confint(m.full,level=1-0.05/2)) abline(v=CI[3,],lwd=1, lty=2, col="brown") abline(h=CI[4,],lwd=1, lty=2, col="brown") rect(CI[3,1],CI[4,1],CI[3,2],CI[4,2],lwd=3,border="brown") legend("bottomright",c("conf. region for both coefficients", "marginal confidence regions", "confidence rectangle for both coefficients (Bonferroni)" ),lty=c(1,2,1),lwd=c(2,1,3),col=c(4,1,"brown")) ### What about reducing to other models? anova(m.addit,m.full) summary(m.weight <- lm(fat ~ weight, data = Policie)) anova(m.weight,m.full) ### A good model appears to be the additive model m.addit ### ### Visualization ### ### 3D visualization in the model m.marg with(Policie, plot3d(height,weight,fat, main="Model without weight", size=5)) hgrid = seq(min(Policie$height), max(Policie$height), length=100) wgrid = seq(min(Policie$weight), max(Policie$weight), length=100) fgrid = outer(hgrid, wgrid, function(x,y) coef(m.marg)["(Intercept)"] + x*coef(m.marg)["height"]) surface3d(hgrid,wgrid,fgrid, alpha = .4) points3d(Policie$height, Policie$weight, fitted(m.marg),col=2, size=3) segments3d(rep(Policie$height, each=2), rep(Policie$weight, each=2), rbind(fitted(m.marg),Policie$fat),col=2,alpha=.4) ## function used for 3D visualization of models with at most two regressors plotlm.3d = function(x,y,z,m){ plot3d(x,y,z,xlab=names(coef(m))[2], ylab=names(coef(m))[3],zlab = "Response", size=5) xgrid = seq(min(x), max(x), length=100) ygrid = seq(min(y), max(y), length=100) nc = length(coef(m)) if(nc==1) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1]) if(nc==2) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2]) if(nc==3) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2] + b*coef(m)[3]) if(nc==4) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2] + b*coef(m)[3] + a*b*coef(m)[4]) surface3d(xgrid,ygrid,zgrid, alpha = .4) points3d(x,y,fitted(m),col=2, size=3) segments3d(rep(x, each=2), rep(y, each=2), rbind(fitted(m),z),col=2,alpha=.4) } ### Visualisation: additive model with(Policie,plotlm.3d(height,weight,fat,m.addit)) ### Visualisation: model with interactions with(Policie,plotlm.3d(height,weight,fat,m.full)) ### ### Alternative visualisation ### summary(Policie[, "weight"]) ### For the following weights the relationship between the ### fat and height will be shown more explicitly weight.show <- c(70, 80, 90) ### approx. quartiles + median Policie <- transform(Policie, fweight = cut(weight, breaks = c(50, 75, 85, 120), labels = c("<=75", "75-85", ">85"))) PALwe <- diverge_hcl(length(weight.show)) LINwe <- c("blue", "grey50", "red") names(PALwe) <- names(LINwe) <- levels(Policie[, "fweight"]) PCH <- 23 BTY <- "o"; par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1) plot(fat ~ height, data = Policie, pch = PCH, col = LINwe[fweight], bg = PALwe[fweight], xlab = "Height [cm]", ylab = "Body fat [%]") for (i in 1:length(weight.show)){ ### in the additive model abline(a = coef(m.addit)["(Intercept)"] + coef(m.addit)["weight"] * weight.show[i], b = coef(m.addit)["height"], col = LINwe[i], lwd = 2) ### in model with interactions change to the following # abline(a = coef(m.full)["(Intercept)"] + coef(m.full)["weight"] * # weight.show[i], # b = coef(m.full)["height"] + coef(m.full)["height:weight"] * # weight.show[i], # col = LINwe[i], lwd = 2) } abline(m.marg, col = "orange", lwd = 2) legend(184, 27, legend = levels(Policie[, "fweight"]), pch = PCH, col = LINwe, pt.bg = PALwe, bty = "n") legend(188, 27, legend = weight.show, col = LINwe, lty = 1, lwd = 2, bty = "n") text(184, 27.5, labels = "Weight [kg]", pos = 4) legend(188, 23.5, legend = "Marginal", col = "orange",lty = 1,lwd = 2,bty = "n") ### ### Partial residuals: ### Visualisation of the net effect of a regressor on respose ### ### To visualize whether the effect of height is only linear in model ### 'm.addit: fat~height + weight', one can use the partial residuals head(residuals(m.addit, type="partial")) ### 'Manual' computation ### Using Definition 9.1 of partial residuals part.res = residuals(m.addit)+ matrix(coef(m.addit)[2:3],nrow=nrow(Policie),ncol=2,byrow=TRUE)* model.matrix(m.addit)[,2:3] head(part.res) ### in R the zero-mean partial residuals are computed X = model.matrix(m.addit) X.c = t(t(X) - colMeans(X)) # centered model matrix round(colMeans(X.c),10) part.res.zero = residuals(m.addit)+ matrix(coef(m.addit)[2:3],nrow=nrow(Policie),ncol=2,byrow=TRUE)*X.c[,2:3] head(part.res.zero) max(abs(part.res.zero-residuals(m.addit,type="partial"))) # the partial residuals are indeed zero-mean round(colSums(part.res.zero),10) ### Lemma 9.2: In the linear model part.res ~ regressor, the estimated ### slope coincides with the estimated coefficient for the regressor ### in the original model ### partial residuals for height coef(lm(part.res[,1]~Policie$height)) # with partial residuals coef(lm(part.res.zero[,1]~Policie$height)) # with zero-mean partial residuals c(-colMeans(X)[2]*coef(m.addit)[2], coef(m.addit)[2]) part.res.height <- part.res[,1]; plot(Policie$height, part.res.height, xlab = "height", ylab="The zero-mean partial residuals for 'height'", main="Partial residuals [height] - model m.addit"); abline(a=0, b=m.addit$coef[2], col="blue", lwd=2); lines(lowess(part.res.height~Policie$height), lty="dashed", lwd=2) ### Compare the plots of height against residuals and partial residuals, resp. par(mfrow=c(1,2)) plot(residuals(m.addit)~height, data=Policie, main="Residuals ~ height", ylab="Residuals") abline(a=0,b=0,lwd=2,col=2) # (recall that b=0 in the plot above is apropriate because) lm(residuals(m.addit)~height,data=Policie) # Why are both estimated coefficients in the model above zero? # plot(residuals(m.addit)+coef(m.addit)["height"]*height~height, data=Policie, main = "Partial residuals ~ height", ylab="Partial residuals") abline(a=0,b=coef(m.addit)["height"],lwd=2,col=2) par(mfrow=c(1,1)) ### The plot with partial residuals using function termplot termplot(m.addit, terms="height", partial.resid=TRUE, smooth=panel.smooth, lwd.term=2, col.res=1) ### -------------------------------------------------------------------------### ### Block 2: INDIVIDUAL WORK ### ### -------------------------------------------------------------------------### ### ### Soil data ### # load("Data/SoilData.RData") (load(url("http://www.karlin.mff.cuni.cz/~nagy/NMSA407/Data/SoilData.RData"))) # The data comes from an experiment focused on the evaluation # of the phosporus concentration in some specific types of soil after # adding one of two available ash-based fertilizers. head(soilData); summary(soilData); ### Variables of interest for this exercise class # P ... phosphorus concentration in a soil sample measured in ug per kg # after adding an ash-based fertilizer; # SoilP ... initial phosphorus concentration in a soil sample [ug per kg]; # SoilpH ... pH acidity of the soil sample; # SoilType ... factor variable with four levels for four different soil types; DATA <- soilData; # Considering Soiltype as a factor DATA$fSoilType <- factor(DATA$SoilType); # Response: Increase in the concentration of phosphorus DATA$Y <- log(DATA$P/DATA$SoilP) # Suppose we are interested in the relation of the response on pH of the soil summary(m0 <- lm(Y ~ SoilpH, data=DATA)) plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", main="log(P/SoilP) vs. SoilpH"); lines(lowess(DATA$Y~DATA$SoilpH), lty="dashed", lwd=2) abline(reg=m0, col="blue", lwd=2) # or equivalently: abline(a=m0$coef[1], b=m0$coef[2], col="blue") legend(4.75,-4, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess"), lwd=2) # Taking fSoilType into consideration summary(m1 <- lm(Y ~ SoilpH+fSoilType, data=DATA)) # Is type of the soil (SoilType) an effect modifier of the relationship # of the response and pH? anova(m2 <- lm(Y ~ SoilpH*fSoilType, data=DATA)) # Can SoilpH be removed from the model altogether? Anova(m2) # How is such a result possible? plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", main="log(P/SoilP) vs. SoilpH", pch=SoilType, col=SoilType, lwd=2); i <- 0; for(f in levels(DATA$fSoilType)){ i <- i+1; qqq <- as.logical(DATA$SoilType == f); lines(lowess(DATA$Y[qqq]~DATA$SoilpH[qqq]), lty="dashed", col=i, lwd=2) b <- coef(lm(fitted(m2)[qqq]~DATA$SoilpH[qqq]))[2]; # Sign of the coefficient if(b > 0){ yrange <- range(fitted(m2)[qqq]) } else{ yrange <- (range(fitted(m2)[qqq]))[c(2,1)] } lines(range(DATA$SoilpH[qqq]), yrange, col=i, lwd=2) } legend(x=5, y=-4, pch=1:4, col=1:4, legend=paste("Soil type", levels(DATA$fSoilType)), lwd=2) legend(x=5, y=-4.72, lty=c("solid", "dashed"), legend=c("linear fit [m2]", "lowess"), lwd=2) ### Model diagnostics: Durbin-Watson's test on the correlation of errors ### (see Section 9.5.1 of the lecture notes) ### Durbin-Watson test of correlation of errors # p-value based on some asymptotic approximations dwtest(m2) # alternatively, p-value with the help of bootstrap # (a simulation method - details will be provided in # NMST434: Modern statistical methods) library(car) durbinWatsonTest(m2) ### Why does the DW test reject the null hypothesis? plot(residuals(m2),main="Residulas vs index: observed data") ### Is any pattern visible in the residuals taken as a time series? ### What if the rows of the data were randomly shuffled? DATA2 = DATA[sample.int(nrow(DATA)),] m2shuf = lm(Y ~ SoilpH*fSoilType, data = DATA2) dwtest(m2shuf) durbinWatsonTest(m2shuf) # and again the same test, with a different p-value durbinWatsonTest(m2shuf) plot(residuals(m2shuf),main="Residulas vs index: random shuffle") ### Clearly no visible pattern here ### In the other extreme case: What if the rows are ordered according to ### the values of residuals? DATA3 = DATA[order(residuals(m2)),] m2ord = lm(Y ~ SoilpH*fSoilType, data = DATA3) dwtest(m2ord) plot(residuals(m2ord),main="Residulas vs index: ordered residuals") ### This is essentially the inverse CDF of the residuals ### Reason for rejection the null hypotehsis in the DW test: ### Observed values of Y as a function of the index of the observations plot(DATA$Y, col=DATA$SoilType, pch=DATA$SoilType, lwd=2, main="Y vs index") legend("bottomright",paste("soil type ", 1:4, sep=""), col=1:4, pch=1:4, lty=0, lwd=2) ### The observations were taken sequentially, most likely by repeated probing ### in the same soil sample! ### This problem with dependence in the dataset is not visible from the ### basic diagnostic plots plotLM(m2) ### Nothing terribly wrong here.. ### Maybe a heteroscedasticity issue? bptest(m2, ~fitted(m2)) bptest(m2) ### Residuals appear to have different variances in different soil types boxplot(residuals(m2)~DATA$SoilType) ### -------------------------------------------------------------------------### ### Block 3: ADDITIONAL MATERIAL ### ### -------------------------------------------------------------------------### ### Durbin-Watson's test dwtest(m2) durbinWatsonTest(m2) ### 'Manual' calculation of the test statistic of the Durbin-Watson test U <- residuals(m2); n <- length(U) (DW <- sum((U[-1] - U[-n])^2)/sum(U^2)); ### Interpretation: shifted residuals appear to depend on the value of the ### unshifted residuals plot(U[-n], U[-1], xlab=expression(U[i]), ylab=expression(U[i+1]), main=paste("Autocorrelation with lag = 1 :", round(cor(U[-n], U[-1]), 2))); abline(reg=lm(U[-1]~U[-n]), lwd=2)