### --------------------------------------------------------------------------------------### ### Winter Term 2018/2019 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #12 Confounders and effect modifier ### ### ### ### --------------------------------------------------------------------------------------### ### Working directory rm(list=ls()) setwd("H:/nmsa407_LinRegr/") ### Load needed packages library("mffSM") library("splines") library("colorspace") library("lmtest") ### Loading the data file data(Policie, package = "mffSM") head(Policie) dim(Policie) summary(Policie) ### Our basic interest is how 'fat' [percentage of the applicant's body fat] ### is related to the 'height' of the applicant. ### Basic scatterplots to start ### (scatterplotMatrix -> function from package car) palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5))) scatterplotMatrix(~fat + height + weight, reg.line = lm, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = Policie, pch = 16) ### Three models m1 <- lm(fat ~ height, data = Policie) summary(m1) plot(fat ~ height, data = Policie, main="Percent of fat vs. height of the applicant") lines(lowess(Policie$fat ~ Policie$height), lty="dashed") abline(a=m1$coef[1], b=m1$coef[2], col="blue") legend(188,6, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess")) ### QUESTIONS: ### Interpret the effect of height on the percentage of body fat. ### Is this effect in agreement with your experience? par(mfrow = c(1,2)); plot(fat~weight, data=Policie, main="Fat vs. weight"); plot(height~weight, data=Policie, main="Height vs. weight"); par(mfrow = c(1,1)); ### QUESTIONS: ### Explain the effect of 'height' on 'fat' in the following model [m2]. ### Is 'weight' a confounder of this relationship? m2 <- lm(fat ~ height + weight, data = Policie) summary(m2) ### QUESTIONS: ### Interpret the model below and the corresponding parameter estimates. ### Is 'weight' an effect modifier of the effect of 'height' on 'fat'? m3 <- lm(fat ~ height + weight + height:weight, data = Policie) summary(m3) ### Visualization (of models m1 and m2) summary(Policie[, "weight"]) ### For the following weights the relationship between the ### fat and height will be shown more explicitly weight.show <- c(70, 80, 90) ### approx. quartiles + median Policie <- transform(Policie, fweight = cut(weight, breaks = c(50, 75, 85, 120), labels = c("<=75", "75-85", ">85"))) PALwe <- diverge_hcl(length(weight.show)) LINwe <- c("blue", "grey50", "red") names(PALwe) <- names(LINwe) <- levels(Policie[, "fweight"]) ### PCH <- 23 PCH <- c(1,2,3) BTY <- "o"; par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1) plot(fat ~ height, data = Policie, pch = PCH, col = LINwe, bg = PALwe[fweight], xlab = "Height [cm]", ylab = "Body fat [%]") for (i in 1:length(weight.show)){ abline(a = coef(m2)["(Intercept)"] + coef(m2)["weight"] * weight.show[i], b = coef(m2)["height"], col = LINwe[i], lwd = 2) } abline(m1, col = "orange", lwd = 2) legend(185, 27, legend = levels(Policie[, "fweight"]), pch = PCH, col = LINwe, pt.bg = PALwe, bty = "n") legend(188, 27, legend = weight.show, col = LINwe, lty = 1, lwd = 2, bty = "n") text(185, 27.5, labels = "Weight [kg]", pos = 4) legend(188, 23.5, legend = "Marginal", col = "orange", lty = 1, lwd = 2, bty = "n") ### Data set #2 ### Pribram dataset ### ### The sampling area (about 5 km2) was located in the vicinity of ### a lead smelter (pec na olovo) in Pribram, Czech Republic. The smelter ### has been operating since 1786. The primary smelting of lead ores ceased ### in 1972 and was replaced by industry processing of secondary lead ### sources such as car batteries and other lead-bearing materials. ### Soil samples were taken from the 121 sites selected at the studied area ### surrounding the smelter. ### The dataset contains measurements of the concentration of several metals: ### As, Pb, Zn (these measurements are already logarithmic transformated ### with the base 2 and then centered by the sample mean) ### Measurements of some parameters of organic activity (also already log2 transformed ### but not centered): ### dehydro ... dehydrogenase activity ### respir ... respiratory activity ### biomass ### Measurements of some other characteristics of the soil ### cox .... Soil microbial carbon (log2 transformed) ### pH .... pH-factor of the soil ### dist ... distance from the smelter ### The primary interest is in the effect of the metals on the organic activity . rm(list = ls()) DATA <- read.table("pribram.csv", sep=";", header=TRUE); ### ALTERNATIVELY (directly from the course website) DATA <- read.table("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/pribram.csv", sep=";", header=TRUE); head(DATA); summary(DATA); ### Consider first the relationship of dehydrogenase activity (dehydro) and Zn summary(m0 <- lm(dehydro ~ Zn, data=DATA)) plot(dehydro ~ Zn, data=DATA, xlab="Zn", ylab="dehydro", main="Dehydro vs. Zn"); lines(lowess(DATA$dehydro~DATA$Zn), lty="dashed") abline(a=m0$coef[1], b=m0$coef[2], col="blue") legend(9,6, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess")) ### QUESTIONS: ### The relationship of Zn and dehydro is not as expected... ### The problem might be in the fact that this is an OBSERVATIONAL STUDY ### and not a DESIGNED EXPERIMENT. In observational studies one has to ### be careful about possible confounding factors (confounders). ### In this study the problem was that due to the contamination the soil near ### the smelter was less used which resulted in higher values of the organic ### activity. par(mfrow = c(1,2)); plot(DATA$dist, DATA$Zn, xlab="Distance from the smelter", ylab="Zn", main="Zn vs. distance"); lines(lowess(DATA$Zn~DATA$dist), lty="dashed") plot(DATA$dist, DATA$dehydro, xlab="Distance from the smelter", ylab="dehydrogenase activity", main="Dehydro vs. distance"); lines(lowess(DATA$dehydro~DATA$dist), lty="dashed") par(mfrow = c(1,1)); ### Thus we can try to include distance from the smelter as a covariate summary(lm(dehydro ~ Zn + dist, data=DATA)) ### QUESTION: ### Explain the effect of Zn on dehydro estimated by this model ### One can also try... summary(lm(dehydro ~ Zn*dist, data=DATA)) ### The problem is that although the distance from the smelter seems to be an important ### variable, it is from the biological point of view not the most appropriate variable ### to control for. ### After discussion with the researchers we decided to control for ### cox (soil microbial carbon) as it is known that organic activity depends on the ### amount of the available carbon in the soil. par(mfrow = c(1,2)); plot(DATA$cox, DATA$Zn, xlab="Cox", ylab="Zn", main="Zn vs. cox"); lines(lowess(DATA$Zn~DATA$cox), lty="dashed") plot(DATA$cox, DATA$dehydro, xlab="Cox", ylab="dehydrogenase activity", main="Dehydro vs. cox"); lines(lowess(DATA$dehydro~DATA$cox), lty="dashed") par(mfrow = c(1,1)); summary(m1 <- lm(dehydro ~ Zn + cox, data=DATA)) ### QUESTION: ### Interpret the effect of Zn on dehydro in this model. ### Compare this effect with the effect of Zn in a model m0 ### (without cox). summary(m0) ### QUESTION: ### Is cox a confounder for the relationship of dehydro vs Zn? ### One can be also interested if cox is even an effect modifier of the relationship ### of dehydro vs Zn. summary(m2 <- lm(dehydro ~ Zn*cox, data=DATA)) anova(m2) ### Plot of effect of the Zn on dehydro as estimated by model m2 plot(DATA$cox, coef(m2)['Zn'] + coef(m2)['Zn:cox']*DATA$cox, xlab="cox", main="Effect of the Zn on dehydro [m2]", type="l"); abline(h=0, lty="dotted") ### Sometimes the effect modification disappear when we use a more flexible function of ### the variable that we need to control. ### To visualize if the effect of cox is only linear in model 'm1: dehydro~Zn + cox', ### one can use the partial residuals head(residuals(m1, type="partial")) parc.res.cox <- residuals(m1, type="partial")[,2]; plot(DATA$cox, parc.res.cox, xlab = "Cox", ylab="The zero-mean partial residuals for 'cox'", main="Partial residuals [cox] - model m1"); abline(a=0, b=m1$coef[3], col="blue"); lines(lowess(parc.res.cox~DATA$cox), lty="dashed") ### Let us consider the following relatively simple spline parametrization of cox DATA$cox.sp <- bs(DATA$cox, knots = 0, degree = 2); qqq <- order(DATA$cox); plot(DATA$cox[qqq], DATA$cox.sp[qqq,1], type="l", ylim=range(DATA$cox.sp)); lines(DATA$cox[qqq], DATA$cox.sp[qqq,2], type="l", col="blue"); lines(DATA$cox[qqq], DATA$cox.sp[qqq,3], type="l", col="red"); ### summary(m3 <- lm(dehydro ~ Zn+cox.sp, data=DATA)) anova(m1, m3) ### What is the effect of Zn on dehydro as estimated by m3? ### Do we still need to have an interaction? anova(lm(dehydro ~ Zn * cox.sp, data=DATA)) ### Let us take m3 as the final model mFin <- m3; ### Basic diagnostic plot plotLM(mFin) ### Test of the normality of the errors shapiro.test(mFin$residuals) ### Test of homoscedasticity - Koenker's studentized version bptest(mFin, ~fitted(mFin)) bptest(mFin) ### Durbin-Watson test of correlation of errors ### p-value based on some asymptotic approximations dwtest(mFin) ### p-value with the help of bootstrap library(car) durbinWatsonTest(mFin) ### The 'manual' calculation of the test statistic of Durbin-Watson test U <- residuals(mFin); n <- length(U) (DW <- sum((U[-1] - U[-n])^2)/sum(U^2)); plot(U[-n], U[-1], xlab=expression(U[i]), ylab=expression(U[i+1]), main=paste("Autocorrelation with lag=1:", round(cor(U[-n], U[-1]), 2))); abline(reg=lm(U[-1]~U[-n])) #3# Do you have any idea why does the DW-test give a significant result? plot(DATA$dist) ### The results about homoscedasticity are borderline and the sample size ### is moderate. Sandwich estimator might be of interest library("sandwich") ### Estimate of the variance matrix of the regression coefficients (VHC <- vcovHC(mFin, type = "HC3")); ### type = "HC3" It uses residuals(mFin)^2/(1 - hatvalues(mFin))^2 ### on the diagonal of the "meat" matrix of the sandwich. ### Manual calculation of the sandwich estimator - start X <- model.matrix(mFin); Bread <- solve(t(X) %*% X) %*% t(X) Meat <- diag(residuals(mFin)^2 / (1 - hatvalues(mFin))^2) Bread %*% Meat %*% t(Bread) Bread %*% Meat %*% t(Bread) - VHC ### Manual calculation of the sandwich estimator - end ### Summary table using the sandwich estimator of variance coeftest(mFin, vcov = VHC) ### Summary table using the standard estimator of variance that assumes homoscedasticity summary(mFin) ### Data set #3 rm(list = ls()) load("SoilData.RData") ### EQUIVALENTLY (directly from the course website) ### load(url("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/SoilData.RData")) # The data comes from an experiment focusing on evaluating the concentration # of phosphorus in some specific types of soil after # adding one of two available ash-based fertilizers. head(soilData); summary(soilData); ### Variables of interest for this exercise class ### P ... phosphor concentration in a soil sample measured in μg per kg after adding an ash-based fertilizer; ### SoilP ... initial phosphor concentration in a soil sample measured in μg per kg; ### SoilpH ... pH acidity of the soil sample; ### SoilType ... factor variable with four levels for four different soil types; DATA <- soilData; ### Considering Soiltype as a factor DATA$fSoilType <- factor(DATA$SoilType); ### Response - Concentration of phosphor DATA$Y <- log(DATA$P/DATA$SoilP) ### Suppose we are interested in the relationship of the response and pH of the soil summary(m0 <- lm(Y ~ SoilpH, data=DATA)) plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", main="log(P/SoilP) vs. SoilpH"); lines(lowess(DATA$Y~DATA$SoilpH), lty="dashed") abline(reg=m0, col="blue") ### or equivalently: abline(a=m0$coef[1], b=m0$coef[2], col="blue") legend(4.75,-4, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess")) ### Taking fSoilType into consideration summary(m1 <- lm(Y ~ SoilpH+fSoilType, data=DATA)) ### Is type of the soil (SoilType) an effect modifier of the relationship of the response and pH? anova(m2 <- lm(Y ~ SoilpH*fSoilType, data=DATA)) plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", main="log(P/SoilP) vs. SoilpH", pch=SoilType, col=SoilType); i <- 0; for(f in levels(DATA$fSoilType)){ i <- i+1; qqq <- as.logical(DATA$SoilType == f); lines(lowess(DATA$Y[qqq]~DATA$SoilpH[qqq]), lty="dashed", col=i) b <- coef(lm(fitted(m2)[qqq]~DATA$SoilpH[qqq]))[2]; # Sign of the coefficient if(b > 0){ yrange <- range(fitted(m2)[qqq]) } else{ yrange <- (range(fitted(m2)[qqq]))[c(2,1)] } lines(range(DATA$SoilpH[qqq]), yrange, col=i) } legend(x=5, y=-4, pch=1:4, col=1:4, legend=paste("Soil type", levels(DATA$fSoilType))) legend(x=5, y=-4.72, lty=c("solid", "dashed"), legend=c("linear fit [m2]", "lowess")) ### Do you have any idea why does the DW test give a significant result? dwtest(m2); # using asymptotic approximation durbinWatsonTest(m2); # using bootstrap ### plot(DATA$Y, col=DATA$SoilType, pch=DATA$SoilType) plotLM(m2) bptest(m2, ~fitted(m2)) bptest(m2) boxplot(residuals(m2)~DATA$SoilType)