### --------------------------------------------------------------------------------------###
### Winter Term 2019/2020 ###
### NMSA407 - Linear Regression ###
### ###
### EXERCISE #12 Confounders and effect modifier ###
### ###
### --------------------------------------------------------------------------------------###
### Working directory
rm(list=ls())
setwd("H:/nmsa407_LinRegr/")
### Load needed packages
library("mffSM")
library("splines")
library("colorspace")
library("lmtest")
### Loading the data file
### load(url("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/policie.RData"))
data(Policie, package = "mffSM")
head(Policie)
dim(Policie)
summary(Policie)
### Our basic interest is how 'fat' [percentage of the applicant's body fat]
### is related to the 'height' of the applicant.
### Basic scatterplots to start
### (scatterplotMatrix -> function from package car)
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~fat + height + weight, reg.line = lm, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = Policie, pch = 16)
### Three models
m1 <- lm(fat ~ height, data = Policie)
summary(m1)
plot(fat ~ height, data = Policie, main="Percent of fat vs. height of the applicant")
lines(lowess(Policie$fat ~ Policie$height), lty="dashed")
abline(a=m1$coef[1], b=m1$coef[2], col="blue")
legend(188,6, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess"))
### QUESTIONS:
### Interpret the effect of height on the percentage of body fat.
### Is this effect in agreement with your experience?
par(mfrow = c(1,2));
plot(fat~weight, data=Policie, main="Fat vs. weight");
plot(height~weight, data=Policie, main="Height vs. weight");
par(mfrow = c(1,1));
### QUESTIONS:
### Explain the effect of 'height' on 'fat' in the following model [m2].
### Is 'weight' a confounder of this relationship?
m2 <- lm(fat ~ height + weight, data = Policie)
summary(m2)
### QUESTIONS:
### Interpret the model below and the corresponding parameter estimates.
### Is 'weight' an effect modifier of the effect of 'height' on 'fat'?
m3 <- lm(fat ~ height + weight + height:weight, data = Policie)
summary(m3)
### Visualization (of models m1 and m2)
summary(Policie[, "weight"])
### For the following weights the relationship between the
### fat and height will be shown more explicitly
weight.show <- c(70, 80, 90) ### approx. quartiles + median
Policie <- transform(Policie, fweight = cut(weight, breaks = c(50, 75, 85, 120), labels = c("<=75", "75-85", ">85")))
PALwe <- diverge_hcl(length(weight.show))
LINwe <- c("blue", "grey50", "red")
names(PALwe) <- names(LINwe) <- levels(Policie[, "fweight"])
### PCH <- 23
PCH <- c(1,2,3)
BTY <- "o";
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(fat ~ height, data = Policie, pch = PCH, col = LINwe, bg = PALwe[fweight], xlab = "Height [cm]", ylab = "Body fat [%]")
for (i in 1:length(weight.show)){
abline(a = coef(m2)["(Intercept)"] + coef(m2)["weight"] * weight.show[i], b = coef(m2)["height"], col = LINwe[i], lwd = 2)
}
abline(m1, col = "orange", lwd = 2)
legend(185, 27, legend = levels(Policie[, "fweight"]), pch = PCH, col = LINwe, pt.bg = PALwe, bty = "n")
legend(188, 27, legend = weight.show, col = LINwe, lty = 1, lwd = 2, bty = "n")
text(185, 27.5, labels = "Weight [kg]", pos = 4)
legend(188, 23.5, legend = "Marginal", col = "orange", lty = 1, lwd = 2, bty = "n")
### Data set #2
### Pribram dataset
###
### The sampling area (about 5 km2) was located in the vicinity of
### a lead smelter (pec na olovo) in Pribram, Czech Republic. The smelter
### has been operating since 1786. The primary smelting of lead ores ceased
### in 1972 and was replaced by industry processing of secondary lead
### sources such as car batteries and other lead-bearing materials.
### Soil samples were taken from the 121 sites selected at the studied area
### surrounding the smelter.
### The dataset contains measurements of the concentration of several metals:
### As, Pb, Zn (these measurements are already logarithmic transformated
### with the base 2 and then centered by the sample mean)
### Measurements of some parameters of organic activity (also already log2 transformed
### but not centered):
### dehydro ... dehydrogenase activity
### respir ... respiratory activity
### biomass
### Measurements of some other characteristics of the soil
### cox .... Soil microbial carbon (log2 transformed)
### pH .... pH-factor of the soil
### dist ... distance from the smelter
### The primary interest is in the effect of the metals on the organic activity .
rm(list = ls())
DATA <- read.table("pribram.csv", sep=";", header=TRUE);
### ALTERNATIVELY (directly from the course website)
DATA <- read.table("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/pribram.csv", sep=";", header=TRUE);
head(DATA);
summary(DATA);
### Consider first the relationship of dehydrogenase activity (dehydro) and Zn
summary(m0 <- lm(dehydro ~ Zn, data=DATA))
plot(dehydro ~ Zn, data=DATA, xlab="Zn", ylab="dehydro", main="Dehydro vs. Zn");
lines(lowess(DATA$dehydro~DATA$Zn), lty="dashed")
abline(a=m0$coef[1], b=m0$coef[2], col="blue")
legend(9,6, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess"))
### QUESTIONS:
### The relationship of Zn and dehydro is not as expected...
### The problem might be in the fact that this is an OBSERVATIONAL STUDY
### and not a DESIGNED EXPERIMENT. In observational studies one has to
### be careful about possible confounding factors (confounders).
### In this study the problem was that due to the contamination the soil near
### the smelter was less used which resulted in higher values of the organic
### activity.
par(mfrow = c(1,2));
plot(DATA$dist, DATA$Zn, xlab="Distance from the smelter", ylab="Zn", main="Zn vs. distance");
lines(lowess(DATA$Zn~DATA$dist), lty="dashed")
plot(DATA$dist, DATA$dehydro, xlab="Distance from the smelter", ylab="dehydrogenase activity",
main="Dehydro vs. distance");
lines(lowess(DATA$dehydro~DATA$dist), lty="dashed")
par(mfrow = c(1,1));
### Thus we can try to include distance from the smelter as a covariate
summary(lm(dehydro ~ Zn + dist, data=DATA))
### QUESTION:
### Explain the effect of Zn on dehydro estimated by this model
### One can also try...
summary(lm(dehydro ~ Zn*dist, data=DATA))
### The problem is that although the distance from the smelter seems to be an important
### variable, it is from the biological point of view not the most appropriate variable
### to control for.
### After discussion with the researchers we decided to control for
### cox (soil microbial carbon) as it is known that organic activity depends on the
### amount of the available carbon in the soil.
par(mfrow = c(1,2));
plot(DATA$cox, DATA$Zn, xlab="Cox", ylab="Zn", main="Zn vs. cox");
lines(lowess(DATA$Zn~DATA$cox), lty="dashed")
plot(DATA$cox, DATA$dehydro, xlab="Cox", ylab="dehydrogenase activity",
main="Dehydro vs. cox");
lines(lowess(DATA$dehydro~DATA$cox), lty="dashed")
par(mfrow = c(1,1));
summary(m1 <- lm(dehydro ~ Zn + cox, data=DATA))
### QUESTION:
### Interpret the effect of Zn on dehydro in this model.
### Compare this effect with the effect of Zn in a model m0
### (without cox).
summary(m0)
### QUESTION:
### Is cox a confounder for the relationship of dehydro vs Zn?
### One can be also interested if cox is even an effect modifier of the relationship
### of dehydro vs Zn.
summary(m2 <- lm(dehydro ~ Zn*cox, data=DATA))
anova(m2)
### Plot of effect of the Zn on dehydro as estimated by model m2
plot(DATA$cox, coef(m2)['Zn'] + coef(m2)['Zn:cox']*DATA$cox, xlab="cox", main="Effect of the Zn on dehydro [m2]", type="l");
abline(h=0, lty="dotted")
### Sometimes the effect modification disappear when we use a more flexible function of
### the variable that we need to control.
### To visualize if the effect of cox is only linear in model 'm1: dehydro~Zn + cox',
### one can use the partial residuals
head(residuals(m1, type="partial"))
parc.res.cox <- residuals(m1, type="partial")[,2];
plot(DATA$cox, parc.res.cox, xlab = "Cox", ylab="The zero-mean partial residuals for 'cox'", main="Partial residuals [cox] - model m1");
abline(a=0, b=m1$coef[3], col="blue");
lines(lowess(parc.res.cox~DATA$cox), lty="dashed")
### Let us consider the following relatively simple spline parametrization of cox
DATA$cox.sp <- bs(DATA$cox, knots = 0, degree = 2);
qqq <- order(DATA$cox);
plot(DATA$cox[qqq], DATA$cox.sp[qqq,1], type="l", ylim=range(DATA$cox.sp));
lines(DATA$cox[qqq], DATA$cox.sp[qqq,2], type="l", col="blue");
lines(DATA$cox[qqq], DATA$cox.sp[qqq,3], type="l", col="red");
###
summary(m3 <- lm(dehydro ~ Zn+cox.sp, data=DATA))
anova(m1, m3)
### What is the effect of Zn on dehydro as estimated by m3?
### Do we still need to have an interaction?
anova(lm(dehydro ~ Zn * cox.sp, data=DATA))
### Let us take m3 as the final model
mFin <- m3;
### Basic diagnostic plot
plotLM(mFin)
### Test of the normality of the errors
shapiro.test(mFin$residuals)
### Test of homoscedasticity - Koenker's studentized version
bptest(mFin, ~fitted(mFin))
bptest(mFin)
### Durbin-Watson test of correlation of errors
### p-value based on some asymptotic approximations
dwtest(mFin)
### p-value with the help of bootstrap
library(car)
durbinWatsonTest(mFin)
### The 'manual' calculation of the test statistic of Durbin-Watson test
U <- residuals(mFin);
n <- length(U)
(DW <- sum((U[-1] - U[-n])^2)/sum(U^2));
plot(U[-n], U[-1], xlab=expression(U[i]), ylab=expression(U[i+1]), main=paste("Autocorrelation with lag=1:", round(cor(U[-n], U[-1]), 2)));
abline(reg=lm(U[-1]~U[-n]))
#3# Do you have any idea why does the DW-test give a significant result?
plot(DATA$dist)
### The results about homoscedasticity are borderline and the sample size
### is moderate. Sandwich estimator might be of interest
library("sandwich")
### Estimate of the variance matrix of the regression coefficients
(VHC <- vcovHC(mFin, type = "HC3"));
### type = "HC3" It uses residuals(mFin)^2/(1 - hatvalues(mFin))^2
### on the diagonal of the "meat" matrix of the sandwich.
### Manual calculation of the sandwich estimator - start
X <- model.matrix(mFin);
Bread <- solve(t(X) %*% X) %*% t(X)
Meat <- diag(residuals(mFin)^2 / (1 - hatvalues(mFin))^2)
Bread %*% Meat %*% t(Bread)
Bread %*% Meat %*% t(Bread) - VHC
### Manual calculation of the sandwich estimator - end
### Summary table using the sandwich estimator of variance
coeftest(mFin, vcov = VHC)
### Summary table using the standard estimator of variance that assumes homoscedasticity
summary(mFin)
### Data set #3
rm(list = ls())
load("SoilData.RData")
### EQUIVALENTLY (directly from the course website)
### load(url("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/SoilData.RData"))
# The data comes from an experiment focusing on evaluating the concentration
# of phosphorus in some specific types of soil after
# adding one of two available ash-based fertilizers.
head(soilData);
summary(soilData);
### Variables of interest for this exercise class
### P ... phosphor concentration in a soil sample measured in μg per kg after adding an ash-based fertilizer;
### SoilP ... initial phosphor concentration in a soil sample measured in μg per kg;
### SoilpH ... pH acidity of the soil sample;
### SoilType ... factor variable with four levels for four different soil types;
DATA <- soilData;
### Considering Soiltype as a factor
DATA$fSoilType <- factor(DATA$SoilType);
### Response - Concentration of phosphor
DATA$Y <- log(DATA$P/DATA$SoilP)
### Suppose we are interested in the relationship of the response and pH of the soil
summary(m0 <- lm(Y ~ SoilpH, data=DATA))
plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", main="log(P/SoilP) vs. SoilpH");
lines(lowess(DATA$Y~DATA$SoilpH), lty="dashed")
abline(reg=m0, col="blue")
### or equivalently: abline(a=m0$coef[1], b=m0$coef[2], col="blue")
legend(4.75,-4, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess"))
### Taking fSoilType into consideration
summary(m1 <- lm(Y ~ SoilpH+fSoilType, data=DATA))
### Is type of the soil (SoilType) an effect modifier of the relationship of the response and pH?
anova(m2 <- lm(Y ~ SoilpH*fSoilType, data=DATA))
plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", main="log(P/SoilP) vs. SoilpH", pch=SoilType, col=SoilType);
i <- 0;
for(f in levels(DATA$fSoilType)){
i <- i+1;
qqq <- as.logical(DATA$SoilType == f);
lines(lowess(DATA$Y[qqq]~DATA$SoilpH[qqq]), lty="dashed", col=i)
b <- coef(lm(fitted(m2)[qqq]~DATA$SoilpH[qqq]))[2]; # Sign of the coefficient
if(b > 0){
yrange <- range(fitted(m2)[qqq])
}
else{
yrange <- (range(fitted(m2)[qqq]))[c(2,1)]
}
lines(range(DATA$SoilpH[qqq]), yrange, col=i)
}
legend(x=5, y=-4, pch=1:4, col=1:4, legend=paste("Soil type", levels(DATA$fSoilType)))
legend(x=5, y=-4.72, lty=c("solid", "dashed"), legend=c("linear fit [m2]", "lowess"))
### Do you have any idea why does the DW test give a significant result?
dwtest(m2); # using asymptotic approximation
durbinWatsonTest(m2); # using bootstrap
###
plot(DATA$Y, col=DATA$SoilType, pch=DATA$SoilType)
plotLM(m2)
bptest(m2, ~fitted(m2))
bptest(m2)
boxplot(residuals(m2)~DATA$SoilType)