### --------------------------------------------------------------------------------------### ### Winter Term 2017/2018 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #9 Two-way ANOVA ### ### ### ### --------------------------------------------------------------------------------------### ### Working directory setwd("H:/nmsa407_LinRegr/") ## To be changed to something which exists on your PC. ### Loading necessary packages ### ====================== library("mffSM") library("car") ### Archaeological data on skulls. ### ### Original data are described in a paper ### Howells, W. W. (1996). ### Howell's craniometric data on the internet. ### American Journal of Physical Anthropology, 101(3), 441-442, doi: 10.1002/ajpa.1331010302. ### ### Two measurements are provided here: ### (i) occipital angle (tylni uhel in Czech) ### (see http://http://www.karlin.mff.cuni.cz/~maciak/occipital.jpg) ### ### (ii) glabello-occipital length (nejvetsi delka mozkovny in Czech) ### (see http://http://www.karlin.mff.cuni.cz/~maciak/glabella.jpg) ### ### and data from three sites: Berg in Austria, Australia and Burjati in Siberia. ### ### Two data frames are provided: HowellsAll which contains all measurements reported by Howells ### and data.frame Howells which is a balanced (with respect to gender and site) random subset of original data. ### Data Howells (all) data(HowellsAll, package = "mffSM") head(HowellsAll) dim(HowellsAll) summary(HowellsAll) ### Data Howells (balanced subset - artificially created for pedagocial reasons) data(Howells, package = "mffSM") head(Howells) dim(Howells) summary(Howells) ### Colors definition (used in the plots below) genCOL <- rainbow_hcl(2, start = 10, end = 280) genCOL2 <- c("red3", "darkblue") names(genCOL) <- names(genCOL2) <- levels(HowellsAll[, "fgender"]) print(genCOL) print(genCOL2) popCOL <- rainbow_hcl(3, start = 50, end = 250) popCOL2 <- c("red", "darkgreen", "blue") names(popCOL) <- names(popCOL2) <- levels(HowellsAll[, "fpopul"]) print(popCOL) print(popCOL2) ### Frequency tables> ### 1. All data with(HowellsAll, table(fpopul, fgender)) with(HowellsAll, table(fpop.gen)) with(HowellsAll, table(fgen.pop)) margin.table(with(HowellsAll, table(fpopul, fgender)), margin = 1) margin.table(with(HowellsAll, table(fpopul, fgender)), margin = 2) nrow(HowellsAll) ### 2. Balanced subset with(Howells, table(fpopul, fgender)) with(Howells, table(fpop.gen)) with(Howells, table(fgen.pop)) margin.table(with(Howells, table(fpopul, fgender)), margin = 1) margin.table(with(Howells, table(fpopul, fgender)), margin = 2) nrow(Howells) ### Group means ### 1. All data ### OCA with(HowellsAll, tapply(oca, list(fpopul, fgender), mean)) round(with(HowellsAll, tapply(oca, list(fpopul, fgender), mean)), 1) round(with(HowellsAll, tapply(oca, fpopul, mean)), 1) round(with(HowellsAll, tapply(oca, fgender, mean)), 1) round(with(HowellsAll, mean(oca)), 1) ### GOL with(HowellsAll, tapply(gol, list(fpopul, fgender), mean)) round(with(HowellsAll, tapply(gol, list(fpopul, fgender), mean)), 1) round(with(HowellsAll, tapply(gol, fpopul, mean)), 1) round(with(HowellsAll, tapply(gol, fgender, mean)), 1) round(with(HowellsAll, mean(gol)), 1) ### 2. Balanced subset ### OCA with(Howells, tapply(oca, list(fpopul, fgender), mean)) round(with(Howells, tapply(oca, list(fpopul, fgender), mean)), 1) round(with(Howells, tapply(oca, fpopul, mean)), 1) round(with(Howells, tapply(oca, fgender, mean)), 1) round(with(Howells, mean(oca)), 1) ### GOL with(Howells, tapply(gol, list(fpopul, fgender), mean)) round(with(Howells, tapply(gol, list(fpopul, fgender), mean)), 1) round(with(Howells, tapply(gol, fpopul, mean)), 1) round(with(Howells, tapply(gol, fgender, mean)), 1) round(with(Howells, mean(gol)), 1) ### Basic boxplots ### 1. All data ### OCA par(mfrow=c(1,2)) plot(oca ~ fpop.gen, data = HowellsAll, col = genCOL, xlab="Population x gender") plot(oca ~ fgen.pop, data = HowellsAll, col = popCOL, xlab="Gender x population") par(mfrow=c(1,1)) ### GOL par(mfrow=c(1,2)) plot(gol ~ fpop.gen, data = HowellsAll, col = genCOL, xlab="Population x gender") plot(gol ~ fgen.pop, data = HowellsAll, col = popCOL, xlab="Gender x population") par(mfrow=c(1,1)) ### Basic spaghetti plots (for all data) par(mfrow=c(1,2)) with(HowellsAll, interaction.plot(fgender, fpopul, oca, col = popCOL2, main="OCA", lwd=2, xlab="Gender")) with(HowellsAll, interaction.plot(fgender, fpopul, gol, col = popCOL2, main="GOL", lwd=2, xlab="Gender")) par(mfrow=c(1,1)) par(mfrow=c(1,2)) with(HowellsAll, interaction.plot(fpopul, fgender, gol, col = genCOL2, main="OCA", lwd=2, xlab="Population")) with(HowellsAll, interaction.plot(fpopul, fgender,oca, col = genCOL2, main="GOL", lwd=2, xlab="Population")) par(mfrow=c(1,1)) ### Nicer spaghetti plots (for all data) with(HowellsAll, interaction.plot(fgender, fpopul, oca, col = popCOL2, type = "b", lwd = 2, lty = 1, xlab = "Gender", trace.label = "Population", ylab = "Mean of OCA")) with(HowellsAll, interaction.plot(fpopul, fgender, oca, col = genCOL2, type = "b", lwd = 2, lty = 1, xlab = "Population", trace.label = "Gender", ylab = "Mean of OCA")) with(HowellsAll, interaction.plot(fgender, fpopul, gol, col = popCOL2, type = "b", lwd = 2, lty = 1, xlab = "Gender", trace.label = "Population", ylab = "Mean of GOL")) with(HowellsAll, interaction.plot(fpopul, fgender, gol, col = genCOL2, type = "b", lwd = 2, lty = 1, xlab = "Population", trace.label = "Gender", ylab = "Mean of GOL")) ### QUESTIONS: ### Does it seem that a constant residual variance can be assumed? ### For which variable a significant interaction is to be expected? ### Use analogous plots for the balanced dataset. Are the conclussions the same? ### Linear Regresion Models ### F-tests and ANOVA tables ### 1. Regression model for all data ### OCA oca0All <- lm(oca ~ 1, data = HowellsAll) ocaXAll <- lm(oca ~ fpopul, data = HowellsAll) ocaZAll <- lm(oca ~ fgender, data = HowellsAll) ocaXplusZAll <- lm(oca ~ fpopul + fgender, data = HowellsAll) ocaXZAll <- lm(oca ~ fpopul * fgender, data = HowellsAll) ocaXZAll2 <- lm(oca ~ fgender * fpopul, data = HowellsAll) ### ANOVA tables for model ocaXZAll anova(ocaXZAll) ## type I anova(ocaXZAll2) ### QUESTIONS: ### What are the corresponding tests on each line of the output? ### Do you see an estimate of the residual variance here? Anova(ocaXZAll, type = "II") ## type II ### QUESTIONS: ### How to interpret rows fpopul and fgender? ### What would be the output for the Anova II type table for model 'ocaXZAll2'? Anova(ocaXZAll, type = "III") ## type III, ### QUESTIONS: ### How to interpret rows fpopul and fgender? ### What would be the output for the Anova III type table for model 'ocaXZAll2'? summary(ocaXZAll) ### GOL gol0All <- lm(gol ~ 1, data = HowellsAll) golXAll <- lm(gol ~ fpopul, data = HowellsAll) golZAll <- lm(gol ~ fgender, data = HowellsAll) golXplusZAll <- lm(gol ~ fpopul + fgender, data = HowellsAll) golXZAll <- lm(gol ~ fpopul * fgender, data = HowellsAll) golXZAll2 <- lm(gol ~ fgender * fpopul, data = HowellsAll) ### ANOVA tables anova(golXZAll) ## type I anova(golXZAll2) Anova(golXZAll, type = "II") ## type II Anova(golXZAll, type = "III") ## type III ### 2. Regression model for the balanced dataset ### OCA oca0 <- lm(oca ~ 1, data = Howells) ocaX <- lm(oca ~ fpopul, data = Howells) ocaZ <- lm(oca ~ fgender, data = Howells) ocaXplusZ <- lm(oca ~ fpopul + fgender, data = Howells) ocaXZ <- lm(oca ~ fpopul * fgender, data = Howells) ocaXZ2 <- lm(oca ~ fgender * fpopul, data = Howells) summary(ocaXplusZ); summary(ocaX); summary(ocaZ); ### ANOVA tables for model ocaXZ anova(ocaXZ) ## type I anova(ocaXZ2) ### QUESTIONS: ### What are the corresponding tests on each line of the output above? ### Surprised that the corresponding rows are the same now? Anova(ocaXZ, type = "II") ## type II Anova(ocaXZ, type = "III") ## type III, ### GOL gol0 <- lm(gol ~ 1, data = Howells) golX <- lm(gol ~ fpopul, data = Howells) golZ <- lm(gol ~ fgender, data = Howells) golXplusZ <- lm(gol ~ fpopul + fgender, data = Howells) golXZ <- lm(gol ~ fpopul * fgender, data = Howells) golXZ2 <- lm(gol ~ fgender * fpopul, data = Howells) ### ANOVA tables for model golXZ anova(golXZ) ## type I anova(golXZ2) Anova(golXZ, type = "II") ## type II Anova(golXZ, type = "III") ## type III ### Different parametrizations ### Levels of the two factors (lpop <- levels(HowellsAll[, "fpopul"])) (lgen <- levels(HowellsAll[, "fgender"])) outer(lpop, lgen, paste, sep = ":") (lpopgen <- as.character(outer(lpop, lgen, paste, sep = ":"))) ### contr.treatment ### Pseudocontrast matrices CX.trt <- contr.treatment(3) # pseduocontrast for populations CZ.trt <- contr.treatment(2) # pseduocontrast for gender CZX.trt <- kronecker(CZ.trt, CX.trt) rownames(CX.trt) <- lpop rownames(CZ.trt) <- lgen rownames(CZX.trt) <- lpopgen print(CX.trt) print(CZ.trt) print(CZX.trt) D.trt <- cbind(kronecker(rep(1, 2), CX.trt), kronecker(CZ.trt, rep(1, 3)), CZX.trt) colnames(D.trt) <- names(coef(golXZ))[-1] D.trt ### OCA (all data) ### Interaction model ocaXZAll.trt <- lm(oca ~ fpopul * fgender, data = HowellsAll) summary(ocaXZAll.trt) drop1(ocaXZAll.trt, test = "F") ### QUESTIONS: ### Does it make sense to ask for partial effects of fpopul or fgender? ### Does it make sense to estimate the additive model? ### What can be concluded from the plot below? plotLM(ocaXZAll.trt) ### Alternatively plot(residuals(ocaXZAll.trt) ~ fpop.gen, data = HowellsAll, col = genCOL, xlab="Population x gender", main="Residuals") abline(h=0, lty="dotted") ### GOL (all data) ### Interaction model golXZAll.trt <- lm(gol ~ fpopul * fgender, data = HowellsAll) summary(golXZAll.trt) drop1(golXZAll.trt, test = "F") ### QUESTIONS: ### Does it make sense to ask for partial effects of fpopul or fgender? ### Does it make sense to estimate the additive model? ### What can be concluded from the plot below? ### Additive model golXplusZAll.trt <- lm(gol ~ fpopul + fgender, data = HowellsAll) summary(golXplusZAll.trt) drop1(golXplusZAll.trt, test = "F") plotLM(golXplusZAll.trt) ### What can be concluded from here? ### Alternatively plot(residuals(golXplusZAll.trt) ~ fpop.gen, data = HowellsAll, col = genCOL, xlab="Population x gender", main="Residuals") abline(h=0, lty="dotted") ### Estimates of all group means based on the ADDITIVE model L.meanAdd.trt <- cbind(1, kronecker(rep(1, 2), CX.trt), kronecker(CZ.trt, rep(1, 3))) colnames(L.meanAdd.trt) <- names(coef(golXplusZAll.trt)) rownames(L.meanAdd.trt) <- lpopgen print(L.meanAdd.trt) LSest(golXplusZAll.trt, L = L.meanAdd.trt) with(HowellsAll, tapply(gol, fgen.pop, mean)) ### Sample group means are (of course) slightly different ### Estimates of all partial effects of fpopul given fgender in the additive model print(L.meanAdd.trt) L.effX.trt <- cbind(0, rbind(CX.trt[2,] - CX.trt[1,], CX.trt[3,] - CX.trt[1,], CX.trt[3,] - CX.trt[2,]), 0); colnames(L.effX.trt) <- names(coef(golXplusZAll.trt)) rownames(L.effX.trt) <- paste(lpop[c(2, 3, 3)], "-", lpop[c(1, 1, 2)], sep = "") print(L.effX.trt) LSest(golXplusZAll.trt, L = L.effX.trt) ### QUESTIONS: ### Which populations are significantly different if simple Bonferroni correction is used? ### Is the simultaneous coverage of the three conf. intervals at least 95%? LSest(golXplusZAll.trt, L = L.effX.trt, conf.level = 1 - 0.05/3) ### Bonferroni P-values: LSgolXplusZAll.trt <- LSest(golXplusZAll.trt, L = L.effX.trt) PvBonf <- format.pval(LSgolXplusZAll.trt[["P value"]] * 3, digits = 3, eps = 1e-4) names(PvBonf) <- attr(LSgolXplusZAll.trt, "row.names") print(PvBonf) ### Estimates of all partial effects of fgender given fpopul in the additive model print(L.meanAdd.trt) L.effZ.trt <- cbind(0, 0, 0, CZ.trt[2,] - CZ.trt[1,]) colnames(L.effZ.trt) <- names(coef(golXplusZAll.trt)) rownames(L.effZ.trt) <- paste(lgen[2], "-", lgen[1], sep = "") print(L.effZ.trt) LSest(golXplusZAll.trt, L = L.effZ.trt) ### QUESTIONS: ### Is there a significant difference between male and female given the population? ### Compare the above output with. t.test(gol ~ fgender, data = HowellsAll) ### contr.sum ### Contrast matrices (CX.sum <- contr.sum(3)) (CZ.sum <- contr.sum(2)) (CZX.sum <- kronecker(CZ.sum, CX.sum)) rownames(CX.sum) <- lpop rownames(CZ.sum) <- lgen rownames(CZX.sum) <- lpopgen print(CX.sum) print(CZ.sum) print(CZX.sum) D.sum <- cbind(kronecker(rep(1, 2), CX.sum), kronecker(CZ.sum, rep(1, 3)), CZX.sum) XX.sum <- model.matrix(oca ~ fpopul * fgender, data = HowellsAll, contrasts = list(fpopul = "contr.sum", fgender = "contr.sum")) colnames(D.sum) <- colnames(XX.sum)[-1] print(D.sum) ### OCA (all data) ### Interaction model ocaXZAll.sum <- lm(oca ~ fpopul * fgender, data = HowellsAll, contrasts = list(fpopul = "contr.sum", fgender = "contr.sum")) summary(ocaXZAll.sum) drop1(ocaXZAll.sum, test = "F") plotLM(ocaXZAll.sum) ## What can be concluded from here? ### Estimates of all "alpha^X" L.alphaX <- cbind(0, CX.sum, 0, 0, 0) colnames(L.alphaX) <- names(coef(ocaXZAll.sum)) rownames(L.alphaX) <- lpop print(L.alphaX) LSest(ocaXZAll.sum, L = L.alphaX) alphaX <- LSest(ocaXZAll.sum, L = L.alphaX)[["Estimate"]] names(alphaX) <- lpop print(alphaX) ### What is their interpretation? ### Estimates of all "alpha^Z" L.alphaZ <- cbind(0, 0, 0, CZ.sum, 0, 0) colnames(L.alphaZ) <- names(coef(ocaXZAll.sum)) rownames(L.alphaZ) <- lgen print(L.alphaZ) LSest(ocaXZAll.sum, L = L.alphaZ) alphaZ <- LSest(ocaXZAll.sum, L = L.alphaZ)[["Estimate"]] names(alphaZ) <- lgen print(alphaZ) ### What is their interpretation? ### Estimates of all "alpha^XZ" L.alphaXZ <- cbind(0, 0, 0, 0, CZX.sum) colnames(L.alphaXZ) <- names(coef(ocaXZAll.sum)) rownames(L.alphaXZ) <- lpopgen print(L.alphaXZ) LSest(ocaXZAll.sum, L = L.alphaXZ) alphaXZ <- LSest(ocaXZAll.sum, L = L.alphaXZ)[["Estimate"]] names(alphaXZ) <- lpopgen print(alphaXZ) ### alpha^XZ stored in a matrix (alphaXZm <- matrix(alphaXZ, nrow = length(lpop), ncol = length(lgen))) rownames(alphaXZm) <- lpop colnames(alphaXZm) <- lgen print(alphaXZm) ### Each row and each column sums in 0. ### Reconstrucing group means from the regression parameters coef(ocaXZAll.sum)["(Intercept)"] + alphaXZm + cbind(alphaX, alphaX) + rbind(alphaZ, alphaZ, alphaZ) (ocaMean <- with(HowellsAll, tapply(oca, list(fpopul, fgender), mean))) ### Sample group means are now (of course) the same as those estimated from the interaction model. ### Estimates of all group means again L.meanInt.sum <- cbind(1, D.sum) colnames(L.meanInt.sum) <- names(coef(ocaXZAll.sum)) rownames(L.meanInt.sum) <- lpopgen print(L.meanInt.sum) LSest(ocaXZAll.sum, L = L.meanInt.sum) with(HowellsAll, tapply(oca, fgen.pop, mean)) ### If simultaneous confidence intervals for all 6 group means of interest, simple Bonferroni procedure can again be used. LSest(ocaXZAll.sum, L = L.meanInt.sum, conf.level = 1 - 0.05/6) ### Estimates of the differences between the means of the means of groups by fpopul = differences in alpha^X's print(L.meanInt.sum) L.effX.sumXZ <- cbind(0, rbind(CX.sum[2,] - CX.sum[1,], CX.sum[3,] - CX.sum[1,], CX.sum[3,] - CX.sum[2,]), 0, 0, 0) colnames(L.effX.sumXZ) <- names(coef(ocaXZAll.sum)) rownames(L.effX.sumXZ) <- paste(lpop[c(2, 3, 3)], "-", lpop[c(1, 1, 2)], sep = "") print(L.effX.sumXZ) LSest(ocaXZAll.sum, L = L.effX.sumXZ) ### mu_{2.} - mu_{1.} ### mu_{3.} - mu_{1.} ### mu_{3.} - mu_{2.} ### Here are the estimated means of the means and their differences apply(ocaMean, 1, mean) outer(apply(ocaMean, 1, mean), apply(ocaMean, 1, mean), "-") ### Estimates of the differences between the means of the means of groups by fgender = differences in alpha^Z's L.effZ.sumXZ <- cbind(0, 0, 0, CZ.sum[2,] - CZ.sum[1,], 0, 0) colnames(L.effZ.sumXZ) <- names(coef(ocaXZAll.sum)) rownames(L.effZ.sumXZ) <- paste(lgen[2], "-", lgen[1], sep = "") print(L.effZ.sumXZ) LSest(ocaXZAll.sum, L = L.effZ.sumXZ) ### mu_{.2} - mu_{.1} ### Here are the estimated means of the means and their differences apply(ocaMean, 2, mean) outer(apply(ocaMean, 2, mean), apply(ocaMean, 2, mean), "-") ### GOL (all data) ### Interaction model golXZAll.sum <- lm(gol ~ fpopul * fgender, data = HowellsAll, contrasts = list(fpopul = "contr.sum", fgender = "contr.sum")) summary(golXZAll.sum) drop1(golXZAll.sum, test = "F") ### Additive model golXplusZAll.sum <- lm(gol ~ fpopul + fgender, data = HowellsAll, contrasts = list(fpopul = "contr.sum", fgender = "contr.sum")) summary(golXplusZAll.sum) drop1(golXplusZAll.sum, test = "F") ### Estimates of all "alpha^X" in the additive model L.alphaX <- cbind(0, CX.sum, 0) colnames(L.alphaX) <- names(coef(golXplusZAll.sum)) rownames(L.alphaX) <- lpop print(L.alphaX) LSest(golXplusZAll.sum, L = L.alphaX) alphaX <- LSest(golXplusZAll.sum, L = L.alphaX)[["Estimate"]] names(alphaX) <- lpop print(alphaX) ### What is their interpretation? ### Estimates of all "alpha^Z" in the additive model L.alphaZ <- cbind(0, 0, 0, CZ.sum) colnames(L.alphaZ) <- names(coef(golXplusZAll.sum)) rownames(L.alphaZ) <- lgen print(L.alphaZ) LSest(golXplusZAll.sum, L = L.alphaZ) alphaZ <- LSest(golXplusZAll.sum, L = L.alphaZ)[["Estimate"]] names(alphaZ) <- lgen print(alphaZ) ### What is their interpretation? ### Group means (model-based) (golModelMean <- coef(golXplusZAll.sum)["(Intercept)"] + cbind(alphaX, alphaX) + rbind(alphaZ, alphaZ, alphaZ)) colnames(golModelMean) <- names(alphaZ) print(golModelMean) ### Group means (observed) (golMean <- with(HowellsAll, tapply(gol, list(fpopul, fgender), mean))) ### Again, not surprising that those are slightly different. ### Estimates of the differences between the means of the means of groups by fpopul ### = partial effects of fpopul given fgender (when the additive model can be assumed) ### = differences in alpha^X's L.peffX.sumXZ <- cbind(0, rbind(CX.sum[2,] - CX.sum[1,], CX.sum[3,] - CX.sum[1,], CX.sum[3,] - CX.sum[2,]), 0) colnames(L.peffX.sumXZ) <- names(coef(golXplusZAll.sum)) rownames(L.peffX.sumXZ) <- paste(lpop[c(2, 3, 3)], "-", lpop[c(1, 1, 2)], sep = "") print(L.peffX.sumXZ) LSest(golXplusZAll.sum, L = L.peffX.sumXZ) ### mu_{2.} - mu_{1.} ### mu_{3.} - mu_{1.} ### mu_{3.} - mu_{2.} ### Here are the estimated means of the means (model-based) and their differences apply(golModelMean, 1, mean) outer(apply(golModelMean, 1, mean), apply(golModelMean, 1, mean), "-") ### Estimates of the differences between ### the means of the means of groups ### by fgender ### = partial effects of fgender given fpopul ### (when the additive model can be assumed) ### = differences in alpha^Z's L.peffZ.sumXZ <- cbind(0, 0, 0, CZ.sum[2,] - CZ.sum[1,]) colnames(L.peffZ.sumXZ) <- names(coef(golXplusZAll.sum)) rownames(L.peffZ.sumXZ) <- paste(lgen[2], "-", lgen[1], sep = "") print(L.peffZ.sumXZ) LSest(golXplusZAll.sum, L = L.peffZ.sumXZ) ### mu_{.2} - mu_{.1} ### Here are the estimated means of the means (model-based) and their differences apply(golModelMean, 2, mean) outer(apply(golModelMean, 2, mean), apply(golModelMean, 2, mean), "-")