### -------------------------------------------------------------------------### ### NMSA407 - Linear Regression ### ### Winter Term 2021/2022 ### ### ### ### EXERCISE #12 Two-way ANOVA and Generalized Least Squares ### ### -------------------------------------------------------------------------### ### -------------------------------------------------------------------------### ### Block 1: TEACHING MATERIAL ### ### -------------------------------------------------------------------------### library("mffSM") ### Archaeological data on skulls. ### ### Original data are described in the paper: ### Howells, W. W. (1996). Howell's craniometric data on the internet. ### American Journal of Physical Anthropology, 101(3), 441-442, ### doi: 10.1002/ajpa.1331010302. ### ### Two measurements are provided here: ### (i) occipital angle (tylni uhel in Czech) ### (see http://http://www.karlin.mff.cuni.cz/~maciak/occipital.jpg) ### ### (ii) glabello-occipital length (nejvetsi delka mozkovny in Czech) ### (see http://http://www.karlin.mff.cuni.cz/~maciak/glabella.jpg) ### ### (iii) skulks from three archeological sites are considered: Berg (Austria), ### Australia, and Burjati in Siberia. ### ### (iv) Two data frames are provided: the data frame HowellsAll which contains ### all reported measurements and the data.frame Howells which is ### a balanced (with respect to the gender and site) random subset ### of original data. ### Data Howells (all) data(HowellsAll, package = "mffSM") head(HowellsAll) dim(HowellsAll) summary(HowellsAll) ### Data Howells ### (balanced subset - artificially created for educational reasons) data(Howells, package = "mffSM") head(Howells) dim(Howells) summary(Howells) ### ... just some color definitions (used in the plots below) genCOL <- rainbow_hcl(2, start = 10, end = 280) genCOL2 <- c("red3", "darkblue") names(genCOL) <- names(genCOL2) <- levels(HowellsAll[, "fgender"]) print(genCOL) print(genCOL2) popCOL <- rainbow_hcl(3, start = 50, end = 250) popCOL2 <- c("red", "darkgreen", "blue") names(popCOL) <- names(popCOL2) <- levels(HowellsAll[, "fpopul"]) print(popCOL) print(popCOL2) ### A brief exploratory analysis ### 1a. Frequency tables: All data with(HowellsAll, table(fpopul, fgender)) with(HowellsAll, table(fpop.gen)) with(HowellsAll, table(fgen.pop)) margin.table(with(HowellsAll, table(fpopul, fgender)), margin = 1) margin.table(with(HowellsAll, table(fpopul, fgender)), margin = 2) nrow(HowellsAll) ### 1b. Frequency tables: Balanced subset with(Howells, table(fpopul, fgender)) with(Howells, table(fpop.gen)) with(Howells, table(fgen.pop)) margin.table(with(Howells, table(fpopul, fgender)), margin = 1) margin.table(with(Howells, table(fpopul, fgender)), margin = 2) nrow(Howells) ### 2a. Group specific means: All data ### OCA with(HowellsAll, tapply(oca, list(fpopul, fgender), mean)) round(with(HowellsAll, tapply(oca, list(fpopul, fgender), mean)), 1) round(with(HowellsAll, tapply(oca, fpopul, mean)), 1) round(with(HowellsAll, tapply(oca, fgender, mean)), 1) round(with(HowellsAll, mean(oca)), 1) ### GOL with(HowellsAll, tapply(gol, list(fpopul, fgender), mean)) round(with(HowellsAll, tapply(gol, list(fpopul, fgender), mean)), 1) round(with(HowellsAll, tapply(gol, fpopul, mean)), 1) round(with(HowellsAll, tapply(gol, fgender, mean)), 1) round(with(HowellsAll, mean(gol)), 1) ### 2b. Group specific means: Balanced subset ### OCA with(Howells, tapply(oca, list(fpopul, fgender), mean)) round(with(Howells, tapply(oca, list(fpopul, fgender), mean)), 1) round(with(Howells, tapply(oca, fpopul, mean)), 1) round(with(Howells, tapply(oca, fgender, mean)), 1) round(with(Howells, mean(oca)), 1) ### GOL with(Howells, tapply(gol, list(fpopul, fgender), mean)) round(with(Howells, tapply(gol, list(fpopul, fgender), mean)), 1) round(with(Howells, tapply(gol, fpopul, mean)), 1) round(with(Howells, tapply(gol, fgender, mean)), 1) round(with(Howells, mean(gol)), 1) ### 3a. Basic boxplots: All data ### OCA par(mfrow=c(1,2)) plot(oca ~ fpop.gen, data = HowellsAll, col = genCOL,xlab="Population x gender") plot(oca ~ fgen.pop, data = HowellsAll, col = popCOL,xlab="Gender x population") par(mfrow=c(1,1)) ### GOL par(mfrow=c(1,2)) plot(gol ~ fpop.gen, data = HowellsAll, col = genCOL,xlab="Population x gender") plot(gol ~ fgen.pop, data = HowellsAll, col = popCOL,xlab="Gender x population") par(mfrow=c(1,1)) ### 3b. Basic boxplots: Balanced subset ### OCA par(mfrow=c(1,2)) plot(oca ~ fpop.gen, data = Howells, col = genCOL, xlab="Population x gender") plot(oca ~ fgen.pop, data = Howells, col = popCOL, xlab="Gender x population") par(mfrow=c(1,1)) ### GOL par(mfrow=c(1,2)) plot(gol ~ fpop.gen, data = Howells, col = genCOL, xlab="Population x gender") plot(gol ~ fgen.pop, data = Howells, col = popCOL, xlab="Gender x population") par(mfrow=c(1,1)) ### 4a. Spaghetti plots: All data par(mfrow=c(1,2)) with(HowellsAll, interaction.plot(fgender, fpopul, oca, col = popCOL2, main="OCA", lwd=2, xlab="Gender")) with(HowellsAll, interaction.plot(fgender, fpopul, gol, col = popCOL2, main="GOL", lwd=2, xlab="Gender")) par(mfrow=c(1,1)) par(mfrow=c(1,2)) with(HowellsAll, interaction.plot(fpopul, fgender, oca, col = genCOL2, main="OCA", lwd=2, xlab="Population")) with(HowellsAll, interaction.plot(fpopul, fgender,gol, col = genCOL2, main="GOL", lwd=2, xlab="Population")) par(mfrow=c(1,1)) ### 4b. Spaghetti plots: Balanced subset par(mfrow=c(1,2)) with(HowellsAll, interaction.plot(fgender, fpopul, oca, col = popCOL2, main="OCA", lwd=2, xlab="Gender")) with(HowellsAll, interaction.plot(fgender, fpopul, gol, col = popCOL2, main="GOL", lwd=2, xlab="Gender")) par(mfrow=c(1,1)) par(mfrow=c(1,2)) with(HowellsAll, interaction.plot(fpopul, fgender, oca, col = genCOL2, main="OCA", lwd=2, xlab="Population")) with(HowellsAll, interaction.plot(fpopul, fgender,gol, col = genCOL2, main="GOL", lwd=2, xlab="Population")) par(mfrow=c(1,1)) ### 5a. Alternative (maybe nicer) spaghetti plots: All data with(HowellsAll, interaction.plot(fgender, fpopul, oca, col = popCOL2, type = "b", lwd = 2, lty = 1, xlab = "Gender", trace.label = "Population", ylab = "Mean of OCA")) with(HowellsAll, interaction.plot(fpopul, fgender, oca, col = genCOL2, type = "b", lwd = 2, lty = 1, xlab = "Population", trace.label = "Gender", ylab = "Mean of OCA")) with(HowellsAll, interaction.plot(fgender, fpopul, gol, col = popCOL2, type = "b", lwd = 2, lty = 1, xlab = "Gender", trace.label = "Population", ylab = "Mean of GOL")) with(HowellsAll, interaction.plot(fpopul, fgender, gol, col = genCOL2, type = "b", lwd = 2, lty = 1, xlab = "Population", trace.label = "Gender", ylab = "Mean of GOL")) ### 5b. Alternative (maybe nicer) spaghetti plots: Balanced subset with(HowellsAll, interaction.plot(fgender, fpopul, oca, col = popCOL2, type = "b", lwd = 2, lty = 1, xlab = "Gender", trace.label = "Population", ylab = "Mean of OCA")) with(HowellsAll, interaction.plot(fpopul, fgender, oca, col = genCOL2, type = "b", lwd = 2, lty = 1, xlab = "Population", trace.label = "Gender", ylab = "Mean of OCA")) with(HowellsAll, interaction.plot(fgender, fpopul, gol, col = popCOL2, type = "b", lwd = 2, lty = 1, xlab = "Gender", trace.label = "Population", ylab = "Mean of GOL")) with(HowellsAll, interaction.plot(fpopul, fgender, gol, col = genCOL2, type = "b", lwd = 2, lty = 1, xlab = "Population", trace.label = "Gender", ylab = "Mean of GOL")) ### Does it seem that a constant residual variance can be assumed? ### For which variable of interest interactions seem to be expected? ### 6a. Some basic regression models: All data ### OCA oca0All <- lm(oca ~ 1, data = HowellsAll) ocaXAll <- lm(oca ~ fpopul, data = HowellsAll) ocaZAll <- lm(oca ~ fgender, data = HowellsAll) ocaXplusZAll <- lm(oca ~ fpopul + fgender, data = HowellsAll) ocaXZAll <- lm(oca ~ fpopul * fgender, data = HowellsAll) ocaXZAll2 <- lm(oca ~ fgender * fpopul, data = HowellsAll) ### Two equivalent models summary(ocaXZAll) summary(ocaXZAll2) ### ANOVA tables (type I) for ocaXZAll and ocaXZAll2 anova(ocaXZAll) anova(ocaXZAll2) ### Recall the corresponding tests on each line of the output1 ### What about the estimate of the residual variance and SST? Anova(ocaXZAll, type = "II") ## type II Anova(ocaXZAll2, type = "II") ### How to interpret the rows denoted as fpopul and fgender? Anova(ocaXZAll, type = "III") ## type III Anova(ocaXZAll2, type = "III") ### What is the interpretation of the rows denoted as fpopul and fgender? ### GOL gol0All <- lm(gol ~ 1, data = HowellsAll) golXAll <- lm(gol ~ fpopul, data = HowellsAll) golZAll <- lm(gol ~ fgender, data = HowellsAll) golXplusZAll <- lm(gol ~ fpopul + fgender, data = HowellsAll) golXZAll <- lm(gol ~ fpopul * fgender, data = HowellsAll) golXZAll2 <- lm(gol ~ fgender * fpopul, data = HowellsAll) ### ANOVA tables (type I) anova(golXZAll) anova(golXZAll2) Anova(golXZAll, type = "II") ## type II Anova(golXZAll2, type = "II") Anova(golXZAll, type = "III") ## type III Anova(golXZAll2, type = "III") ### 6b. Some basic regression models: Balanced dataset ### OCA oca0 <- lm(oca ~ 1, data = Howells) ocaX <- lm(oca ~ fpopul, data = Howells) ocaZ <- lm(oca ~ fgender, data = Howells) ocaXplusZ <- lm(oca ~ fpopul + fgender, data = Howells) ocaXZ <- lm(oca ~ fpopul * fgender, data = Howells) ocaXZ2 <- lm(oca ~ fgender * fpopul, data = Howells) ### Again, equivalent models summary(ocaXZ); summary(ocaXZ2); ### ANOVA tables (type I) for ocaXZ and ocaXZ2 (!!!) anova(ocaXZ) anova(ocaXZ2) ### What are the corresponding tests on each line of the output above? ### Is it surprising that the tables are the same now? Why? Anova(ocaXZ, type = "II") ## type II Anova(ocaXZ2, type = "II") Anova(ocaXZ, type = "III") ## type III Anova(ocaXZ2, type = "III") ### GOL gol0 <- lm(gol ~ 1, data = Howells) golX <- lm(gol ~ fpopul, data = Howells) golZ <- lm(gol ~ fgender, data = Howells) golXplusZ <- lm(gol ~ fpopul + fgender, data = Howells) golXZ <- lm(gol ~ fpopul * fgender, data = Howells) golXZ2 <- lm(gol ~ fgender * fpopul, data = Howells) ### Two models summary(golXZ) summary(golXZ2) ### ANOVA tables (type I) for golXZ and golXZ2 anova(golXZ) anova(golXZ2) Anova(golXZ, type = "II") ## type II Anova(golXZ2, type = "II") Anova(golXZ, type = "III") ## type III Anova(golXZ2, type = "III") ### Does it make sense to ask for partial effects of fpopul or fgender in the ### models above? ### Does it make sense to estimate the additive model? ### G E N E R A L I Z E D L E A S T S Q U A R E S ### Dataset Hlavy ### * Application of the Generalized Least Squares (GLS) ### * Piecewise polynomial model fitted using a linear model ### with constraints on continuity/smoothness ### ### Data on head sizes of fetuses (from the ultrasound monitoring) in different ### periods of pregnancy. ### Each data value (variable y) is the average over n measurements. ### ### i: serial number ### ### t: week of pregnancy ### ### n: number of measurements used to calculate the average head size ### given in the y variable ### ### y: the average head size obtained from the measurements ### performed in week t of pregnancy data(Hlavy, package = "mffSM") ### Exploratory analysis head(Hlavy) dim(Hlavy) summary(Hlavy) plot(y ~ t, data = Hlavy, pch = 23, col = "red3", bg = "orange") ### Basic scatterplot showing also sample size used to calculate each value of y with(Hlavy, quantile(n, prob = seq(0, 1, by = 0.2))) Hlavy <- transform(Hlavy, fn = cut(n, breaks = c(0, 10, 30, 50, 60, 90), labels = c("<=10", "11-30", "31-50", "51-60", ">60"))) with(Hlavy, summary(fn)) PAL <- diverge_hcl(length(levels(Hlavy[, "fn"]))) names(PAL) <- levels(Hlavy[, "fn"]) plot(y ~ t, data = Hlavy, pch = 23, col = "black", bg = PAL[fn]) legend(17, 9, legend = levels(Hlavy[, "fn"]), title = "n", fill = PAL) plotData <- function(){ PAL <- rev(heat_hcl(length(levels(Hlavy[, "fn"])), c.=c(80, 30), l=c(30, 90), power=c(1/5, 1.3))) names(PAL) <- levels(Hlavy[, "fn"]) plot(y ~ t, data = Hlavy, pch = 23, col = "black", bg = PAL[fn]) legend(17, 9, legend = levels(Hlavy[, "fn"]), title = "n", fill = PAL) abline(v = 27, col = "lightblue", lty = 2) } plotData() ### MOTIVATION: ### Practitioners say that y ~ t relationship could be described by a piecewise ### quadratic function with t = 27 being a point where the quadratic ### relationship changes its shape. ### ### From the practical point of view, the fitted curve should be CONTINUOUS at ### t = 27 and perhaps also SMOOTH (i.e., with continuous at least the first ### derivative) as it is biologically not defensible to claim that at t = 27 ### the growth has a jump in the speed (rate) of the growth. Hlavy <- transform(Hlavy, t27 = t - 27) ### Grid to calculate the fitted regression function tgrid <- seq(16, 42, length = 100) pdata <- data.frame(t27 = tgrid - 27) ### Piecewise quadratic model which has a regression function ### CONTINUOUS at t = 27. ### Generalized least squares (GLS) used to calculate the estimates. summary(mCont <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy)) ### Some naive calculation W <- diag(Hlavy$n); Y <- Hlavy$y; X <- model.matrix(mCont) ### Estimated regression coefficients (betahat <- solve(t(X)%*%W%*%X)%*%(t(X)%*%W%*%Y)); coef(mCont) ### Generalized fitted values (Yg <- X%*%betahat); fitted(mCont) ### Generalized residual sum of squares (RSS <- t(Y - Yg)%*%W%*%(Y - Yg)); ### Generalized mean square RSS/(length(Y)-ncol(X)) (summary(mCont)$sigma)^2 ### Be careful that Multiple R-squared: 0.9968 reported in the output ### "summary(mCont)" is not equal to sum((Yg-mean(Yg))^2)/sum((Y-mean(Y))^2) ### Instead, it equals the following quantity that is more difficult ### to interpret barYw <- sum(Y*Hlavy$n)/sum(Hlavy$n); ### weigthed mean of the original observations ### something as "generalized" regression sum of squares SSR <- sum((Yg - barYw)^2 * Hlavy$n) ### something as "generalized" total variance SST <- sum((Y - barYw)^2 * Hlavy$n); ### "generalized" R squared SSR/SST summary(mCont)$r.squared ### Figure for the fit fitCont <- predict(mCont, newdata = pdata) plotData() lines(tgrid, fitCont, col = "red2", lwd = 2) ### -------------------------------------------------------------------------### ### Block 2: INDIVIDUAL WORK ### ### -------------------------------------------------------------------------### ### Recall the parametrization used in the models for OCA and GOL ### Levels of the two factors (lpop <- levels(HowellsAll[, "fpopul"])) (lgen <- levels(HowellsAll[, "fgender"])) outer(lpop, lgen, paste, sep = ":") (lpopgen <- as.character(outer(lpop, lgen, paste, sep = ":"))) ### contr.treatment - pseudocontrast matrices CX.trt <- contr.treatment(3) # pseduocontrast for populations CZ.trt <- contr.treatment(2) # pseduocontrast for gender CZX.trt <- kronecker(CZ.trt, CX.trt) rownames(CX.trt) <- lpop rownames(CZ.trt) <- lgen rownames(CZX.trt) <- lpopgen print(CX.trt) print(CZ.trt) print(CZX.trt) D.trt <- cbind(kronecker(rep(1, 2), CX.trt), kronecker(CZ.trt, rep(1, 3)), CZX.trt) colnames(D.trt) <- names(coef(golXZ))[-1] D.trt ### What can be concluded from the plot below? Compare with plotLM() par(mfrow = c(2,2)) plot(ocaXZAll$residuals ~ ocaXZAll$fitted, pch = 21, bg = "lightblue", xlab = "Fitted values", ylab = "Residuals", main = "OCA | All") lines(lowess(ocaXZAll$residuals ~ ocaXZAll$fitted), col = "red") plot(golXZAll$residuals ~ golXZAll$fitted, pch = 21, bg = "lightblue", xlab = "Fitted values", ylab = "Residuals", main = "GOL | All") lines(lowess(golXZAll$residuals ~ golXZAll$fitted), col = "red") plot(ocaXZ$residuals ~ ocaXZ$fitted, pch = 21, bg = "lightblue", xlab = "Fitted values", ylab = "Residuals", main = "OCA | Balanced") lines(lowess(ocaXZ$residuals ~ ocaXZ$fitted), col = "red") plot(golXZ$residuals ~ golXZ$fitted, pch = 21, bg = "lightblue", xlab = "Fitted values", ylab = "Residuals", main = "GOL | Balanced") lines(lowess(golXZ$residuals ~ golXZ$fitted), col = "red") ### Alternatively par(mfrow = c(2,2)) plot(residuals(ocaXZAll) ~ fpop.gen, data = HowellsAll, col = genCOL, xlab="Population x gender", main="Residuals: OCA | All") abline(h=0, lty="dotted") plot(residuals(golXZAll) ~ fpop.gen, data = HowellsAll, col = genCOL, xlab="Population x gender", main="Residuals: GOL | All") abline(h=0, lty="dotted") plot(residuals(golXZ) ~ fpop.gen, data = Howells, col = genCOL, xlab="Population x gender", main="Residuals: OCA | Balanced") abline(h=0, lty="dotted") plot(residuals(golXZ) ~ fpop.gen, data = Howells, col = genCOL, xlab="Population x gender", main="Residuals: GOL | Balanced") abline(h=0, lty="dotted") ### Additive model for GOL | All data summary(golXZAll) drop1(golXZAll, test = "F") summary(golXplusZAll) drop1(golXplusZAll, test = "F") ### Additive model for GOL | Balanced data summary(golXZ) drop1(golXZ, test = "F") summary(golXplusZ) drop1(golXplusZ, test = "F") ### What can be concluded from here? Again, compare with plotLM(). par(mfrow = c(2,1)) plot(golXplusZAll$residuals ~ golXplusZAll$fitted, pch = 21, bg = "lightblue", xlab = "Fitted values", ylab = "Residuals", main = "All data") lines(lowess(golXplusZAll$residuals ~ golXplusZAll$fitted), col = "red") plot(golXplusZ$residuals ~ golXplusZ$fitted, pch = 21, bg = "lightblue", xlab = "Fitted values", ylab = "Residuals", main = "Balanced data") lines(lowess(golXplusZ$residuals ~ golXplusZ$fitted), col = "red") ### Alternatively par(mfrow = c(2,1)) plot(residuals(golXplusZAll) ~ fpop.gen, data = HowellsAll, col = genCOL, xlab="Population x gender", main="Residuals: All data") abline(h=0, lty="dotted") plot(residuals(golXplusZ) ~ fpop.gen, data = Howells, col = genCOL, xlab="Population x gender", main="Residuals: Balanced data") abline(h=0, lty="dotted") ### Estimates of all group means based on the ADDITIVE model L.meanAdd <- cbind(1, kronecker(rep(1, 2), CX.trt),kronecker(CZ.trt, rep(1, 3))) colnames(L.meanAdd) <- names(coef(golXplusZAll)) rownames(L.meanAdd) <- lpopgen print(L.meanAdd) LSest(golXplusZAll, L = L.meanAdd) with(HowellsAll, tapply(gol, fgen.pop, mean)) ### Sample group means are (of course) slightly different. ### Can you explain the differences? ### Estimates of all partial effects of fpopul given fgender in additive model print(L.meanAdd) L.effX <- cbind(0, rbind(CX.trt[2,] - CX.trt[1,], CX.trt[3,] - CX.trt[1,], CX.trt[3,] - CX.trt[2,]), 0) colnames(L.effX) <- names(coef(golXplusZAll)) rownames(L.effX) <- paste(lpop[c(2, 3, 3)], "-", lpop[c(1, 1, 2)], sep = "") print(L.effX) LSest(golXplusZAll, L = L.effX) ### Which populations are significantly different if the Bonferroni correction ### is used? ### Is the simultaneous coverage of the three conf. intervals at least 95%? LSest(golXplusZAll, L = L.effX, conf.level = 1 - 0.05/3) ### Bonferroni P-values: LSgolXplusZAll <- LSest(golXplusZAll, L = L.effX) PvBonf <- format.pval(LSgolXplusZAll[["P value"]] * 3, digits = 3, eps = 1e-4) names(PvBonf) <- attr(LSgolXplusZAll, "row.names") print(PvBonf) ### Estimates of all partial effects of fgender given fpopul in additive model print(L.meanAdd) L.effZ <- cbind(0, 0, 0, CZ.trt[2,] - CZ.trt[1,]) colnames(L.effZ) <- names(coef(golXplusZAll)) rownames(L.effZ) <- paste(lgen[2], "-", lgen[1], sep = "") print(L.effZ) LSest(golXplusZAll, L = L.effZ) ### Is there a significant difference between male and female ### given the population? ### Compare the above output with. t.test(gol ~ fgender, data = HowellsAll) ### Now, we will use the contrast sum parametrization. ### Recall, the interpretation of the estimated parameters under ### this parametrization (the means of the group means). Note that for ### the balanced data the mean of the group means turns out to be the ### overall mean. Therefore, we only consider the dataset HowellsAll and we ### will compare the group means in the model with the interation term ### (the model for OCA) and the additive model with no interactions (for GOL). ### The corresponding contr.sum contrast matrices are (CX.sum <- contr.sum(3)) (CZ.sum <- contr.sum(2)) (CZX.sum <- kronecker(CZ.sum, CX.sum)) rownames(CX.sum) <- lpop rownames(CZ.sum) <- lgen rownames(CZX.sum) <- lpopgen print(CX.sum) print(CZ.sum) print(CZX.sum) D.sum <- cbind(kronecker(rep(1, 2), CX.sum), kronecker(CZ.sum, rep(1, 3)), CZX.sum) XX.sum <- model.matrix(oca ~ fpopul * fgender, data = HowellsAll, contrasts = list(fpopul = "contr.sum", fgender = "contr.sum")) colnames(D.sum) <- colnames(XX.sum)[-1] print(D.sum) ### OCA (all data) ### Interaction model ocaXZAll.sum <- lm(oca ~ fpopul * fgender, data = HowellsAll, contrasts = list(fpopul = "contr.sum", fgender = "contr.sum")) summary(ocaXZAll.sum) drop1(ocaXZAll.sum, test = "F") ### Estimates of all "alpha^X" L.alphaX <- cbind(0, CX.sum, 0, 0, 0) colnames(L.alphaX) <- names(coef(ocaXZAll.sum)) rownames(L.alphaX) <- lpop print(L.alphaX) LSest(ocaXZAll.sum, L = L.alphaX) alphaX <- LSest(ocaXZAll.sum, L = L.alphaX)[["Estimate"]] names(alphaX) <- lpop print(alphaX) ### What is their interpretation? ### Estimates of all "alpha^Z" L.alphaZ <- cbind(0, 0, 0, CZ.sum, 0, 0) colnames(L.alphaZ) <- names(coef(ocaXZAll.sum)) rownames(L.alphaZ) <- lgen print(L.alphaZ) LSest(ocaXZAll.sum, L = L.alphaZ) alphaZ <- LSest(ocaXZAll.sum, L = L.alphaZ)[["Estimate"]] names(alphaZ) <- lgen print(alphaZ) ### What is their interpretation? ### Estimates of all "alpha^XZ" L.alphaXZ <- cbind(0, 0, 0, 0, CZX.sum) colnames(L.alphaXZ) <- names(coef(ocaXZAll.sum)) rownames(L.alphaXZ) <- lpopgen print(L.alphaXZ) LSest(ocaXZAll.sum, L = L.alphaXZ) alphaXZ <- LSest(ocaXZAll.sum, L = L.alphaXZ)[["Estimate"]] names(alphaXZ) <- lpopgen print(alphaXZ) ### alpha^XZ stored in a matrix (alphaXZm <- matrix(alphaXZ, nrow = length(lpop), ncol = length(lgen))) rownames(alphaXZm) <- lpop colnames(alphaXZm) <- lgen print(alphaXZm) ### Note, that each row and each column sums in zero. ### Reconstrucing group means from the regression parameters coef(ocaXZAll.sum)["(Intercept)"] + alphaXZm + cbind(alphaX, alphaX) + rbind(alphaZ, alphaZ, alphaZ) (ocaMean <- with(HowellsAll, tapply(oca, list(fpopul, fgender), mean))) ### Sample group means are now (of course) the same as those estimated from ### the interaction model. ### Estimates of all group means again L.meanInt.sum <- cbind(1, D.sum) colnames(L.meanInt.sum) <- names(coef(ocaXZAll.sum)) rownames(L.meanInt.sum) <- lpopgen print(L.meanInt.sum) LSest(ocaXZAll.sum, L = L.meanInt.sum) with(HowellsAll, tapply(oca, fgen.pop, mean)) ### If simultaneous confidence intervals for all 6 group means of interest, ### simple Bonferroni procedure can again be used. LSest(ocaXZAll.sum, L = L.meanInt.sum, conf.level = 1 - 0.05/6) ### Estimates of the differences between the means of the means of groups by ### fpopul = differences in alpha^X's print(L.meanInt.sum) L.effX.sumXZ <- cbind(0, rbind(CX.sum[2,] - CX.sum[1,], CX.sum[3,] - CX.sum[1,], CX.sum[3,] - CX.sum[2,]), 0, 0, 0) colnames(L.effX.sumXZ) <- names(coef(ocaXZAll.sum)) rownames(L.effX.sumXZ) <- paste(lpop[c(2, 3, 3)], "-", lpop[c(1, 1, 2)], sep = "") print(L.effX.sumXZ) LSest(ocaXZAll.sum, L = L.effX.sumXZ) ### mu_{2.} - mu_{1.} ### mu_{3.} - mu_{1.} ### mu_{3.} - mu_{2.} ### Here are the estimated means of the means and their differences apply(ocaMean, 1, mean) outer(apply(ocaMean, 1, mean), apply(ocaMean, 1, mean), "-") ### Estimates of the differences between the means of the means of groups by ### fgender = differences in alpha^Z's L.effZ.sumXZ <- cbind(0, 0, 0, CZ.sum[2,] - CZ.sum[1,], 0, 0) colnames(L.effZ.sumXZ) <- names(coef(ocaXZAll.sum)) rownames(L.effZ.sumXZ) <- paste(lgen[2], "-", lgen[1], sep = "") print(L.effZ.sumXZ) LSest(ocaXZAll.sum, L = L.effZ.sumXZ) ### mu_{.2} - mu_{.1} ### Here are the estimated means of the means and their differences apply(ocaMean, 2, mean) outer(apply(ocaMean, 2, mean), apply(ocaMean, 2, mean), "-") ### GOL (all data) ### Interaction model golXZAll.sum <- lm(gol ~ fpopul * fgender, data = HowellsAll, contrasts = list(fpopul = "contr.sum", fgender = "contr.sum")) summary(golXZAll.sum) drop1(golXZAll.sum, test = "F") ### Additive model golXplusZAll.sum <- lm(gol ~ fpopul + fgender, data = HowellsAll, contrasts = list(fpopul = "contr.sum", fgender = "contr.sum")) summary(golXplusZAll.sum) drop1(golXplusZAll.sum, test = "F") ### Estimates of all "alpha^X" in the additive model L.alphaX <- cbind(0, CX.sum, 0) colnames(L.alphaX) <- names(coef(golXplusZAll.sum)) rownames(L.alphaX) <- lpop print(L.alphaX) LSest(golXplusZAll.sum, L = L.alphaX) alphaX <- LSest(golXplusZAll.sum, L = L.alphaX)[["Estimate"]] names(alphaX) <- lpop print(alphaX) ### What is their interpretation? ### Estimates of all "alpha^Z" in the additive model L.alphaZ <- cbind(0, 0, 0, CZ.sum) colnames(L.alphaZ) <- names(coef(golXplusZAll.sum)) rownames(L.alphaZ) <- lgen print(L.alphaZ) LSest(golXplusZAll.sum, L = L.alphaZ) alphaZ <- LSest(golXplusZAll.sum, L = L.alphaZ)[["Estimate"]] names(alphaZ) <- lgen print(alphaZ) ### What is their interpretation? ### Group means (model-based) (golModelMean <- coef(golXplusZAll.sum)["(Intercept)"] + cbind(alphaX, alphaX) + rbind(alphaZ, alphaZ, alphaZ)) colnames(golModelMean) <- names(alphaZ) print(golModelMean) ### Group means (observed) (golMean <- with(HowellsAll, tapply(gol, list(fpopul, fgender), mean))) ### Again, not surprising that those are slightly different. ### Estimates of the differences between the means of the means of groups ### by fpopul ### = partial effects of fpopul given fgender (when the additive model can be ### assumed) ### = differences in alpha^X's L.peffX.sumXZ <- cbind(0, rbind(CX.sum[2,] - CX.sum[1,], CX.sum[3,] - CX.sum[1,], CX.sum[3,] - CX.sum[2,]), 0) colnames(L.peffX.sumXZ) <- names(coef(golXplusZAll.sum)) rownames(L.peffX.sumXZ) <- paste(lpop[c(2, 3, 3)], "-", lpop[c(1, 1, 2)], sep = "") print(L.peffX.sumXZ) LSest(golXplusZAll.sum, L = L.peffX.sumXZ) ### mu_{2.} - mu_{1.} ### mu_{3.} - mu_{1.} ### mu_{3.} - mu_{2.} ### Here are the estimated means of the means (model-based) and ### their differences apply(golModelMean, 1, mean) outer(apply(golModelMean, 1, mean), apply(golModelMean, 1, mean), "-") ### Estimates of the differences between the means of the means of groups ### by fgender = partial effects of fgender given fpopul (when the additive ### model can be assumed) = differences in alpha^Z's L.peffZ.sumXZ <- cbind(0, 0, 0, CZ.sum[2,] - CZ.sum[1,]) colnames(L.peffZ.sumXZ) <- names(coef(golXplusZAll.sum)) rownames(L.peffZ.sumXZ) <- paste(lgen[2], "-", lgen[1], sep = "") print(L.peffZ.sumXZ) LSest(golXplusZAll.sum, L = L.peffZ.sumXZ) ### mu_{.2} - mu_{.1} ### Here are the estimated means of the means (model-based) and ### their differences apply(golModelMean, 2, mean) outer(apply(golModelMean, 2, mean), apply(golModelMean, 2, mean), "-") ### -------------------------------------------------------------------------### ### Block 3: ADDITIONAL MATERIAL ### ### -------------------------------------------------------------------------### ### Consider the dataset Hlavy and the model mCont summary(mCont) ### Can you identify both quadratic functions in the summary output above? ### Suppose that m(t) denotes the regression function. Can you find a p-value ### of the test ### ### H0: the first derivative of m(t) is continous ### (we will say that m(t) is SMOOTH) ### ### against the alternative ### ### H1: the first derivative of m(t) is not continous ### (we will say that m(t) is CONTINUOUS) Anova(mCont, type = "II") ### Note, that the model is not hierarchically well formulated ### A hierarchical model would make no sense in this particular scenario. Why? ### What would be the interpretation of this model? summary(mContH <- lm(y ~ t27 + I(t27^2) + (t27 > 0) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy)) ### The regression coefficients from the model ### (mCont in particular but mContH as well) ### can be also estimated by the standard OLS method from the data, ### where each row is repeated Hlavy$n - times. ### Creating such artificial data set iii <- rep(1:nrow(Hlavy), Hlavy$n) Hlavy2 <- Hlavy[iii,]; # new dataset ### Comparing OLS (with repeated observations) with GLS: summary(mCont2 <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), data = Hlavy2)) summary(mCont) ### The OLS approach can be only used to get the right estimates. ### It can not be used for the inference. ### Just for comparison, GLS fit versus OLS (ordinary least squares) ### fit which does not take the weights into account summary(mContOLS <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), data = Hlavy)) ### Visual comparison of the models fitContOLS <- predict(mContOLS, newdata = pdata) plotData() lines(tgrid, fitCont, col = "red2", lwd = 2) lines(tgrid, fitContOLS, col = "blue", lwd = 2) legend(33, 7, legend = c("GLS", "OLS"), col = c("red2", "blue"), lwd = 2, lty = 1) ### Do you have some explanation why the red line does not follow so tightly ### the points corresponding to the low values of t? ### Piecewise quadratic model which has a regression function SMOOTH at t = 27 ### Generalized least squares (GLS) used to calculate the estimates. summary(mSmooth <- lm(y ~ t27 + I(t27^2) + (t27 > 0):I(t27^2), weights = n, data = Hlavy)) fitSmooth <- predict(mSmooth, newdata = pdata) plotData() lines(tgrid, fitCont, col = "red2", lwd = 2) lines(tgrid, fitSmooth, col = "blue", lwd = 2) legend(33, 7, legend = c("Continuous", "Smooth"), col = c("red2", "blue"), lwd = 2, lty = 1) ### Piecewise quadratic model which has a regression function not even ### continuous at t = 27 ### Generalized least squares (GLS) used to calculate the estimates. summary(mJump <- lm(y ~ t27 + I(t27^2) + (t27 > 0) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy)) ### QUESTIONS: ### Can you find a p-value of the test ### ### H0: Jump of m(t) at t = 27 is equal to 0 ### ### against ### ### H1: Jump of m(t) at t = 27 is not equal to 0? fitJump <- predict(mJump, newdata = pdata) plotData() lines(tgrid, fitCont, col = "red2", lwd = 2) lines(tgrid, fitSmooth, col = "blue", lwd = 2) lines(tgrid, fitJump, col = "darkgreen", lwd = 2) legend(33, 7, legend = c("Jump", "Continuous", "Smooth"), col = c("darkgreen", "red2", "blue"), lwd = 2, lty = 1)