### --------------------------------------------------------------------------------------### ### Winter Term 2019/2020 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #5 Different parametrizations of one-way classified group means ### ### ### ### --------------------------------------------------------------------------------------### # In K4/K10/K11 you might not be able to load the package 'mffSM' because of a low # version of R software (< 3.2.0). In this case tyou can run R software from the # network disc L, where the version 3.2.2. is installed. The path is # L:\Statistika\R\bin\x64\Rgui.exe ### ====================== ### Working directory ### ====================== setwd("H:/nmsa407_LinRegr/") library("mffSM") data(Koreny) # "Koreny" means roots in English. help(Koreny) head(Koreny) summary(Koreny) table(Koreny$percentage) ### In the sequel it will be useful to consider the percentage as a categorical covariate Koreny$fpercentage <- factor(Koreny$percentage) summary(Koreny) ### Exploratory analysis: two figures par(mfrow=c(2, 1), bty="n") plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange") plot(weight ~ fpercentage, data=Koreny, col="sandybrown") par(mfrow=c(1, 1)) ### ================================================================ ### Different parametrization of one-way classified group means ### ### ================================================================ ### number of the groups print(G <- nlevels(Koreny$fpercentage)) ### Standard ANOVA table aov(weight~fpercentage, data=Koreny) summary(aov(weight ~ fpercentage, data=Koreny)); ### QUESTIONS ### What are the assumptions of the underlying model? ### What is the conclusion? ### 1. Full-rank parameterization without intercept ### ----------------------------------------------- aWithoutInt <- lm(weight ~ -1 + fpercentage, data=Koreny); summary(aWithoutInt) ## The model matrix of X looks like this model.matrix(aWithoutInt) ## Unique rows of the model matrix X unique(model.matrix(aWithoutInt)) ### QUESTIONS: ### Which of the numbers in the output are useful within the problem context? aggregate(Koreny$weight, list(Koreny$fpercentage), mean) aWithoutInt$coef ### Note that as the intercept is not explicitly included, many of the numbers ### in the output are useless. Among others: ### Multiple R-squared: - see help(summary.lm) ### F-statistic - it is calculated from 'Multiple R-squared' and thus it is also useless. ### R-squared in a model wihtout intercept (R2.0<-sum(fitted(aWithoutInt)^2)/sum(Koreny$weight^2)) # R-squared and a test of a submodel asub0 = lm(weight~-1,data=Koreny) anova(asub0,aWithoutInt) summary(aWithoutInt)$fstatistic # manually computed F-statistic n = nrow(Koreny) (n-G)/(G-0)*(R2.0/(1-R2.0)) ### Nevertheless as the intercept is implicitly present, it makes ### sense to calculate the multiple R-squared. One can do it ### 'manually' as ### R-squared in a model with intercept (R2<-var(fitted(aWithoutInt))/var(Koreny$weight)) ### (This is the usual calculation of R-squared for models with intercept) asub = lm(weight~1, data=Koreny) anova(asub,aWithoutInt) # manually computed F-statistic (n-G)/(G-1)*R2/(1-R2) ### Moreover the output of 'summary()' can be used to ### estimate any linear combination of the group expectations ### and the appropriate standard error and confidence interval. ### Task: Estimate the difference in expectations of the groups ### for the percentage of sugar equal to 2 and equal to 0. ### The vector to create the linear combination of regression coefficients levels(Koreny$fpercentage) l <- c(-1,1,0,0); ### The corresponding point estimate coef(aWithoutInt)[2] - coef(aWithoutInt)[1] ### or equivalently (estim <- c(l%*%coef(aWithoutInt))) ### Standard errors - in this parametrization the covariance matrix of ### the estimated coefficients is diagonal vcov(aWithoutInt) ### So, the standard error can be calculated by the formula given below SE <- summary(aWithoutInt)$coefficients[,2]; sqrt(SE[1]^2 + SE[2]^2) ### or equivalently (SE.estim <- sqrt(t(l)%*%vcov(aWithoutInt)%*%l)) ### Confidence interval DF <- aWithoutInt$df.residual; # degrees of freedom estim + c(-1,1)*c(SE.estim*qt(0.975, df=DF)) # The estimates above (and even more) can be simply calculated # with the help of function LSest from the package "mffSM" LSest(aWithoutInt, l) # manual computation of the t value and the P value estim/SE.estim 2*pt(estim/SE.estim,df=DF,lower.tail=FALSE) ### QUESTIONS: ### Let mu[j] stand for the expectation of the group with percentage = j (j=1,2,3,4) ### With the help of model 'aWithoutInt' test the null hypohesis ### that (mu[1] + mu[2]) - (mu[3] + mu[4]) = 0 l <- c(1,1,-1,-1); # point estimate (estim <- c(l%*%coef(aWithoutInt))) # estimated SE sqrt(sum(SE^2)) (SE.estim <- sqrt(t(l)%*%vcov(aWithoutInt)%*%l)) # confidence interval as.numeric(estim) + c(-1,1)*c(SE.estim)*qt(0.975, df=DF) # ...or simply also LSest(aWithoutInt, l) ### ### With the help of model 'aWithoutInt' test the null hypohesis ### that 'mu[1] = mu[2] and at the same time mu[3] = mu[4]' ### against the alternative hypothesis that ### 'mu[1] is not equal to mu[2] and/or mu[3] is not equal to mu[4]'. (L <- matrix(rbind(c(1,-1,0,0), c(0,0,1,-1)), nrow=2, ncol=4)) m <- nrow(L); # point estimates (thetahat <- L%*%coef(aWithoutInt)) # test statistic - a quadratic form, F-distributed (Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aWithoutInt)%*%t(L))%*%thetahat) 1 - pf(Q, df1=m, df2=DF) # another way of testing the same hypothesis: a submodel where # mu[1] and mu[2] are considered the same regressor, and mu[3] and mu[4] are the same asub = lm(weight ~ -1 + I(percentage<3), data=Koreny); summary(asub) anova(asub,aWithoutInt) # compare with the following output LSest(aWithoutInt,L=L) # LSest considers the rows of L to correspond to two separate tests! # This is not the correct test in our situation ### 2. Contr.treatment: Reference group pseudocontrast with the reference ### given by the first group (default in R) ### ---------------------------------------------------------------------- ### Default options for contrasts options()$contrasts; a <- lm(weight ~ fpercentage, data=Koreny) summary(a) # compare with aggregate(Koreny$weight, list(Koreny$fpercentage), mean) ### (Pseudo)Contrast matrix C.treat (C.treat <- contr.treatment(G)) ### Model matrix (unique rows only) unique(model.matrix(a)) # the final model from weight~fpercentage - 1 and weight ~ fpercentage is the same all.equal(fitted(aWithoutInt), fitted(a)) ### QUESTIONS: ### Based on the above output estimate the difference in expectations of the groups ### for the percentage of sugar equal to 0 and equal to 4. ### # point estimate means <- tapply(Koreny$weight, Koreny$percentage, mean) names(means) <- paste(names(means), " % sugar", sep="") unname(means[3]-means[1]) a$coef[3] # the complete matrix of contrasts C = cbind(1,contr.treatment(G)) rownames(C) = levels(Koreny$fpercentage) colnames(C) = paste("b",1:4,sep="") C # its inverse (C.treat.inv = solve(C)) # the difference in means that we consider l = c(1,0,-1,0) l%*%means # and the same difference in the basis given by contrasts (lc<-l%*%C) lc%*%a$coef ### Based on the above output estimate the difference in expectations of the groups ### for the percentage of sugar equal to 2 and equal to 4. # the difference we consider l = c(0,1,-1,0) l%*%means # and the same in the basis of contrasts (lc<-l%*%C) lc%*%a$coef ### Comparison of the regression coefficients and sample means ### What did we obtain? print(means) data.frame(Beta=coef(a), Mu=means) ### The sample means can be calculated from the regression coefficients as Beta.treat <- coef(a) ### Sample means Beta.treat[1] + C.treat %*% Beta.treat[-1] C%*%Beta.treat ### Thus the full-rank parameterization without intercept can be recovered as LSest(a, L=C) summary(aWithoutInt)$coef # the inverse transformation (C.treat.inv) LSest(aWithoutInt, L = C.treat.inv) summary(a)$coef # the models are equivalent, but - again - be careful about the F-statistics and R-squared summary(aWithoutInt) summary(a) ### QUESTIONS: ### What are the assumptions of the model? ### Compare the following outputs: summary(a); summary(aov(weight ~ fpercentage, data=Koreny)); anova(a); ### QUESTIONS: ### Let mu[j] stand for the expectation of the group with percentage = j (j=1,2,3,4) ### With the help of model 'a' test the null hypohesis ### that (mu[1] + mu[2]) - (mu[3] + mu[4]) = 0 l = c(1,1,-1,-1) l%*%means (lc<-l%*%C) LSest(a,L=lc) LSest(aWithoutInt, l) ### 3. Contr.SAS: Reference group pseudocontrast with the reference given by the last group ### --------------------------------------------------------------------------------------- aSAS <- lm(weight ~ fpercentage, data=Koreny, contrasts = list(fpercentage = contr.SAS)); summary(aSAS) ### Model matrix (unique rows only) unique(model.matrix(aSAS)) ### QUESTIONS: ### Based on the above output estimate the difference in expectations of the groups ### for the percentage of sugar equal to 0 and equal to 6. ### # analogously as above (C = cbind(1,contr.SAS(G))) (C.inv = solve(C)) l = c(1,0,0,-1) l%*%means (lc<-l%*%C) lc%*%aSAS$coef # reconstruct mu[4] l = c(0,0,0,1) l%*%means (lc<-l%*%C) lc%*%aSAS$coef ### Based on the above output estimate the difference in expectations of the groups ### for the percentage of sugar equal to 2 and equal to 4. means[2]-means[3] l = c(0,1,-1,0) l%*%means (lc<-l%*%C) lc%*%aSAS$coef ### Comparison of the regression coefficients and sample means print(means) data.frame(Beta=coef(aSAS), Mu=means) ### The sample means can be calculated from the regression coefficients as (C.SAS <- contr.SAS(G)); Beta.SAS <- coef(aSAS); ### Sample means Beta.SAS[1] + C.SAS %*% Beta.SAS[-1] C%*%Beta.SAS ### Thus the full-rank parameterization without intercept can be again recovered as LSest(aSAS, L=cbind(1, C.SAS)) summary(aWithoutInt)$coef ### 4. Contr.sum: Sum contrasts (reference = the last group) ### -------------------------------------------------------- (C.Sum <- contr.Sum(G)); aSum <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=contr.sum)) summary(aSum) ### Model matrix unique(model.matrix(aSum)) ### QUESTIONS: ### Based on the above output estimate the difference in expectations of the groups ### for the percentage of sugar equal to 0 and equal to 4. ### C = cbind(1,contr.Sum(G)) rownames(C) = levels(Koreny$fpercentage) colnames(C) = paste("b",1:4,sep="") C (C.inv = solve(C)) ### Intercept is the average of the group means (not the average weight!) mean(means) coef(aSum) l = c(1,0,-1,0) l%*%means (lc<-l%*%C) lc%*%aSum$coef ### Based on the above output estimate the difference in expectations of the groups ### for the percentage of sugar equal to 0 and equal to 6. l = c(1,0,0,-1) l%*%means (lc<-l%*%C) lc%*%aSum$coef ### Comparison of the regression coefficients and sample means print(means) data.frame(Beta=coef(aSum), Mu=means) ### The sample means can be calculated from the regression coefficients as (C.Sum <- contr.Sum(G)); C Beta.Sum <- coef(aSum); ### Sample means Beta.Sum[1] + C.Sum %*% Beta.Sum[-1] C%*%Beta.Sum ### Thus the full-rank parameterization without intercept can be recovered as LSest(aSum, L=C) summary(aWithoutInt)$coef ### QUESTIONS: ### Let mu[j] stand for the expectation of the group with percentage = j ### With the help of model 'aSum' test the null hypothesis ### that (mu[1] + mu[2]) - (mu[3] + mu[4]) = 0 l = c(1,1,-1,-1) l%*%means (lc<-l%*%C) LSest(aSum,L=lc) LSest(aWithoutInt, l) ### 5. Weighted sum contrasts (reference = the last group) ### ------------------------------------------------------ ### Note that this method is not readily available in the standard R-software instalation, ### therefore, we need to prepare the (pseudo)contrast matrix by ourselves. ### See Section 8.4 of the course notes (page 170) ### Group sample sizes (ng <- table(Koreny$percentage)); # formula (8.28) C.wSum <- rbind(diag(G - 1), - ng[-G] / ng[G]) rownames(C.wSum) <- levels(Koreny[["fpercentage"]]) print(C.wSum) # custom contrast specification awSum <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=C.wSum)) summary(awSum) ### Model matrix unique(model.matrix(awSum)) ### Intercept is now the mean weight (of all responses) awSum$coef mean(Koreny$weight) ### QUESTIONS: ### Based on the above output estimate the difference in expectations of the groups ### for the percentage of sugar equal to 0 and equal to 4. ### C = cbind(1,C.wSum) rownames(C) = levels(Koreny$fpercentage) colnames(C) = paste("b",1:4,sep="") C (C.inv = solve(C)) l = c(1,0,-1,0) l%*%means (lc<-l%*%C) lc%*%aSum$coef ### Based on the above output estimate the difference in expectations of the groups ### for the percentage of sugar equal to 0 and equal to 6. l = c(1,0,0,-1) l%*%means (lc<-l%*%C) lc%*%awSum$coef ### Comparison of the regression coefficients and sample means print(means) data.frame(Beta=coef(awSum), Mu=means) ### The sample means can be calculated from the regression coefficients as Beta.wSum <- coef(awSum); ### Sample means Beta.wSum[1] + C.wSum %*% Beta.wSum[-1] C%*%Beta.wSum ### And thus, the full-rank parameterization without intercept can be recovered as LSest(awSum, L=cbind(1, C.wSum)) summary(aWithoutInt) ### 6. Orthonormal polynomial contrasts ### ------------------------------------ (C.poly <- contr.poly(G)); ### Check orthogonality -- the columns of the matrix are orthonormal round(t(C.poly)%*%C.poly,10) ### Figure illustrating C.poly plot(levels(Koreny$fpercentage), C.poly[,1], ylab="", col="blue", xlab="sugar percentage", cex=2, main="Orthonormal polynomial contrasts"); lines(levels(Koreny$fpercentage), C.poly[,1], col="blue", lwd=2); # points(levels(Koreny$fpercentage), C.poly[,2], col="red3", pch=2); lines(levels(Koreny$fpercentage), C.poly[,2], col="red3", lwd=2); # points(levels(Koreny$fpercentage), C.poly[,3], col="darkgreen", pch=3); lines(levels(Koreny$fpercentage), C.poly[,3], col="darkgreen", lwd=2); # legend(x=2.5, y=0.65, pch=1:3, col=c("blue", "red3", "darkgreen"), lty=rep("solid", 3), legend=c("first column (.L)", "second column (.Q)", "third column (.C)")) ### Model matrix unique(model.matrix(aPoly)) aPoly <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=contr.poly)) summary(aPoly) ### Model aPoly suggests that the percentage can be considered as ### an equidistant numerical covariate and the evolution of the group means can be ### described by a quadratic function. plot(weight ~ fpercentage, data=Koreny, col="sandybrown"); ### A formal test for H_0: beta_3 = 0 l <- c(0,0,0,1); LSest(aPoly, l) ### A formal test for H_0: beta_2 = 0 & beta_3 = 0 DF <- aPoly$df.residual; L <- matrix(rbind(c(0,0,1,0), c(0,0,0,1)), nrow=2, ncol=4) m <- nrow(L); thetahat <- L%*%coef(aPoly); ### test-statistic (Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aPoly)%*%t(L))%*%thetahat) ### p-value 1 - pf(Q, df1=m, df2=DF) ### A formal test for H_0: beta_1 = 0 & beta_2 = 0 & beta_3 = 0 DF <- aPoly$df.residual; L <- matrix(rbind(c(0,1,0,0), c(0,0,1,0), c(0,0,0,1)), nrow=3, ncol=4) m <- nrow(L); thetahat <- L%*%coef(aPoly); ### test-statistic (Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aPoly)%*%t(L))%*%thetahat) ### p-value 1 - pf(Q, df1=m, df2=DF) ### The sample means can be calculated from the regression coefficients as Beta.Poly <- coef(aPoly); ###Sample means Beta.Poly[1] + C.poly %*% Beta.Poly[-1] ### Thus the full-rank parameterization without intercept can be recovered as LSest(aPoly, L=cbind(1, C.poly)) summary(aWithoutInt)$coef ### Note: ### In practice it might be of interest to take into the account the issue of multiple testing ### and sometimes also to make all the pairwise comparisons. This will be ### discussed in the second half of the semester. ### ============================================================================ ### ### The linear model with the percentage of sugar taken as a numeric covariate ### ### ============================================================================ ### ### Assuming independence summary(a0 <- lm(weight ~ 1, data=Koreny)) ### Assuming linear dependence summary(a1 <- lm(weight ~ percentage, data=Koreny)) par(mfrow=c(1, 1), bty="n") plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange") abline(a1, col="darkblue", lwd=3) ### Assuming quadratic model summary(a2 <- lm(weight ~ percentage + I(percentage^2), data=Koreny)) pKoreny <- data.frame(percentage=seq(0, 6, length=101)) fit2 <- predict(a2, newdata=pKoreny) ### Figure par(mfrow=c(1, 1), bty="n") plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange") lines(fit2 ~ percentage, col="darkgreen", data=pKoreny, lwd=3) abline(a1, col="darkblue", lwd=3) legend(1.5, 0.2, legend=c("Linear", "Quadratic"), lty=1, col=c("darkblue", "darkgreen"), lwd=3) ### QUESTIONS: ### Estimate the expectation of weight for percentage of sugar = 1. ### Is there any problem with such a prediction? (pred1 = predict(a2, newdata=data.frame(percentage=1))) points(1,pred1,col="seagreen",bg="darkgreen",pch=23,cex=1.5) ### Assuming a cubic model summary(a3 <- lm(weight ~ percentage + I(percentage^2) + I(percentage^3), data=Koreny)) fit3 <- predict(a3, newdata=pKoreny) ### Figure means <- tapply(Koreny$weight, Koreny$percentage, mean) BARVY <- c(linear="darkblue", kvadr="darkgreen", kubic="red") par(mfrow=c(1, 1), bty="n") plot(weight ~ percentage, data=Koreny, col="grey50") abline(a1, col=BARVY["linear"], lwd=3) lines(fit2 ~ percentage, col=BARVY["kvadr"], data=pKoreny, lwd=3) lines(fit3 ~ percentage, col=BARVY["kubic"], data=pKoreny, lwd=3) points(means ~ seq(0, 6, by=2), pch=15, col="orange", cex=1.5) legend(1, 0.2, legend=c("Linear", "Quadratic", "Cubic"), lty=1, col=BARVY, lwd=3) ### predicted means are the same as the observed means now predict(a3,newdata = data.frame(percentage=seq(0, 6, length=4))) means ### model with a fourth degree polynomial lm(weight ~ poly(percentage,degree=4), data=Koreny) a4<-lm(weight ~ percentage + I(percentage^2) + I(percentage^3)+I(percentage^4), data=Koreny) fit4 <- predict(a4, newdata=pKoreny) lines(fit4 ~ percentage, col="deeppink3", data=pKoreny, lwd=3) ### Recall the contr.treatment() parametrization summary(a) ### QUESTIONS: ### Explain why models a3, a and aPoly are equivalent. ### Explain why model a2 is a submodel of a. ### Explain why model a1 is a submodel of a. summary(aPoly)$coef aPoly2 = lm(weight ~ poly(percentage,degree=3), data=Koreny) summary(aPoly2)$coef fitted.all = cbind(fitted(a),fitted(a3),fitted(aPoly),fitted(aPoly2)) colnames(fitted.all) = c("~fpercentage","~poly(percentage,3,raw)","~contr.poly(fpercentage)","~poly(percentage,3)") unique(fitted.all) anova(a2,a) anova(a1,a)