### -------------------------------------------------------------------------### ### NMSA407 - Linear Regression ### ### Winter Term 2021/2022 ### ### ### ### EXERCISE #5 Various parametrization for factor covariates/group means ### ### -------------------------------------------------------------------------### ### -------------------------------------------------------------------------### ### Block 1: TEACHING MATERIAL ### ### -------------------------------------------------------------------------### library("mffSM") data(Koreny) ### "Koreny" means roots in English. help(Koreny) dim(Koreny) head(Koreny) summary(Koreny) table(Koreny$percentage) ### the percentage variable will be considered as a categorical covariate Koreny$fpercentage <- factor(Koreny$percentage) summary(Koreny) ### Exploratory analysis par(mfrow=c(2, 1), bty="n") plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange") plot(weight ~ fpercentage, data=Koreny, col="sandybrown") par(mfrow=c(1, 1)) ### number of the groups print(G <- nlevels(Koreny$fpercentage)) ### Standard ANOVA aov(weight~fpercentage, data=Koreny) summary(aov(weight ~ fpercentage, data=Koreny)); ### What are the assumptions of the underlying model? What is the conclusion? ### 1. FULL RANK PARAMETRIZATION WITHOUT INTERCEPT aWithoutInt <- lm(weight ~ -1 + fpercentage, data=Koreny); summary(aWithoutInt) ### The fitted function is of the form Y <- function(a,b,c,d, x){ return(a * (x == 0) + b * (x == 2) + c * (x == 4) + d * (x == 6)) } ### What are the limitations of this function? ### The model matrix model.matrix(aWithoutInt) ### with unique rows only unique(model.matrix(aWithoutInt)) aggregate(Koreny$weight, list(Koreny$fpercentage), mean) aWithoutInt$coef ### Note that as the intercept is not explicitly included, many of the numbers ### in the output are useless. Among others: ### Multiple R-squared: (1 - (aWithoutInt$df.residual * (summary(aWithoutInt)$sigma^2)) / sum(Koreny$weight^2)) (1 - (sum(aWithoutInt$residuals^2)) / sum(Koreny$weight^2)) (R2 <- sum(fitted(aWithoutInt)^2)/sum(Koreny$weight^2)) ### F-statistic (calculated from 'Multiple R-squared') is also useless: ((sum(Koreny$weight^2) - sum(aWithoutInt$residuals^2))/G) / (sum(aWithoutInt$residuals^2) / (nrow(Koreny)-G)) (nrow(Koreny) - G)/(G - 0)*(R2/(1 - R2)) ### Nevertheless as the intercept is implicitly present, it makes ### sense to calculate the multiple R-squared - manually: ### R-squared in a model with intercept (1 - (sum(aWithoutInt$residuals^2)) / sum((Koreny$weight - mean(Koreny$weight))^2)) (R2 <- sum((fitted(aWithoutInt) - mean(fitted(aWithoutInt)))^2) / sum((Koreny$weight - mean(Koreny$weight))^2)) ### manually computed F-statistic ((sum((Koreny$weight - mean(Koreny$weight))^2) - sum(aWithoutInt$residuals^2))/3) / (sum(aWithoutInt$residuals^2) / 50) (nrow(Koreny)-G)/(G-1)*R2/(1-R2) ### Moreover the output of the 'summary()' table can be used to estimate any ### linear combination of the group expectations and the appropriate standard ### error and confidence interval. ### Task: Estimate the difference in expectations of the groups for the ### percentage of sugar equal to 2 and equal to 0. ### The vector to create the linear combination of regression coefficients levels(Koreny$fpercentage) l <- c(-1,1,0,0) ### The corresponding point estimate coef(aWithoutInt)[2] - coef(aWithoutInt)[1] ### or equivalently (estim <- c(l%*%coef(aWithoutInt))) ### Standard errors - in this parametrization the covariance matrix of ### the estimated coefficients is diagonal - can you explain why? vcov(aWithoutInt) ### So, the standard error can be calculated by the formula given below SE <- summary(aWithoutInt)$coefficients[,2] sqrt(SE[1]^2 + SE[2]^2) ### or equivalently (SE.estim <- sqrt(t(l)%*%vcov(aWithoutInt)%*%l)) ### Confidence interval DF <- aWithoutInt$df.residual; # degrees of freedom estim + c(-1,1)*c(SE.estim*qt(0.975, df=DF)) ### or equivalently, using the function LSest from the package "mffSM" LSest(aWithoutInt, l) ### QUESTIONS: ### Let mu[j] stand for the expectation of the group with percentage = j for ### (j=1,2,3,4) With the help of model 'aWithoutInt' test the null hypohesis ### that (mu[1] + mu[2]) - (mu[3] + mu[4]) = 0 l <- c(1,1,-1,-1); ### point estimate (estim <- c(l%*%coef(aWithoutInt))) ### estimated SE sqrt(sum(SE^2)) (SE.estim <- sqrt(t(l)%*%vcov(aWithoutInt)%*%l)) ### confidence interval as.numeric(estim) + c(-1,1)*c(SE.estim)*qt(0.975, df=DF) ### or, simply (and equaivalently) LSest(aWithoutInt, l) ### With the help of model 'aWithoutInt' test the null hypohesis ### that 'mu[1] = mu[2] and at the same time mu[3] = mu[4]' ### against the alternative hypothesis that ### 'mu[1] is not equal to mu[2] and/or mu[3] is not equal to mu[4]'. (L <- matrix(rbind(c(1,-1,0,0), c(0,0,1,-1)), nrow=2, ncol=4)) m <- nrow(L); ### point estimates (thetahat <- L%*%coef(aWithoutInt)) ### test statistic - a quadratic form, F-distributed (Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aWithoutInt)%*%t(L))%*%thetahat) 1 - pf(Q, df1=m, df2=DF) ### another way of testing the same hypothesis: a submodel where ### mu[1] and mu[2] are considered the same regressor, and mu[3] and mu[4] ### are the same asub = lm(weight ~ -1 + I(percentage<3), data=Koreny); summary(asub) anova(asub,aWithoutInt) ### compare with the following output, where LSest considers the rows of L to ### correspondto two separate tests which is not the correct test in this ### situation LSest(aWithoutInt,L=L) ### 2. CONTR.TREATMENT PARAMETRIZATION | REFERENCE GROUP: THE FIRST GROUP ### Default options for contrasts options()$contrasts ### model with intercept a <- lm(weight ~ fpercentage, data=Koreny) summary(a) ### The fitted function is of the form Y <- function(a,b,c,d, x){ return(a + b * (x == 2) + c * (x == 4) + d * (x == 6)) } # compare with aggregate(Koreny$weight, list(Koreny$fpercentage), mean) ### (Pseudo)Contrast matrix C.treat (C.treat <- contr.treatment(G)) ### Model matrix (unique rows only) unique(model.matrix(a)) ### final model from weight~fpercentage - 1 and weight ~ fpercentage is the same all.equal(fitted(aWithoutInt), fitted(a)) ### Based on the above output estimate the difference in expectations of the ### groups for the percentage of sugar equal to 0 and equal to 4. means <- tapply(Koreny$weight, Koreny$percentage, mean) names(means) <- paste(names(means), " % sugar", sep="") unname(means[3]-means[1]) ### or equivalently a$coef[3] ### the complete matrix of contrasts C <- cbind(1,contr.treatment(G)) ### Based on the above output estimate the difference in expectations of the ### groups for the percentage of sugar equal to 2 and equal to 4. # the difference we consider l = c(0,1,-1,0) l%*%means ### Thus the full-rank parameterization without intercept can be recovered as LSest(a, L=C) summary(aWithoutInt)$coef ### or vice-versa (using the inverse of C) LSest(aWithoutInt, L = solve(C)) summary(a)$coef ### thus, the models are equivalent, but one must be careful about the ### F-statistics and R-squared which are generally different summary(aWithoutInt) summary(a) ### 3. CONTR.SUM: SUM CONTRASTS aSum <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=contr.sum)) summary(aSum) C <- cbind(1,contr.sum(G)) ### The fitted function is of the form Y <- function(a,b,c,d, x){ return(a + b * (x == 0) + c * (x == 2) + d * (x == 4) - (b + c + d) * (x == 6)) } ### Model matrix unique(model.matrix(aSum)) ### Intercept is now the average of the group means (not the average weight!) mean(Koreny$weight) mean(means) coef(aSum) ### Based on the above output estimate the difference in expectations of the ### groups for the percentage of sugar equal to 0 and equal to 6. means[1]-means[4] l = c(1,0,0,-1) l%*%means (lc<-l%*%C) lc%*%aSum$coef LSest(aSum, lc) ### Comparison of the regression coefficients and sample means print(means) data.frame(Beta=coef(aSum), Mu=means) ### Thus the full-rank parameterization without intercept can be recovered as LSest(aSum, L=C) summary(aWithoutInt)$coef ### or vice-versa again LSest(aWithoutInt, L=solve(C)) summary(aSum)$coef ### 4. WEIGHTED SUM CONSTRASTS ### Not available in the standard R-software instalation, therefore, we need to ### prepare the (pseudo)contrast matrix by ourselves. ### Group sample sizes (ng <- table(Koreny$percentage)); ### (pseudo)constrast matrix C.wSum <- rbind(diag(G - 1), - ng[-G] / ng[G]) rownames(C.wSum) <- levels(Koreny[["fpercentage"]]) print(C.wSum) ### custom contrast specification awSum <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=C.wSum)) summary(awSum) ### The fitted function is of the form Y <- function(a,b,c,d, x){ return(a + b * (x == 0) + c * (x == 2) + d * (x == 4) - (1.25 * b + c + 1.25 * d) * (x == 6)) } ### Model matrix - why is it good for? unique(model.matrix(awSum)) ### Intercept is now the mean weight (of all responses - not the mean of the ### group means - as it was before) mean(means) mean(Koreny$weight) awSum$coef[1] ### Based on the above output estimate the difference in expectations of the ### groups for the percentage of sugar equal to 0 and equal to 4. C = cbind(1,C.wSum) rownames(C) = levels(Koreny$fpercentage) means[1] - means[3] l = c(1,0,-1,0) l%*%means (lc<-l%*%C) lc%*%aSum$coef LSest(awSum, lc) ### Comparison of the regression coefficients and sample means print(means) data.frame(Beta=coef(awSum), Mu=means) ### Full-rank parameterization without intercept recovery LSest(awSum, L=cbind(1, C.wSum)) summary(aWithoutInt) ### and again, vice-versa LSest(aWithoutInt, L=solve(cbind(1, C.wSum))) summary(awSum) ### LINEAR MODEL WITH PERCENTAGES TAKEN AS A NUMERICAL COVARIATE ### Assuming independence summary(a0 <- lm(weight ~ 1, data=Koreny)) par(mfrow=c(1, 1), bty="n") plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange") abline(a0, 0, col = "red") ### Assuming linear dependence summary(a1 <- lm(weight ~ percentage, data=Koreny)) abline(a1, col="darkblue", lwd=3) ### Assuming quadratic dependence (but still linear model) summary(a2 <- lm(weight ~ percentage + I(percentage^2), data=Koreny)) xSeq <- seq(0,6, length = 100) ySeq <- cbind(rep(1, 100), xSeq, xSeq^2) %*% a2$coeff lines(ySeq ~ xSeq, col = "green", lwd = 3) ### Assuming cubic dependence (but still linear model!!!) summary(a3 <- lm(weight ~ percentage + I(percentage^2) + I(percentage^3), data=Koreny)) ### recall that there are four parameters in the model now ### (and four unique x values/groups too) - what does it mean? ySeq2 <- cbind(rep(1, 100), xSeq, xSeq^2, xSeq^3) %*% a3$coeff lines(ySeq2 ~ xSeq, col = "orange", lwd = 3) ### how do you explain the following? points(means ~ c(0,2,4,6), pch = 21, bg = "red", cex = 2) ### The models with the percentage being a four-level factor are all equivalent ### in terms of the fitted values (for groups with the percentages 0, 2, 4, ### and 6) with the model where the percentages are considered to be numerical ### values and the cubic line is used to with the data (thus, four parameters ### are used)! (predict(a3, newdata=data.frame(percentage=c(0,2,4,6)))) ### and compare with (means) (aWithoutInt$coeff) ### or in more detail p <- c(0, 2, 4, 6) L <- matrix(c(p^0, p, p^2, p^3), nrow = 4, byrow = F) LSest(a3, L ) ### versus summary(aWithoutInt) ### but this is not (generally) achived for other (sub)models LSest(a2, L[,1:3] ) LSest(a1, L[,1:2] ) ### Estimate the expectation of weight for percentage of sugar = 1. ### Is there any problem with such a prediction? (pred1 = predict(a2, newdata=data.frame(percentage=1))) points(1,pred1,col="black",bg="darkgreen",pch=23,cex=1.5) (pred2 = predict(a3, newdata=data.frame(percentage=1))) points(1,pred2,col="black",bg="darkorange",pch=23,cex=1.5) ### -------------------------------------------------------------------------### ### Block 2: INDIVIDUAL WORK ### ### -------------------------------------------------------------------------### ### What are the assumptions of the model 'a'? Compare the following outputs: summary(a); summary(aov(weight ~ fpercentage, data=Koreny)); anova(a); ### 5. CONTR.SAS PARAMETRIZAtiON | REFERENCE GROUP: THE LAST GROUP ### Alternative parametrization as the one with the reference group but in this ### case the reference category is the last group; aSAS <- lm(weight ~ fpercentage, data=Koreny, contrasts = list(fpercentage = contr.SAS)); summary(aSAS) ### The fitted function is of the form Y <- function(a,b,c,d, x){ return(a + b * (x == 0) + c * (x == 2) + d * (x == 4)) } ### Model matrix (unique rows only) (C <- unique(model.matrix(aSAS))) ### Based on the above output estimate the difference in expectations of the ### groups for the percentage of sugar equal to 2 and equal to 4. means[2]-means[3] l = c(0,1,-1,0) l%*%means (lc<-l%*%C) lc%*%aSAS$coef LSest(aSAS, lc) ### Again, the sample means can be calculated from the regression coefficients LSest(aSAS, L=C) summary(aWithoutInt)$coef ### 6. ORTHOGONAL CONSTRASTS (C.poly <- contr.poly(G)); ### Check orthogonality -- the columns of the matrix are orthonormal round(t(C.poly)%*%C.poly,10) ### Illustration of C.poly - recall, that again four parameters are used plot(levels(Koreny$fpercentage), C.poly[,1], ylab="", col="blue", xlab="sugar percentage", cex=2, main="Orthonormal polynomial contrasts"); lines(levels(Koreny$fpercentage), C.poly[,1], col="blue", lwd=2); # points(levels(Koreny$fpercentage), C.poly[,2], col="red3", pch=2); lines(levels(Koreny$fpercentage), C.poly[,2], col="red3", lwd=2); # points(levels(Koreny$fpercentage), C.poly[,3], col="darkgreen", pch=3); lines(levels(Koreny$fpercentage), C.poly[,3], col="darkgreen", lwd=2); # legend(x=2.5, y=0.65, pch=1:3, col=c("blue", "red3", "darkgreen"), lty=rep("solid", 3), legend=c("first column (.L)", "second column (.Q)", "third column (.C)")) ### the model with the orthogonal constrasts aPoly <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=contr.poly)) summary(aPoly) ### Model aPoly suggests that the percentage can be considered as ### an equidistant numerical covariate and the evolution of the group means can ### be described by a cubic function. ySeq <- means plot(weight ~ fpercentage, data=Koreny, col="sandybrown"); lines(ySeq ~ c(1,2,3,4), col = "blue", lwd = 2) points(means ~ c(1,2,3,4), pch = 22, bg = "red", cex = 2) ### this, however, automatically gives the estimate for other level - the ### percentage of 8, for instance (if there would be such observations). X <- NULL for (i in c(1,2,3,4,5)){ X <- rbind(X, c(i^0, i, i^2, i^3)) } linCoeff <- solve(X[1:2, 1:2], contr.poly(G)[1:2,1]) quadCoeff <- solve(X[1:3,1:3], contr.poly(G)[1:3,2]) cubCoeff <- solve(X[1:4,1:4], contr.poly(G)[,3]) ### verify, that you obtained the analytic coefficients for polynomials in C.poly cbind(X[1:4,1:2] %*% linCoeff, C.poly[,1]) ## linear term cbind(X[1:4,1:3] %*% quadCoeff, C.poly[,2]) ## quadraticl term cbind(X[1:4,1:4] %*% cubCoeff, C.poly[,3]) ## cubic term ### plot again - with some room for the percentage of 8 plot(weight ~ fpercentage, data=Koreny, col="sandybrown", xlim = c(0.5,6)); lines(ySeq ~ c(1,2,3,4), col = "blue", lwd = 2) points(means ~ c(1,2,3,4), pch = 22, bg = "red", cex = 2) ### the predicted mean for percentage 8 can be obtained by using the linear ### combination as in the underlying model ### interpolation using the model with the factor covariate for another level percEight <- c(1, X[5,1:2] %*% linCoeff, X[5, 1:3] %*% quadCoeff, X[5, 1:4] %*% cubCoeff) %*% aPoly$coeff lines(c(ySeq[4], percEight) ~ c(4,5), col = "green", lwd = 2) points(c(ySeq[4], percEight) ~ c(4,5), pch = 22, bg = "red", cex = 4) ### Do you understand the main idea in these models? ### Can we now reduce the model somehow (this is a hierarchical model)? ### A formal test for H_0: beta_3 = 0 l <- c(0,0,0,1); LSest(aPoly, l) ### A formal test for H_0: beta_2 = 0 & beta_3 = 0 DF <- aPoly$df.residual; L <- matrix(rbind(c(0,0,1,0), c(0,0,0,1)), nrow=2, ncol=4) m <- nrow(L); thetahat <- L%*%coef(aPoly); ### test-statistic (Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aPoly)%*%t(L))%*%thetahat) ### p-value 1 - pf(Q, df1=m, df2=DF) ### A formal test for H_0: beta_1 = 0 & beta_2 = 0 & beta_3 = 0 DF <- aPoly$df.residual; L <- matrix(rbind(c(0,1,0,0), c(0,0,1,0), c(0,0,0,1)), nrow=3, ncol=4) m <- nrow(L); thetahat <- L%*%coef(aPoly); ### test-statistic (Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aPoly)%*%t(L))%*%thetahat) ### p-value 1 - pf(Q, df1=m, df2=DF) ### The sample means can be calculated from the regression coefficients as Beta.Poly <- coef(aPoly); ###Sample means Beta.Poly[1] + C.poly %*% Beta.Poly[-1] ### or equivalently (cbind(rep(1,4), C.poly) %*% aPoly$coeff) ### Thus the full-rank parameterization without intercept can be recovered as LSest(aPoly, L=cbind(1, C.poly)) summary(aWithoutInt)$coef ### and vice-versa LSest(aWithoutInt, L=solve(cbind(1, C.poly))) summary(aPoly)$coeff ### -------------------------------------------------------------------------### ### Block 3: ADDITIONAL MATERIAL ### ### -------------------------------------------------------------------------### ### model with a fourth degree polynomial | different parametrization of the ### same continuous covariate a4<-lm(weight ~ percentage + I(percentage^2) + I(percentage^3)+I(percentage^4), data=Koreny) summary(a4) ### can you explain NA in the last row? cbind(fitted(aPoly),fitted(a4)) pKoreny <- data.frame(percentage=seq(0, 6, length=101)) fit4 <- predict(a4, newdata=pKoreny) plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange") lines(fit4 ~ percentage, col="deeppink3", data=pKoreny, lwd=3) ### versus the means from the full rank model (factor covariates) points(aWithoutInt$coeff ~ c(0,2,4,6), bg = "red", pch = 21, cex = 2) ### Explain why models a3, a and aPoly are equivalent. ### Explain why model a2 is a submodel of a. ### Explain why model a1 is a submodel of a. ### percentage as a factor covariate covariate summary(aPoly)$coef ### percentage as a continuous covariate aPoly2 = lm(weight ~ poly(percentage,degree=3), data=Koreny) summary(aPoly2)$coef ### compare four (in a sense different but statistically/geometrically the same) ### models fitted.all = cbind(fitted(a),fitted(a3),fitted(aPoly),fitted(aPoly2)) colnames(fitted.all) = c("~fpercentage","~poly(percentage,3,raw)", "~contr.poly(fpercentage)","~poly(percentage,3)") unique(fitted.all) ### formal tests - what are the conclussions? anova(a2,a) anova(a1,a)