### -------------------------------------------------------------------------###
### NMSA407 - Linear Regression                                              ###
### Winter Term 2021/2022                                                    ###
###                                                                          ###
### EXERCISE #5 Various parametrization for factor covariates/group means    ###
### -------------------------------------------------------------------------###

### -------------------------------------------------------------------------###
###  Block 1: TEACHING MATERIAL                                              ###
### -------------------------------------------------------------------------###  


library("mffSM")

data(Koreny) ### "Koreny" means roots in English.
help(Koreny)

dim(Koreny)
head(Koreny) 
summary(Koreny)

table(Koreny$percentage)

### the percentage variable will be considered as a categorical covariate 
Koreny$fpercentage <- factor(Koreny$percentage)                                    
summary(Koreny)                                                                

### Exploratory analysis
par(mfrow=c(2, 1), bty="n")
plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange")
plot(weight ~ fpercentage, data=Koreny, col="sandybrown")
par(mfrow=c(1, 1))

### number of the groups
print(G <- nlevels(Koreny$fpercentage))

### Standard ANOVA 
aov(weight~fpercentage, data=Koreny)
summary(aov(weight ~ fpercentage, data=Koreny));

### What are the assumptions of the underlying model? What is the conclusion? 


### 1. FULL RANK PARAMETRIZATION WITHOUT INTERCEPT
aWithoutInt <- lm(weight ~ -1 + fpercentage, data=Koreny);
summary(aWithoutInt) 

### The fitted function is of the form
Y <- function(a,b,c,d, x){
  return(a * (x == 0) + b * (x == 2) + c * (x == 4) + d * (x == 6))
}

### What are the limitations of this function? 

### The model matrix 
model.matrix(aWithoutInt)
###  with unique rows only
unique(model.matrix(aWithoutInt))

aggregate(Koreny$weight, list(Koreny$fpercentage), mean)
aWithoutInt$coef

### Note that as the intercept is not explicitly included, many of the numbers 
### in the output are useless. Among others:  

### Multiple R-squared: 

(1 - (aWithoutInt$df.residual * 
        (summary(aWithoutInt)$sigma^2)) / sum(Koreny$weight^2))
(1 - (sum(aWithoutInt$residuals^2)) / sum(Koreny$weight^2))
(R2 <- sum(fitted(aWithoutInt)^2)/sum(Koreny$weight^2))

### F-statistic (calculated from 'Multiple R-squared') is also useless:

((sum(Koreny$weight^2) - sum(aWithoutInt$residuals^2))/G) / 
  (sum(aWithoutInt$residuals^2) / (nrow(Koreny)-G))
(nrow(Koreny) - G)/(G - 0)*(R2/(1 - R2))


### Nevertheless as the intercept is implicitly present, it makes 
### sense to calculate the multiple R-squared - manually:

### R-squared in a model with intercept
(1 - (sum(aWithoutInt$residuals^2)) / 
    sum((Koreny$weight - mean(Koreny$weight))^2))
(R2 <- sum((fitted(aWithoutInt) - mean(fitted(aWithoutInt)))^2) / 
    sum((Koreny$weight - mean(Koreny$weight))^2))

### manually computed F-statistic
((sum((Koreny$weight - mean(Koreny$weight))^2) 
  - sum(aWithoutInt$residuals^2))/3) / (sum(aWithoutInt$residuals^2) / 50)
(nrow(Koreny)-G)/(G-1)*R2/(1-R2)

### Moreover the output of the 'summary()' table can be used to estimate any 
### linear combination of the group expectations and the appropriate standard 
### error and confidence interval.  

### Task: Estimate the difference in expectations of the groups  for the 
### percentage of sugar equal to 2 and equal to 0. 

### The vector to create the linear combination of regression coefficients
levels(Koreny$fpercentage)
l <- c(-1,1,0,0)

### The corresponding point estimate
coef(aWithoutInt)[2] - coef(aWithoutInt)[1]

### or equivalently 
(estim <- c(l%*%coef(aWithoutInt)))

### Standard errors - in this parametrization the covariance matrix of 
### the estimated coefficients is diagonal - can you explain why? 
vcov(aWithoutInt)

### So, the standard error can be calculated by the formula given below 
SE <- summary(aWithoutInt)$coefficients[,2]
sqrt(SE[1]^2 + SE[2]^2)

### or equivalently 
(SE.estim <- sqrt(t(l)%*%vcov(aWithoutInt)%*%l))

### Confidence interval 
DF <- aWithoutInt$df.residual; # degrees of freedom
estim + c(-1,1)*c(SE.estim*qt(0.975, df=DF))

### or equivalently, using the function LSest from the package "mffSM"
LSest(aWithoutInt, l)

### QUESTIONS:
### Let mu[j] stand for the expectation of the group with percentage = j for 
### (j=1,2,3,4) With the help of model 'aWithoutInt' test the null hypohesis 
### that (mu[1] + mu[2]) - (mu[3] + mu[4]) = 0

l <- c(1,1,-1,-1);
### point estimate
(estim <- c(l%*%coef(aWithoutInt)))
### estimated SE
sqrt(sum(SE^2))
(SE.estim <- sqrt(t(l)%*%vcov(aWithoutInt)%*%l))
### confidence interval
as.numeric(estim) + c(-1,1)*c(SE.estim)*qt(0.975, df=DF)    

### or, simply (and equaivalently)  
LSest(aWithoutInt, l)


### With the help of model 'aWithoutInt' test the null hypohesis 
### that 'mu[1] = mu[2] and at the same time mu[3] = mu[4]' 
### against the alternative hypothesis that 
### 'mu[1] is not equal to mu[2] and/or mu[3] is not equal to mu[4]'.

(L <- matrix(rbind(c(1,-1,0,0), c(0,0,1,-1)), nrow=2, ncol=4))
m <- nrow(L);

### point estimates
(thetahat <- L%*%coef(aWithoutInt))
### test statistic - a quadratic form, F-distributed
(Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aWithoutInt)%*%t(L))%*%thetahat)
1 - pf(Q, df1=m, df2=DF)

### another way of testing the same hypothesis: a submodel where 
### mu[1] and mu[2] are considered the same regressor, and mu[3] and mu[4] 
### are the same
asub = lm(weight ~ -1 + I(percentage<3), data=Koreny);
summary(asub) 
anova(asub,aWithoutInt)

### compare with the following output, where LSest considers the rows of L to 
### correspondto two separate tests which is not the correct test in this 
### situation
LSest(aWithoutInt,L=L)


### 2. CONTR.TREATMENT PARAMETRIZATION | REFERENCE GROUP: THE FIRST GROUP

### Default options for contrasts 
options()$contrasts

### model with intercept
a <- lm(weight ~ fpercentage, data=Koreny)
summary(a)

### The fitted function is of the form
Y <- function(a,b,c,d, x){
  return(a + b * (x == 2) + c * (x == 4) + d * (x == 6))
}

# compare with
aggregate(Koreny$weight, list(Koreny$fpercentage), mean)

### (Pseudo)Contrast matrix C.treat 
(C.treat <- contr.treatment(G))

### Model matrix (unique rows only)
unique(model.matrix(a))

### final model from weight~fpercentage - 1 and weight ~ fpercentage is the same
all.equal(fitted(aWithoutInt), fitted(a))


### Based on the above output estimate the difference in expectations of the 
### groups for the percentage of sugar equal to 0 and equal to 4. 

means <- tapply(Koreny$weight, Koreny$percentage, mean)
names(means) <- paste(names(means), " % sugar", sep="")
unname(means[3]-means[1])

### or equivalently
a$coef[3]

### the complete matrix of contrasts  
C <- cbind(1,contr.treatment(G))

### Based on the above output estimate the difference in expectations of the 
### groups for the percentage of sugar equal to 2 and equal to 4.

# the difference we consider
l = c(0,1,-1,0)
l%*%means

### Thus the full-rank parameterization without intercept can be recovered as 
LSest(a, L=C)
summary(aWithoutInt)$coef

### or vice-versa (using the inverse of C)
LSest(aWithoutInt, L = solve(C))
summary(a)$coef

### thus, the models are equivalent, but one must be careful about the 
### F-statistics and R-squared which are generally different
summary(aWithoutInt)
summary(a)


### 3. CONTR.SUM: SUM CONTRASTS 
aSum <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=contr.sum))
summary(aSum)

C <- cbind(1,contr.sum(G))

### The fitted function is of the form
Y <- function(a,b,c,d, x){
  return(a + b * (x == 0) + c * (x == 2) + d * (x == 4) - (b + c + d) * (x == 6))
}

### Model matrix 
unique(model.matrix(aSum))

### Intercept is now the average of the group means (not the average weight!)
mean(Koreny$weight)
mean(means)  
coef(aSum)


### Based on the above output estimate the difference in expectations of the 
### groups for the percentage of sugar equal to 0 and equal to 6.

means[1]-means[4]

l = c(1,0,0,-1)
l%*%means

(lc<-l%*%C)
lc%*%aSum$coef

LSest(aSum, lc)

### Comparison of the regression coefficients and sample means
print(means)
data.frame(Beta=coef(aSum), Mu=means)


### Thus the full-rank parameterization without intercept can be recovered as 
LSest(aSum, L=C)
summary(aWithoutInt)$coef

### or vice-versa again
LSest(aWithoutInt, L=solve(C))
summary(aSum)$coef


### 4. WEIGHTED SUM CONSTRASTS

### Not available in the standard R-software instalation,  therefore, we need to
### prepare the (pseudo)contrast matrix by ourselves.

### Group sample sizes 
(ng <- table(Koreny$percentage));

### (pseudo)constrast matrix
C.wSum <- rbind(diag(G - 1), - ng[-G] / ng[G])
rownames(C.wSum) <- levels(Koreny[["fpercentage"]])
print(C.wSum)

### custom contrast specification
awSum <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=C.wSum))
summary(awSum)

### The fitted function is of the form
Y <- function(a,b,c,d, x){
  return(a + b * (x == 0) + c * (x == 2) + d * (x == 4) 
         - (1.25 * b + c + 1.25 * d) * (x == 6))
}

### Model matrix  - why is it good for? 
unique(model.matrix(awSum))

### Intercept is now the mean weight (of all responses - not the mean of the 
### group means - as it was before)
mean(means)
mean(Koreny$weight)
awSum$coef[1]


### Based on the above output estimate the difference in expectations of the 
### groups for the percentage of sugar equal to 0 and equal to 4. 
C = cbind(1,C.wSum)
rownames(C) = levels(Koreny$fpercentage)

means[1] - means[3]

l = c(1,0,-1,0)
l%*%means

(lc<-l%*%C)
lc%*%aSum$coef

LSest(awSum, lc)

### Comparison of the regression coefficients and sample means
print(means)
data.frame(Beta=coef(awSum), Mu=means)

### Full-rank parameterization without intercept recovery
LSest(awSum, L=cbind(1, C.wSum))
summary(aWithoutInt)

### and again, vice-versa
LSest(aWithoutInt, L=solve(cbind(1, C.wSum)))
summary(awSum)


### LINEAR MODEL WITH PERCENTAGES TAKEN AS A NUMERICAL COVARIATE

### Assuming independence
summary(a0 <- lm(weight ~ 1, data=Koreny))

par(mfrow=c(1, 1), bty="n")
plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange")
abline(a0, 0, col = "red")

### Assuming linear dependence 
summary(a1 <- lm(weight ~ percentage, data=Koreny))

abline(a1, col="darkblue", lwd=3)

### Assuming quadratic dependence (but still linear model) 
summary(a2 <- lm(weight ~ percentage + I(percentage^2), data=Koreny))

xSeq <- seq(0,6, length = 100)
ySeq <- cbind(rep(1, 100), xSeq, xSeq^2) %*% a2$coeff
lines(ySeq ~ xSeq, col = "green", lwd = 3)

### Assuming cubic dependence (but still linear model!!!) 
summary(a3 <- lm(weight ~ percentage + I(percentage^2) + I(percentage^3), 
                 data=Koreny)) 
### recall that there are four parameters in the model now 
### (and four unique x values/groups too) - what does it mean? 
ySeq2 <- cbind(rep(1, 100), xSeq, xSeq^2, xSeq^3) %*% a3$coeff
lines(ySeq2 ~ xSeq, col = "orange", lwd = 3)

### how do you explain the following? 
points(means ~ c(0,2,4,6), pch = 21, bg = "red", cex = 2)

### The models with the percentage being a four-level factor are all equivalent 
### in terms of the fitted values (for groups with the percentages 0, 2, 4, 
### and 6) with the model where the percentages are considered to be numerical 
### values and the cubic line is used to with the data (thus, four parameters 
### are used)!
(predict(a3, newdata=data.frame(percentage=c(0,2,4,6)))) 

### and compare with 
(means)
(aWithoutInt$coeff)

### or in more detail
p <- c(0, 2, 4, 6)
L <- matrix(c(p^0, p, p^2, p^3), nrow = 4, byrow = F)
LSest(a3, L )

### versus
summary(aWithoutInt)

### but this is not (generally) achived for other (sub)models
LSest(a2, L[,1:3] )
LSest(a1, L[,1:2] )


### Estimate the expectation of weight for percentage of sugar = 1. 
### Is there any problem with such a prediction? 
(pred1 = predict(a2, newdata=data.frame(percentage=1))) 
points(1,pred1,col="black",bg="darkgreen",pch=23,cex=1.5)

(pred2 = predict(a3, newdata=data.frame(percentage=1))) 
points(1,pred2,col="black",bg="darkorange",pch=23,cex=1.5)



### -------------------------------------------------------------------------###
###  Block 2: INDIVIDUAL WORK                                                ###
### -------------------------------------------------------------------------###

### What are the assumptions of the model 'a'? Compare the following outputs:
summary(a);
summary(aov(weight ~ fpercentage, data=Koreny));
anova(a);


### 5. CONTR.SAS PARAMETRIZAtiON | REFERENCE GROUP: THE LAST GROUP
### Alternative parametrization as the one with the reference group but in this 
### case the reference category is the last group; 

aSAS <- lm(weight ~ fpercentage, data=Koreny, 
           contrasts = list(fpercentage = contr.SAS));
summary(aSAS)

### The fitted function is of the form
Y <- function(a,b,c,d, x){
  return(a + b * (x == 0) + c * (x == 2) + d * (x == 4))
}

### Model matrix (unique rows only)
(C <- unique(model.matrix(aSAS)))


### Based on the above output estimate the difference in expectations of the 
### groups for the percentage of sugar equal to 2 and equal to 4.

means[2]-means[3]

l = c(0,1,-1,0)
l%*%means

(lc<-l%*%C)
lc%*%aSAS$coef

LSest(aSAS, lc)


### Again, the sample means can be calculated from the regression coefficients
LSest(aSAS, L=C)
summary(aWithoutInt)$coef


###  6. ORTHOGONAL CONSTRASTS

(C.poly <- contr.poly(G));

### Check orthogonality -- the columns of the matrix are orthonormal 
round(t(C.poly)%*%C.poly,10)

### Illustration of C.poly - recall, that again four parameters are used 
plot(levels(Koreny$fpercentage), C.poly[,1], ylab="", col="blue", 
     xlab="sugar percentage", cex=2, main="Orthonormal polynomial contrasts");
lines(levels(Koreny$fpercentage), C.poly[,1], col="blue", lwd=2);
#
points(levels(Koreny$fpercentage), C.poly[,2], col="red3", pch=2);
lines(levels(Koreny$fpercentage), C.poly[,2], col="red3", lwd=2);
#
points(levels(Koreny$fpercentage), C.poly[,3], col="darkgreen", pch=3);
lines(levels(Koreny$fpercentage), C.poly[,3], col="darkgreen", lwd=2);
#
legend(x=2.5, y=0.65, pch=1:3, col=c("blue", "red3", "darkgreen"), 
       lty=rep("solid", 3), 
       legend=c("first column (.L)", "second column (.Q)", "third column (.C)"))


### the model with the orthogonal constrasts
aPoly <- lm(weight ~ fpercentage, data=Koreny, 
            contr=list(fpercentage=contr.poly))
summary(aPoly)

### Model aPoly suggests that the percentage can be considered as 
### an equidistant numerical covariate and the evolution of the group means can 
### be described by a cubic function. 
ySeq <- means

plot(weight ~ fpercentage, data=Koreny, col="sandybrown");
lines(ySeq ~ c(1,2,3,4), col = "blue", lwd = 2)
points(means ~ c(1,2,3,4), pch = 22, bg = "red", cex = 2)

### this, however, automatically gives the estimate for other level - the
### percentage of 8, for instance (if there would be such observations). 

X <- NULL 
for (i in c(1,2,3,4,5)){
  X <- rbind(X, c(i^0, i, i^2, i^3))
}

linCoeff <- solve(X[1:2, 1:2], contr.poly(G)[1:2,1])
quadCoeff <- solve(X[1:3,1:3], contr.poly(G)[1:3,2]) 
cubCoeff <- solve(X[1:4,1:4], contr.poly(G)[,3]) 

### verify, that you obtained the analytic coefficients for polynomials in C.poly
cbind(X[1:4,1:2] %*% linCoeff, C.poly[,1]) ## linear term
cbind(X[1:4,1:3] %*% quadCoeff, C.poly[,2]) ## quadraticl term
cbind(X[1:4,1:4] %*% cubCoeff, C.poly[,3]) ## cubic term

### plot again - with some room for the percentage of 8
plot(weight ~ fpercentage, data=Koreny, col="sandybrown", xlim = c(0.5,6));
lines(ySeq ~ c(1,2,3,4), col = "blue", lwd = 2)
points(means ~ c(1,2,3,4), pch = 22, bg = "red", cex = 2)

### the predicted mean for percentage 8 can be obtained by using the linear 
### combination as in the underlying model


### interpolation using the model with the factor covariate for another level
percEight <-  c(1, X[5,1:2] %*% linCoeff, X[5, 1:3] %*% quadCoeff, X[5, 1:4] %*% cubCoeff)  %*% aPoly$coeff
lines(c(ySeq[4], percEight) ~ c(4,5), col = "green", lwd = 2)
points(c(ySeq[4], percEight) ~ c(4,5), pch = 22, bg = "red", cex = 4)

### Do you understand the main idea in these models? 

### Can we now reduce the model somehow (this is a hierarchical model)? 
### A formal test for H_0: beta_3 = 0
l <- c(0,0,0,1);
LSest(aPoly, l)

### A formal test for H_0: beta_2 = 0 & beta_3 = 0
DF <- aPoly$df.residual;
L <- matrix(rbind(c(0,0,1,0), c(0,0,0,1)), nrow=2, ncol=4)
m <- nrow(L);
thetahat <- L%*%coef(aPoly);

### test-statistic 
(Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aPoly)%*%t(L))%*%thetahat) 
### p-value 
1 - pf(Q, df1=m, df2=DF) 


### A formal test for H_0: beta_1 = 0 & beta_2 = 0 & beta_3 = 0
DF <- aPoly$df.residual;
L <- matrix(rbind(c(0,1,0,0), c(0,0,1,0), c(0,0,0,1)), nrow=3, ncol=4)
m <- nrow(L);
thetahat <- L%*%coef(aPoly);

### test-statistic 
(Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aPoly)%*%t(L))%*%thetahat) 
### p-value 
1 - pf(Q, df1=m, df2=DF) 


### The sample means can be calculated from the regression coefficients as 
Beta.Poly <- coef(aPoly);

###Sample means 
Beta.Poly[1] + C.poly %*% Beta.Poly[-1]

### or equivalently
(cbind(rep(1,4), C.poly) %*% aPoly$coeff)

### Thus the full-rank parameterization without intercept can be recovered as 
LSest(aPoly, L=cbind(1, C.poly))
summary(aWithoutInt)$coef

### and vice-versa
LSest(aWithoutInt, L=solve(cbind(1, C.poly)))
summary(aPoly)$coeff




### -------------------------------------------------------------------------###
###  Block 3: ADDITIONAL MATERIAL                                            ###
### -------------------------------------------------------------------------###


### model with a fourth degree polynomial | different parametrization of the 
### same continuous covariate
a4<-lm(weight ~ percentage + I(percentage^2) + I(percentage^3)+I(percentage^4), 
       data=Koreny)

summary(a4) ### can you explain NA in the last row? 

cbind(fitted(aPoly),fitted(a4))


pKoreny <- data.frame(percentage=seq(0, 6, length=101))
fit4 <- predict(a4, newdata=pKoreny)


plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange")
lines(fit4 ~ percentage, col="deeppink3", data=pKoreny, lwd=3)
### versus the means from the full rank model (factor covariates)
points(aWithoutInt$coeff ~ c(0,2,4,6), bg = "red", pch = 21, cex = 2)


### Explain why models a3, a and aPoly are equivalent. 
### Explain why model a2 is a submodel of a. 
### Explain why model a1 is a submodel of a. 

### percentage as a factor covariate covariate
summary(aPoly)$coef

### percentage as a continuous covariate
aPoly2 = lm(weight ~ poly(percentage,degree=3), data=Koreny)
summary(aPoly2)$coef


### compare four (in a sense different but statistically/geometrically the same)
### models 
fitted.all = cbind(fitted(a),fitted(a3),fitted(aPoly),fitted(aPoly2))
colnames(fitted.all) = c("~fpercentage","~poly(percentage,3,raw)",
                         "~contr.poly(fpercentage)","~poly(percentage,3)")
unique(fitted.all)


### formal tests - what are the conclussions? 
anova(a2,a)
anova(a1,a)