### --------------------------------------------------------------------------------------###
### Winter Term 2019/2020                                                                 ###
### NMSA407 - Linear Regression                                                           ###
###                                                                                       ###
### EXERCISE #5  Different parametrizations of one-way classified group means             ###
###                                                                                       ###
### --------------------------------------------------------------------------------------###

# In K4/K10/K11 you might not be able to load the package 'mffSM' because of a low 
# version of R software (< 3.2.0). In this case tyou can run R software from the 
# network disc L, where the version 3.2.2. is installed. The path is
#  L:\Statistika\R\bin\x64\Rgui.exe                                              

### ======================
### Working directory
### ======================
setwd("H:/nmsa407_LinRegr/")

library("mffSM")

data(Koreny) # "Koreny" means roots in English.
help(Koreny)
head(Koreny) 
summary(Koreny)

table(Koreny$percentage)

### In the sequel it will be useful to consider the percentage as a categorical covariate 
Koreny$fpercentage <- factor(Koreny$percentage)                                    
summary(Koreny)                                                                

### Exploratory analysis: two figures
par(mfrow=c(2, 1), bty="n")
plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange")
plot(weight ~ fpercentage, data=Koreny, col="sandybrown")
par(mfrow=c(1, 1))

### ================================================================
### Different parametrization of one-way classified group means  ###
### ================================================================

### number of the groups
print(G <- nlevels(Koreny$fpercentage))

### Standard ANOVA table 
aov(weight~fpercentage, data=Koreny)
summary(aov(weight ~ fpercentage, data=Koreny));

### QUESTIONS
### What are the assumptions of the underlying model? 
### What is the conclusion? 

### 1. Full-rank parameterization without intercept
### -----------------------------------------------

aWithoutInt <- lm(weight ~ -1 + fpercentage, data=Koreny);
summary(aWithoutInt) 

## The model matrix of X looks like this
model.matrix(aWithoutInt)
##  Unique rows of the model matrix X
unique(model.matrix(aWithoutInt))

### QUESTIONS: 
### Which of the numbers in the output are useful within the problem context?

aggregate(Koreny$weight, list(Koreny$fpercentage), mean)
aWithoutInt$coef
 
### Note that as the intercept is not explicitly included, many of the numbers 
### in the output are useless. Among others:  
### Multiple R-squared: - see help(summary.lm)
### F-statistic - it is calculated from 'Multiple R-squared' and thus it is also useless. 

### R-squared in a model wihtout intercept
(R2.0<-sum(fitted(aWithoutInt)^2)/sum(Koreny$weight^2))
# R-squared and a test of a submodel
asub0 = lm(weight~-1,data=Koreny)
anova(asub0,aWithoutInt)
summary(aWithoutInt)$fstatistic
# manually computed F-statistic
n = nrow(Koreny)
(n-G)/(G-0)*(R2.0/(1-R2.0))

### Nevertheless as the intercept is implicitly present, it makes 
### sense to calculate the multiple R-squared. One can do it 
### 'manually' as

### R-squared in a model with intercept
(R2<-var(fitted(aWithoutInt))/var(Koreny$weight))
### (This is the usual calculation of R-squared for models with intercept)
asub = lm(weight~1, data=Koreny)
anova(asub,aWithoutInt)
# manually computed F-statistic
(n-G)/(G-1)*R2/(1-R2)

### Moreover the output of 'summary()' can be used to 
### estimate any linear combination of the group expectations 
### and the appropriate standard error and confidence interval.  

### Task: Estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 2 and equal to 0. 

### The vector to create the linear combination of regression coefficients
levels(Koreny$fpercentage)
l <- c(-1,1,0,0);

### The corresponding point estimate
coef(aWithoutInt)[2] - coef(aWithoutInt)[1]

### or equivalently 
(estim <- c(l%*%coef(aWithoutInt)))

### Standard errors - in this parametrization the covariance matrix of 
### the estimated coefficients is diagonal
vcov(aWithoutInt)

### So, the standard error can be calculated by the formula given below 
SE <- summary(aWithoutInt)$coefficients[,2]; 
sqrt(SE[1]^2 + SE[2]^2)
### or equivalently 
(SE.estim <- sqrt(t(l)%*%vcov(aWithoutInt)%*%l))

### Confidence interval 
DF <- aWithoutInt$df.residual; # degrees of freedom
estim + c(-1,1)*c(SE.estim*qt(0.975, df=DF))

# The estimates above (and even more) can be simply calculated 
# with the help of function LSest from the package "mffSM"
LSest(aWithoutInt, l)

# manual computation of the t value and the P value
estim/SE.estim
2*pt(estim/SE.estim,df=DF,lower.tail=FALSE)

### QUESTIONS:
### Let mu[j] stand for the expectation of the group with percentage = j (j=1,2,3,4)
### With the help of model 'aWithoutInt' test the null hypohesis 
### that (mu[1] + mu[2]) - (mu[3] + mu[4]) = 0

l <- c(1,1,-1,-1);
# point estimate
(estim <- c(l%*%coef(aWithoutInt)))
# estimated SE
sqrt(sum(SE^2))
(SE.estim <- sqrt(t(l)%*%vcov(aWithoutInt)%*%l))
# confidence interval
as.numeric(estim) + c(-1,1)*c(SE.estim)*qt(0.975, df=DF)    

# ...or simply also  
LSest(aWithoutInt, l)
  
### 
### With the help of model 'aWithoutInt' test the null hypohesis 
### that 'mu[1] = mu[2] and at the same time mu[3] = mu[4]' 
### against the alternative hypothesis that 
### 'mu[1] is not equal to mu[2] and/or mu[3] is not equal to mu[4]'.

(L <- matrix(rbind(c(1,-1,0,0), c(0,0,1,-1)), nrow=2, ncol=4))
m <- nrow(L);

# point estimates
(thetahat <- L%*%coef(aWithoutInt))
# test statistic - a quadratic form, F-distributed
(Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aWithoutInt)%*%t(L))%*%thetahat)
1 - pf(Q, df1=m, df2=DF)

# another way of testing the same hypothesis: a submodel where 
# mu[1] and mu[2] are considered the same regressor, and mu[3] and mu[4] are the same
asub = lm(weight ~ -1 + I(percentage<3), data=Koreny);
summary(asub) 
anova(asub,aWithoutInt)
  
# compare with the following output
LSest(aWithoutInt,L=L)
# LSest considers the rows of L to correspond to two separate tests!
# This is not the correct test in our situation

### 2. Contr.treatment: Reference group pseudocontrast with  the reference 
###    given by the first group (default in R)
### ---------------------------------------------------------------------- 

### Default options for contrasts 
options()$contrasts;

a <- lm(weight ~ fpercentage, data=Koreny)
summary(a)

# compare with
aggregate(Koreny$weight, list(Koreny$fpercentage), mean)

### (Pseudo)Contrast matrix C.treat 
(C.treat <- contr.treatment(G))

### Model matrix (unique rows only)
unique(model.matrix(a))

# the final model from weight~fpercentage - 1 and weight ~ fpercentage is the same
all.equal(fitted(aWithoutInt), fitted(a))

### QUESTIONS:
### Based on the above output estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 0 and equal to 4. 
###
  
# point estimate
means <- tapply(Koreny$weight, Koreny$percentage, mean)
names(means) <- paste(names(means), " % sugar", sep="")
unname(means[3]-means[1])
a$coef[3]

# the complete matrix of contrasts  
C = cbind(1,contr.treatment(G))
rownames(C) = levels(Koreny$fpercentage)
colnames(C) = paste("b",1:4,sep="")
C
# its inverse
(C.treat.inv = solve(C))

# the difference in means that we consider 
l = c(1,0,-1,0)
l%*%means
# and the same difference in the basis given by contrasts  
(lc<-l%*%C)
lc%*%a$coef

### Based on the above output estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 2 and equal to 4.

# the difference we consider
l = c(0,1,-1,0)
l%*%means
# and the same in the basis of contrasts  
(lc<-l%*%C)
lc%*%a$coef

### Comparison of the regression coefficients and sample means
### What did we obtain? 
print(means)
data.frame(Beta=coef(a), Mu=means)

### The sample means can be calculated from the regression coefficients as 
Beta.treat <- coef(a)

### Sample means 
Beta.treat[1] + C.treat %*% Beta.treat[-1]
C%*%Beta.treat

### Thus the full-rank parameterization without intercept can be recovered as 
LSest(a, L=C)
summary(aWithoutInt)$coef

# the inverse transformation
(C.treat.inv)
LSest(aWithoutInt, L = C.treat.inv)
summary(a)$coef

# the models are equivalent, but - again - be careful about the F-statistics and R-squared
summary(aWithoutInt)
summary(a)

### QUESTIONS:
### What are the assumptions of the model? 
### Compare the following outputs:
summary(a);
summary(aov(weight ~ fpercentage, data=Koreny));
anova(a);

### QUESTIONS:
### Let mu[j] stand for the expectation of the group with percentage = j (j=1,2,3,4)
### With the help of model 'a' test the null hypohesis 
### that (mu[1] + mu[2]) - (mu[3] + mu[4]) = 0
  
l = c(1,1,-1,-1)
l%*%means
(lc<-l%*%C)
LSest(a,L=lc)
LSest(aWithoutInt, l)

### 3. Contr.SAS: Reference group pseudocontrast with the reference given by the last group
### ---------------------------------------------------------------------------------------

aSAS <- lm(weight ~ fpercentage, data=Koreny, contrasts = list(fpercentage = contr.SAS));
summary(aSAS)

### Model matrix (unique rows only)
unique(model.matrix(aSAS))

### QUESTIONS:
### Based on the above output estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 0 and equal to 6. 
###

# analogously as above
(C = cbind(1,contr.SAS(G)))
(C.inv = solve(C))

l = c(1,0,0,-1)
l%*%means

(lc<-l%*%C)
lc%*%aSAS$coef
  
# reconstruct mu[4]
l = c(0,0,0,1)
l%*%means
  
(lc<-l%*%C)
lc%*%aSAS$coef

### Based on the above output estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 2 and equal to 4.

means[2]-means[3]

l = c(0,1,-1,0)
l%*%means
  
(lc<-l%*%C)
lc%*%aSAS$coef
  

### Comparison of the regression coefficients and sample means
print(means)
data.frame(Beta=coef(aSAS), Mu=means)

### The sample means can be calculated from the regression coefficients as 
(C.SAS <- contr.SAS(G));
Beta.SAS <- coef(aSAS);

### Sample means 
Beta.SAS[1] + C.SAS %*% Beta.SAS[-1]
C%*%Beta.SAS

### Thus the full-rank parameterization without intercept can be again recovered as 
LSest(aSAS, L=cbind(1, C.SAS))
summary(aWithoutInt)$coef



### 4. Contr.sum: Sum contrasts (reference = the last group)
### --------------------------------------------------------

(C.Sum <- contr.Sum(G));
aSum <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=contr.sum))
summary(aSum)

### Model matrix 
unique(model.matrix(aSum))
  
### QUESTIONS:
### Based on the above output estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 0 and equal to 4. 
###

C = cbind(1,contr.Sum(G))
rownames(C) = levels(Koreny$fpercentage)
colnames(C) = paste("b",1:4,sep="")
C
(C.inv = solve(C))

### Intercept is the average of the group means (not the average weight!)
mean(means)  
coef(aSum)

l = c(1,0,-1,0)
l%*%means
  
(lc<-l%*%C)
lc%*%aSum$coef
  
### Based on the above output estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 0 and equal to 6.

l = c(1,0,0,-1)
l%*%means
  
(lc<-l%*%C)
lc%*%aSum$coef
  
### Comparison of the regression coefficients and sample means
print(means)
data.frame(Beta=coef(aSum), Mu=means)

### The sample means can be calculated from the regression coefficients as 
(C.Sum <- contr.Sum(G));
C
Beta.Sum <- coef(aSum);

### Sample means 
Beta.Sum[1] + C.Sum %*% Beta.Sum[-1]
C%*%Beta.Sum

### Thus the full-rank parameterization without intercept can be recovered as 
LSest(aSum, L=C)
summary(aWithoutInt)$coef

### QUESTIONS:
### Let mu[j] stand for the expectation of the group with percentage = j
### With the help of model 'aSum' test the null hypothesis 
### that (mu[1] + mu[2]) - (mu[3] + mu[4]) = 0

l = c(1,1,-1,-1)
l%*%means
(lc<-l%*%C)
LSest(aSum,L=lc)
LSest(aWithoutInt, l)


### 5. Weighted sum contrasts (reference = the last group)
### ------------------------------------------------------

### Note that this method is not readily available in the standard R-software instalation, 
### therefore, we need to prepare the (pseudo)contrast matrix by ourselves.
### See Section 8.4 of the course notes (page 170)

### Group sample sizes 
(ng <- table(Koreny$percentage));

# formula (8.28)
C.wSum <- rbind(diag(G - 1), - ng[-G] / ng[G])
rownames(C.wSum) <- levels(Koreny[["fpercentage"]])
print(C.wSum)

# custom contrast specification
awSum <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=C.wSum))
summary(awSum)

### Model matrix 
unique(model.matrix(awSum))

### Intercept is now the mean weight (of all responses)
awSum$coef
mean(Koreny$weight)

### QUESTIONS:
### Based on the above output estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 0 and equal to 4. 
###

C = cbind(1,C.wSum)
rownames(C) = levels(Koreny$fpercentage)
colnames(C) = paste("b",1:4,sep="")
C
(C.inv = solve(C))
  
l = c(1,0,-1,0)
l%*%means
  
(lc<-l%*%C)
lc%*%aSum$coef

### Based on the above output estimate the difference in expectations of the groups 
### for the percentage of sugar equal to 0 and equal to 6.

l = c(1,0,0,-1)
l%*%means
  
(lc<-l%*%C)
lc%*%awSum$coef

### Comparison of the regression coefficients and sample means
print(means)
data.frame(Beta=coef(awSum), Mu=means)

### The sample means can be calculated from the regression coefficients as 
Beta.wSum <- coef(awSum);

### Sample means 
Beta.wSum[1] + C.wSum %*% Beta.wSum[-1]
C%*%Beta.wSum

### And thus, the full-rank parameterization without intercept can be recovered as 
LSest(awSum, L=cbind(1, C.wSum))
summary(aWithoutInt)


###  6. Orthonormal polynomial contrasts  
### ------------------------------------

(C.poly <- contr.poly(G));

### Check orthogonality -- the columns of the matrix are orthonormal 
round(t(C.poly)%*%C.poly,10)

### Figure illustrating C.poly 
plot(levels(Koreny$fpercentage), C.poly[,1], ylab="", col="blue", xlab="sugar percentage", cex=2, main="Orthonormal polynomial contrasts");
lines(levels(Koreny$fpercentage), C.poly[,1], col="blue", lwd=2);
#
points(levels(Koreny$fpercentage), C.poly[,2], col="red3", pch=2);
lines(levels(Koreny$fpercentage), C.poly[,2], col="red3", lwd=2);
#
points(levels(Koreny$fpercentage), C.poly[,3], col="darkgreen", pch=3);
lines(levels(Koreny$fpercentage), C.poly[,3], col="darkgreen", lwd=2);
#
legend(x=2.5, y=0.65, pch=1:3, col=c("blue", "red3", "darkgreen"), lty=rep("solid", 3), legend=c("first column (.L)", "second column (.Q)", "third column (.C)"))

### Model matrix 
unique(model.matrix(aPoly))

aPoly <- lm(weight ~ fpercentage, data=Koreny, contr=list(fpercentage=contr.poly))
summary(aPoly)

### Model aPoly suggests that the percentage can be considered as 
### an equidistant numerical covariate and the evolution of the group means can be
### described by a quadratic function. 

plot(weight ~ fpercentage, data=Koreny, col="sandybrown");

### A formal test for H_0: beta_3 = 0
l <- c(0,0,0,1);
LSest(aPoly, l)


### A formal test for H_0: beta_2 = 0 & beta_3 = 0
DF <- aPoly$df.residual;
L <- matrix(rbind(c(0,0,1,0), c(0,0,0,1)), nrow=2, ncol=4)
m <- nrow(L);
thetahat <- L%*%coef(aPoly);

### test-statistic 
(Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aPoly)%*%t(L))%*%thetahat) 
### p-value 
1 - pf(Q, df1=m, df2=DF) 


### A formal test for H_0: beta_1 = 0 & beta_2 = 0 & beta_3 = 0
DF <- aPoly$df.residual;
L <- matrix(rbind(c(0,1,0,0), c(0,0,1,0), c(0,0,0,1)), nrow=3, ncol=4)
m <- nrow(L);
thetahat <- L%*%coef(aPoly);

### test-statistic 
(Q <- 1/m * t(thetahat)%*%solve(L%*%vcov(aPoly)%*%t(L))%*%thetahat) 
### p-value 
1 - pf(Q, df1=m, df2=DF) 


### The sample means can be calculated from the regression coefficients as 
Beta.Poly <- coef(aPoly);

###Sample means 
Beta.Poly[1] + C.poly %*% Beta.Poly[-1]

### Thus the full-rank parameterization without intercept can be recovered as 
LSest(aPoly, L=cbind(1, C.poly))
summary(aWithoutInt)$coef


### Note:
### In practice it might be of interest to take into the account the issue of multiple testing 
### and sometimes also to make all the pairwise comparisons. This will be 
### discussed in the second half of the semester.

### ============================================================================ ###
### The linear model  with the percentage of sugar taken as a numeric covariate  ### 
### ============================================================================ ###

### Assuming independence
summary(a0 <- lm(weight ~ 1, data=Koreny))

### Assuming linear dependence 
summary(a1 <- lm(weight ~ percentage, data=Koreny))

par(mfrow=c(1, 1), bty="n")
plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange")
abline(a1, col="darkblue", lwd=3)

### Assuming quadratic model 
summary(a2 <- lm(weight ~ percentage + I(percentage^2), data=Koreny))

pKoreny <- data.frame(percentage=seq(0, 6, length=101))
fit2 <- predict(a2, newdata=pKoreny)

### Figure 
par(mfrow=c(1, 1), bty="n")
plot(weight ~ percentage, data=Koreny, pch=23, col="red3", bg="orange")
lines(fit2 ~ percentage, col="darkgreen", data=pKoreny, lwd=3)
abline(a1, col="darkblue", lwd=3)
legend(1.5, 0.2, legend=c("Linear", "Quadratic"), lty=1, col=c("darkblue", "darkgreen"), lwd=3)

### QUESTIONS: 
### Estimate the expectation of weight for percentage of sugar = 1. 
### Is there any problem with such a prediction? 
(pred1 = predict(a2, newdata=data.frame(percentage=1))) 
points(1,pred1,col="seagreen",bg="darkgreen",pch=23,cex=1.5)

### Assuming a cubic model 
summary(a3 <- lm(weight ~ percentage + I(percentage^2) + I(percentage^3), data=Koreny))
fit3 <- predict(a3, newdata=pKoreny)

### Figure 
means <- tapply(Koreny$weight, Koreny$percentage, mean)
BARVY <- c(linear="darkblue", kvadr="darkgreen", kubic="red")

par(mfrow=c(1, 1), bty="n")
plot(weight ~ percentage, data=Koreny, col="grey50")
abline(a1, col=BARVY["linear"], lwd=3)
lines(fit2 ~ percentage, col=BARVY["kvadr"], data=pKoreny, lwd=3)
lines(fit3 ~ percentage, col=BARVY["kubic"], data=pKoreny, lwd=3)
points(means ~ seq(0, 6, by=2), pch=15, col="orange", cex=1.5)
legend(1, 0.2, legend=c("Linear", "Quadratic", "Cubic"), lty=1, col=BARVY, lwd=3)

### predicted means are the same as the observed means now
predict(a3,newdata = data.frame(percentage=seq(0, 6, length=4)))
means
  
### model with a fourth degree polynomial
lm(weight ~ poly(percentage,degree=4), data=Koreny)
a4<-lm(weight ~ percentage + I(percentage^2) + I(percentage^3)+I(percentage^4), data=Koreny)
fit4 <- predict(a4, newdata=pKoreny)
lines(fit4 ~ percentage, col="deeppink3", data=pKoreny, lwd=3)
  
### Recall the contr.treatment() parametrization 
summary(a)
  
### QUESTIONS:
### Explain why models a3, a and aPoly are equivalent. 
### Explain why model a2 is a submodel of a. 
### Explain why model a1 is a submodel of a. 

summary(aPoly)$coef
aPoly2 = lm(weight ~ poly(percentage,degree=3), data=Koreny)
summary(aPoly2)$coef
fitted.all = cbind(fitted(a),fitted(a3),fitted(aPoly),fitted(aPoly2))
colnames(fitted.all) = c("~fpercentage","~poly(percentage,3,raw)","~contr.poly(fpercentage)","~poly(percentage,3)")
unique(fitted.all)

anova(a2,a)
anova(a1,a)