### --------------------------------------------------------------------------------------###
### Winter Term 2019/2020                                                                 ###
### NMSA407 - Linear Regression                                                           ###
###                                                                                       ###
### EXERCISE #6  Additivity and partial effects of covariates                             ###
###              (Simple linear regression models with interactions)                      ###
###                                                                                       ###
### --------------------------------------------------------------------------------------###


# In K4/K10/K11 you might not be able to load the package 'mffSM' because of a low 
# version of R software (< 3.2.0). In this case tyou can run R software from the 
# network disc L, where the version 3.2.2. is installed. The path is
#  L:\Statistika\R\bin\x64\Rgui.exe .                                              


### ======================
### Working directory
### ======================
setwd("H:/nmsa407_LinRegr/")

library("mffSM")


### ============================
### Data we will be working on
### ============================
data(Dris, package = "mffSM")
# help("Dris")

head(Dris)
summary(Dris)

### =======================================================
### As in one of the previous exercise classes we will be 
### interested in the dependence of yield on the amount of 
### magnesium yield ~ Mg but now we include also other 
### measurements to improve our model. 
### =======================================================

### Add transformations of Mg, Ca, N to the dataset 
Dris <- transform(Dris, lMg = log2(Mg), lMg2 = (log2(Mg))^2, lCa = log2(Ca), 
                  lCa2 = (log2(Ca))^2, lN = log2(N), lN2=(log2(N))^2)


### First, start with adding Ca in the form of log2(Ca)

### Let start with a simple additive model 
summary(m1_addit <- lm(yield ~ lMg + lCa, data = Dris))

### QUESTIONS:
### Interpret the estimated regression coefficients. 
### What is being tested on the rows started with 'lMg' and 'lCa'?
### Interpret the test on the last line of the output started with 'F-statistic'.

### Compare the above output with the following model output:
summary(lm(yield ~ lMg, data = Dris))

### QUESTION:
### What is the difference in interpretation of the regression coefficients? 


### Interpret the following models that include also the quadratic term of lMg2.
summary(m20 <- lm(yield ~ lMg + lMg2, data = Dris))
summary(m21 <- lm(yield ~ lMg + lMg2 + lCa, data = Dris)) 
summary(m22 <- lm(yield ~ lMg + lMg2 + lCa + lCa2, data = Dris)) 

### The change of lMg by +log2(1.1) <=> the increase of Mg by 10%
eps <- 0.1;
delta <- log2(1+eps);
lMg.grid <- seq(min(Dris$lMg), max(Dris$lMg)-delta, length = 1000); 
effect.lMg <- function(b1,b2) b1*delta + b2*(2*delta*lMg.grid + delta^2); 

### Plotting the effect of lMg on the yield 
plot(lMg.grid, effect.lMg(coef(m20)['lMg'], coef(m20)['lMg2']), col = "blue", xlab = "log2(Mg)", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg on the yield", type="l", lwd=2, ylim=range(effect.lMg(coef(m20)['lMg'], coef(m20)['lMg2']), effect.lMg(coef(m21)['lMg'], coef(m21)['lMg2']), effect.lMg(coef(m22)['lMg'], coef(m22)['lMg2'])));
lines(lMg.grid, effect.lMg(coef(m21)['lMg'], coef(m21)['lMg2']), lwd=2, col="red")
lines(lMg.grid, effect.lMg(coef(m22)['lMg'], coef(m22)['lMg2']), lwd=2, col="yellow", lty="dotted") 
abline(h=0, lty="dotted")
legend(3.25, -0.1, col=c("blue", "red", "yellow"), lwd=c(2,2,2), lty=c("solid", "solid", "dotted"), legend=c("m20", "m21", "m22"), cex=1.3)

### Plotting the effect of Mg on the yield 
Mg.grid <- seq(min(Dris$Mg), max(Dris$Mg)-1, length = 1000); 
effect.Mg <- function(b1,b2) b1*(log2(Mg.grid+1) - log2(Mg.grid)) + b2*((log2(1+Mg.grid))^2 - (log2(Mg.grid))^2); 

plot(Mg.grid, effect.Mg(coef(m20)['lMg'], coef(m20)['lMg2']), col = "blue", xlab = "Mg", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg on the yield", type="l", lwd=2, ylim=range(effect.Mg(coef(m20)['lMg'], coef(m20)['lMg2']), effect.Mg(coef(m21)['lMg'], coef(m21)['lMg2']), effect.Mg(coef(m22)['lMg'], coef(m22)['lMg2'])));
lines(Mg.grid, effect.Mg(coef(m21)['lMg'], coef(m21)['lMg2']), lwd=2, col="red")
lines(Mg.grid, effect.Mg(coef(m22)['lMg'], coef(m22)['lMg2']), lwd=2, col="yellow", lty="dotted")
abline(h=0, lty="dotted")
legend(15, 0.3, col=c("blue", "red", "yellow"), lwd=c(2,2,2), lty=c("solid", "solid", "dotted"), legend=c("m20", "m21", "m22"), cex=1.3)
rug(Dris$Mg+runif(nrow(Dris))-0.5);


### Assume that Ca and Mg influences only the expectation of 'yield' and the linear model holds. 
### How would you test that given 'Ca' the amount of 'yield' is independent of 'Mg'? 
### That is we would like to test that L(Y|Ca) is independent with L(Mg|Ca). 
summary(m01 <- lm(yield ~ lCa, data = Dris)) 
summary(m02 <- lm(yield ~ lCa+lCa2, data = Dris)) 

anova(m02, m22)
anova(m01, m21)

### Task: How would you test that given 'Mg' the amount of 'yield' is independent of 'Ca'? 


### Nest: we add variable lN into the model
summary(m211 <- lm(yield ~ lMg + lMg2 + lCa + lN, data = Dris)) 

### QUESTIONS:
### Can we say that the amount of 'yield' is independent of 'Ca'?  
### Can we say that given 'Mg' and 'Ca' the amount of 'yield' is independent of 'N'?  
### Can we say that given 'N' and 'Ca' the amount of 'yield' is independent of 'Mg'?  

summary(m011 <- lm(yield ~ lCa + lN, data = Dris)) 
anova(m011, m211)

### A simple interaction model 
summary(m1_inter <- lm(yield ~ lMg + lCa + lMg:lCa, data = Dris))

### QUESTIONS: 
### What is the interpretation of the regression coefficients?
### How would you quantify the effect of Mg on yield (when taking Ca into account)? 

lCa.grid <- seq(min(Dris$lCa), max(Dris$lCa), length = 1000); 
eps <- 0.1;
delta <- log2(1+eps);

par(mfrow = c(2, 1))
plot(lCa.grid, delta*coef(m1_inter)['lMg'] + delta*coef(m1_inter)['lMg:lCa'] * lCa.grid, col = "blue", xlab = "log2(Ca)", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg on the yield", type="l", lwd=2)
Ca.grid <- seq(min(Dris$Ca), max(Dris$Ca), length = 1000)
plot(Ca.grid, coef(m1_inter)['lMg'] * (log2(Mg.grid+1)-log2(Mg.grid)) + coef(m1_inter)['lMg:lCa'] * (log2(Mg.grid+1)-log2(Mg.grid)) * log2(Ca.grid), col = "blue", xlab = "Ca", ylab = "Change of expected yield", main = "The effect of increas of Mg by +1 on the yield", type="l", lwd=2)
par(mfrow = c(1, 1))


###  Data from the American Association of University Professors
###  (AAUP) annual faculty salary survey of American colleges and
###  universities, 1995.
###
###  Data are from 1161 colleges obtained (mostly) by a census method.


### Data
data(AAUP, package = "mffSM")

head(AAUP)
summary(AAUP)

###  Model for dependence of salary of the associate professor [salary.assoc]
###  on the following variables:
###  * number of full professors [n.prof]
###  * number of associate professors [n.assoc]
###  * number of assistant professors [n.assist]
###  * type of the institution [type]
###
### Our main aim will be to have a model which sufficiently well
### expresses E(salary.assoc|...) such that inference concerning
### its dependence on considered factors can be done.

### Create a subset dataset containing only needed
### or otherwise interesting variables
aaup <- subset(AAUP, select = c("FICE", "name", "state", "type", "n.prof", "n.assoc", "n.assist", "salary.assoc"))
summary(aaup)

### Descriptive statistics for the quantitative variables to be analyzed,
### also categorized by type

TABS <- list()
for (v in c("salary.assoc", "n.prof", "n.assoc", "n.assist")){
  MeanType <- tapply(aaup[, v], aaup[, "type"], mean, na.rm = TRUE)
  MedType  <- tapply(aaup[, v], aaup[, "type"], median, na.rm = TRUE)
  SDType   <- tapply(aaup[, v], aaup[, "type"], sd, na.rm = TRUE)    
  Q1Type   <- tapply(aaup[, v], aaup[, "type"], quantile, prob = 0.25, na.rm = TRUE)
  Q3Type   <- tapply(aaup[, v], aaup[, "type"], quantile, prob = 0.75, na.rm = TRUE)    
  IQRType  <- tapply(aaup[, v], aaup[, "type"], IQR, na.rm = TRUE)  
  NAsType <- tapply(is.na(aaup[, v]), aaup[, "type"], sum)
  
  Mean <- mean(aaup[, v], na.rm = TRUE)
  Med  <- median(aaup[, v], na.rm = TRUE)  
  SD   <- sd(aaup[, v], na.rm = TRUE)
  Q1   <- quantile(aaup[, v], prob = 0.25, na.rm = TRUE)
  Q3   <- quantile(aaup[, v], prob = 0.75, na.rm = TRUE)  
  IQR  <- IQR(aaup[, v], na.rm = TRUE)
  NAs <- sum(is.na(aaup[, v]))

  TABS[[v]] <- data.frame(Mean   = round(c(Mean, MeanType), 1),
                          SD     = round(c(SD, SDType), 1),
                          Median = round(c(Med, MedType), 1),
                          Q1     = round(c(Q1, Q1Type), 1),
                          Q3     = round(c(Q3, Q3Type), 1),
                          IQR    = round(c(IQR, IQRType), 1),
                          NAs    = round(c(NAs, NAsType), 1))
  rownames(TABS[[v]]) <- c("All", "I", "IIA", "IIB")
  colnames(TABS[[v]])[7] <- "NA's"
}  

print(TABS)


### Frequency table for the categorical variable
table(aaup[, "type"])
prop.table(table(aaup[, "type"]))
round(prop.table(table(aaup[, "type"])) * 100, 1)


### FYI only ###
### LaTeX tables. Perhaps some additional editing
### towards getting aesthetically sufficiently nice
### tables is still needed!
###
###  TO BE EXPLORED IN MORE DETAIL AT HOME
###  (IF BEING INTERESTED).
###
###  DURING THE EXERCISE CLASS, ONLY QUICKLY RUN IT
###  AND ADMIRE THE OUTPUT.
 
library("xtable")

### All choices default
print(xtable(TABS[["salary.assoc"]]))

### Some setting changed
print(xtable(TABS[["salary.assoc"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)),
      floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4))
print(xtable(TABS[["n.prof"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)),
      floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4))
print(xtable(TABS[["n.assoc"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)),
      floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4))
print(xtable(TABS[["n.assist"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)),
      floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4))


### Frequency table
(tTAB <- table(aaup[, "type"]))
(pTAB <- format(round(prop.table(tTAB) * 100, 1), nsmall = 1))
(tpTAB <- paste(tTAB, " (", pTAB, "\\%)", sep = ""))
names(tpTAB) <- names(tTAB)
print(tpTAB)

texTAB <- paste("\\begin{tabular}{rrr}\n",
                "\\hline\\hline\n",
                paste(paste("\\textbf{", names(tpTAB), "}", sep = ""), collapse = " & "), "\\\\\n",
                "\\hline\n",
                paste(tpTAB, collapse = " & "), "\\\\\n",
                "\\hline\\hline\n",
                "\\end{tabular}\n\n", sep = "")
cat(texTAB)
### End of LaTeX corner.
##################################################################################################


### Scatterplot matrix for quantitative variables at hand
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~salary.assoc + n.prof + n.assoc + n.assist,
                  reg.line = FALSE, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = aaup, pch = 16, cex=0.25)

### Boxplots of salary.assoc by type
COL3 <- rainbow_hcl(3)
plot(salary.assoc ~ type, data = aaup, col = COL3, xlab = "Type of institution", ylab = "Salary associate professor [USD 100]")


### Scatterplots of Y versus all quantitative x's while distinguishing the points by type
XLAB <- c("Num. of professors", "Num. of assoc. prof.", "Num. of assistants")
names(XLAB) <- c("n.prof", "n.assoc", "n.assist")

BCOL3 <- rainbow_hcl(3)
COL3 <- c("red4", "darkgreen", "blue4")
PCH3 <- c(21, 22, 23)
names(COL3) <- names(BCOL3) <- names(PCH3) <- levels(aaup[, "type"])
layout(matrix(c(rep(1:3, each = 3), 4), nrow = 1))
for (v in c("n.prof", "n.assoc", "n.assist")){
  plot(formula(paste("salary.assoc ~", v)), data = aaup, pch = PCH3[type], col = COL3[type], bg = BCOL3[type],
       xlab = XLAB[v], ylab = "Salary [USD 100]", cex=0.5)
}
plot.new()
legend(0.1, 0.9, legend = levels(aaup[, "type"]), title = "Type", pch = PCH3, col = COL3, pt.bg = BCOL3, cex=1.25)
par(mfrow = c(1, 1))


### QUESTIONS:
### In case we would consider each quantitative regressor separately, would a simple line be a useful model?
### Is it likely that type of institution must be taken into account?
### Do you expect normal and/or homoscedastic errors?


### In the following, we will fit several models,
### not all necessarily including all considered covariates.
### To make those models comparable (by F-tests etc.) we
### create a dataset containing only those institutions
### for which all needed variables are available.
### Consequently, all fitted models will be based on
### the same dataset.
Data <- subset(aaup, complete.cases(aaup[, c("salary.assoc", "n.prof", "n.assoc", "n.assist", "type")]))
nrow(Data)
nrow(aaup)

### Which institutions are lost?
whichREM <- with(aaup, !complete.cases(aaup[, c("salary.assoc", "n.prof", "n.assoc", "n.assist", "type")]))
subset(aaup, whichREM)


### Simple additive model to start with ... 
summary(m1orig <- lm(salary.assoc ~ type + n.prof + n.assoc + n.assist, data = Data));

### QUESTIONS: 
### Interpret the estimated regression coefficients. What do the p-values in the last column of the table tell  us?
### Based on the above output compare the salary of assoc. professor at type IIA school and type IIB school. 
### Can we conclude that hiring one more professor would increase the salary of associate professors? 

l <- c(0,1,-1,0,0,0);
LSest(m1orig, l)

### Can you explain how is it possible that the Std. error in the above output 
### is smaller than Std. Errors in summary(m1) at rows starting by 'typeIIA' 
### and 'typeIIB'?

###  (Overall) medians of n.prof, n.assist are 40,
###  for n.assoc it is 38 (which is roughly 40 as well).
###
###  To get better interpretation of the regression coefficients
###  we create new covariates n.prof40, n.assoc40, n.assist40
###  obtained from subtracting 40 from the original ones.
###  "40" versions will then be used in models.
###
###  Which regression coefficients become influenced by this
###  transformation?

	
Data <- transform(Data, n.prof40 = n.prof - 40,
                        n.assoc40 = n.assoc - 40,
                        n.assist40 = n.assist - 40)


### Simple additive model with transformed variables 
m1 <- lm(salary.assoc ~ type + n.prof40 + n.assoc40 + n.assist40, data = Data)
summary(m1)

### QUESTIONS: 
### Interpret the estimated regression coefficients. 
### Which parameters have different interpretation  in models m1 and m1orig?

summary(m1orig)
summary(m1)


### What is being tested here? 
m0 <- lm(salary.assoc ~ n.prof40 + n.assoc40 + n.assist40, data = Data)
anova(m0, m1);

### Let consider the contrast sum ("contr.sum:) for the varaible  'type'
m1B <- lm(salary.assoc ~ type +n.prof40 + n.assoc40 + n.assist40, data = Data, contrasts = list(type = "contr.sum"))
summary(m1B);

### QUESTION: Interpret the regression coefficients on the rows that start with 'type1' and 'type2' 
C <- contr.sum(3);
rownames(C) <- levels(Data$type);
colnames(C) <- names(coef(m1B))[2:3];
C

 
### Compare the following two summary outputs 
summary(m1B);
summary(m1);

### Basic diagnostic plots 
plotLM(m1)


### Slightly more complicated model?
summary(m2 <- lm(salary.assoc ~ type + n.prof40*n.assoc40 + n.assist40, data = Data)) 

### Based on the model m2 describe what is the effect of the numbers of professors on the salary of associate professors?

### Compare the following two outputs 
summary(m2)
summary(m2orig <- lm(salary.assoc ~ type + n.prof*n.assoc + n.assist, data = Data)) 


### Try to add the quadratic term of number of professors. 
summary(m3 <- lm(salary.assoc ~ type + n.prof40*n.assoc40 + I(n.prof40^2) + n.assist40, data = Data))

### Based on the model m3 describe what is the effect of the numbers of professors on the salary of associate professors?