### --------------------------------------------------------------------------------------### ### Winter Term 2019/2020 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #6 Additivity and partial effects of covariates ### ### (Simple linear regression models with interactions) ### ### ### ### --------------------------------------------------------------------------------------### # In K4/K10/K11 you might not be able to load the package 'mffSM' because of a low # version of R software (< 3.2.0). In this case tyou can run R software from the # network disc L, where the version 3.2.2. is installed. The path is # L:\Statistika\R\bin\x64\Rgui.exe . ### ====================== ### Working directory ### ====================== setwd("H:/nmsa407_LinRegr/") library("mffSM") ### ============================ ### Data we will be working on ### ============================ data(Dris, package = "mffSM") # help("Dris") head(Dris) summary(Dris) ### ======================================================= ### As in one of the previous exercise classes we will be ### interested in the dependence of yield on the amount of ### magnesium yield ~ Mg but now we include also other ### measurements to improve our model. ### ======================================================= ### Add transformations of Mg, Ca, N to the dataset Dris <- transform(Dris, lMg = log2(Mg), lMg2 = (log2(Mg))^2, lCa = log2(Ca), lCa2 = (log2(Ca))^2, lN = log2(N), lN2=(log2(N))^2) ### First, start with adding Ca in the form of log2(Ca) ### Let start with a simple additive model summary(m1_addit <- lm(yield ~ lMg + lCa, data = Dris)) ### QUESTIONS: ### Interpret the estimated regression coefficients. ### What is being tested on the rows started with 'lMg' and 'lCa'? ### Interpret the test on the last line of the output started with 'F-statistic'. ### Submodel 1: yield ~ lMg m1_p1 = lm(yield ~ lMg, data = Dris) anova(m1_p1,m1_addit) summary(m1_addit)$coef[3,3]^2 ### Submodel 2: yield ~ lCa m1_p2 = lm(yield ~ lCa, data = Dris) anova(m1_p2,m1_addit) summary(m1_addit)$coef[2,3]^2 ### Submodel 3: yield ~ 1 m1_p3 = lm(yield ~ 1, data = Dris) anova(m1_p3,m1_addit) summary(m1_addit)$fstat ### Compare the above output with the following model output: summary(m1_addit) summary(lm(yield ~ lMg, data = Dris)) ### QUESTION: ### What is the difference in interpretation of the regression coefficients? ### Interpret the following models that include also the quadratic term of lMg2. summary(m20 <- lm(yield ~ lMg + lMg2, data = Dris)) summary(m21 <- lm(yield ~ lMg + lMg2 + lCa, data = Dris)) summary(m22 <- lm(yield ~ lMg + lMg2 + lCa + lCa2, data = Dris)) ### The change of lMg by +log2(1.1) <=> the increase of Mg by 10% eps <- 0.1; delta <- log2(1+eps); lMg.grid <- seq(min(Dris$lMg), max(Dris$lMg)-delta, length = 1000); effect.lMg <- function(b1,b2) b1*delta + b2*(2*delta*lMg.grid + delta^2); ### Plotting the effect of lMg on the yield plot(lMg.grid, effect.lMg(coef(m20)['lMg'], coef(m20)['lMg2']), col = "blue", xlab = "log2(Mg)", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg on the yield", type="l", lwd=2, ylim=range(effect.lMg(coef(m20)['lMg'], coef(m20)['lMg2']), effect.lMg(coef(m21)['lMg'], coef(m21)['lMg2']), effect.lMg(coef(m22)['lMg'], coef(m22)['lMg2']))); lines(lMg.grid, effect.lMg(coef(m21)['lMg'], coef(m21)['lMg2']), lwd=2, col="red") lines(lMg.grid, effect.lMg(coef(m22)['lMg'], coef(m22)['lMg2']), lwd=2, col="yellow", lty="dotted") abline(h=0, lty="dotted") legend("topright", col=c("blue", "red", "yellow"), lwd=c(2,2,2), lty=c("solid", "solid", "dotted"), legend=c("m20", "m21", "m22"), cex=1.2) rug(log2(Dris$Mg+runif(nrow(Dris))-0.5)); ### coefficients of the models: compare with the plot coefs = rbind(c(coef(m20),0,0),c(coef(m21),0),coef(m22)) rownames(coefs) = c("m20","m21","m22") colnames(coefs) = names(coef(m22)) coefs ### Plotting the effect of Mg on the yield Mg.grid <- seq(min(Dris$Mg), max(Dris$Mg)-1, length = 1000); effect.Mg <- function(b1,b2) b1*(log2(Mg.grid+1) - log2(Mg.grid)) + b2*((log2(1+Mg.grid))^2 - (log2(Mg.grid))^2); plot(Mg.grid, effect.Mg(coef(m20)['lMg'], coef(m20)['lMg2']), col = "blue", xlab = "Mg", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg on the yield", type="l", lwd=2, ylim=range(effect.Mg(coef(m20)['lMg'], coef(m20)['lMg2']), effect.Mg(coef(m21)['lMg'], coef(m21)['lMg2']), effect.Mg(coef(m22)['lMg'], coef(m22)['lMg2']))); lines(Mg.grid, effect.Mg(coef(m21)['lMg'], coef(m21)['lMg2']), lwd=2, col="red") lines(Mg.grid, effect.Mg(coef(m22)['lMg'], coef(m22)['lMg2']), lwd=2, col="yellow", lty="dotted") abline(h=0, lty="dotted") legend(15, 0.3, col=c("blue", "red", "yellow"), lwd=c(2,2,2), lty=c("solid", "solid", "dotted"), legend=c("m20", "m21", "m22"), cex=1.2) rug(Dris$Mg+runif(nrow(Dris))-0.5); ### Assume that Ca and Mg influence only the expectation of 'yield' and the linear model holds. ### How would you test that given 'Ca' the amount of 'yield' is independent of 'Mg'? ### That is we would like to test that L(Y|Ca) is independent with L(Mg|Ca). ### (Here L means "Law", i.e. the probability distribution.) summary(m01 <- lm(yield ~ lCa, data = Dris)) summary(m02 <- lm(yield ~ lCa+lCa2, data = Dris)) # test of model H0: yield ~ lCa + lCa2 against H: yield ~ lCa + lCa2 + lMg + lMg2 anova(m02, m22) # we reject H0, i.e. the larger model is needed # test of model H0: yield ~ lCa against H: yield ~ lCa + lMg + lMg2 anova(m01, m21) # again, we reject H0, i.e. yield depends on lMg given lCa ### Task: How would you test that given 'Mg' the amount of 'yield' is independent of 'Ca'? ### Nest: we add variable lN into the model summary(m211 <- lm(yield ~ lMg + lMg2 + lCa + lN, data = Dris)) ### QUESTIONS: ### Can we say that the amount of 'yield' is independent of 'Ca'? ### => un-conditional analysis, we ask directly about the relation of yield and lCa summary(m01) summary(m02) ### visualisation plot(yield~lCa, data=Dris, col="orange", pch=21, bg = "red", main="Marginal relation of yield and lCa") abline(m01, col="darkgreen", lwd=2) lCa.grid = seq(min(Dris$lCa),max(Dris$lCa),length=1000) lines(lCa.grid,m02$coef[1]+m02$coef[2]*lCa.grid+m02$coef[3]*lCa.grid^2, col="coral", lwd=3) legend("topright", c("m01","m02"), lwd=c(2,3), col=c("darkgreen","coral")) ### Can we say that given 'Mg' and 'Ca' the amount of 'yield' is independent of 'N'? summary(m211) anova(m21,m211) ### Can we say that given 'N' and 'Ca' the amount of 'yield' is independent of 'Mg'? summary(m011 <- lm(yield ~ lCa + lN, data = Dris)) anova(m011, m211) ### Interactions ### ------------ ### A simple interaction model summary(m1_inter <- lm(yield ~ lMg + lCa + lMg:lCa, data = Dris)) ### Interaction columns are given as the product of corresponding main effect columns head(X<-model.matrix(m1_inter)) all.equal(X[,2]*X[,3],X[,4]) ### visulalisation of the fitter regression surface - contours lMg.grid <- seq(min(Dris$lMg), max(Dris$lMg), length = 100); lCa.grid <- seq(min(Dris$lCa), max(Dris$lCa), length = 100); Yhat = outer(lMg.grid,lCa.grid, function(x,y) predict(m1_inter,newdata=data.frame(lMg=x,lCa=y))) contour(lMg.grid,lCa.grid,Yhat,xlab="log(Mg)",ylab="log(Ca)") title("Contours of the fitted \n regression surface (with interactions)") cols = gray.colors(nrow(Dris), start = 0.1, end = 0.5, gamma = 2.2, alpha = NULL)[round(rank(Dris$yield))] points(Dris$lMg, Dris$lCa,cex=Dris$yield/3, col=cols,pch=16) ### ### bigger and lighter dots represent larger values of observed yield ### the same figure for the model without interactions Yhat = outer(lMg.grid,lCa.grid, function(x,y) predict(m1_addit,newdata=data.frame(lMg=x,lCa=y))) contour(lMg.grid,lCa.grid,Yhat,xlab="log(Mg)",ylab="log(Ca)") title("Contours of the fitted \n regression surface (without interactions)") points(Dris$lMg, Dris$lCa,cex=Dris$yield/3, col=cols,pch=16) ### QUESTIONS: ### What is the interpretation of the regression coefficients? ### How would you quantify the effect of Mg on yield (when taking Ca into account)? eps <- 0.1; delta <- log2(1+eps); par(mfrow = c(2, 1)) plot(lCa.grid, delta*coef(m1_inter)['lMg'] + delta*coef(m1_inter)['lMg:lCa'] * lCa.grid, col = "blue", xlab = "log2(Ca)", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg \n on the yield", type="l", lwd=2) Ca.grid <- seq(min(Dris$Ca), max(Dris$Ca), length = 1000) plot(Ca.grid, coef(m1_inter)['lMg'] * (log2(Mg.grid+1)-log2(Mg.grid)) + coef(m1_inter)['lMg:lCa'] * (log2(Mg.grid+1)-log2(Mg.grid)) * log2(Ca.grid), col = "blue", xlab = "Ca", ylab = "Change of expected yield", main = "The effect of increas of Mg by +1 \n on the yield", type="l", lwd=2) par(mfrow = c(1, 1)) summary(m1_inter) ### test whether the interaction is significant anova(m1_inter,m1_addit) ### test whether lCa is significant anova(m1_inter,m10<-lm(yield~lMg, data = Dris)) ### Data from the American Association of University Professors ### (AAUP) annual faculty salary survey of American colleges and ### universities, 1995. ### ### Data are from 1161 colleges obtained (mostly) by a census method. ### Data data(AAUP, package = "mffSM") head(AAUP) summary(AAUP) ### Model for dependence of salary of the associate professor [salary.assoc] ### on the following variables: ### * number of full professors [n.prof] ### * number of associate professors [n.assoc] ### * number of assistant professors [n.assist] ### * type of the institution [type] ### ### Our main aim will be to have a model which sufficiently well ### expresses E(salary.assoc|...) such that inference concerning ### its dependence on considered factors can be done. ### Create a subset dataset containing only the needed ### or otherwise interesting variables aaup <- subset(AAUP, select = c("FICE", "name", "state", "type", "n.prof", "n.assoc", "n.assist", "salary.assoc")) summary(aaup) ### Descriptive statistics for the quantitative variables to be analyzed, ### also categorized by type TABS <- list() for (v in c("salary.assoc", "n.prof", "n.assoc", "n.assist")){ MeanType <- tapply(aaup[, v], aaup[, "type"], mean, na.rm = TRUE) MedType <- tapply(aaup[, v], aaup[, "type"], median, na.rm = TRUE) SDType <- tapply(aaup[, v], aaup[, "type"], sd, na.rm = TRUE) Q1Type <- tapply(aaup[, v], aaup[, "type"], quantile, prob = 0.25, na.rm = TRUE) Q3Type <- tapply(aaup[, v], aaup[, "type"], quantile, prob = 0.75, na.rm = TRUE) IQRType <- tapply(aaup[, v], aaup[, "type"], IQR, na.rm = TRUE) NAsType <- tapply(is.na(aaup[, v]), aaup[, "type"], sum) Mean <- mean(aaup[, v], na.rm = TRUE) Med <- median(aaup[, v], na.rm = TRUE) SD <- sd(aaup[, v], na.rm = TRUE) Q1 <- quantile(aaup[, v], prob = 0.25, na.rm = TRUE) Q3 <- quantile(aaup[, v], prob = 0.75, na.rm = TRUE) IQR <- IQR(aaup[, v], na.rm = TRUE) NAs <- sum(is.na(aaup[, v])) TABS[[v]] <- data.frame(Mean = round(c(Mean, MeanType), 1), SD = round(c(SD, SDType), 1), Median = round(c(Med, MedType), 1), Q1 = round(c(Q1, Q1Type), 1), Q3 = round(c(Q3, Q3Type), 1), IQR = round(c(IQR, IQRType), 1), NAs = round(c(NAs, NAsType), 1)) rownames(TABS[[v]]) <- c("All", "I", "IIA", "IIB") colnames(TABS[[v]])[7] <- "NA's" } print(TABS) ### Frequency table for the categorical variable table(aaup[, "type"]) prop.table(table(aaup[, "type"])) round(prop.table(table(aaup[, "type"])) * 100, 1) ### FYI only ### ### LaTeX tables. Perhaps some additional editing ### towards getting aesthetically sufficiently nice ### tables is still needed! ### ### TO BE EXPLORED IN MORE DETAIL AT HOME ### (IF BEING INTERESTED). ### ### DURING THE EXERCISE CLASS, ONLY QUICKLY RUN IT ### AND ADMIRE THE OUTPUT. library("xtable") ### All choices default print(xtable(TABS[["salary.assoc"]])) ### Some settings changed print(xtable(TABS[["salary.assoc"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)), floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4)) print(xtable(TABS[["n.prof"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)), floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4)) print(xtable(TABS[["n.assoc"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)), floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4)) print(xtable(TABS[["n.assist"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)), floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4)) ### Frequency table (tTAB <- table(aaup[, "type"])) (pTAB <- format(round(prop.table(tTAB) * 100, 1), nsmall = 1)) (tpTAB <- paste(tTAB, " (", pTAB, "\\%)", sep = "")) names(tpTAB) <- names(tTAB) print(tpTAB) texTAB <- paste("\\begin{tabular}{rrr}\n", "\\hline\\hline\n", paste(paste("\\textbf{", names(tpTAB), "}", sep = ""), collapse = " & "), "\\\\\n", "\\hline\n", paste(tpTAB, collapse = " & "), "\\\\\n", "\\hline\\hline\n", "\\end{tabular}\n\n", sep = "") cat(texTAB) ### End of LaTeX corner. ################################################################################################## ### Scatterplot matrix for quantitative variables at hand palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5))) scatterplotMatrix(~salary.assoc + n.prof + n.assoc + n.assist, reg.line = FALSE, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = aaup, pch = 16, cex=0.25) ### Boxplots of salary.assoc by type COL3 <- rainbow_hcl(3) plot(salary.assoc ~ type, data = aaup, col = COL3, xlab = "Type of institution", ylab = "Salary associate professor [USD 100]") ### Scatterplots of Y versus all quantitative x's while distinguishing the points by type XLAB <- c("Num. of professors", "Num. of assoc. prof.", "Num. of assistants") names(XLAB) <- c("n.prof", "n.assoc", "n.assist") BCOL3 <- rainbow_hcl(3) COL3 <- c("red4", "darkgreen", "blue4") PCH3 <- c(21, 22, 23) names(COL3) <- names(BCOL3) <- names(PCH3) <- levels(aaup[, "type"]) layout(matrix(c(rep(1:3, each = 1), 4), nrow = 1)) for (v in c("n.prof", "n.assoc", "n.assist")){ plot(formula(paste("salary.assoc ~", v)), data = aaup, pch = PCH3[type], col = COL3[type], bg = BCOL3[type], xlab = XLAB[v], ylab = "Salary [USD 100]", cex=1.25) } plot.new() legend(0.1, 0.9, legend = levels(aaup[, "type"]), title = "Type", pch = PCH3, col = COL3, pt.bg = BCOL3, cex=1.25) par(mfrow = c(1, 1)) ### QUESTIONS: ### In case we would consider each quantitative regressor separately, ### would a simple line be a useful model? ### Is it likely that type of institution must be taken into account? ### Do you expect normal and/or homoscedastic errors? ### In the following, we will fit several models, ### not all necessarily including all considered covariates. ### To make those models comparable (by F-tests etc.) we ### create a dataset containing only those institutions ### for which all needed variables are available. ### Consequently, all fitted models will be based on ### the same dataset. Data <- subset(aaup, complete.cases(aaup[, c("salary.assoc", "n.prof", "n.assoc", "n.assist", "type")])) nrow(Data) nrow(aaup) ### Which institutions are lost? whichREM <- with(aaup, !complete.cases(aaup[, c("salary.assoc", "n.prof", "n.assoc", "n.assist", "type")])) subset(aaup, whichREM) ### Simple additive model to start with ... summary(m1orig <- lm(salary.assoc ~ type + n.prof + n.assoc + n.assist, data = Data)); ### QUESTIONS: ### Interpret the estimated regression coefficients. What do the p-values in the last column of the table tell us? # for instance, the interpretation of the coefficient at n.prof in the two models: summary(lm(salary.assoc~n.prof, data = Data)) summary(m1orig) ### Based on the above output compare the salary of assoc. professor ### at type IIA school and type IIB school. m1orig ### let's take the conditional expectations for: ### (arbitrary but fixed numbers follow) n.p = 10 # number of professors n.a1 = -17.2*pi # number of associate professors n.a2 = 19 # number of assistant professors ## point estimates of the conditional expectations for type IIA and type IIB schools lIIA = c(1,1,0,n.p,n.a1,n.a2) lIIB = c(1,0,1,n.p,n.a1,n.a2) lIIA - lIIB ## the difference does not depend on the the number of professors, assoc. profs, ### or assist. profs at all ## <= A consequence of additivity l <- c(0,1,-1,0,0,0); LSest(m1orig, l) ### Can you explain how is it possible that the Std. error in the above output ### is smaller than Std. Errors in summary(m1orig) at rows starting by 'typeIIA' ### and 'typeIIB'? summary(m1orig)$coef ### compare with the following output vcov(m1orig) cov2cor(vcov(m1orig)) sqrt(l%*%vcov(m1orig)%*%l) sqrt(vcov(m1orig)[2,2]+vcov(m1orig)[3,3]-2*vcov(m1orig)[2,3]) ### Can we conclude that hiring one more professor would increase ### the salary of associate professors? coef(m1orig) ### To assess significance, test of a submodel m_sub = lm(salary.assoc ~ type + n.assoc + n.assist, data = Data) anova(m_sub,m1orig) summary(m1orig) ### alternatively, directly testing about the regression coefficient ### one-sided test ### H0: beta_n.prof <= 0 against H1: beta_n.prof > 0 LSest(m1orig,L=c(0,0,0,1,0,0),alternative="greater") ### (Overall) medians of n.prof, n.assist are 40, ### for n.assoc it is 38 (which is roughly 40 as well). TABS ### ### To get better interpretation of the regression coefficients ### we create new covariates n.prof40, n.assoc40, n.assist40 ### obtained from subtracting 40 from the original ones. ### "40" versions will then be used in models. ### ### Which regression coefficients become influenced by this ### transformation? Data <- transform(Data, n.prof40 = n.prof - 40, n.assoc40 = n.assoc - 40, n.assist40 = n.assist - 40) ### Simple additive model with transformed variables m1 <- lm(salary.assoc ~ type + n.prof40 + n.assoc40 + n.assist40, data = Data) summary(m1) ### QUESTIONS: ### Interpret the estimated regression coefficients. ### Which parameters have different interpretation in models m1 and m1orig? summary(m1orig)$coef summary(m1)$coef ### Model m1 is just a reparametrization of m1orig l = c(1,0,0,-40,-40,-40) # intercept in m1orig coef(m1)%*%l coef(m1orig)[1] LSest(m1,L=l) LSest(m1orig,L=c(1,0,0,0,0,0)) # the whole model transformed (L = unname(rbind(l,cbind(0,diag(5))))) LSest(m1,L=L) summary(m1orig)$coef ### What is being tested here? m0 <- lm(salary.assoc ~ n.prof40 + n.assoc40 + n.assist40, data = Data) anova(m0, m1); ### Let consider the contrast sum ("contr.sum:) for the varaible 'type' m1B <- lm(salary.assoc ~ type +n.prof40 + n.assoc40 + n.assist40, data = Data, contrasts = list(type = "contr.sum")) summary(m1B); ### QUESTION: Interpret the regression coefficients on the rows that start ### with 'type1' and 'type2' C <- contr.sum(3); rownames(C) <- levels(Data$type); colnames(C) <- names(coef(m1B))[2:3]; C ### Compare the following two summary outputs summary(m1B); summary(m1); # compare the output below with the two summaries above (L = cbind(1,C,0,0,0)) LSest(m1B,L=L) ## in simple (one-way) ANOVA, these regression coefficients were the same as ## the estimated group-specific means, and mean of group-specific means was the intercept. ## with additional covariates this no longer holds true unconditionally (means = tapply(Data$salary.assoc, Data$type, mean)) ### compare with the marginal model, where only type is taken into account (coefs = lm(salary.assoc ~ type, data = Data)$coef) ### in the marginal model we indeed get that the regression coefficients are modeled ### by the estimated group-specific means cbind(1,contr.treatment(3))%*%coefs ### Still, the two models m1 and m1B are equivalent (C2 = cbind(1,contr.treatment(3))) (Cf = solve(C2)%*%L) (Cf = unname(rbind(Cf,cbind(0,0,0,diag(3))))) LSest(m1B,L=Cf) ## which is the same as the original output summary(m1)$coef ### Basic diagnostic plots plotLM(m1) ### Slightly more complicated model? summary(m2 <- lm(salary.assoc ~ type + n.prof40*n.assoc40 + n.assist40, data = Data)) ### Based on the model m2 describe what is the effect of the numbers of professors ### on the salary of associate professors? ## let's take, for instance, the conditional expectations for: n.p = 10 # number of professors n.a1 = 100*pi # number of associate professors n.a2 = -15 # number of assistant professors ty = levels(Data$type)[2] # type of school coef(m2)%*%c(1,1,0,0,n.a1,n.a2,0) + n.p*(coef(m2)[4]+n.a1*coef(m2)[7]) # predict(m2,newdata=data.frame(type = ty, n.prof40 = n.p, n.assoc40 = n.a1, n.assist40 = n.a2)) ## the relation is now more complex than in the additive model ### Compare the following two outputs ### the only difference is the shift in covariates by 40 summary(m2) summary(m2orig <- lm(salary.assoc ~ type + n.prof*n.assoc + n.assist, data = Data)) ### models are still equivalent. For instance, coefficient with n.prof and m2orig m2$coef[4] - 40*m2$coef[7] m2orig$coef[4] # or the intercept of m2orig m2$coef %*% c(1,0,0,rep(-40,3),40^2) m2orig$coef[1] ### Try to add the quadratic term of number of professors. summary(m3 <- lm(salary.assoc ~ type + n.prof40*n.assoc40 + I(n.prof40^2) + n.assist40, data = Data)) ### Based on the model m3 describe what is the effect of the numbers of professors on the salary of associate professors? coef(m3) # n.p = 10 n.a1 = 100 n.a2 = -15 ty = levels(Data$type)[2] # coef(m3)%*%c(1,1,0,0,n.a1,0,n.a2,0) + n.p*(coef(m3)[4]+n.a1*coef(m3)[8]+ n.p*coef(m3)[6]) predict(m3,newdata=data.frame(type = ty, n.prof40 = n.p, n.assoc40 = n.a1, n.assist40 = n.a2))