### --------------------------------------------------------------------------------------### ### Winter Term 2019/2020 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #6 Additivity and partial effects of covariates ### ### (Simple linear regression models with interactions) ### ### ### ### --------------------------------------------------------------------------------------### # In K4/K10/K11 you might not be able to load the package 'mffSM' because of a low # version of R software (< 3.2.0). In this case tyou can run R software from the # network disc L, where the version 3.2.2. is installed. The path is # L:\Statistika\R\bin\x64\Rgui.exe . ### ====================== ### Working directory ### ====================== setwd("H:/nmsa407_LinRegr/") library("mffSM") ### ============================ ### Data we will be working on ### ============================ data(Dris, package = "mffSM") # help("Dris") head(Dris) summary(Dris) ### ======================================================= ### As in one of the previous exercise classes we will be ### interested in the dependence of yield on the amount of ### magnesium yield ~ Mg but now we include also other ### measurements to improve our model. ### ======================================================= ### Add transformations of Mg, Ca, N to the dataset Dris <- transform(Dris, lMg = log2(Mg), lMg2 = (log2(Mg))^2, lCa = log2(Ca), lCa2 = (log2(Ca))^2, lN = log2(N), lN2=(log2(N))^2) ### First, start with adding Ca in the form of log2(Ca) ### Let start with a simple additive model summary(m1_addit <- lm(yield ~ lMg + lCa, data = Dris)) ### QUESTIONS: ### Interpret the estimated regression coefficients. ### What is being tested on the rows started with 'lMg' and 'lCa'? ### Interpret the test on the last line of the output started with 'F-statistic'. ### Compare the above output with the following model output: summary(lm(yield ~ lMg, data = Dris)) ### QUESTION: ### What is the difference in interpretation of the regression coefficients? ### Interpret the following models that include also the quadratic term of lMg2. summary(m20 <- lm(yield ~ lMg + lMg2, data = Dris)) summary(m21 <- lm(yield ~ lMg + lMg2 + lCa, data = Dris)) summary(m22 <- lm(yield ~ lMg + lMg2 + lCa + lCa2, data = Dris)) ### The change of lMg by +log2(1.1) <=> the increase of Mg by 10% eps <- 0.1; delta <- log2(1+eps); lMg.grid <- seq(min(Dris$lMg), max(Dris$lMg)-delta, length = 1000); effect.lMg <- function(b1,b2) b1*delta + b2*(2*delta*lMg.grid + delta^2); ### Plotting the effect of lMg on the yield plot(lMg.grid, effect.lMg(coef(m20)['lMg'], coef(m20)['lMg2']), col = "blue", xlab = "log2(Mg)", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg on the yield", type="l", lwd=2, ylim=range(effect.lMg(coef(m20)['lMg'], coef(m20)['lMg2']), effect.lMg(coef(m21)['lMg'], coef(m21)['lMg2']), effect.lMg(coef(m22)['lMg'], coef(m22)['lMg2']))); lines(lMg.grid, effect.lMg(coef(m21)['lMg'], coef(m21)['lMg2']), lwd=2, col="red") lines(lMg.grid, effect.lMg(coef(m22)['lMg'], coef(m22)['lMg2']), lwd=2, col="yellow", lty="dotted") abline(h=0, lty="dotted") legend(3.25, -0.1, col=c("blue", "red", "yellow"), lwd=c(2,2,2), lty=c("solid", "solid", "dotted"), legend=c("m20", "m21", "m22"), cex=1.3) ### Plotting the effect of Mg on the yield Mg.grid <- seq(min(Dris$Mg), max(Dris$Mg)-1, length = 1000); effect.Mg <- function(b1,b2) b1*(log2(Mg.grid+1) - log2(Mg.grid)) + b2*((log2(1+Mg.grid))^2 - (log2(Mg.grid))^2); plot(Mg.grid, effect.Mg(coef(m20)['lMg'], coef(m20)['lMg2']), col = "blue", xlab = "Mg", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg on the yield", type="l", lwd=2, ylim=range(effect.Mg(coef(m20)['lMg'], coef(m20)['lMg2']), effect.Mg(coef(m21)['lMg'], coef(m21)['lMg2']), effect.Mg(coef(m22)['lMg'], coef(m22)['lMg2']))); lines(Mg.grid, effect.Mg(coef(m21)['lMg'], coef(m21)['lMg2']), lwd=2, col="red") lines(Mg.grid, effect.Mg(coef(m22)['lMg'], coef(m22)['lMg2']), lwd=2, col="yellow", lty="dotted") abline(h=0, lty="dotted") legend(15, 0.3, col=c("blue", "red", "yellow"), lwd=c(2,2,2), lty=c("solid", "solid", "dotted"), legend=c("m20", "m21", "m22"), cex=1.3) rug(Dris$Mg+runif(nrow(Dris))-0.5); ### Assume that Ca and Mg influences only the expectation of 'yield' and the linear model holds. ### How would you test that given 'Ca' the amount of 'yield' is independent of 'Mg'? ### That is we would like to test that L(Y|Ca) is independent with L(Mg|Ca). summary(m01 <- lm(yield ~ lCa, data = Dris)) summary(m02 <- lm(yield ~ lCa+lCa2, data = Dris)) anova(m02, m22) anova(m01, m21) ### Task: How would you test that given 'Mg' the amount of 'yield' is independent of 'Ca'? ### Nest: we add variable lN into the model summary(m211 <- lm(yield ~ lMg + lMg2 + lCa + lN, data = Dris)) ### QUESTIONS: ### Can we say that the amount of 'yield' is independent of 'Ca'? ### Can we say that given 'Mg' and 'Ca' the amount of 'yield' is independent of 'N'? ### Can we say that given 'N' and 'Ca' the amount of 'yield' is independent of 'Mg'? summary(m011 <- lm(yield ~ lCa + lN, data = Dris)) anova(m011, m211) ### A simple interaction model summary(m1_inter <- lm(yield ~ lMg + lCa + lMg:lCa, data = Dris)) ### QUESTIONS: ### What is the interpretation of the regression coefficients? ### How would you quantify the effect of Mg on yield (when taking Ca into account)? lCa.grid <- seq(min(Dris$lCa), max(Dris$lCa), length = 1000); eps <- 0.1; delta <- log2(1+eps); par(mfrow = c(2, 1)) plot(lCa.grid, delta*coef(m1_inter)['lMg'] + delta*coef(m1_inter)['lMg:lCa'] * lCa.grid, col = "blue", xlab = "log2(Ca)", ylab = "Change of expected yield", main = "The effect of 10%-increase of Mg on the yield", type="l", lwd=2) Ca.grid <- seq(min(Dris$Ca), max(Dris$Ca), length = 1000) plot(Ca.grid, coef(m1_inter)['lMg'] * (log2(Mg.grid+1)-log2(Mg.grid)) + coef(m1_inter)['lMg:lCa'] * (log2(Mg.grid+1)-log2(Mg.grid)) * log2(Ca.grid), col = "blue", xlab = "Ca", ylab = "Change of expected yield", main = "The effect of increas of Mg by +1 on the yield", type="l", lwd=2) par(mfrow = c(1, 1)) ### Data from the American Association of University Professors ### (AAUP) annual faculty salary survey of American colleges and ### universities, 1995. ### ### Data are from 1161 colleges obtained (mostly) by a census method. ### Data data(AAUP, package = "mffSM") head(AAUP) summary(AAUP) ### Model for dependence of salary of the associate professor [salary.assoc] ### on the following variables: ### * number of full professors [n.prof] ### * number of associate professors [n.assoc] ### * number of assistant professors [n.assist] ### * type of the institution [type] ### ### Our main aim will be to have a model which sufficiently well ### expresses E(salary.assoc|...) such that inference concerning ### its dependence on considered factors can be done. ### Create a subset dataset containing only needed ### or otherwise interesting variables aaup <- subset(AAUP, select = c("FICE", "name", "state", "type", "n.prof", "n.assoc", "n.assist", "salary.assoc")) summary(aaup) ### Descriptive statistics for the quantitative variables to be analyzed, ### also categorized by type TABS <- list() for (v in c("salary.assoc", "n.prof", "n.assoc", "n.assist")){ MeanType <- tapply(aaup[, v], aaup[, "type"], mean, na.rm = TRUE) MedType <- tapply(aaup[, v], aaup[, "type"], median, na.rm = TRUE) SDType <- tapply(aaup[, v], aaup[, "type"], sd, na.rm = TRUE) Q1Type <- tapply(aaup[, v], aaup[, "type"], quantile, prob = 0.25, na.rm = TRUE) Q3Type <- tapply(aaup[, v], aaup[, "type"], quantile, prob = 0.75, na.rm = TRUE) IQRType <- tapply(aaup[, v], aaup[, "type"], IQR, na.rm = TRUE) NAsType <- tapply(is.na(aaup[, v]), aaup[, "type"], sum) Mean <- mean(aaup[, v], na.rm = TRUE) Med <- median(aaup[, v], na.rm = TRUE) SD <- sd(aaup[, v], na.rm = TRUE) Q1 <- quantile(aaup[, v], prob = 0.25, na.rm = TRUE) Q3 <- quantile(aaup[, v], prob = 0.75, na.rm = TRUE) IQR <- IQR(aaup[, v], na.rm = TRUE) NAs <- sum(is.na(aaup[, v])) TABS[[v]] <- data.frame(Mean = round(c(Mean, MeanType), 1), SD = round(c(SD, SDType), 1), Median = round(c(Med, MedType), 1), Q1 = round(c(Q1, Q1Type), 1), Q3 = round(c(Q3, Q3Type), 1), IQR = round(c(IQR, IQRType), 1), NAs = round(c(NAs, NAsType), 1)) rownames(TABS[[v]]) <- c("All", "I", "IIA", "IIB") colnames(TABS[[v]])[7] <- "NA's" } print(TABS) ### Frequency table for the categorical variable table(aaup[, "type"]) prop.table(table(aaup[, "type"])) round(prop.table(table(aaup[, "type"])) * 100, 1) ### FYI only ### ### LaTeX tables. Perhaps some additional editing ### towards getting aesthetically sufficiently nice ### tables is still needed! ### ### TO BE EXPLORED IN MORE DETAIL AT HOME ### (IF BEING INTERESTED). ### ### DURING THE EXERCISE CLASS, ONLY QUICKLY RUN IT ### AND ADMIRE THE OUTPUT. library("xtable") ### All choices default print(xtable(TABS[["salary.assoc"]])) ### Some setting changed print(xtable(TABS[["salary.assoc"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)), floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4)) print(xtable(TABS[["n.prof"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)), floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4)) print(xtable(TABS[["n.assoc"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)), floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4)) print(xtable(TABS[["n.assist"]], align = c("l", rep("r", 7)), digits = c(0, rep(1, 6), 0)), floating = FALSE, hline.after = c(-1, -1, 0, 1, 4, 4)) ### Frequency table (tTAB <- table(aaup[, "type"])) (pTAB <- format(round(prop.table(tTAB) * 100, 1), nsmall = 1)) (tpTAB <- paste(tTAB, " (", pTAB, "\\%)", sep = "")) names(tpTAB) <- names(tTAB) print(tpTAB) texTAB <- paste("\\begin{tabular}{rrr}\n", "\\hline\\hline\n", paste(paste("\\textbf{", names(tpTAB), "}", sep = ""), collapse = " & "), "\\\\\n", "\\hline\n", paste(tpTAB, collapse = " & "), "\\\\\n", "\\hline\\hline\n", "\\end{tabular}\n\n", sep = "") cat(texTAB) ### End of LaTeX corner. ################################################################################################## ### Scatterplot matrix for quantitative variables at hand palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5))) scatterplotMatrix(~salary.assoc + n.prof + n.assoc + n.assist, reg.line = FALSE, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = aaup, pch = 16, cex=0.25) ### Boxplots of salary.assoc by type COL3 <- rainbow_hcl(3) plot(salary.assoc ~ type, data = aaup, col = COL3, xlab = "Type of institution", ylab = "Salary associate professor [USD 100]") ### Scatterplots of Y versus all quantitative x's while distinguishing the points by type XLAB <- c("Num. of professors", "Num. of assoc. prof.", "Num. of assistants") names(XLAB) <- c("n.prof", "n.assoc", "n.assist") BCOL3 <- rainbow_hcl(3) COL3 <- c("red4", "darkgreen", "blue4") PCH3 <- c(21, 22, 23) names(COL3) <- names(BCOL3) <- names(PCH3) <- levels(aaup[, "type"]) layout(matrix(c(rep(1:3, each = 3), 4), nrow = 1)) for (v in c("n.prof", "n.assoc", "n.assist")){ plot(formula(paste("salary.assoc ~", v)), data = aaup, pch = PCH3[type], col = COL3[type], bg = BCOL3[type], xlab = XLAB[v], ylab = "Salary [USD 100]", cex=0.5) } plot.new() legend(0.1, 0.9, legend = levels(aaup[, "type"]), title = "Type", pch = PCH3, col = COL3, pt.bg = BCOL3, cex=1.25) par(mfrow = c(1, 1)) ### QUESTIONS: ### In case we would consider each quantitative regressor separately, would a simple line be a useful model? ### Is it likely that type of institution must be taken into account? ### Do you expect normal and/or homoscedastic errors? ### In the following, we will fit several models, ### not all necessarily including all considered covariates. ### To make those models comparable (by F-tests etc.) we ### create a dataset containing only those institutions ### for which all needed variables are available. ### Consequently, all fitted models will be based on ### the same dataset. Data <- subset(aaup, complete.cases(aaup[, c("salary.assoc", "n.prof", "n.assoc", "n.assist", "type")])) nrow(Data) nrow(aaup) ### Which institutions are lost? whichREM <- with(aaup, !complete.cases(aaup[, c("salary.assoc", "n.prof", "n.assoc", "n.assist", "type")])) subset(aaup, whichREM) ### Simple additive model to start with ... summary(m1orig <- lm(salary.assoc ~ type + n.prof + n.assoc + n.assist, data = Data)); ### QUESTIONS: ### Interpret the estimated regression coefficients. What do the p-values in the last column of the table tell us? ### Based on the above output compare the salary of assoc. professor at type IIA school and type IIB school. ### Can we conclude that hiring one more professor would increase the salary of associate professors? l <- c(0,1,-1,0,0,0); LSest(m1orig, l) ### Can you explain how is it possible that the Std. error in the above output ### is smaller than Std. Errors in summary(m1) at rows starting by 'typeIIA' ### and 'typeIIB'? ### (Overall) medians of n.prof, n.assist are 40, ### for n.assoc it is 38 (which is roughly 40 as well). ### ### To get better interpretation of the regression coefficients ### we create new covariates n.prof40, n.assoc40, n.assist40 ### obtained from subtracting 40 from the original ones. ### "40" versions will then be used in models. ### ### Which regression coefficients become influenced by this ### transformation? Data <- transform(Data, n.prof40 = n.prof - 40, n.assoc40 = n.assoc - 40, n.assist40 = n.assist - 40) ### Simple additive model with transformed variables m1 <- lm(salary.assoc ~ type + n.prof40 + n.assoc40 + n.assist40, data = Data) summary(m1) ### QUESTIONS: ### Interpret the estimated regression coefficients. ### Which parameters have different interpretation in models m1 and m1orig? summary(m1orig) summary(m1) ### What is being tested here? m0 <- lm(salary.assoc ~ n.prof40 + n.assoc40 + n.assist40, data = Data) anova(m0, m1); ### Let consider the contrast sum ("contr.sum:) for the varaible 'type' m1B <- lm(salary.assoc ~ type +n.prof40 + n.assoc40 + n.assist40, data = Data, contrasts = list(type = "contr.sum")) summary(m1B); ### QUESTION: Interpret the regression coefficients on the rows that start with 'type1' and 'type2' C <- contr.sum(3); rownames(C) <- levels(Data$type); colnames(C) <- names(coef(m1B))[2:3]; C ### Compare the following two summary outputs summary(m1B); summary(m1); ### Basic diagnostic plots plotLM(m1) ### Slightly more complicated model? summary(m2 <- lm(salary.assoc ~ type + n.prof40*n.assoc40 + n.assist40, data = Data)) ### Based on the model m2 describe what is the effect of the numbers of professors on the salary of associate professors? ### Compare the following two outputs summary(m2) summary(m2orig <- lm(salary.assoc ~ type + n.prof*n.assoc + n.assist, data = Data)) ### Try to add the quadratic term of number of professors. summary(m3 <- lm(salary.assoc ~ type + n.prof40*n.assoc40 + I(n.prof40^2) + n.assist40, data = Data)) ### Based on the model m3 describe what is the effect of the numbers of professors on the salary of associate professors?