### --------------------------------------------------------------------------------------### ### Winter Term 2018/2019 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #8 Anova tables ### ### ### ### --------------------------------------------------------------------------------------### ### Working directory setwd("H:/nmsa407_LinRegr/") library("mffSM") library("car") ### Data from the American Association of University Professors ### (AAUP) annual faculty salary survey of American colleges and ### universities, 1995. The data are from 1161 colleges obtained ### (mostly) by a census method. ### Data data(AAUP, package = "mffSM") # help("AAUP") head(AAUP) summary(AAUP) aaup <- subset(AAUP, select = c("FICE", "name", "state", "type", "n.prof", "n.assoc", "n.assist", "salary.assoc")) ### In the following, we will fit several models, not all necessarily ### including all considered covariates. To make those models comparable ### (by F-tests etc.) we create a dataset containing only those institutions ### for which all needed variables are available. Consequently, all fitted ### models will be based on the same dataset. Data <- subset(aaup, complete.cases(aaup[, c("salary.assoc", "n.prof", "n.assoc", "n.assist", "type")])) nrow(Data) ### Which institutions are lost? whichREM <- with(aaup, !complete.cases(aaup[, c("salary.assoc", "n.prof", "n.assoc", "n.assist", "type")])) subset(aaup, whichREM) ### (Overall) medians of n.prof, n.assist are 40, ### for n.assoc it is 38 (which is roughly 40 as well). ### ### To get better interpretation of the regression coefficients ### we create new covariates n.prof40, n.assoc40, n.assist40 ### obtained from subtracting 40 from the original ones. ### "40" versions will then be used in models. ### ### Which regression coefficients become influenced by this ### transformation? Data <- transform(Data, n.prof40 = n.prof - 40, n.assoc40 = n.assoc - 40, n.assist40 = n.assist - 40) ### A simple linear model without interactions m1 <- lm(salary.assoc ~ type + n.prof40 + n.assoc40 + n.assist40, data = Data) summary(m1) plotLM(m1) ### Slightly more complicated model? m2 <- lm(salary.assoc ~ (type + n.prof40 + n.assoc40 + n.assist40)^2, data = Data) summary(m2) ### QUESTIONS: ### Interpret the estimated regression coefficients on the lines starting with ### '(Intercept)', 'typeIIA', 'typeIIB', 'n.prof40', 'n.associ40', 'n.assist40'. ### Describe the effect of the number of professors on the salary of associate professors. ### Describe the effect of the type of the school on the salary of associate professors. levels(Data$type) ## Intercept predict(m2,newdata=data.frame(type="I", n.prof40=0, n.assoc40=0, n.assist40=0)) coef(m2)[1] ## typeIIA predict(m2,newdata=data.frame(type="IIA",n.prof40=0, n.assoc40=0, n.assist40=0))- predict(m2,newdata=data.frame(type="I",n.prof40=0, n.assoc40=0, n.assist40=0)) coef(m2)[2] ## n.prof40 x0 = 171 # for instance, holds for any x0 predict(m2,newdata=data.frame(type="I",n.prof40=x0+1, n.assoc40=0, n.assist40=0))- predict(m2,newdata=data.frame(type="I",n.prof40=x0, n.assoc40=0, n.assist40=0)) coef(m2)[4] ## typeIIA:n.prof40 predict(m2,newdata=data.frame(type="IIA",n.prof40=x0+1, n.assoc40=0, n.assist40=0))- predict(m2,newdata=data.frame(type="IIA",n.prof40=x0, n.assoc40=0, n.assist40=0)) coef(m2)[4]+coef(m2)[7] ## n.prof40:n.assoc40 predict(m2,newdata=data.frame(type="I",n.prof40=x0+1, n.assoc40=1, n.assist40=0))- predict(m2,newdata=data.frame(type="I",n.prof40=x0, n.assoc40=1, n.assist40=0)) coef(m2)[4]+coef(m2)[13] predict(m2,newdata=data.frame(type="I",n.prof40=1, n.assoc40=x0+1, n.assist40=0))- predict(m2,newdata=data.frame(type="I",n.prof40=1, n.assoc40=x0, n.assist40=0)) coef(m2)[5]+coef(m2)[13] ### Type I (sequential) ANOVA table (standard 'anova' command) (a1 <- anova(m2)) # Type I (sequential) ANOVA table ### QUESTIONS: ### Make sure that you understand what is being tested on each line of the output. ### (for example, consider the line starting with 'type:n.assist40') ### ### On this line we test the following null model ### (i.e. model that holds under the null hypothesis) m_null <- lm(salary.assoc ~ type + n.prof40 + n.assoc40 + n.assist40 + type:n.prof40+ type:n.assoc40, data = Data) ### against the alternative model (i.e. model that holds under the alternative hypothesis) m_alt <- lm(salary.assoc ~ type + n.prof40 + n.assoc40 + n.assist40 + type:n.prof40 + type:n.assoc40 + type:n.assist40, data = Data) (a_null_alt <- anova(m_null, m_alt)) ### Note that the degrees of freedom and the sums of squares coincide with the output ### from anova(m2). But the values of the test statistics F and p-values are slightly different. ### The reason is that the estimates of the residual variance (MSe) differ ### and thus also different degrees of freedom are used. ### In anova(m2) we use MSe from the model m2 (i.e. 2963012/1110) and the F-statistic is then given by (32569/2)/(2963012/1110); ## recall that (df_num = m_null$df.res-m_alt$df.res) (df_den = m2$df.res) ((deviance(m_null) - deviance(m_alt))/df_num)/(deviance(m2)/df_den) ### more precisely (Fstat_anova_m2 <- (a1$Sum[7]/a1$Df[7])/(a1$Sum[11]/a1$Df[11])); ### and the p-value is calculated as 1 - pf(Fstat_anova_m2, df1=a1$Df[7], df2=a1$Df[11]) ### On the other hand the value when comparing only the models m_null and m_alt ### (and not a series of models), MSe is taken from m_alt and thus ((32569/2)/(3022526/1113)); (df_num = m_null$df.res-m_alt$df.res) (df_den = m_alt$df.res) ((deviance(m_null) - deviance(m_alt))/df_num)/(deviance(m_alt)/df_den) ### more precisely (Fstat_anova_null_alt <- (a_null_alt$Sum[2]/a_null_alt$Df[2])/(a_null_alt$RSS[2]/a_null_alt$Res.Df[2])); 1 - pf(Fstat_anova_null_alt, df1=a_null_alt$Df[2], df2=a_null_alt$Res.Df[2]) #### Note that the sum of "Sum Sq" column a1; sum(a1[, "Sum Sq"]) ### corresponds to the residual sum of squares in the model that contains only intercept m0 <- lm(salary.assoc ~ 1, data = Data) deviance(m0) ### which is just sum((Y-mean(Y))^2), or crossprod(Data$salary.assoc - mean(Data$salary.assoc)) ### or with(Data, (length(salary.assoc)-1)*var(salary.assoc)) Anova(m2, type="I") # does not exist ### Type II ANOVA table ### (Anova command comes from package car) (a2 <- Anova(m2, type = "II")) ## Type II ANOVA table ### QUESTIONS: ### Make sure that you understand what is being tested on each line of the output. ### Interpret the values given in the table. ### For example, consider the line starting with'type:n.assoc40'. ### On this line we test the following null model ### (i.e. model that holds under the null hypothesis) m_null <- lm(salary.assoc ~ (type + n.prof40 + n.assoc40 + n.assist40)^2 - type:n.assoc40, data = Data) ### against the alternative model m2. anova(m_null, m2) a2[6,] ### The alternative is the 'full' model m2, now this output ### corresponds with the the line starting with 'type:n.assoc40' ### of the output from Anova(m2, type = "II"). ### Analogously, line with n.assoc40:n.assist40 coincides with the test from summary(m2) ### 2. The line starting with 'n.assoc40' is less straightforward. ### ### On this line we test the following null model ### (i.e. model without any variable containing 'n.assoc40') m_null <- lm(salary.assoc ~ (type + n.prof40 + n.assist40)^2, data = Data); ### against the alternative model m_alt <- lm(salary.assoc ~ (type + n.prof40 + n.assist40)^2 + n.assoc40, data = Data); (a2_null_alt <- anova(m_null, m_alt)) a2[3,] ### Note that the degrees of freedom and the sums of squares coincide with the output ### from Anova(m2, type = "II"). But the values of the test statistics F and p-values ### are slightly different. The reason is that the estimates of the residuals ### variance (MSe) differ and thus also different degrees of freedom are used. ### In Anova(m2, type = "II") we use MSe from the model m2 (i.e. 2963012/1110) ### and the F-statistic is then given by (18968/1)/(2963012/1110); df_num=m_null$df.res-m_alt$df.res df_den=m2$df.res ((deviance(m_null)-deviance(m_alt))/df_num)/(deviance(m2)/df_den) ### More precisely (Fstat_anova2_m2 <- (a2$Sum[3]/a2$Df[3])/(a2$Sum[11]/a2$Df[11])); ### and the p-value is calculated as 1 - pf(Fstat_anova2_m2, df1=a2$Df[3], df2=a2$Df[11]) ### On the other hand the value when comparing only the models m_null and m_alt (and not a series of model), ### MSe is taken from m_alt and thus (18968/1)/(3135352/1114); df_num=m_null$df.res-m_alt$df.res df_den=m_alt$df.res ((deviance(m_null)-deviance(m_alt))/df_num)/(deviance(m_alt)/df_den) ### more precisely (Fstat_anova_null_alt <- (a2_null_alt$Sum[2]/a2_null_alt$Df[2])/(a2_null_alt$RSS[2]/a2_null_alt$Res.Df[2])); 1 - pf(Fstat_anova_null_alt, df1=a2_null_alt$Df[2], df2=a2_null_alt$Res.Df[2]); ### Type III ANOVA table ### (Anova command comes from package car) (a3 <- Anova(m2, type = "III")); # Type III ANOVA table ### QUESTIONS: ### Make sure that you understand what is being tested on each line of the output. ### Note that the lines with the interactions are the same as in Anova(m2, type = "II"). ### The lines with the numeric covariates correspond to the output in a2 summary(m2) ### as we test that the corresponding regression coefficient is zero. ### Such a test does not make usually any sense for e.g. 'n.assoc', ### as interactions with this variable are included. ### On the line with the categorical covariates 'type' we test for the null model (m_null <- lm(salary.assoc ~ (type + n.prof40 + n.assoc40 + n.assist40)^2 - type, data = Data)); ### against the model m2 anova(m_null, m2) a3[2,] ### From which output below it is/it is NOT possible to conclude whether we need ### each particular interaction term? anova(m2) Anova(m2, type = "II") Anova(m2, type = "III") m_null <- lm(salary.assoc ~ (type + n.prof40 + n.assoc40 + n.assist40)^2 - type:n.prof40, data = Data); anova(m_null,m2) ## This test is only to be found in type II and III tables. ## Type I table is for this purpose useless - it is sequential. ### Only relevant type III F-tests drop1(m2) drop1(m2, test = "F") ### Numbers are the same as numbers on ### equally labeled rows from type II and type III ### Anova commands. ### Model m3 where the highly non-significant interaction ### term from m2 is omitted. m3 <- lm(salary.assoc ~ (n.prof40 + n.assoc40 + n.assist40) * type + n.prof40:n.assoc40 + n.assoc40:n.assist40, data = Data) summary(m3) plotLM(m3) ### Model m3 vs. m2 again. anova(m3, m2) ### How about significance of the remaining terms in model m3? drop1(m3, test = "F") ### Can you interpret parameters in different parameterization of the categorical covariate type? m3Sum <- lm(salary.assoc ~ (n.prof40 + n.assoc40 + n.assist40) * type + n.prof40:n.assoc40 + n.assoc40:n.assist40, data = Data, contrasts = list(type = "contr.sum")) ### Contrast matrix Csum <- contr.sum(3); colnames(Csum) <- paste("type", c(1,2), sep=""); rownames(Csum) <- levels(Data$type); Csum; summary(m3Sum) ### QUESTIONS: ### Interpret the estimated regression coefficients on the lines starting with ### '(Intercept)', 'type1', 'type2', 'n.prof40', 'n.associ40', 'n.assist40'. ### Describe the effect of the number of professors on the salary of associate professors. ### Describe the effect of the type of the school on the salary of associate professors. (ICsum = solve(cbind(1,Csum))) ## Intercept (predict(m3Sum,newdata=data.frame(type="I", n.prof40=0, n.assoc40=0, n.assist40=0))+ predict(m3Sum,newdata=data.frame(type="IIA", n.prof40=0, n.assoc40=0, n.assist40=0))+ predict(m3Sum,newdata=data.frame(type="IIB", n.prof40=0, n.assoc40=0, n.assist40=0)) )/3 coef(m3Sum)["(Intercept)"] ## type1 predict(m3Sum,newdata=data.frame(type="I", n.prof40=0, n.assoc40=0, n.assist40=0)) unname(coef(m3Sum)["(Intercept)"]+coef(m3Sum)["type1"]) ## type2 predict(m3Sum,newdata=data.frame(type="IIA", n.prof40=0, n.assoc40=0, n.assist40=0)) unname(coef(m3Sum)["(Intercept)"]+coef(m3Sum)["type2"]) ## nprof40 x0 = pi # for any value of x0 (predict(m3Sum,newdata=data.frame(type="I", n.prof40=x0+1, n.assoc40=0, n.assist40=0))- predict(m3Sum,newdata=data.frame(type="I", n.prof40=x0, n.assoc40=0, n.assist40=0))+ predict(m3Sum,newdata=data.frame(type="IIA", n.prof40=x0+1, n.assoc40=0, n.assist40=0))- predict(m3Sum,newdata=data.frame(type="IIA", n.prof40=x0, n.assoc40=0, n.assist40=0))+ predict(m3Sum,newdata=data.frame(type="IIB", n.prof40=x0+1, n.assoc40=0, n.assist40=0))- predict(m3Sum,newdata=data.frame(type="IIB", n.prof40=x0, n.assoc40=0, n.assist40=0)) )/3 coef(m3Sum)["n.prof40"] ## nprof40:type1 predict(m3Sum,newdata=data.frame(type="I", n.prof40=x0+1, n.assoc40=0, n.assist40=0))- predict(m3Sum,newdata=data.frame(type="I", n.prof40=x0, n.assoc40=0, n.assist40=0)) coef(m3Sum)["n.prof40"]+coef(m3Sum)["n.prof40:type1"] ### Compare the following two outputs summary(m3Sum) summary(m3) summary(m3Sum)$coef summary(m3)$coef ### And how about this parameterization? levels(Data[, "type"]) m3Poly <- lm(salary.assoc ~ (n.prof40 + n.assoc40 + n.assist40) * type + n.prof40:n.assoc40 + n.assoc40:n.assist40, data = Data, contrasts = list(type = "contr.poly")) contr.poly(3) summary(m3Poly) ### Does model m4 below and its comparison with m3 via F-test make sense? ### How would you interpret model m4? Data <- transform(Data, ntype = as.numeric(type) - 1) summary(Data[, c("type", "ntype")]) with(Data, table(type, ntype)) m4 <- lm(salary.assoc ~ (n.prof40 + n.assoc40 + n.assist40) * ntype + n.prof40:n.assoc40 + n.assoc40:n.assist40, data = Data) summary(m4) anova(m4, m3) ### Does model m5 below and its comparison with m3 via F-test make sense? ### How would you interpret model m5? Data <- transform(Data, n.staff120 = n.prof40 + n.assoc40 + n.assist40) summary(Data[, c("n.staff120", "n.prof40", "n.assoc40", "n.assist40")]) m5 <- lm(salary.assoc ~ n.staff120 * type + n.prof40:n.assoc40 + n.assoc40:n.assist40, data = Data) summary(m5) anova(m5, m3) ### In the following, we will consider m3 (or any of its ### equivalent parameterizations) as a model which ### is useful and we will use it for statistical inference. m3Form <- formula(salary.assoc ~ (n.prof40 + n.assoc40 + n.assist40) * type + n.prof40:n.assoc40 + n.assoc40:n.assist40) m3 <- lm(m3Form, data = Data) m3Sum <- lm(m3Form, data = Data, contrasts = list(type = "contr.sum")) m3Poly <- lm(m3Form, data = Data, contrasts = list(type = "contr.poly")) summary(m3) summary(m3Sum) summary(m3Poly) ### SOME QUESTIONS TO BE ANSWERED. ### ### (1) Can it be claimed that the expected salary at type IIA ### institutions with 40 full professors, 40 associate professors ### and 40 assistant professors is by more than 1000 USD ### higher than at type IIB institutions of the same size? ### Supplement the statistical test by a relevant point and interval ### estimate. ### coef(m3) l1 = c(1,0,0,0,1,0,rep(0,8)) l2 = c(1,0,0,0,0,1,rep(0,8)) LSest(m3,L=l1) LSest(m3,L=l2) (l = l1-l2) LSest(m3,L=l,alternative="greater", theta0 = 10) ### (2) What is the answer to the above question if there are 50/50/50 ### full professors/associate professors/assistant professors ### working at institutions under comparison? ### l1 = c(1,10,10,10,1,0,10,0,10,0,10,0,100,100) l2 = c(1,10,10,10,0,1,0,10,0,10,0,10,100,100) LSest(m3,L=l1) LSest(m3,L=l2) (l = l1-l2) LSest(m3,L=l,alternative="greater", theta0 = 10) ### (3) Provide an estimate (both point and interval) ### of the expected salary at institution of type IIA ### with 75 full profs, 60 assoc. profs, 50 assistant profs. ### prof = 35 asoc = 20 assi = 10 (l = c(1,prof,asoc,assi,1,0,prof,0,asoc,0,assi,0,prof*asoc,asoc*assi)) LSest(m3,L=l) ### (4) Provide an interval prediction of the salary ### at institution of type IIA with 75 full profs, 60 assoc. profs, ### 50 assistant profs. predict(m3,newdata=data.frame(type="IIA",n.prof40=35,n.assoc40=20,n.assist40=10),interval="prediction",se.fit=TRUE) ### Using model m3 (parameterization 'contr.treatment') L <- matrix(c(0,0,0,0, 1,-1, rep(0, 8), 0,0,0,0, 1,-1, rep(c(10, -10), 3), 0, 0, 1, 35, 20, 10, 1, 0, 35, 0, 20, 0, 10, 0, 35*20, 20*10), nrow = 3, byrow = TRUE) colnames(L) <- names(coef(m3)) rownames(L) <- c("Question 1", "Question 2", "Question 3") print(L) ### Which one answers which of above questions? LSest(m3, L = L, theta0 = 10) LSest(m3, L = L, alternative = "greater", theta0 = 10) pdata <- data.frame(n.prof40 = 35, n.assoc40 = 20, n.assist40 = 10, type = factor("IIA", levels = c("I", "IIA", "IIB"))) print(pdata) predict(m3, newdata = pdata, interval = "confidence") predict(m3, newdata = pdata, interval = "prediction") ### Basic residual plots for m3. plotLM(m3)