### --------------------------------------------------------------------------------------### ### Winter Term 2017/2018 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #10 Generalized Least Squares, Breusch-Pagan test, Box-Cox transformation, ### ### and VIF (Variance inflantion factor) ### ### ### ### --------------------------------------------------------------------------------------### ### Working directory setwd("H:/nmsa407_LinRegr/") ### Load the needed packages library("mffSM") ### 1. G E N E R A L I Z E D L E A S T S Q U A R E S ### Dataset Hlavy ### * Application of the Generalized Least Squares (GLS) ### * Piecewise polynomial model fitted using a linear model ### with constraints on continuity/smoothness ### ### Data on head sizes of fetuses (from the ultrasound monitoring) in different periods of pregnancy. ### Each data value (variable ‘y’) is the average over n measurements. ### ### i: serial number ### ### t: week of pregnancy ### ### n: number of measurements used to calculate the average head size ### given in the ‘y’ variable ### ### y: the average head size obtained from the ‘n’ measurements ### performed in week ‘t’ of pregnancy data(Hlavy, package = "mffSM") ### Exploratory analysis head(Hlavy) dim(Hlavy) summary(Hlavy) plot(y ~ t, data = Hlavy, pch = 23, col = "red3", bg = "orange") ### Basic scatterplot showing also sample size used to calculate each value of y with(Hlavy, quantile(n, prob = seq(0, 1, by = 0.2))) Hlavy <- transform(Hlavy, fn = cut(n, breaks = c(0, 10, 30, 50, 60, 90), labels = c("<=10", "11-30", "31-50", "51-60", ">60"))) with(Hlavy, summary(fn)) PAL <- diverge_hcl(length(levels(Hlavy[, "fn"]))) names(PAL) <- levels(Hlavy[, "fn"]) plot(y ~ t, data = Hlavy, pch = 23, col = "black", bg = PAL[fn]) legend(17, 9, legend = levels(Hlavy[, "fn"]), title = "n", fill = PAL) plotData <- function(){ PAL <- rev(heat_hcl(length(levels(Hlavy[, "fn"])), c.=c(80, 30), l=c(30, 90), power=c(1/5, 1.3))) names(PAL) <- levels(Hlavy[, "fn"]) plot(y ~ t, data = Hlavy, pch = 23, col = "black", bg = PAL[fn]) legend(17, 9, legend = levels(Hlavy[, "fn"]), title = "n", fill = PAL) abline(v = 27, col = "lightblue", lty = 2) } plotData() ### MOTIVATION: ### Practitioner says that y ~ t relationship could be described by a piecewise quadratic ### function with t = 27 being a point where the quadratic relationship changes its shape. ### ### From the practical point of view, the fitted curve should be CONTINUOUS at t = 27 ### and perhaps also SMOOTH (i.e., with continuous at least the first derivative) ### as it is biologically not defensible to claim that at t = 27 the growth has a jump ### in the speed (rate) of the growth. Hlavy <- transform(Hlavy, t27 = t - 27) ### Grid to calculate the fitted regression function tgrid <- seq(16, 42, length = 100) pdata <- data.frame(t27 = tgrid - 27) ### Piecewise quadratic model which has a regression function ### CONTINUOUS at t = 27 ### Generalized least squares (GLS) used to calculate the estimates. summary(mCont <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy)) ### QUESTIONS: ### Can you identify both quadratic functions in the output? ### Suppose that m(t) denotes the regression function. Can you find a p-value of the test ### ### H0: the first derivative of m(t) is continous (we will say that m(t) is SMOOTH) ### ### against the alternative ### ### H1: the first derivative of m(t) is not continous (we will say that m(t) is CONTINUOUS) ### Some naive calculation W <- diag(Hlavy$n); Y <- Hlavy$y; X <- model.matrix(mCont) ### Estimated regression coefficients (betahat <- solve(t(X)%*%W%*%X)%*%(t(X)%*%W%*%Y)); coef(mCont) ### Generalized fitted values (Yg <- X%*%betahat); fitted(mCont) ### Generalized residual sum of squares (RSS <- t(Y - Yg)%*%W%*%(Y - Yg)); ### Generalized mean square RSS/(length(Y)-ncol(X)) (summary(mCont)$sigma)^2 ### Be careful that Multiple R-squared: 0.9968 reported in the output "summary(mCont)" is not equal to sum((Yg-mean(Yg))^2)/sum((Y-mean(Y))^2) ### Instead, it equals the following quantity that is more difficult to interpret barYw <- sum(Y*Hlavy$n)/sum(Hlavy$n); ### weigthed mean of the original observations ### something as "generalized" regression sum of squares SSR <- sum((Yg - barYw)^2 * Hlavy$n) ### something as "generalized" total variance SST <- sum((Y - barYw)^2 * Hlavy$n); ### "generalized" R squared SSR/SST summary(mCont)$r.squared ### The regression coefficients can be also estimated by the standard ### OLS method from the data, where each row is repeated Hlavy$n - times. ### But then one CANNOT USE the standard errors of the regression coefficients ### in the output of the lm -function. iii <- rep(1:nrow(Hlavy), Hlavy$n) Hlavy2 <- Hlavy[iii,]; # new dataset summary(mCont2 <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), data = Hlavy2)) summary(mCont) ### Figure for the fit fitCont <- predict(mCont, newdata = pdata) plotData() lines(tgrid, fitCont, col = "red2", lwd = 2) ### Just for comparison, GLS fit versus OLS (ordinary least squares) ### fit which does not take the weights into account summary(mContOLS <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), data = Hlavy)) fitContOLS <- predict(mContOLS, newdata = pdata) plotData() lines(tgrid, fitCont, col = "red2", lwd = 2) lines(tgrid, fitContOLS, col = "blue", lwd = 2) legend(33, 7, legend = c("GLS", "OLS"), col = c("red2", "blue"), lwd = 2, lty = 1) ### QUESTIONS: ### Do you have explanation why the red line does not follow so tightly ### the points corresponding to the low values of t? ### Piecewise quadratic model which has a regression function SMOOTH at the point t = 27 ### Generalized least squares (GLS) used to calculate the estimates. summary(mSmooth <- lm(y ~ t27 + I(t27^2) + (t27 > 0):I(t27^2), weights = n, data = Hlavy)) fitSmooth <- predict(mSmooth, newdata = pdata) plotData() lines(tgrid, fitCont, col = "red2", lwd = 2) lines(tgrid, fitSmooth, col = "blue", lwd = 2) legend(33, 7, legend = c("Continuous", "Smooth"), col = c("red2", "blue"), lwd = 2, lty = 1) ### Strictly speaking, this model is not Hierarchically Well Formulated (HWF). ### Nevertheless, is it sensible in this context? ### Piecewise quadratic model which has a regression function not even continuous at t = 27 ### Generalized least squares (GLS) used to calculate the estimates. summary(mJump <- lm(y ~ t27 + I(t27^2) + (t27 > 0) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy)) ### QUESTIONS: ### Can you find a p-value of the test ### ### H0: Jump of m(t) at t = 27 is equal to 0 ### ### against ### ### H1: Jump of m(t) at t = 27 is not equal to 0? fitJump <- predict(mJump, newdata = pdata) plotData() lines(tgrid, fitCont, col = "red2", lwd = 2) lines(tgrid, fitSmooth, col = "blue", lwd = 2) lines(tgrid, fitJump, col = "darkgreen", lwd = 2) legend(33, 7, legend = c("Jump", "Continuous", "Smooth"), col = c("darkgreen", "red2", "blue"), lwd = 2, lty = 1) ### 2. B R E U S C H - P A G A N | B O X - C O X | V I F ### Dataset on housing values in suburbs of Boston. ### ### Data contain information on 506 municipalities in the metropolitan area of Boston ### and Massachusetts. Data were collected in 1978. ### ### 'zn' proportion of residential land zones for lots over 25,000 sq.ft. ### ### 'indus' proportion of non-retail business acres per municipality. ### ### 'chas' Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). ### ### 'nox' nitrogen oxides concentration (parts per 10 million). ### ### 'rm' average number of rooms per dwelling. ### ### 'age' proportion of owner-occupied units built prior to 1940. ### ### 'rad' index of accessibility to radial highways. ### ### 'tax' full-value property-tax rate per $10,000. ### ### 'ptratio' pupil-teacher ratio by town. ### ### 'black' the proportion of blacks by town. ### ### 'lstat' percentage of population with low income ### ### 'medv' median value of owner-occupied homes (in thousands of $). ### ### 'crime' categorized number of crimes per 1000 thousand citizens categorized as follows: ### 1 .... almost no crime (<= 0.1) ### 2 .... some crime (0.1-1) ### 3 .... high crime rate (> 1) ### ### 'ldis' logarithm (to the power of 2) of the weighted mean of distances to five Boston employment centres. Data <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/Boston2.csv", header = TRUE, sep = ",", na.strings = "") head(Data) ### The main task is to create a model that evaluates influence the of nitrogen oxides concentration [nox] ### on the median value of owner-occupied home [medv] (or its transformation) ### while possibly taking into account also some other variables, specifically the following: ### ### * crime rate [crime] ### * logarithm (to the power of 2) of the distance from the Boston centre [ldis] ### * pupil-teacher ratio [ptratio] ### * proportion of non-retail business acres per municipality [indus] ### * proportion of owner-occupied units built prior to 1940 [age] ### * the proportion of blacks by town [black] Data <- transform(Data, fcrime = factor(crime, levels = 1:3, labels = c("None", "Some", "High"))) Data <- subset(Data, select = c("medv", "nox", "fcrime", "ldis", "ptratio", "indus", "age", "black")) head(Data) summary(Data) with(Data, summary(medv)) hist(Data[, "medv"], prob = TRUE, col = "olivedrab", xlab = "Median value of the owners-occupied houses [1000 USD]") subset(Data, medv == 50) ### Is a value of 50 a real value or only the lower bound of the true value ("right-censoring")? Who knows... ### Scatterplot matrix palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5))) scatterplotMatrix(~medv + nox + ldis + indus + age + ptratio + black, reg.line = FALSE, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = Data, pch = 16, cex=0.5) ### Dependence of medv on nox while taking crime into account BGC3 <- rainbow_hcl(3) COL3 <- c("red3", "darkgreen", "blue3") PCH3 <- c(21, 22, 23) names(BGC3) <- names(COL3) <- names(PCH3) <- levels(Data[, "fcrime"]) plot(medv ~ nox, pch = PCH3[fcrime], col = COL3[fcrime], bg = BGC3[fcrime], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Median value of the owners-occupied houses [1000 USD]") legend(0.75, 50, title = "Crime level", legend = levels(Data[, "fcrime"]), pch = PCH3, pt.bg = BGC3, col = COL3) ### As a starting model (denote it m1), we consider a linear model where we include all regressors ### (nox, crime, ldis, ptratio, indus, age, black), all two-way interactions ### between nox and the remaining covariates (that is, only interactions nox:z, ### where z are covariates other than nox) and additionally ### a quadratic term of nox (to account for possible non-linear effect of nox on medv). m1 <- lm(medv ~ nox*(fcrime + ldis + ptratio + indus + age + black) + I(nox^2), data = Data) summary(m1) plotLM(m1) ### Shapiro-Wilk test of normality shapiro.test(residuals(m1)) ### Breusch-Pagan test - assuming normality of errors library("lmtest") ncvTest(m1); bptest(m1, ~fitted(m1), studentize = FALSE); ### ncvTest : var.formula: "a one-sided formula for the error variance; if omitted, the error variance depends on the fitted values." ### bptest : var.formula: "by default the same explanatory variables are taken as in the main regression model." ### 'Manual calculation' U <- residuals(m1); U2 <- U^2 Yhat <- fitted(m1); n <- length(U) ### Test statistic (BP <- sum(U^2 * (Yhat - mean(Yhat)))^2 / (2 * (mean(U^2))^2 * sum((Yhat-mean(Yhat))^2))) ### p-value 1-pchisq(BP, df=1) bptest(m1, ~fitted(m1), studentize = FALSE)$p.value ### Koenker's studentized variant of Breusch-Pagan test - normality of errors not assumed bptest(m1, ~fitted(m1)) ### Manual calculation (BP.Koenker <- (sqrt(n)*cor(U^2, Yhat))^2) 1-pchisq(BP.Koenker, df=1) bptest(m1, ~fitted(m1))$p.value ### Up to some minor (and asymptotically negligible) differences ### this test can be also performed by considering the overall test ### of the following 'linear model' anova(lm(I(U^2)~Yhat)) ### One can also try to consider all the covariates bptest(m1, studentize=FALSE); ### Manual calculation Z <- model.matrix(m1)[,-1]; sigma2 <- mean(U^2); Suz <- colMeans(U^2/sigma2*(Z - matrix(colMeans(Z), nrow(Z), ncol(Z), byrow=T))); n <- nrow(Z); ### Test statistic n*t(Suz)%*%solve(2*var(Z)*(n-1)/n)%*%Suz ### Studentized version bptest(m1); ### Manual calculation kurt.estim <- mean((U^2-sigma2)^2)/sigma2^2 n*t(Suz)%*%solve(kurt.estim*var(Z)*(n-1)/n)%*%Suz ### Again, up to some minor (and asymptotically negligible) differences ### this test can be also performed by considering the overall test ### of the following 'linear model anova(lm(I(U^2)~model.matrix(m1))) ### Box-Cox transformation library(MASS) boxcox(m1) boxcox(m1, lambda = seq(-0.1, 0.5, by = 0.05)) ### Fit the same model as above, however, with the transformed response ### as suggested by the Box-Cox transformation (denote the transformed response as lmedv). Data <- transform(Data, lmedv = log(medv)); lm1 <- lm(lmedv ~ nox*(fcrime + ldis + ptratio + indus + age + black) + I(nox^2), data = Data) summary(lm1) plotLM(lm1) ### Shapiro-Wilk test of normality shapiro.test(residuals(lm1)) ### Koenker's studentized variance of Breusch-Pagan test - normality of errors not assumed bptest(lm1, ~fitted(lm1)); ### The model still does not seem to be homoscedastic and it is not clear how to make ### it "more" homoscedastic. The inference should ideally take this into consideration. ### ### During the following lectures the so called sandwich estimator of the variance matrix ### of the regression coefficient will be explained. This estimator is consistent ### estimator of the variance matrix even under heteroscedasticity. ### In what follows we IGNORE the problem of HETEROSCEDASTICITY. ### QUESTIONS: ### Can we simplify the model? drop1(lm1, test = "F") ### Leaving out not significant interactions we arrive at model lm2 <- lm(lmedv ~ nox + I(nox^2) + fcrime + ldis + ptratio + indus + age + black + nox:(ldis + ptratio + indus), data = Data) anova(lm2, lm1); ### What about further simplification of a model? drop1(lm2, test = "F") ### Model that leaves out the quadratic term of nox and fcrime lm3 <- lm(lmedv ~ nox + ldis + ptratio + indus + age + black + nox:(ldis + ptratio + indus), data = Data) anova(lm3, lm2) ### Thus a candidate on the final model would be lm3 summary(lm3); ### M U L T I C O L I N E A R I T Y ### When we are interested in the effect of particular regressors (i.e. 'nox' in our situation) ### then we should also explore the possible problem of multicollinearity. ### ### In particular we are interested how are the other variables related with 'nox'. ### The reason is that while we want to have 'nox' we do not want to have in ### the model variables that are highly correlated with 'nox' but do not contribute ### in terms of the R-squared value, for instance. ##### Correlation coefficients among regressors (R <- round(cor(Data[, c("nox", "ldis", "ptratio", "indus", "age", "black")]), 2)) ### Mutually high correlations: nox, ldis, indus, age COL <- "red3" BG <- "orange" PCH <- 23 par(mfrow = c(2, 3)) plot(nox ~ ldis, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "ldis"])) plot(nox ~ indus, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "indus"])) plot(nox ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "age"])) plot(ldis ~ indus, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["ldis", "indus"])) plot(ldis ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["ldis", "age"])) plot(indus ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["indus", "age"])) par(mfrow = c(1, 1)) ### Squared coefficient of multiple correlation between nox and (ldis, indus, age) ### Where is it? summary(lm(nox ~ ldis + indus + age, data = Data)) ### Coefficient of multiple correlation between nox and (ldis, indus, age) ### (up to a sign) and including the sign mnox <- lm(nox ~ ldis + indus + age, data = Data) sqrt(summary(mnox)[["r.squared"]]) cor(mnox[["model"]][, "nox"], fitted(mnox)) ### How much does the multiple correlation decrease if we remove ldis? ### (visually the closest linear dependence with nox)? mnox2 <- lm(nox ~ indus + age, data = Data) cor(mnox2[["model"]][, "nox"], fitted(mnox2)) ### Relationship of quantitative regressors with crime par(mfrow = c(2, 3)) plot(nox ~ fcrime, data = Data, col = BGC3) plot(ldis ~ fcrime, data = Data, col = BGC3) plot(ptratio ~ fcrime, data = Data, col = BGC3) plot(indus ~ fcrime, data = Data, col = BGC3) plot(age ~ fcrime, data = Data, col = BGC3) plot(black ~ fcrime, data = Data, col = BGC3) par(mfrow = c(1, 1)) ### --> also mostly quite strong dependence ### --> again potential cause of multicollinearity ### Initial model (to treat multicollinearity first) m1 <- lm(lmedv ~ nox + ldis + indus + age + ptratio + black + fcrime, data = Data) summary(m1) plotLM(m1) ### (Generalized) variance inflation factors (function from package car) vif(m1) ### Not that bad. But still, the worst inflation for the coefficient on nox. ### (also the most interesting one for us) ### For a numeric covariate GVIF conincides with VIF and it is calculated as 1/(1-R_j^2), where ### R_j^2 is the multiple correlation coefficent between the j-th covariate ### and the remaining covariates. For instance: Rj2 <- summary(lm(nox~ldis + indus + age + ptratio + black + fcrime, data = Data))[["r.squared"]]; 1/(1-Rj2) ### For a categorical variable fcrime, GIV is calculated as R <- cor(model.matrix(m1)[,-1]); # correlation matrix of regressors in the model R11 <- R[7:8,7:8] # correlation submatrix corresponding to regressors generated by 'fcrime' R22 <- R[1:6,1:6] # correlation submatrix corresponding to the remaining regressors R12 <- R[1:6,7:8] det(R11)*det(R22)/det(R) ### How about if we remove each of ldis, indus, age, fcrime ### (always only one of them) m2 <- list() m2[["ldis"]] <- lm(lmedv ~ nox + indus + age + ptratio + black + fcrime, data = Data) m2[["indus"]] <- lm(lmedv ~ nox + ldis + age + ptratio + black + fcrime, data = Data) m2[["age"]] <- lm(lmedv ~ nox + ldis + indus + ptratio + black + fcrime, data = Data) m2[["fcrime"]] <- lm(lmedv ~ nox + ldis + indus + age + ptratio + black, data = Data) summary(m1)[["r.squared"]] sapply(lapply(m2, summary), "[[", "r.squared") vif(m1) lapply(m2, vif) ### Removal of fcrime practically does not change R^2 ### and slightly (by about one unit) decreases VIF for nox. ### How about if we additionally remove each of ldis, indus, age ### or two of them or all of them? m3 <- list() m3[["ldis"]] <- lm(lmedv ~ nox + indus + age + ptratio + black, data = Data) m3[["indus"]] <- lm(lmedv ~ nox + ldis + age + ptratio + black, data = Data) m3[["age"]] <- lm(lmedv ~ nox + ldis + indus + ptratio + black, data = Data) m3[["ldis-indus"]] <- lm(lmedv ~ nox + age + ptratio + black, data = Data) m3[["ldis-age"]] <- lm(lmedv ~ nox + indus + ptratio + black, data = Data) m3[["indus-age"]] <- lm(lmedv ~ nox + ldis + ptratio + black, data = Data) m3[["ldis-indus-age"]] <- lm(lmedv ~ nox + ptratio + black, data = Data) summary(m1)[["r.squared"]] sapply(lapply(m3, summary), "[[", "r.squared") vif(m1) lapply(m3, vif) ### Suggestion: ### Do not consider also indus. ### The reason is that after its removal, R^2 still does not drop too much ### (more or less in the same magnitude as after removal of age) but the VIF ### of nox after removal of indus is lower than the VIF of nox after removal of age. ### Let us consider now a more complex model including only the variables ### nox, ldis, age, ptratio, black... lm4 <- lm(lmedv ~ nox + I(nox^2) + nox*(ldis + age + ptratio + black), data = Data) drop1(lm4, test = "F") ### Is it possible to remove all three interactions ### which are not significant at 20% lm5 <- lm(lmedv ~ nox + I(nox^2) + ldis + age + ptratio + black + nox:ldis, data = Data) summary(lm5) anova(lm5, lm4) ### More complex model lm4 is far from being ### significantly better than model lm5. drop1(lm5, test = "F") ### All terms that could be removed ### are now quite highly significant. ### Final model? mFin <- lm5; # Model with an initial analysis of multicollinearity mFin0 <- lm3; # Model without the initial analysis of multicollinearity summary(mFin) ### How would you describe dependence ### of medv on nox given this model? plotLM(mFin) ### Heteroscedasticity is likely a problem. ### Inference on beta's (or their linear combinations) ### should probably be corrected. ### --> time for SANDWICH ESTIMATOR of variance (coming soon) ### Final model from the first part of the exercise class summary(mFin); # Model with initial analysis of multicollinearity summary(mFin0); # Model without initial analysis of multicollinearity ### Possible explanation of why nox^2 was (quite highly) non-significant ### in model mFin0 (the first part of the exercise class) ### Some visualization summary(Data) lgr <- 200 xindus <- c(5, 10, 18) ## approx quartiles and median of indus xnox <- seq(0.35, 0.90, length = lgr) xdata <- data.frame(nox = rep(xnox, 3), indus = rep(xindus, each = lgr), ptratio = 19, ldis = 1.17, age = 75, black = 0.01) head(xdata) pFin <- predict(mFin, newdata = xdata) pFin0 <- predict(mFin0, newdata = xdata) Data <- transform(Data, findus = cut(indus, breaks = c(0.1, 6.2, 18.1, 28))) summary(Data[, "findus"]) COL33 <- diverge_hcl(3); COL33[2] <- "grey60" BGC33 <- c("lightblue", "grey80", "red") PCH33 <- c(21, 22, 23) names(BGC33) <- names(COL33) <- names(PCH33) <- levels(Data[, "findus"]) par(mfrow = c(1, 2)) plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]", main="Final model with multicoll. considered") lines(xnox, pFin[1:lgr], lwd = 2) legend(0.4, 2, legend = names(COL33), title = "indus", col = COL33, pt.bg = BGC33, pch = PCH33) # plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]", main="Final model without multicoll. considered") lines(xnox, pFin0[1:lgr], col = COL33[1], lwd = 2) lines(xnox, pFin0[(lgr+1):(2*lgr)], col = COL33[2], lwd = 2) lines(xnox, pFin0[(2*lgr+1):(3*lgr)], col = COL33[3], lwd = 2) par(mfrow = c(1, 1)) layout(matrix(c(1,1,0,4, 2,2,3,3), nrow = 2, byrow = TRUE)) for (i in 1:3){ plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = subset(Data, findus == levels(findus)[i]), xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]") lines(xnox, pFin[((i-1)*lgr+1):(i*lgr)], col = COL33[i], lwd = 2) lines(xnox, pFin0[((i-1)*lgr+1):(i*lgr)], col = COL33[i], lwd = 2, lty = 2) } plot.new() legend(0.1, 0.9, legend = names(COL33), title = "indus", col = COL33, pt.bg = BGC33, pch = PCH33) legend(0.1, 0.4, legend = c("Fin", "Fin0"), title = "Model", lty = c(1, 2), lwd = 2, col = "black") par(mfrow = c(1, 1))