### --------------------------------------------------------------------------------------###
### Winter Term 2019/2020                                                                 ###
### NMSA407 - Linear Regression                                                           ###
###                                                                                       ###
### EXERCISE #10  Generalized Least Squares, Breusch-Pagan test, Box-Cox transformation,  ###
###               and VIF (Variance inflantion factor)                                    ###
###                                                                                       ###
### --------------------------------------------------------------------------------------###


### Working directory
setwd("H:/nmsa407_LinRegr/")


### Load the needed packages
library("mffSM")


###  1. G E N E R A L I Z E D   L E A S T    S Q U A R E S 
###  Dataset Hlavy
###     * Application of the Generalized Least Squares (GLS)
###     * Piecewise polynomial model fitted using a linear model
###       with constraints on continuity/smoothness
###
###  Data on head sizes of fetuses (from the ultrasound monitoring) in different periods of pregnancy. 
###  Each data value (variable ‘y’) is the average over n measurements.
###
###     i: serial number
###
###     t: week of pregnancy
###
###     n: number of measurements used to calculate the average head size
###           given in the ‘y’ variable
###
###     y: the average head size obtained from the ‘n’ measurements
###           performed in week ‘t’ of pregnancy

data(Hlavy, package = "mffSM")

### Exploratory analysis
head(Hlavy)
dim(Hlavy)
summary(Hlavy)


plot(y ~ t, data = Hlavy, pch = 23, col = "red3", bg = "orange")


### Basic scatterplot showing also sample size used to calculate each value of y
with(Hlavy, quantile(n, prob = seq(0, 1, by = 0.2)))
Hlavy <- transform(Hlavy, fn = cut(n, breaks = c(0, 10, 30, 50, 60, 90), labels = c("<=10", "11-30", "31-50", "51-60", ">60")))
with(Hlavy, summary(fn))

PAL <- diverge_hcl(length(levels(Hlavy[, "fn"])))
names(PAL) <- levels(Hlavy[, "fn"])
plot(y ~ t, data = Hlavy, pch = 23, col = "black", bg = PAL[fn])
legend(17, 9, legend = levels(Hlavy[, "fn"]), title = "n", fill = PAL)

plotData <- function(){
  PAL <- rev(heat_hcl(length(levels(Hlavy[, "fn"])), c.=c(80, 30), l=c(30, 90), power=c(1/5, 1.3)))
  names(PAL) <- levels(Hlavy[, "fn"])
  plot(y ~ t, data = Hlavy, pch = 23, col = "black", bg = PAL[fn])
  legend(17, 9, legend = levels(Hlavy[, "fn"]), title = "n", fill = PAL)
  abline(v = 27, col = "lightblue", lty = 2)
}    

plotData()


### MOTIVATION: 
### Practitioner says that y ~ t relationship could be described by a piecewise quadratic
### function with t = 27 being a point where the quadratic relationship changes its shape.
###
### From the practical point of view, the fitted curve should be CONTINUOUS at t = 27
### and perhaps also SMOOTH (i.e., with continuous at least the first derivative)
### as it is biologically not defensible to claim that at t = 27 the growth has a jump
### in the speed (rate) of the growth.


Hlavy <- transform(Hlavy, t27 = t - 27)


### Grid to calculate the fitted regression function
tgrid <- seq(16, 42, length = 100)
pdata <- data.frame(t27 = tgrid - 27)


### Piecewise quadratic model which has a regression function
### CONTINUOUS at t = 27

### Generalized least squares (GLS) used to calculate the estimates.
summary(mCont <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy))

### QUESTIONS: 
### Can you identify both quadratic functions in the output?
### Suppose that m(t) denotes the regression function. Can you find a p-value of the test
###
### H0: the first derivative of m(t) is continous  (we will say that m(t) is SMOOTH) 
###
### against the alternative
###
### H1: the first derivative of m(t) is not continous  (we will say that m(t) is CONTINUOUS) 


### Some naive calculation 
W <- diag(Hlavy$n);
Y <- Hlavy$y;
X <- model.matrix(mCont)

### Estimated regression coefficients 
(betahat <- solve(t(X)%*%W%*%X)%*%(t(X)%*%W%*%Y));
coef(mCont)

### Generalized fitted values 
(Yg <- X%*%betahat);
fitted(mCont)

### Generalized residual sum of squares 
(RSS <- t(Y - Yg)%*%W%*%(Y - Yg));

### Generalized mean square 
RSS/(length(Y)-ncol(X))
(summary(mCont)$sigma)^2


### Be careful that Multiple R-squared:  0.9968 reported in the output "summary(mCont)" is not equal to 
sum((Yg-mean(Yg))^2)/sum((Y-mean(Y))^2)

### Instead, it equals the following quantity that is more difficult to interpret 
barYw <- sum(Y*Hlavy$n)/sum(Hlavy$n);   ### weigthed mean of the original observations 

### something as "generalized" regression sum of squares 
SSR <- sum((Yg - barYw)^2 * Hlavy$n)

### something as "generalized" total variance 
SST <- sum((Y - barYw)^2 * Hlavy$n); 

### "generalized" R squared
SSR/SST
summary(mCont)$r.squared


### The regression coefficients can be also estimated by the standard 
### OLS method from the data, where each row is repeated Hlavy$n - times. 
### But then one CANNOT USE the standard errors of the regression coefficients 
### in the output of the lm -function. 


iii <- rep(1:nrow(Hlavy), Hlavy$n)
Hlavy2 <- Hlavy[iii,]; # new dataset
summary(mCont2 <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), data = Hlavy2))
summary(mCont)


### Figure for the fit
fitCont <- predict(mCont, newdata = pdata)
plotData()
lines(tgrid, fitCont, col = "red2", lwd = 2)


### Just for comparison, GLS fit versus OLS (ordinary least squares)
### fit which does not take the weights into account
summary(mContOLS <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), data = Hlavy))

fitContOLS <- predict(mContOLS, newdata = pdata)
plotData()
lines(tgrid, fitCont, col = "red2", lwd = 2)
lines(tgrid, fitContOLS, col = "blue", lwd = 2)
legend(33, 7, legend = c("GLS", "OLS"), col = c("red2", "blue"), lwd = 2, lty = 1)


### QUESTIONS: 
### Do you have explanation why the red line does not follow so tightly
### the points corresponding to the low values of t?


### Piecewise quadratic model which has a regression function SMOOTH at the point t = 27
### Generalized least squares (GLS) used to calculate the estimates.
summary(mSmooth <- lm(y ~ t27 + I(t27^2) + (t27 > 0):I(t27^2), weights = n, data = Hlavy))

fitSmooth <- predict(mSmooth, newdata = pdata)
plotData()
lines(tgrid, fitCont, col = "red2", lwd = 2)
lines(tgrid, fitSmooth, col = "blue", lwd = 2)
legend(33, 7, legend = c("Continuous", "Smooth"), col = c("red2", "blue"), lwd = 2, lty = 1)


### Strictly speaking, this model is not Hierarchically Well Formulated (HWF).
### Nevertheless, is it sensible in this context?

### Piecewise quadratic model which has a regression function not even continuous at t = 27
### Generalized least squares (GLS) used to calculate the estimates.
summary(mJump <- lm(y ~ t27 + I(t27^2) + (t27 > 0) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy))

### QUESTIONS: 
### Can you find a p-value of the test
###
### H0: Jump of m(t) at t = 27 is equal to 0
### 
### against
###
### H1: Jump of m(t) at t = 27 is not equal to 0?
   
fitJump <- predict(mJump, newdata = pdata)
plotData()
lines(tgrid, fitCont, col = "red2", lwd = 2)
lines(tgrid, fitSmooth, col = "blue", lwd = 2)
lines(tgrid, fitJump, col = "darkgreen", lwd = 2)
legend(33, 7, legend = c("Jump", "Continuous", "Smooth"), col = c("darkgreen", "red2", "blue"), lwd = 2, lty = 1)


###  2. B R E U S C H - P A G A N  |  B O X - C O X  |  V I F
### Dataset on housing values in suburbs of Boston.
###
### Data contain information on 506 municipalities in the metropolitan area of Boston 
### and Massachusetts. Data were collected in 1978.
### 
###  'zn' proportion of residential land zones for lots over 25,000 sq.ft.
### 
###  'indus' proportion of non-retail business acres per municipality.
### 
###  'chas' Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
### 
###  'nox' nitrogen oxides concentration (parts per 10 million).
### 
###  'rm' average number of rooms per dwelling.
### 
###  'age' proportion of owner-occupied units built prior to 1940.
### 
###  'rad' index of accessibility to radial highways.
### 
###  'tax' full-value property-tax rate per $10,000.
### 
###  'ptratio' pupil-teacher ratio by town.
### 
###  'black' the proportion of blacks by town.
### 
###  'lstat' percentage of population with low income 
###
###  'medv'  median value of owner-occupied homes (in thousands of $).
###
###  'crime' categorized number of crimes per 1000 thousand citizens categorized as follows:
###        1 .... almost no crime (<= 0.1)
###        2 .... some crime (0.1-1)
###        3 .... high crime rate (> 1)
### 
###  'ldis' logarithm (to the power of 2) of the weighted mean of distances to five Boston employment centres.

Data <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/Boston2.csv", header = TRUE, sep = ",", na.strings = "")
head(Data)

###  The main task is to create a model that evaluates influence of the nitrogen oxides concentration [nox]
###  on the median value of owner-occupied home [medv] (or its transformation)
###  while possibly taking into account also some other variables, specifically the following: 
###
###   * crime rate [crime]
###   * logarithm (to the power of 2) of the distance from the Boston centre [ldis]
###   * pupil-teacher ratio [ptratio]
###   * proportion of non-retail business acres per municipality [indus]
###   * proportion of owner-occupied units built prior to 1940 [age]
###   * the proportion of blacks by town [black]

Data <- transform(Data, fcrime = factor(crime, levels = 1:3, labels = c("None", "Some", "High")))
Data <- subset(Data, select = c("medv", "nox", "fcrime", "ldis", "ptratio", "indus", "age", "black"))
head(Data)
summary(Data)


with(Data, summary(medv))
hist(Data[, "medv"], prob = TRUE, col = "olivedrab", xlab = "Median value of the owners-occupied houses [1000 USD]")

subset(Data, medv == 50)
### Is a value of 50 a real value or only the lower bound of the true value ("right-censoring")? Who knows...

### Scatterplot matrix
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~medv + nox + ldis + indus + age + ptratio + black, reg.line = FALSE, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = Data, pch = 16, cex=0.5)


### Dependence of medv on nox while taking crime into account
BGC3 <- rainbow_hcl(3)
COL3 <- c("red3", "darkgreen", "blue3")
PCH3 <- c(21, 22, 23)
names(BGC3) <- names(COL3) <- names(PCH3) <- levels(Data[, "fcrime"])
plot(medv ~ nox, pch = PCH3[fcrime], col = COL3[fcrime], bg = BGC3[fcrime], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Median value of the owners-occupied houses [1000 USD]")
legend(0.75, 50, title = "Crime level", legend = levels(Data[, "fcrime"]), pch = PCH3, pt.bg = BGC3, col = COL3)


###  As a starting model (denote it m1), we consider a linear model where we include all regressors
###  (nox, crime, ldis, ptratio, indus, age, black), all two-way interactions
###  between nox and the remaining covariates (that is, only interactions nox:z,
###  where z are covariates other than nox) and additionally
###  a quadratic term of nox (to account for possible non-linear effect of nox on medv).
m1 <- lm(medv ~ nox*(fcrime + ldis + ptratio + indus + age + black) + I(nox^2), data = Data)
summary(m1)
plotLM(m1)

### Shapiro-Wilk test of normality 
shapiro.test(residuals(m1))

### Breusch-Pagan test - assuming normality of errors 
library("lmtest")

ncvTest(m1); 

bptest(m1, ~fitted(m1), studentize = FALSE);

### ncvTest : var.formula: "a one-sided formula for the error variance; if omitted, the error variance depends on the fitted values."
### bptest : var.formula:  "by default the same explanatory variables are taken as in the main regression model."


### 'Manual calculation' 
U <- residuals(m1);
U2 <- U^2
Yhat <- fitted(m1);
n <- length(U)

### Test statistic 
(BP <- sum(U^2 * (Yhat - mean(Yhat)))^2 / (2 * (mean(U^2))^2 * sum((Yhat-mean(Yhat))^2)))

### p-value 
1-pchisq(BP, df=1)
bptest(m1, ~fitted(m1), studentize = FALSE)$p.value

### Koenker's studentized variant of Breusch-Pagan test - normality of errors not assumed 
bptest(m1, ~fitted(m1))

### Manual calculation 
(BP.Koenker <- (sqrt(n)*cor(U^2, Yhat))^2)
1-pchisq(BP.Koenker, df=1)
bptest(m1, ~fitted(m1))$p.value

### Up to some minor (and asymptotically negligible) differences 
### this test can be also performed by considering the overall test 
### of the following 'linear model'
anova(lm(I(U^2)~Yhat))


### One can also try to consider all the covariates 
bptest(m1, studentize=FALSE);

### Manual calculation 
Z <- model.matrix(m1)[,-1];
sigma2 <- mean(U^2); 
Suz <- colMeans(U^2/sigma2*(Z - matrix(colMeans(Z), nrow(Z), ncol(Z), byrow=T)));
n <- nrow(Z);

### Test statistic 
n*t(Suz)%*%solve(2*var(Z)*(n-1)/n)%*%Suz

### Studentized version 
bptest(m1);

### Manual calculation 
kurt.estim <- mean((U^2-sigma2)^2)/sigma2^2
n*t(Suz)%*%solve(kurt.estim*var(Z)*(n-1)/n)%*%Suz

### Again, up to some minor (and asymptotically negligible) differences 
### this test can be also performed by considering the overall test 
### of the following 'linear model
anova(lm(I(U^2)~model.matrix(m1)))


### Box-Cox transformation 
library(MASS)
boxcox(m1)

boxcox(m1, lambda = seq(-0.1, 0.5, by = 0.05))


### Fit the same model as above, however, with the transformed response
### as suggested by the Box-Cox transformation (denote the transformed response as lmedv).
Data <- transform(Data, lmedv = log(medv)); 
lm1 <- lm(lmedv ~ nox*(fcrime + ldis + ptratio + indus + age + black) + I(nox^2), data = Data)
summary(lm1)
plotLM(lm1)

### Shapiro-Wilk test of normality 
shapiro.test(residuals(lm1))

### Koenker's studentized variance of Breusch-Pagan test - normality of errors not assumed 
bptest(lm1, ~fitted(lm1));


### The model still does not seem to be homoscedastic and it is not clear how to make 
### it "more" homoscedastic. The inference should ideally take this into consideration. 
###
### During the following lectures  the so called sandwich estimator of the variance matrix 
### of the regression coefficient will be explained. This estimator is consistent 
### estimator of the variance matrix even under heteroscedasticity.  
### In what follows we IGNORE the problem of HETEROSCEDASTICITY. 


### QUESTIONS:
### Can we simplify the model?
drop1(lm1, test = "F")

### Leaving out not significant interactions we arrive at model 
lm2 <- lm(lmedv ~ nox + I(nox^2) + fcrime + ldis + ptratio + indus + age + black + nox:(ldis + ptratio + indus), data = Data)
anova(lm2, lm1); 

### What about further simplification of a model?
drop1(lm2, test = "F")

### Model that leaves out the quadratic term of nox and fcrime 
lm3 <- lm(lmedv ~ nox + ldis + ptratio + indus + age + black + nox:(ldis + ptratio + indus), data = Data)
anova(lm3, lm2)

### Thus a candidate on the final model would be lm3 
summary(lm3); 


### M U L T I C O L I N E A R I T Y 
### When we are interested in the effect of particular regressors (i.e. 'nox' in our situation)
### then we should also explore the possible problem of multicollinearity. 
###
### In particular we are interested how are the other variables related with 'nox'.
### The reason is that while we want to have 'nox' we do not want to have in 
### the model variables that are highly correlated with 'nox' but do not contribute 
### in terms of the R-squared value, for instance. 


##### Correlation coefficients among regressors
(R <- round(cor(Data[, c("nox", "ldis", "ptratio", "indus", "age", "black")]), 2))

### Mutually high correlations: nox, ldis, indus, age
COL <- "red3"
BG <- "orange"
PCH <- 23
par(mfrow = c(2, 3))
plot(nox ~ ldis, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "ldis"]))
plot(nox ~ indus, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "indus"]))
plot(nox ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "age"]))
plot(ldis ~ indus, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["ldis", "indus"]))
plot(ldis ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["ldis", "age"]))
plot(indus ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["indus", "age"]))
par(mfrow = c(1, 1))


### Squared coefficient of multiple correlation between nox and (ldis, indus, age)
### Where is it?
summary(lm(nox ~ ldis + indus + age, data = Data))

### Coefficient of multiple correlation between nox and (ldis, indus, age)
### (up to a sign) and including the sign
mnox <- lm(nox ~ ldis + indus + age, data = Data)
sqrt(summary(mnox)[["r.squared"]])
cor(mnox[["model"]][, "nox"], fitted(mnox))


### How much does the multiple correlation decrease if we remove ldis?
### (visually the closest linear dependence with nox)?
mnox2 <- lm(nox ~ indus + age, data = Data)
cor(mnox2[["model"]][, "nox"], fitted(mnox2))


### Relationship of quantitative regressors with crime
par(mfrow = c(2, 3))
plot(nox ~ fcrime, data = Data, col = BGC3)
plot(ldis ~ fcrime, data = Data, col = BGC3)
plot(ptratio ~ fcrime, data = Data, col = BGC3)
plot(indus ~ fcrime, data = Data, col = BGC3)
plot(age ~ fcrime, data = Data, col = BGC3)
plot(black ~ fcrime, data = Data, col = BGC3)
par(mfrow = c(1, 1))
### --> also mostly quite strong dependence
###  --> again potential cause of multicollinearity


### Initial model (to treat multicollinearity first)
m1 <- lm(lmedv ~ nox + ldis + indus + age + ptratio + black + fcrime, data = Data)
summary(m1)
plotLM(m1)

### (Generalized) variance inflation factors (function from package car)
vif(m1)

### Not that bad. But still, the worst inflation for the coefficient on nox. 
### (also the most interesting one for us)

### For a numeric covariate GVIF conincides with VIF and it is calculated as 1/(1-R_j^2), where 
### R_j^2 is the multiple correlation coefficent between the j-th covariate 
### and the remaining covariates. For instance: 

Rj2 <- summary(lm(nox~ldis + indus + age + ptratio + black + fcrime, data = Data))[["r.squared"]];
1/(1-Rj2)

### For a categorical variable fcrime, GIV is calculated as 
R <- cor(model.matrix(m1)[,-1]);  # correlation matrix of regressors in the model 
R11 <- R[7:8,7:8]                 # correlation submatrix corresponding to regressors generated by 'fcrime'
R22 <- R[1:6,1:6]                 # correlation submatrix corresponding to the remaining regressors 
R12 <- R[1:6,7:8] 
det(R11)*det(R22)/det(R)


### How about if we remove each of ldis, indus, age, fcrime
### (always only one of them)
m2 <- list()
m2[["ldis"]]   <- lm(lmedv ~ nox + indus + age + ptratio + black + fcrime, data = Data)
m2[["indus"]]  <- lm(lmedv ~ nox + ldis + age + ptratio + black + fcrime, data = Data)
m2[["age"]]    <- lm(lmedv ~ nox + ldis + indus + ptratio + black + fcrime, data = Data)
m2[["fcrime"]] <- lm(lmedv ~ nox + ldis + indus + age + ptratio + black, data = Data)

summary(m1)[["r.squared"]]
sapply(lapply(m2, summary), "[[", "r.squared")

vif(m1)
lapply(m2, vif)

### Removal of fcrime practically does not change R^2
### and slightly (by about one unit) decreases VIF for nox.


### How about if we additionally remove each of ldis, indus, age
### or two of them or all of them?

m3 <- list()
m3[["ldis"]]            <- lm(lmedv ~ nox + indus + age + ptratio + black, data = Data)
m3[["indus"]]           <- lm(lmedv ~ nox + ldis + age + ptratio + black, data = Data)
m3[["age"]]             <- lm(lmedv ~ nox + ldis + indus + ptratio + black, data = Data)
m3[["ldis-indus"]]      <- lm(lmedv ~ nox + age + ptratio + black, data = Data)
m3[["ldis-age"]]        <- lm(lmedv ~ nox + indus + ptratio + black, data = Data)
m3[["indus-age"]]       <- lm(lmedv ~ nox + ldis + ptratio + black, data = Data)
m3[["ldis-indus-age"]]  <- lm(lmedv ~ nox + ptratio + black, data = Data)

summary(m1)[["r.squared"]]
sapply(lapply(m3, summary), "[[", "r.squared")

vif(m1)
lapply(m3, vif)

### Suggestion: 
### Do not consider also indus.
### The reason is that after its removal, R^2 still does not drop too much 
### (more or less in the same magnitude as after removal of age) but the VIF 
### of nox after removal of indus is lower than the VIF of nox after removal of age.


### Let us consider now a more complex model including only the variables 
### nox, ldis, age, ptratio, black...
lm4 <- lm(lmedv ~ nox + I(nox^2) + nox*(ldis + age + ptratio + black), data = Data)
drop1(lm4, test = "F")


### Is it possible to remove all three interactions
### which are not significant at 20%

lm5 <- lm(lmedv ~ nox + I(nox^2) + ldis + age + ptratio + black + nox:ldis, data = Data)
summary(lm5)

anova(lm5, lm4)      
### More complex model lm4 is far from being
### significantly better than model lm5.

drop1(lm5, test = "F")
### All terms that could be removed
### are now quite highly significant.

### Final model?
mFin <- lm5;  # Model with an initial analysis of multicollinearity
mFin0 <- lm3; # Model without the initial analysis of multicollinearity

summary(mFin)
### How would you describe dependence
### of medv on nox given this model?

plotLM(mFin)
### Heteroscedasticity is likely a problem.
### Inference on beta's (or their linear combinations)
### should probably be corrected.
### --> time for SANDWICH ESTIMATOR of variance (coming soon)

### Final model from the first part of the exercise class
summary(mFin);  # Model with initial analysis of multicollinearity
summary(mFin0); # Model without initial analysis of multicollinearity


### Possible explanation of why nox^2 was (quite highly) non-significant
### in model mFin0 (the first part of the exercise class)


### Some visualization
summary(Data)

lgr <- 200
xindus <- c(5, 10, 18)   ## approx quartiles and median of indus
xnox <- seq(0.35, 0.90, length = lgr)
xdata <- data.frame(nox = rep(xnox, 3), indus = rep(xindus, each = lgr), ptratio = 19, ldis = 1.17, age = 75, black = 0.01)

head(xdata)

pFin <- predict(mFin, newdata = xdata)
pFin0  <- predict(mFin0,  newdata = xdata)

Data <- transform(Data, findus = cut(indus, breaks = c(0.1, 6.2, 18.1, 28)))
summary(Data[, "findus"])

COL33 <- diverge_hcl(3); COL33[2] <- "grey60"
BGC33 <- c("lightblue", "grey80", "red")
PCH33 <- c(21, 22, 23)
names(BGC33) <- names(COL33) <- names(PCH33) <- levels(Data[, "findus"])

par(mfrow = c(1, 2))
plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]", main="Final model with multicoll. considered")
lines(xnox, pFin[1:lgr], lwd = 2)
legend(0.4, 2, legend = names(COL33), title = "indus", col = COL33, pt.bg = BGC33, pch = PCH33)
#
plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]", main="Final model without multicoll. considered")
lines(xnox, pFin0[1:lgr], col = COL33[1], lwd = 2)
lines(xnox, pFin0[(lgr+1):(2*lgr)], col = COL33[2], lwd = 2)
lines(xnox, pFin0[(2*lgr+1):(3*lgr)], col = COL33[3], lwd = 2)
par(mfrow = c(1, 1))


layout(matrix(c(1,1,0,4, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:3){
  plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = subset(Data, findus == levels(findus)[i]), xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]")
  lines(xnox, pFin[((i-1)*lgr+1):(i*lgr)], col = COL33[i], lwd = 2)  
  lines(xnox, pFin0[((i-1)*lgr+1):(i*lgr)], col = COL33[i], lwd = 2, lty = 2)
}
plot.new()
legend(0.1, 0.9, legend = names(COL33), title = "indus", col = COL33, pt.bg = BGC33, pch = PCH33)
legend(0.1, 0.4, legend = c("Fin", "Fin0"), title = "Model", lty = c(1, 2), lwd = 2, col = "black")
par(mfrow = c(1, 1))