rm(list = ls())
setwd("/home/komarek/teach/mff_2021/nmst432_AdvRegr/Problem_3")
library("Epi")
library("splines")
(load("AdvRegr_3_iev.RData"))
head(iev)
summary(iev)
##### ============================================================
##### Some new variables
##### that might be useful in either the model building phase
##### or even in the final model.
##### ============================================================
### New age groups
### - some of the following variables might be used further on
with(iev, table(agegr))
#
iev <- transform(iev, agegr4 = Relevel(agegr, list("25-34" = 1, "35-44" = 2, "45-54" = 3, ">=55" = 4:6)))
iev <- transform(iev, agegr5 = Relevel(agegr, list("25-34" = 1, "35-44" = 2, "45-54" = 3, "55-64" = 4, ">=65" = 5:6)))
with(iev, table(agegr4))
with(iev, table(agegr5))
### New tob groups
### - some of the following variables might be used further on
with(iev, table(tobgr))
#
iev <- transform(iev, tobgr4 = Relevel(tobgr, list("None" = 1, "1-4 g/day" = 2, "5-29 g/day" = 3:6, ">=30 g/day" = 7:9)))
with(iev, table(tobgr4))
### Factor from alctot
### - will be useful to choose the right parameterization of alco
### - effect of alco on log-odds on cancer is not necessarily linear
### - see also "Hints on logistic regression practice" in extended course notes
iev <- transform(iev, alctotf = cut(alctot, breaks = c(-Inf, 0, 25, 50, 75, 100, 150, Inf)))
levels(iev[, "alctotf"]) <- c("0", "(0,25]", "(25,50]", "(50,75]", "(75,100]", "(100,150]", ">150")
with(iev, table(alctotf))
### If effect not linear, the first parameterization to try is often logarithm.
### - BUT, is logarithm really so different from a linear function within our range of age values?
x <- seq(25, 91, length = 100)
par(mfrow = c(1, 2), mar = c(4, 4, 1, 1))
plot(x, log(x), type = "l", lwd = 2, col = "red3", xlab = "x", ylab = "log(x)")
plot(x, log(x - 24), type = "l", lwd = 2, col = "red3", xlab = "x", ylab = "log(x - 24)")
par(mfrow = c(1, 1))
### Log of age at diag.
#iev <- transform(iev, lagediag = log(agediag), ## Not really useful...
# lagediag24 = log(agediag - 24))
### ALSO, it is useful to maintain reasonable interpretation of the logit model
### if such logarithm used.
### log_2 will have nicer interpretation, use that
iev <- transform(iev, lagediag = log(agediag, 2), ## Not really useful...
lagediag24 = log(agediag - 24, 2))
### QUESTION: If lagediag24 used in the model as one of predictors (additively to others),
### how would you interprete beta, and how exp(beta)?
##### ===================================================================
##### For yourself, use "standard" methods to get basic insight
##### about (a) marginal relationship between x's (potential covariates)
##### and y, (b) mutual associations between x's (to avoid really obvious
##### collinearity problems if any present)
#####
##### Keep in mind that alctot should not be used in one model jointly
##### with per-partes alcohol variables (beer, ...). At the same time,
##### when trying to answer the question whether different types of alcohol
##### have different effects on odds on cancer, we must somehow include
##### per-partes alco variables in one model. That is, do not decide now
##### that e.g., beer will no more be considered because it is somehow
##### associated with wine.
##### ===================================================================
### Use plots like this
COL <- c("darkgreen", "red2")
layout(matrix(c(0,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
plot(factor(case) ~ agegr, data = iev, col = COL)
plot(factor(case) ~ tobgr, data = iev, col = COL)
plot(factor(case) ~ alctotf, data = iev, col = COL)
par(mfrow = c(1, 1))
## For yourself, everything can be supplemented by reasonable
## empirical estimates of important quantities.
### Think a bit, where the following code is good for.
(pp <- prop.table(with(iev, table(case, agegr)), margin = 2))
(lo <- log(pp[2,] / pp[1,]))
(ma <- c(30, 40, 50, 60, 70, 80))
plot(ma, lo, type = "l", lty = 2, col = "red3", xlab = "Age (midpoint)", ylab = "Log(odds)")
points(ma, lo, pch = 23, col = "red3", bg = "skyblue", cex = 2)
### IN YOUR REPORT: briefly discuss, which parameterizations of continuous versions
### of age, total alcohol and tobacco would you recommend/consider (just by evaluation
### of plots like above) if your task was to evaluate a marginal effect of the considered
### covariates on odds of cancer. In other words, which functions (you preliminary think)
### may describe effect of age, alcohol, tobbaco on evolution of log(odds(cancer))
### in a population of Ille-et-Vilaine?
##### ==================================================================================
##### First, ignore all "per-partes" alcohol variables (beer, cider, wine, aper, digest)
##### and try to find a reasonable model which describes effect of alcohol consumption
##### and tobbaco usage on odds on cancer. Keep in mind that this effect may change
##### with age which itself may have some influence on odds on cancer.
##### That is, we need some (logistic) model that includes (a) age, (b) alcohol,
##### (c) tobbaco (all of them parameterized in a way reflecting their effect on the odds
##### on cancer). Perhaps, interactions of the type alco:age and tobbaco:age might be needed
##### if it appears that effect of alco and/or tobbaco on odds on cancer changes
##### with age.
##### ==================================================================================
### Try to propose reasonable parameterization for each of the three
### considered covariates (age, alcohol, tobbaco). Nevertheless, we
### are interested in a partial effect of each of them, so decisions
### should be made on models that include all of them. Clearly,
### some iterations are needed, roughly as follows.
###
### (i) include all covariates in one model, a safe way is to use factor
### version of all of them (with reasonable number of categories) to avoid
### model misspecification. Concentrate on finding reasonable parameterization
### for one covariate. Your decisions can be driven not only by residual plots
### (which are mostly not really informative in case of binary response) but
### also by other useful plots and and comparison of deviances, perhaps by
### deviance test.
### (ii) repeat step (i) for each considered covariate
### (iii) include found parameterizations of the three covariates in one model
### and verify this "overall" model (plots, deviance tests against model
### with categorical versions of covariates)
### (iv) try to include also interactions (age:alco and age:tob)
### and verify their importance. Here, have in mind that for interactions,
### found parameterization of age may not be suitable for inclusion in interactions
### (effect modification of, let say alcohol, may work in a different way
### than age itself). Verify always your choice also against an interaction
### in which age is included as a factor (categorical), e.g., by using
### a deviance test.
###
### Sometimes, it is necessary to return back to some of previous steps that were
### perhaps considered as finalized...
###
### Have in mind that in real applications (like this), effect of any of considered factors
### is only rarely linear (even on log(odds) scale). Perhaps even worse, it even does not
### have to be monotone!!!
### Have also in mind that your client should have a chance to understand your model.
### Parameterizations to consider are:
### * factor (categorical): sometimes it is not possible to describe effect of
### a covariate in a simple and understandable way and it is then better to keep
### that effect categorical (medical doctors or politicians even like it as it is
### than possible to say, e.g., odds on cancer for people of age 35-45 is 2x as high
### as for people of age 25-35 - if exp(beta) = odds ratio = 2)
### * identity: exp(beta) than gives odds ratio when comparing two odds (on cancer)
### in two groups in which the respective covariate values differ by one unit
### (and remaining covariates do not change)
### * some simple function: but here, with only some functions (log), you can easily
### explain to client what the estimated exp(beta) = some odds ratio says
### * piecewise simple (e.g., linear) function:
### * spline: if this is used, report neither estimated beta's nor exp(beta's),
### it is much more useful to plot estimated spline (beta_1*B_1(x) + ... + beta_k*B_k(x))
### and discuss at least some features of the estimated trend, e.g., odds on cancer
### is rapidly increasing till x = 30, then this increase slows down and from x = 45,
### odds on cancer even starts to decrease with x
###
### Compare different models mainly by deviance test. Even if the two models are not
### strictly speaking forming a pair supermodel - submodel, you can use formally calculated
### p-value (coming from the chi-sq. distribution) to evaluate whether the two deviances
### differ "considerably" (yes if p-value "small", no if p-value "high"). The point is
### that comparison of p-value is performed on a standardized (0, 1) scale, whereas
### the magnitude of a deviance depends on data under consideration and difference of 10
### between two deviances can be both "high" and "small".
### --> use anova(fit1, fit2, test = "Chisq") or anova(fit1, fit2, test = "LRT")
### to get such p-values easily
### As with any model building, do not be sticking on 5% significance level.
### You will be performing a sequence of tests that, moreover, were not pre-specified
### in advance. So the significance level of each separate test is quite a vague concept.
### Simply use the p-value as a standardized measure of evidence
### of whether one model is better than the other.
##### Start with multivariate model with all variables additively
##### and use plots of estimated coefficients to propose
##### reasonable parameterizations of covariates.
#####
##### WARNING: Plots should provide you with reasonable proposals (plural!!!).
##### Unless you are really a wizard/witch (or politician), you should never do your
##### final decision simply by looking at one plot....
#####
##### ============================================================
fit0 <- glm(case ~ agegr + tobgr + alctotf, family = binomial, data = iev)
summary(fit0) ### only rough information, not really important now
drop1(fit0, test = "Chisq") ### all highly significant, also just for information now
be0 <- coef(fit0)
#### Think, why also 0 is plotted for the first age category
(bage <- coef(fit0)[grep("agegr", names(be0))])
plot(0:length(bage), c(0, bage), xaxt = "n", pch = 23, col = "darkblue", bg = "skyblue", ylim = c(0, max(bage)), xlab = "Age", ylab = expression(beta))
axis(1, at = 0:length(bage), labels = levels(iev[, "agegr"]))
with(iev, table(agegr))
### Reasonable parameterizations of age?
### Useful to merge some age groups?
### Be careful when interpreting trend that "goes through"
### estimated beta's related to "small" groups (high standard error).
### Not really linear effect of age...
(btob <- coef(fit0)[grep("tobgr", names(be0))])
plot(0:length(btob), c(0, btob), xaxt = "n", pch = 23, col = "darkblue", bg = "skyblue", ylim = c(0, max(btob)), xlab = "Tobacco", ylab = expression(beta))
axis(1, at = 0:length(btob), labels = levels(iev[, "tobgr"]))
with(iev, table(tobgr))
### Reasonable parameterizations of tobbaco?
### Useful to merge some tobbaco groups?
### Be careful when interpreting trend that "goes through"
### estimated beta's related to "small" groups (high standard error).
### Quite some difference between no smoking and some smoking
### of smoke then perhaps linear effect?
(balc <- coef(fit0)[grep("alctotf", names(be0))])
plot(0:length(balc), c(0, balc), xaxt = "n", pch = 23, col = "darkblue", bg = "skyblue", xlab = "Alco", ylab = expression(beta))
axis(1, at = 0:length(balc), labels = levels(iev[, "alctotf"]))
with(iev, table(alctotf))
### Linear might be reasonable
### when interpreting the last jump, do not forget that the last alco group is rather small
### Now, you should have reasonable proposals for parameterizations of the three covariates
### or suggestions to merge some categories. All those proposals should now be further
### explored and verified by (deviance) tests. Various combinations of reasonable proposals
### should be tried! As reference model, fit0 from above can be used or, if first verified,
### some model with less categories.
### For example, especially tobbaco was splitted into quite many categories, some of them
### very small. The code below verifies whether tobgr4 could be used instead of tobgr:
fit1 <- glm(case ~ agegr + tobgr4 + alctotf, family = binomial, data = iev)
anova(fit1, fit0, test = "Chisq")
## --> looks OK, for me, fit1 would now be a new reference model
##### In the following, try various combinations of reasonable parameterizations
##### of the three covariates and propose their "best" parameterization...
##### ===============================================================================
##### When working with factors, decisions concerning merging the groups
##### == the same effect (beta) on risk of cancer, can be supported by a statistical test
##### ===================================================================================
### Less tobacco groups?
### ----------------------
fit.1 <- glm(case ~ agegr + tobgr + alctot, family = binomial, data = iev)
summary(fit.1)
drop1(fit.1, test = "Chisq")
#
with(iev, table(tobgr4, tobgr))
#
fit.2 <- glm(case ~ agegr + tobgr4 + alctot, family = binomial, data = iev)
summary(fit.2) ## Dev = 670.5
#
anova(fit.2, fit.1, test = "Chisq") ## P = 0.544
### etc.
##### Some reasonable models with respect to age
##### ============================================
fit.4 <- glm(case ~ agegr5 + tobgr4 + alctot, family = binomial, data = iev)
summary(fit.4) ## Dev = 670.5
fit.6 <- glm(case ~ lagediag24 + tobgr4 + alctot, family = binomial, data = iev)
summary(fit.6) ## Dev = 673.8
anova(fit.6, fit.4, test = "Chisq")
## P = 0.357, comparable models
## fit.6 preferred as age is included as a numeric covariate here
### Any better parameterization of age?
(bage.4 <- coef(fit.4)[grep("agegr5", names(coef(fit.4)))])
plot(0:length(bage.4), c(0, bage.4), xaxt = "n", pch = 23, col = "darkblue", bg = "skyblue", ylim = c(0, max(bage.4)), xlab = "Age", ylab = expression(beta))
axis(1, at = 0:length(bage.4), labels = levels(iev[, "agegr5"]))
### etc.
##### Effect of alcohol
##### ============================================
### In the following, we keep using age as factor with 5 levels
### (to decide about alcohol)
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
### Alcohol linearly?
fit.4 <- glm(case ~ agegr5 + tobgr4 + alctot, family = binomial, data = iev)
fit.7 <- glm(case ~ agegr5 + tobgr4 + alctotf, family = binomial, data = iev)
summary(fit.7) ## Dev = 658.9
anova(fit.4, fit.7, test = "Chisq") ### P = 0.040
### --> alctot in a linear way not really the right form
### How to parameterize alcohol?
(bealc <- coef(fit.7)[grep("alctotf", names(coef(fit.7)))])
plot(0:length(bealc), c(0, bealc), pch = 23, col = "darkblue", bg = "skyblue", ylim = c(min(c(0, bealc)), max(bealc)), xaxt = "n", xlab = "Alcohol", ylab = expression(beta))
axis(1, at = 0:length(bealc), labels = levels(iev[, "alctotf"]))
#
sum(iev[, "alctot"] == 0)
table(iev$alctot==0, iev$case)
### etc.
##### Now some remarks to interpretation of estimated coefficients.
##### You certainly know everything from the lecture but in case you don't...
##### ========================================================================
#### Let say that fit2 below is (for the moment being) the final model.
####
fit2 <- glm(case ~ agegr + tobgr4 + alctot, family = binomial, data = iev)
summary(fit2)
(be2 <- coef(fit2))
### For factors, interpretation of beta's depends on used parameterization,
### by default contr.treatment (reference group contrasts used). But you know
### everything from linear regression... Only now, more interesting is exp(beta)
### which is some odds ratio (OR), i.e., ratio of two odds. Which two odds now
### depends on used parameterization. With contr.treatment it's easy:
(ageind <- grep("agegr", names(be2)))
(orage <- exp(be2[ageind]))
print(orage["agegr35-44"]) ## = OR when comparing group aged 35-44 and a reference group (aged 25-34)
## with the same alco and tobbaco consumption
## here: group 35-44 has a higher (approx. 6x) odds on cancer than group 25-34,
## since odds (Å¡ance) is increasing function of probability (pravdÄ›podobnost),
## also probability of cancer is higher in group 35-44 than in group 25-34
exp(be2["agegr45-54"] - be2["agegr35-44"])
### comparison of two "non-reference" groups, here = OR when comparing
### group 45-54 and group 35-44
### For numeric covariates, exp(beta) = OR when comparing odds on cancer
### in groups that differ by 1 unit of the numeric covariate and other covariates are
### unchanged.
(oralco <- exp(be2["alctot"])) ### alcohol consumption increases odds (and probability)
### of cancer
### but 1 g of alcohol is pretty nothing, the numbers below
### are more interesting (in case you do not know, one typical
### glass of a Czech beer is 0.5 l and the alcohol amount is about 5%,
### i.e., roughly 25 g of alcohol in one glass
oralco^25
oralco^50
oralco^75
oralco^100
### Not yet considering a change of your student life?
### But maybe a linear trend (on log(odds)) scale does not fit data :-) That's what you may hope for!
### The same interpretation then holds for limits of confidence intervals:
(ci2 <- confint(fit2))
exp(ci2[-1,])
exp(ci2["alctot",]*100)
### Warning: confint() produces confidence intervals being dual to the deviance (likelihood-ratio)
### test whereas summary() provides p-values of the Wald test. The two outputs are hence
### only asymptotically dual.
### IN YOUR REPORT: provide interpretation of your model that you have at this stage.
### As mentioned above, in case you use a more "complex" parameterization of some
### of the covariates (e.g., spline), do not calculate exp(beta's) but rather plot
### the respective spline and discuss "dynamic" of the effect of that covariate.
### ALSO: provide one "overall" P-value for effect of each of the three covariates
drop1(fit2, test = "LR")
### In case either of the factors is included as spline, the above drop1() will not provide
### the right p-value. anova(.,., test = "LR") must be used instead.
##### And now, examine necessity to include interactions of alcohol and tobbaco
##### with age. Remember that you should consider perhaps different parameterization
##### of age for each interaction. It is wise to start with including alco:age(factor)
##### and tobbaco:age(factor) interactions in the model, test whether they are needed
##### and only if so, deal with replacement of age(factor) by something continuous/smooth
##### (but as simple as possible)...
#####
##### If some interactions appear important, try to interprete the final model in your report.
#####
##### ====================================================================================
##### And finally, take your final model and try to replace total alcohol by a reasonable
##### combination of "per-partes" alcohol values (beer, cider, wine, aper, digest).
##### At the end, try to answer the question whether a general consumption of alcohol
##### has some effect on odds on cancer or whether perhaps some types of alcohol
##### are even protecting the consumer from cancer. In your conclusions, be aware
##### of possible extrapoloation overinterpretation. Before you recommend drinking
##### of something, check ranges and briefly also distribution of the values
##### of beer etc. in data.
##### =======================================================================================