library(dplyr)
library(ggplot2)
library(epitools)
library(forcats)
library(tidyr)
Assignment 2: Case-control analysis via logistic regression
Assignment
Ille-et-Vilaine Data contain the results of a case-control study investigating the effect of alcohol and tobacco consumption on the risk of oesophageal cancer. There are 200 male cases and 775 male controls, all of them inhabitants of the French departement Ille-et-Vilaine. (See [BD1], Sec. 4.1, p. 122-124).
- Reproduce the descriptive results in Table 4.1 of [BD1], p. 123.
- Conduct a grouped analysis of alcohol risk adjusted for age (see [BD1], p. 210-213).
- Conduct a joint grouped analysis of alcohol and tobacco risk adjusted for age (see [BD1], p. 213-221, esp. Tables 6.5 and 6.6).
- Conduct a joint ungrouped analysis of alcohol and tobacco risk adjusted for age (see [BD1], p. 227-231, esp. Table 6.12).
Data
Classical R framework
First some data exploration
load("C:/Práce/Medical Studies/2018/Solution/02_iev/iev.RData")
dim(iev)
[1] 975 11
names(iev)
[1] "case" "agediag" "agegr" "tobgr" "tob" "beer" "cider"
[8] "wine" "aper" "digest" "alctot"
head(iev)
case agediag agegr tobgr tob beer cider wine aper digest alctot
1 0 42 35-44 None 0.0 0 63 68 0 7 139
2 0 45 45-54 5-9 g/day 7.5 0 0 54 5 7 66
3 0 35 35-44 None 0.0 0 17 3 1 4 24
4 0 78 >=75 None 0.0 0 30 7 0 2 39
5 0 45 45-54 5-9 g/day 7.5 0 60 0 0 4 64
6 0 64 55-64 15-19 g/day 17.5 5 34 3 0 7 49
Reproduce the descriptive results
Next step, we check the reference [BD1] and see how did authors decide on grouping of the data. We re-create respective Table 4.1 per response.
## Reproduce the tables
with(iev, table(agegr, case))
case
agegr 0 1
25-34 115 1
35-44 190 9
45-54 167 46
55-64 166 76
65-74 106 55
>=75 31 13
# tobacco and alcohol needs to be reformatted
$tobgr2 <- cut(iev$tob, c(-Inf, 9, 19, 29, Inf))
ievwith(iev, table(tobgr2, case))
case
tobgr2 0 1
(-Inf,9] 448 78
(9,19] 178 58
(19,29] 98 33
(29, Inf] 51 31
$alcgr <- cut(iev$alctot, c(-Inf, 39, 79, 119, Inf))
ievwith(iev, table(alcgr, case))
case
alcgr 0 1
(-Inf,39] 385 29
(39,79] 280 75
(79,119] 88 51
(119, Inf] 22 45
Alternatively, we can do it via tidyverse. Instead of using simply break, we also recode the category names.
<- iev %>%
iev2 mutate(AlcCateg = cut(alctot, breaks=c(-Inf, 39.5, 79.5, 119.5, Inf),
labels = c("0-39","40-79","80-119", "120+"))) %>%
mutate(TobCateg = fct_recode(tobgr, "0-9" = "None", "0-9" = "1-4 g/day", "0-9" = "5-9 g/day",
"10-19" = "10-14 g/day","10-19" = "15-19 g/day",
"20-29" = "20-29 g/day",
"30+" = "30-39 g/day", "30+" = "40-49 g/day",
"30+" = ">=50 g/day")) %>%
mutate(AlcExposure = cut(alctot, breaks=c(-Inf, 39.5, 79.5, 119.5, Inf),
labels = c("0-39","40-79","80-119", "120+")))
# re-create the table
%>%
iev2 group_by(agegr) %>%
summarize(cases = sum(case), controls = sum(1-case))
# A tibble: 6 × 3
agegr cases controls
<fct> <int> <dbl>
1 25-34 1 115
2 35-44 9 190
3 45-54 46 167
4 55-64 76 166
5 65-74 55 106
6 >=75 13 31
%>%
iev2 group_by(AlcCateg) %>%
summarize(cases = sum(case), controls = sum(1-case))
# A tibble: 4 × 3
AlcCateg cases controls
<fct> <int> <dbl>
1 0-39 29 385
2 40-79 75 280
3 80-119 51 88
4 120+ 45 22
%>%
iev2 group_by(TobCateg) %>%
summarize(cases = sum(case), controls = sum(1-case))
# A tibble: 4 × 3
TobCateg cases controls
<fct> <int> <dbl>
1 0-9 78 448
2 10-19 58 178
3 20-29 33 98
4 30+ 31 51
Note that when looking at population characteristis, we can see that the age distribution differs among controls and cases. We also see that tobacco and alcohol consumption is higher for cases. Therefore, in consequent analysis, we would expect all three of them to have some effect and such effect to be positive.
# add some summary statistics to compare the cases and controls
%>%
iev group_by(case) %>%
summarize(ageMean = mean(agediag), ageSD = sd(agediag),
AlcMean = mean(alctot), AlcSD = sd(alctot),
TobMean = mean(tob), TobSD = sd(tob)) %>%
mutate_if(is.numeric, round, 2)
# A tibble: 2 × 7
case ageMean ageSD AlcMean AlcSD TobMean TobSD
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 50.2 14.2 44.4 31.9 10.5 11.9
2 1 60.0 9.23 85.1 48.5 16.7 12.9
Conduct a grouped analysis
First, let us recreate the Table 6.1. However, instead of last column with Total, we use column Controls which would allow us to use these data directly in model fitting stage. Note that row order is different in this implementation than in [BD1] Table 6.1.
# first, let us create the new alcohol variable exposure 0/1
$alcgr2 <- cut(iev$alctot, c(-Inf, 79, Inf))
iev
# Table 6.1 but with non-cases instead of total
<- aggregate(cbind(case, 1-case) ~ agegr + alcgr2, data = iev, FUN = sum)
lldt names(lldt)[4] <- "contr"
lldt
agegr alcgr2 case contr
1 25-34 (-Inf,79] 0 106
2 35-44 (-Inf,79] 5 164
3 45-54 (-Inf,79] 21 138
4 55-64 (-Inf,79] 34 138
5 65-74 (-Inf,79] 36 88
6 >=75 (-Inf,79] 8 31
7 25-34 (79, Inf] 1 9
8 35-44 (79, Inf] 4 26
9 45-54 (79, Inf] 25 29
10 55-64 (79, Inf] 42 28
11 65-74 (79, Inf] 19 18
12 >=75 (79, Inf] 5 0
We can do the same via tidyverse. Note that as implemented here, the order of the rows will differ from code above. It corresponds to [BD1] output.
<- iev2 %>%
iev2 mutate(AlcCateg2 = cut(alctot, breaks=c(-Inf, 79.5, Inf),
labels = c("0-79","80+")))
<- iev2 %>%
lldt2 group_by(agegr, AlcCateg2) %>%
summarize(cases = sum(case), contr = sum(1-case)) %>%
# if you put summarize(case = sum(case), contr = sum(1-case)), the input in
# sum(1-case) will not come from original iev2, but from newly created case variable
mutate(case = cases) %>%
select(-cases)
lldt2
# A tibble: 12 × 4
# Groups: agegr [6]
agegr AlcCateg2 contr case
<fct> <fct> <dbl> <int>
1 25-34 0-79 106 0
2 25-34 80+ 9 1
3 35-44 0-79 164 5
4 35-44 80+ 26 4
5 45-54 0-79 138 21
6 45-54 80+ 29 25
7 55-64 0-79 138 34
8 55-64 80+ 28 42
9 65-74 0-79 88 36
10 65-74 80+ 18 19
11 >=75 0-79 31 8
12 >=75 80+ 0 5
We can calculate the odds ratio for alcohol risk here using epitools library function oddsratio, without taking into account age grouping.
<- with(iev, table(alcgr2, case))) (marginalTable
case
alcgr2 0 1
(-Inf,79] 665 104
(79, Inf] 110 96
oddsratio(marginalTable)
$data
case
alcgr2 0 1 Total
(-Inf,79] 665 104 769
(79, Inf] 110 96 206
Total 775 200 975
$measure
odds ratio with 95% C.I.
alcgr2 estimate lower upper
(-Inf,79] 1.000000 NA NA
(79, Inf] 5.565434 3.951577 7.859804
$p.value
two-sided
alcgr2 midp.exact fisher.exact chi.square
(-Inf,79] NA NA NA
(79, Inf] 0 1.708811e-22 1.599308e-25
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
oddsratio.wald(marginalTable)
$data
case
alcgr2 0 1 Total
(-Inf,79] 665 104 769
(79, Inf] 110 96 206
Total 775 200 975
$measure
odds ratio with 95% C.I.
alcgr2 estimate lower upper
(-Inf,79] 1.00000 NA NA
(79, Inf] 5.58042 3.960066 7.863779
$p.value
two-sided
alcgr2 midp.exact fisher.exact chi.square
(-Inf,79] NA NA NA
(79, Inf] 0 1.708811e-22 1.599308e-25
$correction
[1] FALSE
attr(,"method")
[1] "Unconditional MLE & normal approximation (Wald) CI"
# Same results via tidyr is not that straightforward due to format requirements of oddstratio
# it can be obtained via OddsRatio, but matrix needs to be created first
Now let us refit the models from the Table 6.2 (see [BD1], p. 212).
$agegr = factor(lldt$agegr)
lldt
<- lldt2 %>%
lldt2 as.data.frame() %>%
mutate(agegr = factor(agegr))
# We can use either "classical data" or tidy version
# both will work and provide the same output
# fit.1 <- glm(cbind(case, contr)~ -1 + agegr, family = binomial, data = lldt)
.1 <- glm(cbind(case, contr)~ -1 + agegr, family = binomial, data = lldt2)
fitsummary(fit.1)
Call:
glm(formula = cbind(case, contr) ~ -1 + agegr, family = binomial,
data = lldt2)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.431 -1.299 0.368 2.481 4.917
Coefficients:
Estimate Std. Error z value Pr(>|z|)
agegr25-34 -4.7449 1.0043 -4.725 2.31e-06 ***
agegr35-44 -3.0498 0.3411 -8.940 < 2e-16 ***
agegr45-54 -1.2894 0.1665 -7.743 9.70e-15 ***
agegr55-64 -0.7813 0.1385 -5.641 1.69e-08 ***
agegr65-74 -0.6561 0.1662 -3.948 7.88e-05 ***
agegr>=75 -0.8690 0.3304 -2.630 0.00854 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 572.341 on 12 degrees of freedom
Residual deviance: 89.148 on 6 degrees of freedom
AIC: 141.5
Number of Fisher Scoring iterations: 6
# calculate the G for chi-squared: note that we need contribution from both cases/controls
# Note: the numbers are slightly different than in the book, since they use different calculatin based on standardized residuals
# cases:
<- unlist(fit.1$fitted.values* (lldt2[,"contr"] + lldt2[,"case"]))
Expect # controls:
<- unlist((1-fit.1$fitted.values)* (lldt2[,"contr"] + lldt2[,"case"]))
ExpectB # Deviance
sum((Expect - lldt2[,"case"])^2/Expect) +
sum((ExpectB - lldt2[,"contr"])^2/ExpectB)
[1] 100.2541
Note that if we consider that the saturated model is the aggregated model (i.e. we ignore the alcohol and proceed with aggregating the data over the age groups), we get zero deviance, since we work above saturated model.
# let us fit aggregated model
<- lldt2 %>% group_by(agegr) %>%
lldt.fit1B summarize(case = sum(case), contr = sum(contr))
.1B <- glm(cbind(case, contr)~ -1 + agegr, family = binomial, data = lldt.fit1B)
fitsummary(fit.1B)
Call:
glm(formula = cbind(case, contr) ~ -1 + agegr, family = binomial,
data = lldt.fit1B)
Deviance Residuals:
[1] 0 0 0 0 0 0
Coefficients:
Estimate Std. Error z value Pr(>|z|)
agegr25-34 -4.7449 1.0043 -4.724 2.31e-06 ***
agegr35-44 -3.0498 0.3411 -8.940 < 2e-16 ***
agegr45-54 -1.2894 0.1665 -7.743 9.70e-15 ***
agegr55-64 -0.7813 0.1385 -5.641 1.69e-08 ***
agegr65-74 -0.6561 0.1662 -3.948 7.88e-05 ***
agegr>=75 -0.8690 0.3304 -2.630 0.00854 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4.8319e+02 on 6 degrees of freedom
Residual deviance: 1.3989e-14 on 0 degrees of freedom
AIC: 38.718
Number of Fisher Scoring iterations: 4
<- unlist(fit.1B$fitted.values* (lldt.fit1B[,"contr"] + lldt.fit1B[,"case"]))
Expect <- unlist((1-fit.1B$fitted.values)* (lldt.fit1B[,"contr"] + lldt.fit1B[,"case"]))
ExpectB sum((Expect - lldt.fit1B[,"case"])^2/Expect) +
sum((ExpectB - lldt.fit1B[,"contr"])^2/ExpectB)
[1] 1.106186e-23
Now, let us fit remaining models. Note that for model 3 below, we have some issues with putting correct category as baseline that need to be addressed carefully.
#fit.2 <- glm(cbind(case, contr)~ -1 + agegr + alcgr2, family = binomial, data = lldt)
.2 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg2, family = binomial, data = lldt2)
fitsummary(fit.2)
Call:
glm(formula = cbind(case, contr) ~ -1 + agegr + AlcCateg2, family = binomial,
data = lldt2)
Deviance Residuals:
1 2 3 4 5 6 7 8
-1.16422 0.97386 0.03277 -0.03839 -0.14465 0.17015 -0.30323 0.38863
9 10 11 12
0.94443 -1.55898 -0.68418 2.12217
Coefficients:
Estimate Std. Error z value Pr(>|z|)
agegr25-34 -5.0493 1.0093 -5.003 5.65e-07 ***
agegr35-44 -3.5053 0.3562 -9.842 < 2e-16 ***
agegr45-54 -1.8490 0.1935 -9.557 < 2e-16 ***
agegr55-64 -1.3432 0.1648 -8.151 3.61e-16 ***
agegr65-74 -1.0833 0.1840 -5.889 3.89e-09 ***
agegr>=75 -1.0900 0.3440 -3.169 0.00153 **
AlcCateg280+ 1.6541 0.1892 8.742 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 572.341 on 12 degrees of freedom
Residual deviance: 10.893 on 5 degrees of freedom
AIC: 65.242
Number of Fisher Scoring iterations: 5
<- unlist(fit.2$fitted.values* (lldt2[,"contr"] + lldt2[,"case"]))
Expect2 <- unlist((1-fit.2$fitted.values)* (lldt2[,"contr"] + lldt2[,"case"]))
Expect2B sum((Expect2 - lldt2[,"case"])^2/Expect2) +
sum((Expect2B - lldt2[,"contr"])^2/Expect2B)
[1] 9.162488
# from Table 6.3 manually
<- c(1.19, -0.82, -0.06, 0.05, 0.14, -0.12, 0.46, -0.35, -1.63, 0.98, 1.68, -0.66)
stRes sum(stRes^2)
[1] 9.338
# NOTE: direct fit will not work here, because the wrong category will be taken as baseline
# fit.3 <- glm(cbind(case, contr)~ -1 + agegr + alcgr2 + I(as.integer(agegr)-3.5):alcgr2, family = binomial, data = lldt)
# let us do it explicitly
<- lldt2 %>%
lldt.fit3 mutate(AlcAgeInt = (as.integer(agegr)-3.5)*(as.numeric(AlcCateg2)-1))
.3 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg2 + AlcAgeInt,
fitfamily = binomial, data = lldt.fit3)
summary(fit.3)
Call:
glm(formula = cbind(case, contr) ~ -1 + agegr + AlcCateg2 + AlcAgeInt,
family = binomial, data = lldt.fit3)
Deviance Residuals:
1 2 3 4 5 6 7 8
-1.09152 0.80776 0.25962 -0.28529 0.05078 -0.05897 -0.34034 0.43652
9 10 11 12
0.75231 -1.24076 -0.78422 2.32798
Coefficients:
Estimate Std. Error z value Pr(>|z|)
agegr25-34 -5.1786 1.0353 -5.002 5.67e-07 ***
agegr35-44 -3.6105 0.3966 -9.103 < 2e-16 ***
agegr45-54 -1.8946 0.2073 -9.137 < 2e-16 ***
agegr55-64 -1.3362 0.1647 -8.111 5.03e-16 ***
agegr65-74 -1.0443 0.1912 -5.462 4.71e-08 ***
agegr>=75 -1.0524 0.3448 -3.052 0.00227 **
AlcCateg280+ 1.6989 0.2004 8.479 < 2e-16 ***
AlcAgeInt -0.1268 0.1889 -0.671 0.50196
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 572.341 on 12 degrees of freedom
Residual deviance: 10.445 on 4 degrees of freedom
AIC: 66.793
Number of Fisher Scoring iterations: 5
<- unlist(fit.3$fitted.values* (lldt.fit3[,"contr"] + lldt.fit3[,"case"]))
Expect3 <- unlist((1-fit.2$fitted.values)* (lldt.fit3[,"contr"] + lldt.fit3[,"case"]))
Expect3B sum((Expect3 - lldt.fit3[,"case"])^2/Expect3 ) +
sum((Expect3B - lldt.fit3[,"contr"])^2/Expect3B)
[1] 8.69365
# Note: see the typo in the last coefficient (wrong sign in the book)
Let us test the models and let us stay with model 2 for time being and look at the prediction and compare it with fitted values. We can see that we have got reasonable fit, except maybe for fifth age category, so we would use this model to draw results for the analysis (see BD1 for the interpretation of results).
anova(fit.2, fit.1, test = "Chisq")
Analysis of Deviance Table
Model 1: cbind(case, contr) ~ -1 + agegr + AlcCateg2
Model 2: cbind(case, contr) ~ -1 + agegr
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 5 10.893
2 6 89.148 -1 -78.255 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit.3, fit.2, test = "Chisq")
Analysis of Deviance Table
Model 1: cbind(case, contr) ~ -1 + agegr + AlcCateg2 + AlcAgeInt
Model 2: cbind(case, contr) ~ -1 + agegr + AlcCateg2
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 4 10.445
2 5 10.893 -1 -0.44819 0.5032
<- data.frame(ObservedCases = lldt.fit3[,"case"], PredictedCases = round(fit.2$fitted.values* (lldt.fit3[,"contr"] + lldt.fit3[,"case"]), 3) ) %>%
checkModelFit mutate(Difference = ObservedCases - PredictedCases)
checkModelFit
ObservedCases PredictedCases Difference
1 0 0.676 -0.676
2 1 0.324 0.676
3 5 4.928 0.072
4 4 4.072 -0.072
5 21 21.623 -0.623
6 25 24.377 0.623
7 34 35.602 -1.602
8 42 40.398 1.602
9 36 31.359 4.641
10 19 23.641 -4.641
11 8 9.813 -1.813
12 5 3.187 1.813
Conduct a joint grouped analysis of alcohol and tobacco risk adjusted for age
Let us recreate results Table 6.5. First we prepare the data.
<- iev2 %>%
lldt3 group_by(agegr, AlcCateg, TobCateg) %>%
summarize(cases = sum(case), contr = sum(1-case)) %>%
# if you put summarize(case = sum(case), contr = sum(1-case)), the input in
# sum(1-case) will not come from original iev2, but from newly created case variable
mutate(case = cases) %>%
select(-cases)
lldt3
# A tibble: 88 × 5
# Groups: agegr, AlcCateg [24]
agegr AlcCateg TobCateg contr case
<fct> <fct> <fct> <dbl> <int>
1 25-34 0-39 0-9 41 0
2 25-34 0-39 10-19 10 0
3 25-34 0-39 20-29 6 0
4 25-34 0-39 30+ 5 0
5 25-34 40-79 0-9 27 0
6 25-34 40-79 10-19 7 0
7 25-34 40-79 20-29 3 0
8 25-34 40-79 30+ 7 0
9 25-34 80-119 0-9 2 0
10 25-34 80-119 10-19 1 0
# … with 78 more rows
Let us fit the models listed in Table 6.5
.1 <- glm(cbind(case, contr)~ -1 + agegr, family = binomial, data = lldt3)
fit3round(fit3.1$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74 agegr>=75
-4.74 -3.05 -1.29 -0.78 -0.66 -0.87
.2 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg, family = binomial, data = lldt3)
fit3round(fit3.2$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74
-6.14 -4.52 -2.72 -2.20 -1.79
agegr>=75 AlcCateg40-79 AlcCateg80-119 AlcCateg120+
-1.72 1.44 1.99 3.68
.3 <- glm(cbind(case, contr)~ -1 + agegr + TobCateg , family = binomial, data = lldt3)
fit3round(fit3.3$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74
-5.36 -3.57 -1.78 -1.24 -0.94
agegr>=75 TobCateg10-19 TobCateg20-29 TobCateg30+
-1.27 0.61 0.67 1.74
.4 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg + TobCateg, family = binomial, data = lldt3)
fit3round(fit3.4$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74
-6.89 -4.92 -3.12 -2.56 -2.00
agegr>=75 AlcCateg40-79 AlcCateg80-119 AlcCateg120+ TobCateg10-19
-2.07 1.44 1.97 3.60 0.44
TobCateg20-29 TobCateg30+
0.52 1.64
# and expand
round(exp(fit3.4$coeff),2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74
0.00 0.01 0.04 0.08 0.14
agegr>=75 AlcCateg40-79 AlcCateg80-119 AlcCateg120+ TobCateg10-19
0.13 4.21 7.15 36.71 1.56
TobCateg20-29 TobCateg30+
1.67 5.18
As expected, we can see that both alcohol and tobacco are important for the response and that they have both positive effect (in sense of response occurence, so negative in terms of the health risks).
We continue with model fitted to obtain Table 6.8.
.5 <- iev2 %>%
lldt3mutate(AlcCateg3 = cut(alctot, breaks=c(-Inf, 24.5,49.5, 74.5, 99, Inf),
labels = c("0-24","24-49","50-74", "75-99", "100+"))) %>%
mutate(TobCateg2 = fct_recode(tobgr, "0" = "None", "1-4" = "1-4 g/day", "5-14" = "5-9 g/day",
"5-14" = "10-14 g/day","15-29" = "15-19 g/day",
"15-29" = "20-29 g/day",
"30+" = "30-39 g/day", "30+" = "40-49 g/day",
"30+" = ">=50 g/day")) %>%
group_by(agegr, AlcCateg3, TobCateg2) %>%
summarize(cases = sum(case), contr = sum(1-case)) %>%
# if you put summarize(case = sum(case), contr = sum(1-case)), the input in
# sum(1-case) will not come from original iev2, but from newly created case variable
mutate(case = cases) %>%
select(-cases)
# Note slight computational differences from Table 6.8
.5 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg3 + TobCateg2, family = binomial, data = lldt3.5)
fit3
<- c(1,exp(coef(fit3.5)[7:10]))
beta.alc <- c(1,exp(coef(fit3.5)[11:14]))
beta.tob
round(beta.alc %*% t(beta.tob), 1)
TobCateg21-4 TobCateg25-14 TobCateg215-29 TobCateg230+
[1,] 1.0 4.4 7.3 5.8 19.4
[2,] 1.6 7.1 11.6 9.3 30.8
[3,] 5.0 22.3 36.4 29.3 97.1
[4,] 6.6 29.5 48.2 38.8 128.4
[5,] 17.6 78.0 127.6 102.7 340.2
Again, we see that the alcohol and tobacco has strong positive effect. However, we also notice that the there is rather strong change from tobacco level zero and 1-4g level. Hence, the previous categorisation will not work very well, since it is pulling two distinct categories. It also give us an idea that linear relationship won’t perhaps work for next section.
Conduct a joint ungrouped analysis of alcohol and tobacco risk adjusted for age
Based on findings from previous section, we will try to include tobacco and alcohol as well in some non-linear fashion (note: we still use linear models in sense of linearity in parameters). We reconstruct results of Table 6.12. Note different result for model 6.
<- iev2 %>%
lldt4 mutate(AlcCont = alctot) %>%
mutate(TobCont = as.numeric(as.character(fct_recode(tobgr, "0" = "None", "2.5" = "1-4 g/day", "7.5" = "5-9 g/day",
"12.5" = "10-14 g/day","17.5" = "15-19 g/day",
"25" = "20-29 g/day",
"35" = "30-39 g/day", "45" = "40-49 g/day",
"60" = ">=50 g/day")))) %>%
mutate(AlcContLog = log(AlcCont+1)) %>%
mutate(TobContLog = log(TobCont+1)) %>%
mutate(AlcCont2 = AlcCont^2) %>%
mutate(TobCont2 = TobCont^2)
.1 <- glm(case ~ -1 + agegr + AlcCont + TobCont, family = binomial, data = lldt4)
fit4round(fit3.2$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74
-6.14 -4.52 -2.72 -2.20 -1.79
agegr>=75 AlcCateg40-79 AlcCateg80-119 AlcCateg120+
-1.72 1.44 1.99 3.68
.2 <- glm(case ~ -1 + agegr + TobCont + AlcContLog, family = binomial, data = lldt4)
fit4round(fit4.2$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74 agegr>=75 TobCont
-9.00 -7.29 -5.59 -5.05 -4.61 -4.59 0.04
AlcContLog
0.93
.3 <- glm(case ~ -1 + agegr + AlcCont + TobContLog, family = binomial, data = lldt4)
fit4round(fit4.3$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74 agegr>=75 AlcCont
-7.55 -5.97 -4.20 -3.68 -3.19 -3.08 0.03
TobContLog
0.54
.4 <- glm(case ~ -1 + agegr + AlcContLog + TobContLog, family = binomial, data = lldt4)
fit4round(fit4.4$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74 agegr>=75 AlcContLog
-9.32 -7.77 -6.08 -5.56 -5.17 -5.07 0.89
TobContLog
0.56
.5 <- glm(case ~ -1 + agegr + AlcCont + TobCont + TobCont2, family = binomial, data = lldt4)
fit4round(fit4.5$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74 agegr>=75 AlcCont
-7.18 -5.52 -3.73 -3.19 -2.67 -2.61 0.03
TobCont TobCont2
0.06 0.00
# mistake in book it seems:
.6 <- glm(case ~ -1 + agegr + AlcCont + TobContLog + AlcCont2, family = binomial, data = lldt4)
fit4round(fit4.6$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74 agegr>=75 AlcCont
-7.44 -5.84 -4.05 -3.54 -3.05 -2.96 0.02
TobContLog AlcCont2
0.54 0.00
.7 <- glm(case ~ -1 + agegr + AlcCont + TobContLog + I(TobContLog^2), family = binomial, data = lldt4)
fit4round(fit3.2$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74
-6.14 -4.52 -2.72 -2.20 -1.79
agegr>=75 AlcCateg40-79 AlcCateg80-119 AlcCateg120+
-1.72 1.44 1.99 3.68
Now how to decide which model is the best. One option would be looking at deviance metric or base the model selection on sequential testing (stepwise) of anova models. Alternatively, information criterion can be used. Based on AIC, model 7 will be selected as just slightly better than model 3 and 6. Note that ordering of AIC is not the same as of Chi squared statistics.
In any case, it should not be used in automated mode only. The model selection needs to include careful consideration of both statistical as well as applied aspects (practical relevance of the effects). Hence, the manual search for the correct model is a must.
AIC(fit4.1, fit4.2, fit4.3, fit4.4, fit4.5, fit4.6, fit4.7)
df AIC
fit4.1 8 711.4680
fit4.2 8 765.5021
fit4.3 8 699.2473
fit4.4 8 750.8665
fit4.5 9 711.9197
fit4.6 9 700.8066
fit4.7 9 699.1563