Assignment 2: Case-control analysis via logistic regression

Author

Martin Otava

library(dplyr)
library(ggplot2)
library(epitools)
library(forcats)
library(tidyr)

Assignment

Ille-et-Vilaine Data contain the results of a case-control study investigating the effect of alcohol and tobacco consumption on the risk of oesophageal cancer. There are 200 male cases and 775 male controls, all of them inhabitants of the French departement Ille-et-Vilaine. (See [BD1], Sec. 4.1, p. 122-124).

  1. Reproduce the descriptive results in Table 4.1 of [BD1], p. 123.
  2. Conduct a grouped analysis of alcohol risk adjusted for age (see [BD1], p. 210-213).
  3. Conduct a joint grouped analysis of alcohol and tobacco risk adjusted for age (see [BD1], p. 213-221, esp. Tables 6.5 and 6.6).
  4. Conduct a joint ungrouped analysis of alcohol and tobacco risk adjusted for age (see [BD1], p. 227-231, esp. Table 6.12).

Data

Classical R framework

First some data exploration

load("C:/Práce/Medical Studies/2018/Solution/02_iev/iev.RData")
dim(iev)
[1] 975  11
names(iev)
 [1] "case"    "agediag" "agegr"   "tobgr"   "tob"     "beer"    "cider"  
 [8] "wine"    "aper"    "digest"  "alctot" 
head(iev)
  case agediag agegr       tobgr  tob beer cider wine aper digest alctot
1    0      42 35-44        None  0.0    0    63   68    0      7    139
2    0      45 45-54   5-9 g/day  7.5    0     0   54    5      7     66
3    0      35 35-44        None  0.0    0    17    3    1      4     24
4    0      78  >=75        None  0.0    0    30    7    0      2     39
5    0      45 45-54   5-9 g/day  7.5    0    60    0    0      4     64
6    0      64 55-64 15-19 g/day 17.5    5    34    3    0      7     49

Reproduce the descriptive results

Next step, we check the reference [BD1] and see how did authors decide on grouping of the data. We re-create respective Table 4.1 per response.

## Reproduce the tables

with(iev, table(agegr, case))
       case
agegr     0   1
  25-34 115   1
  35-44 190   9
  45-54 167  46
  55-64 166  76
  65-74 106  55
  >=75   31  13
# tobacco and alcohol needs to be reformatted
iev$tobgr2 <- cut(iev$tob, c(-Inf, 9, 19, 29, Inf))
with(iev, table(tobgr2, case))
           case
tobgr2        0   1
  (-Inf,9]  448  78
  (9,19]    178  58
  (19,29]    98  33
  (29, Inf]  51  31
iev$alcgr <- cut(iev$alctot, c(-Inf, 39, 79, 119, Inf))
with(iev, table(alcgr, case))
            case
alcgr          0   1
  (-Inf,39]  385  29
  (39,79]    280  75
  (79,119]    88  51
  (119, Inf]  22  45

Alternatively, we can do it via tidyverse. Instead of using simply break, we also recode the category names.

iev2 <- iev  %>%
  mutate(AlcCateg = cut(alctot, breaks=c(-Inf, 39.5, 79.5, 119.5, Inf), 
                        labels = c("0-39","40-79","80-119", "120+"))) %>%
  mutate(TobCateg = fct_recode(tobgr, "0-9" = "None", "0-9" = "1-4 g/day", "0-9" = "5-9 g/day",
                               "10-19" = "10-14 g/day","10-19" = "15-19 g/day",
                               "20-29" = "20-29 g/day",
                               "30+" = "30-39 g/day", "30+" = "40-49 g/day", 
                               "30+" = ">=50 g/day")) %>%
  mutate(AlcExposure = cut(alctot, breaks=c(-Inf, 39.5, 79.5, 119.5, Inf), 
                        labels = c("0-39","40-79","80-119", "120+")))

# re-create the table
iev2 %>%
  group_by(agegr) %>%
  summarize(cases = sum(case), controls = sum(1-case))
# A tibble: 6 × 3
  agegr cases controls
  <fct> <int>    <dbl>
1 25-34     1      115
2 35-44     9      190
3 45-54    46      167
4 55-64    76      166
5 65-74    55      106
6 >=75     13       31
iev2 %>%
  group_by(AlcCateg) %>%
  summarize(cases = sum(case), controls = sum(1-case))
# A tibble: 4 × 3
  AlcCateg cases controls
  <fct>    <int>    <dbl>
1 0-39        29      385
2 40-79       75      280
3 80-119      51       88
4 120+        45       22
iev2 %>%
group_by(TobCateg) %>%
  summarize(cases = sum(case), controls = sum(1-case))
# A tibble: 4 × 3
  TobCateg cases controls
  <fct>    <int>    <dbl>
1 0-9         78      448
2 10-19       58      178
3 20-29       33       98
4 30+         31       51

Note that when looking at population characteristis, we can see that the age distribution differs among controls and cases. We also see that tobacco and alcohol consumption is higher for cases. Therefore, in consequent analysis, we would expect all three of them to have some effect and such effect to be positive.

# add some summary statistics to compare the cases and controls 
iev %>%
  group_by(case) %>%
  summarize(ageMean = mean(agediag), ageSD = sd(agediag),
            AlcMean = mean(alctot), AlcSD = sd(alctot),
            TobMean = mean(tob), TobSD = sd(tob)) %>% 
            mutate_if(is.numeric, round, 2)
# A tibble: 2 × 7
   case ageMean ageSD AlcMean AlcSD TobMean TobSD
  <dbl>   <dbl> <dbl>   <dbl> <dbl>   <dbl> <dbl>
1     0    50.2 14.2     44.4  31.9    10.5  11.9
2     1    60.0  9.23    85.1  48.5    16.7  12.9

Conduct a grouped analysis

First, let us recreate the Table 6.1. However, instead of last column with Total, we use column Controls which would allow us to use these data directly in model fitting stage. Note that row order is different in this implementation than in [BD1] Table 6.1.

# first, let us create the new alcohol variable exposure 0/1
iev$alcgr2 <- cut(iev$alctot, c(-Inf, 79, Inf))

# Table 6.1 but with non-cases instead of total
lldt <- aggregate(cbind(case, 1-case) ~ agegr + alcgr2, data = iev, FUN = sum)
names(lldt)[4] <- "contr"
lldt
   agegr    alcgr2 case contr
1  25-34 (-Inf,79]    0   106
2  35-44 (-Inf,79]    5   164
3  45-54 (-Inf,79]   21   138
4  55-64 (-Inf,79]   34   138
5  65-74 (-Inf,79]   36    88
6   >=75 (-Inf,79]    8    31
7  25-34 (79, Inf]    1     9
8  35-44 (79, Inf]    4    26
9  45-54 (79, Inf]   25    29
10 55-64 (79, Inf]   42    28
11 65-74 (79, Inf]   19    18
12  >=75 (79, Inf]    5     0

We can do the same via tidyverse. Note that as implemented here, the order of the rows will differ from code above. It corresponds to [BD1] output.

iev2 <- iev2  %>%
  mutate(AlcCateg2 = cut(alctot, breaks=c(-Inf, 79.5, Inf), 
                        labels = c("0-79","80+"))) 

lldt2 <- iev2 %>%
  group_by(agegr, AlcCateg2) %>%
  summarize(cases = sum(case), contr = sum(1-case)) %>%
  # if you put summarize(case = sum(case), contr = sum(1-case)), the input in
  # sum(1-case) will not come from original iev2, but from newly created case variable
  mutate(case = cases) %>%
  select(-cases)
lldt2
# A tibble: 12 × 4
# Groups:   agegr [6]
   agegr AlcCateg2 contr  case
   <fct> <fct>     <dbl> <int>
 1 25-34 0-79        106     0
 2 25-34 80+           9     1
 3 35-44 0-79        164     5
 4 35-44 80+          26     4
 5 45-54 0-79        138    21
 6 45-54 80+          29    25
 7 55-64 0-79        138    34
 8 55-64 80+          28    42
 9 65-74 0-79         88    36
10 65-74 80+          18    19
11 >=75  0-79         31     8
12 >=75  80+           0     5

We can calculate the odds ratio for alcohol risk here using epitools library function oddsratio, without taking into account age grouping.

(marginalTable <- with(iev, table(alcgr2, case)))
           case
alcgr2        0   1
  (-Inf,79] 665 104
  (79, Inf] 110  96
oddsratio(marginalTable)
$data
           case
alcgr2        0   1 Total
  (-Inf,79] 665 104   769
  (79, Inf] 110  96   206
  Total     775 200   975

$measure
           odds ratio with 95% C.I.
alcgr2      estimate    lower    upper
  (-Inf,79] 1.000000       NA       NA
  (79, Inf] 5.565434 3.951577 7.859804

$p.value
           two-sided
alcgr2      midp.exact fisher.exact   chi.square
  (-Inf,79]         NA           NA           NA
  (79, Inf]          0 1.708811e-22 1.599308e-25

$correction
[1] FALSE

attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
oddsratio.wald(marginalTable)
$data
           case
alcgr2        0   1 Total
  (-Inf,79] 665 104   769
  (79, Inf] 110  96   206
  Total     775 200   975

$measure
           odds ratio with 95% C.I.
alcgr2      estimate    lower    upper
  (-Inf,79]  1.00000       NA       NA
  (79, Inf]  5.58042 3.960066 7.863779

$p.value
           two-sided
alcgr2      midp.exact fisher.exact   chi.square
  (-Inf,79]         NA           NA           NA
  (79, Inf]          0 1.708811e-22 1.599308e-25

$correction
[1] FALSE

attr(,"method")
[1] "Unconditional MLE & normal approximation (Wald) CI"
# Same results via tidyr is not that straightforward due to format requirements of oddstratio
# it can be obtained via OddsRatio, but matrix needs to be created first

Now let us refit the models from the Table 6.2 (see [BD1], p. 212).

lldt$agegr = factor(lldt$agegr)

lldt2 <- lldt2 %>%
  as.data.frame() %>% 
  mutate(agegr = factor(agegr)) 

# We can use either "classical data" or tidy version
# both will work and provide the same output 

# fit.1 <- glm(cbind(case, contr)~ -1 + agegr, family = binomial, data = lldt)
fit.1 <- glm(cbind(case, contr)~ -1 + agegr, family = binomial, data = lldt2)
summary(fit.1)

Call:
glm(formula = cbind(case, contr) ~ -1 + agegr, family = binomial, 
    data = lldt2)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-3.431  -1.299   0.368   2.481   4.917  

Coefficients:
           Estimate Std. Error z value Pr(>|z|)    
agegr25-34  -4.7449     1.0043  -4.725 2.31e-06 ***
agegr35-44  -3.0498     0.3411  -8.940  < 2e-16 ***
agegr45-54  -1.2894     0.1665  -7.743 9.70e-15 ***
agegr55-64  -0.7813     0.1385  -5.641 1.69e-08 ***
agegr65-74  -0.6561     0.1662  -3.948 7.88e-05 ***
agegr>=75   -0.8690     0.3304  -2.630  0.00854 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 572.341  on 12  degrees of freedom
Residual deviance:  89.148  on  6  degrees of freedom
AIC: 141.5

Number of Fisher Scoring iterations: 6
# calculate the G for chi-squared: note that we need contribution from both cases/controls
# Note: the numbers are slightly different than in the book, since they use different calculatin based on standardized residuals

# cases:
Expect <- unlist(fit.1$fitted.values* (lldt2[,"contr"] + lldt2[,"case"])) 
# controls:
ExpectB <- unlist((1-fit.1$fitted.values)* (lldt2[,"contr"] + lldt2[,"case"])) 
# Deviance
sum((Expect - lldt2[,"case"])^2/Expect) + 
  sum((ExpectB - lldt2[,"contr"])^2/ExpectB)
[1] 100.2541

Note that if we consider that the saturated model is the aggregated model (i.e. we ignore the alcohol and proceed with aggregating the data over the age groups), we get zero deviance, since we work above saturated model.

# let us fit aggregated model
lldt.fit1B <- lldt2 %>% group_by(agegr) %>%
  summarize(case = sum(case), contr = sum(contr))
fit.1B <- glm(cbind(case, contr)~ -1 + agegr, family = binomial, data = lldt.fit1B)
summary(fit.1B)

Call:
glm(formula = cbind(case, contr) ~ -1 + agegr, family = binomial, 
    data = lldt.fit1B)

Deviance Residuals: 
[1]  0  0  0  0  0  0

Coefficients:
           Estimate Std. Error z value Pr(>|z|)    
agegr25-34  -4.7449     1.0043  -4.724 2.31e-06 ***
agegr35-44  -3.0498     0.3411  -8.940  < 2e-16 ***
agegr45-54  -1.2894     0.1665  -7.743 9.70e-15 ***
agegr55-64  -0.7813     0.1385  -5.641 1.69e-08 ***
agegr65-74  -0.6561     0.1662  -3.948 7.88e-05 ***
agegr>=75   -0.8690     0.3304  -2.630  0.00854 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4.8319e+02  on 6  degrees of freedom
Residual deviance: 1.3989e-14  on 0  degrees of freedom
AIC: 38.718

Number of Fisher Scoring iterations: 4
Expect <- unlist(fit.1B$fitted.values* (lldt.fit1B[,"contr"] + lldt.fit1B[,"case"])) 
ExpectB <- unlist((1-fit.1B$fitted.values)* (lldt.fit1B[,"contr"] + lldt.fit1B[,"case"])) 
sum((Expect - lldt.fit1B[,"case"])^2/Expect) + 
  sum((ExpectB - lldt.fit1B[,"contr"])^2/ExpectB)
[1] 1.106186e-23

Now, let us fit remaining models. Note that for model 3 below, we have some issues with putting correct category as baseline that need to be addressed carefully.

#fit.2 <- glm(cbind(case, contr)~ -1 + agegr + alcgr2, family = binomial, data = lldt)
fit.2 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg2, family = binomial, data = lldt2)
summary(fit.2)

Call:
glm(formula = cbind(case, contr) ~ -1 + agegr + AlcCateg2, family = binomial, 
    data = lldt2)

Deviance Residuals: 
       1         2         3         4         5         6         7         8  
-1.16422   0.97386   0.03277  -0.03839  -0.14465   0.17015  -0.30323   0.38863  
       9        10        11        12  
 0.94443  -1.55898  -0.68418   2.12217  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
agegr25-34    -5.0493     1.0093  -5.003 5.65e-07 ***
agegr35-44    -3.5053     0.3562  -9.842  < 2e-16 ***
agegr45-54    -1.8490     0.1935  -9.557  < 2e-16 ***
agegr55-64    -1.3432     0.1648  -8.151 3.61e-16 ***
agegr65-74    -1.0833     0.1840  -5.889 3.89e-09 ***
agegr>=75     -1.0900     0.3440  -3.169  0.00153 ** 
AlcCateg280+   1.6541     0.1892   8.742  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 572.341  on 12  degrees of freedom
Residual deviance:  10.893  on  5  degrees of freedom
AIC: 65.242

Number of Fisher Scoring iterations: 5
Expect2 <- unlist(fit.2$fitted.values* (lldt2[,"contr"] + lldt2[,"case"])) 
Expect2B <- unlist((1-fit.2$fitted.values)* (lldt2[,"contr"] + lldt2[,"case"])) 
sum((Expect2 - lldt2[,"case"])^2/Expect2) + 
  sum((Expect2B - lldt2[,"contr"])^2/Expect2B)
[1] 9.162488
# from Table 6.3 manually
stRes <-  c(1.19, -0.82, -0.06, 0.05, 0.14, -0.12, 0.46, -0.35, -1.63, 0.98, 1.68, -0.66)
sum(stRes^2)  
[1] 9.338
# NOTE: direct fit will not work here, because the wrong category will be taken as baseline
# fit.3 <- glm(cbind(case, contr)~ -1 + agegr + alcgr2 + I(as.integer(agegr)-3.5):alcgr2, family = binomial, data = lldt)
# let us do it explicitly
lldt.fit3 <- lldt2 %>%
  mutate(AlcAgeInt = (as.integer(agegr)-3.5)*(as.numeric(AlcCateg2)-1))

fit.3 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg2 + AlcAgeInt, 
             family = binomial, data = lldt.fit3)
summary(fit.3)

Call:
glm(formula = cbind(case, contr) ~ -1 + agegr + AlcCateg2 + AlcAgeInt, 
    family = binomial, data = lldt.fit3)

Deviance Residuals: 
       1         2         3         4         5         6         7         8  
-1.09152   0.80776   0.25962  -0.28529   0.05078  -0.05897  -0.34034   0.43652  
       9        10        11        12  
 0.75231  -1.24076  -0.78422   2.32798  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
agegr25-34    -5.1786     1.0353  -5.002 5.67e-07 ***
agegr35-44    -3.6105     0.3966  -9.103  < 2e-16 ***
agegr45-54    -1.8946     0.2073  -9.137  < 2e-16 ***
agegr55-64    -1.3362     0.1647  -8.111 5.03e-16 ***
agegr65-74    -1.0443     0.1912  -5.462 4.71e-08 ***
agegr>=75     -1.0524     0.3448  -3.052  0.00227 ** 
AlcCateg280+   1.6989     0.2004   8.479  < 2e-16 ***
AlcAgeInt     -0.1268     0.1889  -0.671  0.50196    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 572.341  on 12  degrees of freedom
Residual deviance:  10.445  on  4  degrees of freedom
AIC: 66.793

Number of Fisher Scoring iterations: 5
Expect3 <- unlist(fit.3$fitted.values* (lldt.fit3[,"contr"] + lldt.fit3[,"case"])) 
Expect3B <- unlist((1-fit.2$fitted.values)* (lldt.fit3[,"contr"] + lldt.fit3[,"case"])) 
sum((Expect3  - lldt.fit3[,"case"])^2/Expect3 ) + 
  sum((Expect3B - lldt.fit3[,"contr"])^2/Expect3B)
[1] 8.69365
# Note: see the typo in the last coefficient (wrong sign in the book)

Let us test the models and let us stay with model 2 for time being and look at the prediction and compare it with fitted values. We can see that we have got reasonable fit, except maybe for fifth age category, so we would use this model to draw results for the analysis (see BD1 for the interpretation of results).

anova(fit.2, fit.1, test = "Chisq")
Analysis of Deviance Table

Model 1: cbind(case, contr) ~ -1 + agegr + AlcCateg2
Model 2: cbind(case, contr) ~ -1 + agegr
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1         5     10.893                          
2         6     89.148 -1  -78.255 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit.3, fit.2, test = "Chisq")
Analysis of Deviance Table

Model 1: cbind(case, contr) ~ -1 + agegr + AlcCateg2 + AlcAgeInt
Model 2: cbind(case, contr) ~ -1 + agegr + AlcCateg2
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1         4     10.445                     
2         5     10.893 -1 -0.44819   0.5032
checkModelFit <- data.frame(ObservedCases = lldt.fit3[,"case"], PredictedCases = round(fit.2$fitted.values* (lldt.fit3[,"contr"] + lldt.fit3[,"case"]), 3) ) %>%
  mutate(Difference = ObservedCases - PredictedCases)
checkModelFit
   ObservedCases PredictedCases Difference
1              0          0.676     -0.676
2              1          0.324      0.676
3              5          4.928      0.072
4              4          4.072     -0.072
5             21         21.623     -0.623
6             25         24.377      0.623
7             34         35.602     -1.602
8             42         40.398      1.602
9             36         31.359      4.641
10            19         23.641     -4.641
11             8          9.813     -1.813
12             5          3.187      1.813

Conduct a joint grouped analysis of alcohol and tobacco risk adjusted for age

Let us recreate results Table 6.5. First we prepare the data.

lldt3 <- iev2 %>%
  group_by(agegr, AlcCateg, TobCateg) %>%
  summarize(cases = sum(case), contr = sum(1-case)) %>%
  # if you put summarize(case = sum(case), contr = sum(1-case)), the input in
  # sum(1-case) will not come from original iev2, but from newly created case variable
  mutate(case = cases) %>%
  select(-cases)
lldt3
# A tibble: 88 × 5
# Groups:   agegr, AlcCateg [24]
   agegr AlcCateg TobCateg contr  case
   <fct> <fct>    <fct>    <dbl> <int>
 1 25-34 0-39     0-9         41     0
 2 25-34 0-39     10-19       10     0
 3 25-34 0-39     20-29        6     0
 4 25-34 0-39     30+          5     0
 5 25-34 40-79    0-9         27     0
 6 25-34 40-79    10-19        7     0
 7 25-34 40-79    20-29        3     0
 8 25-34 40-79    30+          7     0
 9 25-34 80-119   0-9          2     0
10 25-34 80-119   10-19        1     0
# … with 78 more rows

Let us fit the models listed in Table 6.5

fit3.1 <- glm(cbind(case, contr)~ -1 + agegr, family = binomial, data = lldt3)
round(fit3.1$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74  agegr>=75 
     -4.74      -3.05      -1.29      -0.78      -0.66      -0.87 
fit3.2 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg, family = binomial, data = lldt3)
round(fit3.2$coeff, 2)
    agegr25-34     agegr35-44     agegr45-54     agegr55-64     agegr65-74 
         -6.14          -4.52          -2.72          -2.20          -1.79 
     agegr>=75  AlcCateg40-79 AlcCateg80-119   AlcCateg120+ 
         -1.72           1.44           1.99           3.68 
fit3.3 <- glm(cbind(case, contr)~ -1 + agegr + TobCateg , family = binomial, data = lldt3)
round(fit3.3$coeff, 2)
   agegr25-34    agegr35-44    agegr45-54    agegr55-64    agegr65-74 
        -5.36         -3.57         -1.78         -1.24         -0.94 
    agegr>=75 TobCateg10-19 TobCateg20-29   TobCateg30+ 
        -1.27          0.61          0.67          1.74 
fit3.4 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg + TobCateg, family = binomial, data = lldt3)
round(fit3.4$coeff, 2)
    agegr25-34     agegr35-44     agegr45-54     agegr55-64     agegr65-74 
         -6.89          -4.92          -3.12          -2.56          -2.00 
     agegr>=75  AlcCateg40-79 AlcCateg80-119   AlcCateg120+  TobCateg10-19 
         -2.07           1.44           1.97           3.60           0.44 
 TobCateg20-29    TobCateg30+ 
          0.52           1.64 
# and expand
round(exp(fit3.4$coeff),2)
    agegr25-34     agegr35-44     agegr45-54     agegr55-64     agegr65-74 
          0.00           0.01           0.04           0.08           0.14 
     agegr>=75  AlcCateg40-79 AlcCateg80-119   AlcCateg120+  TobCateg10-19 
          0.13           4.21           7.15          36.71           1.56 
 TobCateg20-29    TobCateg30+ 
          1.67           5.18 

As expected, we can see that both alcohol and tobacco are important for the response and that they have both positive effect (in sense of response occurence, so negative in terms of the health risks).

We continue with model fitted to obtain Table 6.8.

lldt3.5 <- iev2 %>%
  mutate(AlcCateg3 = cut(alctot, breaks=c(-Inf, 24.5,49.5, 74.5, 99, Inf), 
                        labels = c("0-24","24-49","50-74", "75-99", "100+"))) %>%
  mutate(TobCateg2 = fct_recode(tobgr, "0" = "None", "1-4" = "1-4 g/day", "5-14" = "5-9 g/day",
                               "5-14" = "10-14 g/day","15-29" = "15-19 g/day",
                               "15-29" = "20-29 g/day",
                               "30+" = "30-39 g/day", "30+" = "40-49 g/day", 
                               "30+" = ">=50 g/day")) %>%
  group_by(agegr, AlcCateg3, TobCateg2) %>%
  summarize(cases = sum(case), contr = sum(1-case)) %>%
  # if you put summarize(case = sum(case), contr = sum(1-case)), the input in
  # sum(1-case) will not come from original iev2, but from newly created case variable
  mutate(case = cases) %>%
  select(-cases)

# Note slight computational differences from Table 6.8
fit3.5 <- glm(cbind(case, contr)~ -1 + agegr + AlcCateg3 + TobCateg2, family = binomial, data = lldt3.5)

beta.alc <- c(1,exp(coef(fit3.5)[7:10]))
beta.tob <- c(1,exp(coef(fit3.5)[11:14]))

round(beta.alc %*% t(beta.tob), 1)
          TobCateg21-4 TobCateg25-14 TobCateg215-29 TobCateg230+
[1,]  1.0          4.4           7.3            5.8         19.4
[2,]  1.6          7.1          11.6            9.3         30.8
[3,]  5.0         22.3          36.4           29.3         97.1
[4,]  6.6         29.5          48.2           38.8        128.4
[5,] 17.6         78.0         127.6          102.7        340.2

Again, we see that the alcohol and tobacco has strong positive effect. However, we also notice that the there is rather strong change from tobacco level zero and 1-4g level. Hence, the previous categorisation will not work very well, since it is pulling two distinct categories. It also give us an idea that linear relationship won’t perhaps work for next section.

Conduct a joint ungrouped analysis of alcohol and tobacco risk adjusted for age

Based on findings from previous section, we will try to include tobacco and alcohol as well in some non-linear fashion (note: we still use linear models in sense of linearity in parameters). We reconstruct results of Table 6.12. Note different result for model 6.

lldt4 <- iev2 %>%
  mutate(AlcCont = alctot) %>%
  mutate(TobCont = as.numeric(as.character(fct_recode(tobgr, "0" = "None", "2.5" = "1-4 g/day", "7.5" = "5-9 g/day",
                               "12.5" = "10-14 g/day","17.5" = "15-19 g/day",
                               "25" = "20-29 g/day",
                               "35" = "30-39 g/day", "45" = "40-49 g/day", 
                               "60" = ">=50 g/day")))) %>%
  mutate(AlcContLog = log(AlcCont+1)) %>%
  mutate(TobContLog = log(TobCont+1)) %>%
  mutate(AlcCont2 = AlcCont^2) %>%
  mutate(TobCont2 = TobCont^2)


fit4.1 <- glm(case ~ -1 + agegr + AlcCont + TobCont, family = binomial, data = lldt4)
round(fit3.2$coeff, 2)
    agegr25-34     agegr35-44     agegr45-54     agegr55-64     agegr65-74 
         -6.14          -4.52          -2.72          -2.20          -1.79 
     agegr>=75  AlcCateg40-79 AlcCateg80-119   AlcCateg120+ 
         -1.72           1.44           1.99           3.68 
fit4.2 <- glm(case ~ -1 + agegr + TobCont + AlcContLog, family = binomial, data = lldt4)
round(fit4.2$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74  agegr>=75    TobCont 
     -9.00      -7.29      -5.59      -5.05      -4.61      -4.59       0.04 
AlcContLog 
      0.93 
fit4.3 <- glm(case ~ -1 + agegr + AlcCont + TobContLog, family = binomial, data = lldt4)
round(fit4.3$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74  agegr>=75    AlcCont 
     -7.55      -5.97      -4.20      -3.68      -3.19      -3.08       0.03 
TobContLog 
      0.54 
fit4.4 <- glm(case ~ -1 + agegr + AlcContLog + TobContLog, family = binomial, data = lldt4)
round(fit4.4$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74  agegr>=75 AlcContLog 
     -9.32      -7.77      -6.08      -5.56      -5.17      -5.07       0.89 
TobContLog 
      0.56 
fit4.5 <- glm(case ~ -1 + agegr + AlcCont + TobCont + TobCont2, family = binomial, data = lldt4)
round(fit4.5$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74  agegr>=75    AlcCont 
     -7.18      -5.52      -3.73      -3.19      -2.67      -2.61       0.03 
   TobCont   TobCont2 
      0.06       0.00 
# mistake in book it seems:
fit4.6 <- glm(case ~ -1 + agegr + AlcCont + TobContLog + AlcCont2, family = binomial, data = lldt4)
round(fit4.6$coeff, 2)
agegr25-34 agegr35-44 agegr45-54 agegr55-64 agegr65-74  agegr>=75    AlcCont 
     -7.44      -5.84      -4.05      -3.54      -3.05      -2.96       0.02 
TobContLog   AlcCont2 
      0.54       0.00 
fit4.7 <- glm(case ~ -1 + agegr + AlcCont + TobContLog + I(TobContLog^2), family = binomial, data = lldt4)
round(fit3.2$coeff, 2)
    agegr25-34     agegr35-44     agegr45-54     agegr55-64     agegr65-74 
         -6.14          -4.52          -2.72          -2.20          -1.79 
     agegr>=75  AlcCateg40-79 AlcCateg80-119   AlcCateg120+ 
         -1.72           1.44           1.99           3.68 

Now how to decide which model is the best. One option would be looking at deviance metric or base the model selection on sequential testing (stepwise) of anova models. Alternatively, information criterion can be used. Based on AIC, model 7 will be selected as just slightly better than model 3 and 6. Note that ordering of AIC is not the same as of Chi squared statistics.

In any case, it should not be used in automated mode only. The model selection needs to include careful consideration of both statistical as well as applied aspects (practical relevance of the effects). Hence, the manual search for the correct model is a must.

AIC(fit4.1, fit4.2, fit4.3, fit4.4, fit4.5, fit4.6, fit4.7)
       df      AIC
fit4.1  8 711.4680
fit4.2  8 765.5021
fit4.3  8 699.2473
fit4.4  8 750.8665
fit4.5  9 711.9197
fit4.6  9 700.8066
fit4.7  9 699.1563