rm(list = ls()) # clean-up workspace
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(faraway)
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.5.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] faraway_1.0.8   lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0  
##  [5] dplyr_1.1.3     purrr_1.0.2     readr_2.1.4     tidyr_1.3.0    
##  [9] tibble_3.2.1    ggplot2_3.4.3   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.6       utf8_1.2.3       generics_0.1.3   stringi_1.7.12  
##  [5] lattice_0.21-8   lme4_1.1-34      hms_1.1.3        digest_0.6.33   
##  [9] magrittr_2.0.3   evaluate_0.20    grid_4.3.0       timechange_0.2.0
## [13] fastmap_1.1.1    jsonlite_1.8.4   Matrix_1.5-4     fansi_1.0.4     
## [17] scales_1.2.1     jquerylib_0.1.4  cli_3.6.1        rlang_1.1.1     
## [21] munsell_0.5.0    splines_4.3.0    withr_2.5.0      cachem_1.0.8    
## [25] yaml_2.3.7       tools_4.3.0      tzdb_0.4.0       nloptr_2.0.3    
## [29] minqa_1.2.6      colorspace_2.1-0 boot_1.3-28.1    vctrs_0.6.3     
## [33] R6_2.5.1         lifecycle_1.0.3  MASS_7.3-60      pkgconfig_2.0.3 
## [37] pillar_1.9.0     bslib_0.4.2      gtable_0.3.4     Rcpp_1.0.11     
## [41] glue_1.6.2       xfun_0.39        tidyselect_1.2.0 rstudioapi_0.14 
## [45] knitr_1.42       htmltools_0.5.5  nlme_3.1-162     rmarkdown_2.21  
## [49] compiler_4.3.0

Announcement

Acknowledgement

Dr. Hua Zhou’s slides

Heart disease example

The dataframe wcgs in faraway package contains data from the Western Collaborative Group Study.

wcgs %>% head(10)
##    age height weight sdp dbp chol behave cigs dibep chd  typechd timechd
## 1   49     73    150 110  76  225     A2   25     A  no     none    1664
## 2   42     70    160 154  84  177     A2   20     A  no     none    3071
## 3   42     69    160 110  78  181     B3    0     B  no     none    3071
## 4   41     68    152 124  78  132     B4   20     B  no     none    3064
## 5   59     70    150 144  86  255     B3   20     B yes infdeath    1885
## 6   44     72    204 150  90  182     B4    0     B  no     none    3102
## 7   44     72    164 130  84  155     B4    0     B  no     none    3074
## 8   40     71    150 138  60  140     A2    0     A  no     none    3071
## 9   43     72    190 146  76  149     B3   25     B  no     none    3064
## 10  42     70    175 132  90  325     A2    0     A  no     none    1032
##      arcus
## 1   absent
## 2  present
## 3   absent
## 4   absent
## 5  present
## 6   absent
## 7   absent
## 8   absent
## 9   absent
## 10 present

We convert the dataframe into a tibble for compatibility with tidyverse.

wcgs <- wcgs %>% 
  as_tibble() %>%
  print(width = Inf)
## # A tibble: 3,154 × 13
##      age height weight   sdp   dbp  chol behave  cigs dibep chd   typechd 
##    <int>  <int>  <int> <int> <int> <int> <fct>  <int> <fct> <fct> <fct>   
##  1    49     73    150   110    76   225 A2        25 A     no    none    
##  2    42     70    160   154    84   177 A2        20 A     no    none    
##  3    42     69    160   110    78   181 B3         0 B     no    none    
##  4    41     68    152   124    78   132 B4        20 B     no    none    
##  5    59     70    150   144    86   255 B3        20 B     yes   infdeath
##  6    44     72    204   150    90   182 B4         0 B     no    none    
##  7    44     72    164   130    84   155 B4         0 B     no    none    
##  8    40     71    150   138    60   140 A2         0 A     no    none    
##  9    43     72    190   146    76   149 B3        25 B     no    none    
## 10    42     70    175   132    90   325 A2         0 A     no    none    
##    timechd arcus  
##      <int> <fct>  
##  1    1664 absent 
##  2    3071 present
##  3    3071 absent 
##  4    3064 absent 
##  5    1885 present
##  6    3102 absent 
##  7    3074 absent 
##  8    3071 absent 
##  9    3064 absent 
## 10    1032 present
## # ℹ 3,144 more rows

For now, we focus just on variables
- chd, whether the person develops coronary heard disease or not,
- height, height of the person in inches,
- cigs, number of cigarettes smoked per day.

wcgs %>%
  select(chd, height, cigs) %>%
  summary()
##   chd           height           cigs     
##  no :2897   Min.   :60.00   Min.   : 0.0  
##  yes: 257   1st Qu.:68.00   1st Qu.: 0.0  
##             Median :70.00   Median : 0.0  
##             Mean   :69.78   Mean   :11.6  
##             3rd Qu.:72.00   3rd Qu.:20.0  
##             Max.   :78.00   Max.   :99.0

We use side-by-side boxplots to summarize the qulitative variable chd and quantitative variables height and cigs

ggplot(data = wcgs) +
  geom_boxplot(mapping = aes(x = chd, y = height))

and number of cigarretes smoked per day

ggplot(data = wcgs) +
  geom_boxplot(mapping = aes(x = chd, y = cigs))

It seems more cigarettes is associated with heard disease, but not height. How can we formally analyze this? If we use linear regression (straight line) for the anlaysis, the line will eventually extends beyond the [0, 1] range, making interpretation hard.

ggplot(data = wcgs) +
  geom_point(mapping = aes(x = cigs, y = chd))

Logistic regression

ggplot(data = tibble(x = 0), mapping = aes(x = x)) + # null data
  stat_function(fun = ilogit) + # ilogit is from faraway
  xlim(-6, 6) + 
  labs(x = expression(eta), y = "p", title = "Logistic function (inverse link)")

# curve(ilogit(x), -6, 6, xlab = expression(eta), ylab = "p")

Fitting logistic regression

(lmod <- glm(chd ~ height + cigs, family = binomial, wcgs))
## 
## Call:  glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
## 
## Coefficients:
## (Intercept)       height         cigs  
##    -4.50161      0.02521      0.02313  
## 
## Degrees of Freedom: 3153 Total (i.e. Null);  3151 Residual
## Null Deviance:       1781 
## Residual Deviance: 1749  AIC: 1755

Summary of the result:

(lmod_sm <- summary(lmod))
## 
## Call:
## glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.50161    1.84186  -2.444   0.0145 *  
## height       0.02521    0.02633   0.957   0.3383    
## cigs         0.02313    0.00404   5.724 1.04e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1781.2  on 3153  degrees of freedom
## Residual deviance: 1749.0  on 3151  degrees of freedom
## AIC: 1755
## 
## Number of Fisher Scoring iterations: 5

Interpretation

# dataframe
wcgs %>%
  select(chd, height, cigs) %>%
  head(10)
## # A tibble: 10 × 3
##    chd   height  cigs
##    <fct>  <int> <int>
##  1 no        73    25
##  2 no        70    20
##  3 no        69     0
##  4 no        68    20
##  5 yes       70    20
##  6 no        72     0
##  7 no        72     0
##  8 no        71     0
##  9 no        72    25
## 10 no        70     0
# response
lmod$y %>% head(10)
##  1  2  3  4  5  6  7  8  9 10 
##  0  0  0  0  1  0  0  0  0  0
# predictors
model.matrix(lmod) %>% head(10)
##    (Intercept) height cigs
## 1            1     73   25
## 2            1     70   20
## 3            1     69    0
## 4            1     68   20
## 5            1     70   20
## 6            1     72    0
## 7            1     72    0
## 8            1     71    0
## 9            1     72   25
## 10           1     70    0
library(gtsummary)
lmod %>%
  tbl_regression() %>%
  bold_labels() %>%
  bold_p(t = 0.05)
Characteristic log(OR)1 95% CI1 p-value
height 0.03 -0.03, 0.08 0.3
cigs 0.02 0.02, 0.03 <0.001
1 OR = Odds Ratio, CI = Confidence Interval

Summarize the odds:

lmod %>%
  tbl_regression(intercept = TRUE, exponentiate = TRUE) %>%
  bold_labels() %>%
  bold_p(t = 0.05)
Characteristic OR1 95% CI1 p-value
(Intercept) 0.01 0.00, 0.40 0.015
height 1.03 0.97, 1.08 0.3
cigs 1.02 1.02, 1.03 <0.001
1 OR = Odds Ratio, CI = Confidence Interval
# same as lmod$coefficients
# coef(lmod) is a named numeric vector
(beta_hat <- unname(coef(lmod)))
## [1] -4.50161397  0.02520779  0.02312740
exp(beta_hat)
## [1] 0.01109108 1.02552819 1.02339691

How to interpret the effect of a pack a day (20 cigarettes) on heart disease?

exp(beta_hat[3] * 20)
## [1] 1.588115
(p1 <- ilogit(sum(beta_hat * c(1, 68, 20))))
## [1] 0.08907868

and

(p2 <- ilogit(sum(beta_hat * c(1, 68, 0))))
## [1] 0.05800425

Then the relative risk is

p1 / p2
## [1] 1.535727

Inference (analysis of deviance)

lmod$null.deviance - lmod$deviance
## [1] 32.19451

giving p-value

pchisq(lmod$null.deviance - lmod$deviance, 2, lower.tail = FALSE)
## [1] 1.02106e-07

Therefore our model gives a significantly better fit than the null (intercept-only) model.

# fit a model without height
lmodc <- glm(chd ~ cigs, family = binomial, wcgs)
anova(lmodc, lmod, test = "Chi")
## Analysis of Deviance Table
## 
## Model 1: chd ~ cigs
## Model 2: chd ~ height + cigs
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      3152       1750                     
## 2      3151       1749  1  0.92025   0.3374
drop1(lmod, test = "Chi")
## Single term deletions
## 
## Model:
## chd ~ height + cigs
##        Df Deviance    AIC     LRT Pr(>Chi)    
## <none>      1749.0 1755.0                     
## height  1   1750.0 1754.0  0.9202   0.3374    
## cigs    1   1780.1 1784.1 31.0695 2.49e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmod_sm$coefficients
##                Estimate  Std. Error   z value     Pr(>|z|)
## (Intercept) -4.50161397 1.841862743 -2.444055 1.452321e-02
## height       0.02520779 0.026327385  0.957474 3.383281e-01
## cigs         0.02312740 0.004040183  5.724344 1.038339e-08

In general, deviance-based test is preferred over the z-test.

Confidence intervals

tibble(
  `coef`  = beta_hat,
  `2.5%`  = beta_hat - 1.96 * lmod_sm$coefficients[, 2],
  `97.5%` = beta_hat + 1.96 * lmod_sm$coefficients[, 2])
## # A tibble: 3 × 3
##      coef  `2.5%` `97.5%`
##     <dbl>   <dbl>   <dbl>
## 1 -4.50   -8.11   -0.892 
## 2  0.0252 -0.0264  0.0768
## 3  0.0231  0.0152  0.0310

or from profile-likelihood

confint(lmod)
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept) -8.13475465 -0.91297018
## height      -0.02619902  0.07702835
## cigs         0.01514949  0.03100534

Diagnostics

linpred  <- predict(lmod)
linpred %>% head(10)
##         1         2         3         4         5         6         7         8 
## -2.083261 -2.274521 -2.762277 -2.324936 -2.274521 -2.686653 -2.686653 -2.711861 
##         9        10 
## -2.108468 -2.737069

The second on the scale of response, \(p = \text{logit}^{-1}(\eta)\),

predprob <- predict(lmod, type = "response")
predprob %>% head(10)
##          1          2          3          4          5          6          7 
## 0.11073449 0.09325523 0.05939705 0.08907868 0.09325523 0.06376553 0.06376553 
##          8          9         10 
## 0.06227708 0.10827647 0.06082112
# same as residuals(lmod, type = "response")
rawres <- lmod$y - predprob

The plot of raw residuals against the fitted values is not very informative.

wcgs %>%
  mutate(rawres = rawres, linpred = linpred) %>%
  ggplot() +
  geom_point(mapping = aes(x = linpred, y = rawres)) +
  labs(x = "Linear predictor", y = "Raw residuals")

We do not expect the raw residuals to have equal variance because the binary variance is \(p(1 - p)\).

devres <- residuals(lmod)
devres %>% head(10)
##          1          2          3          4          5          6          7 
## -0.4844779 -0.4424800 -0.3499548 -0.4319693  2.1782631 -0.3630133 -0.3630133 
##          8          9         10 
## -0.3586106 -0.4787466 -0.3542579

Sanity check:

sqrt(-2 * (lmod$y * log(predprob) + (1 - lmod$y) * log(1 - predprob))) %>%
  head(10)
##         1         2         3         4         5         6         7         8 
## 0.4844779 0.4424800 0.3499548 0.4319693 2.1782631 0.3630133 0.3630133 0.3586106 
##         9        10 
## 0.4787466 0.3542579

The plot of deviance residuals against the fitted values.

wcgs %>%
  mutate(devres = devres, linpred = linpred) %>%
  ggplot() +
  geom_point(mapping = aes(x = linpred, y = devres)) +
  labs(x = "Linear predictor", y = "Deviance residuals")

Again we see the residuals are clustered into two lines: the upper one corresponding to \(y_i=1\) and the lower one to \(y_i=0\). We can improve this plot by binning: divide the range of linear predictor into 100 bins of roughly equal points and plot average residual against average linear predictors per bin.

wcgs %>%
  mutate(devres = devres, linpred = linpred) %>% 
  group_by(cut(linpred, breaks = unique(quantile(linpred, (1:100)/101)))) %>%
  summarize(devres = mean(devres), 
            linpred = mean(linpred)) %>%
  ggplot() +
  geom_point(mapping = aes(x = linpred, y = devres)) + 
  labs(x = "Linear predictor", y = "Binned deviance residual")

Question: is there a concern that the deviance residuals are not centered around 0?

qqnorm(devres)

halfnorm(hatvalues(lmod))

We see two high leverage cases, who smoke an unusual number of cigarettes per day!

wcgs %>%
  slice(c(2527, 2695)) %>%
  print(width = Inf)
## # A tibble: 2 × 13
##     age height weight   sdp   dbp  chol behave  cigs dibep chd   typechd timechd
##   <int>  <int>  <int> <int> <int> <int> <fct>  <int> <fct> <fct> <fct>     <int>
## 1    52     71    168   120    80   251 A1        99 A     no    none       2956
## 2    47     64    158   116    76   206 A1        80 A     no    none       2114
##   arcus  
##   <fct>  
## 1 present
## 2 absent
halfnorm(cooks.distance(lmod))

wcgs %>%
  slice(c(953, 2082)) %>%
  print(width = Inf)
## # A tibble: 2 × 13
##     age height weight   sdp   dbp  chol behave  cigs dibep chd   typechd timechd
##   <int>  <int>  <int> <int> <int> <int> <fct>  <int> <fct> <fct> <fct>     <int>
## 1    57     63    155   128    88   196 A2         0 A     yes   silent     2349
## 2    49     77    210   138    86   235 B3         0 B     yes   angina     3048
##   arcus 
##   <fct> 
## 1 absent
## 2 absent

Goodness of fit

Hosmer-Lemeshow statistic

  • Intuitively if we divide observations into \(J\) bins according to linear predictors \(\eta\), then \(y_j / n_j\) (observed proportion of “successes”) for \(j\)-th bin should be close to the average predicted probabilities in that bin.
wcgs_binned <- wcgs %>%
  mutate(predprob = predict(lmod, type = "response"), 
         linpred  = predict(lmod, type = "link"),
         bin      = cut(linpred, breaks = unique(quantile(linpred, (1:100) / 101)))) %>%
  group_by(bin) %>%
  summarize(y       = sum(ifelse(chd == "yes", 1, 0)), 
            avgpred = mean(predprob), 
            count   = n()) %>%
  mutate(se_fit = sqrt(avgpred * (1 - avgpred) / count))
wcgs_binned %>%
  ggplot(mapping = aes(x = avgpred, y = y / count)) + 
  geom_point() +
  geom_linerange(mapping = aes(ymin = y / count - 2 * se_fit,
                               ymax = y / count + 2 * se_fit), alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1) +
  labs(x = "Predicted probability", y = "Observed proportion")

  • The Hosmer-Lemeshow test formalizes this idea by testing the statistic \[ X_{\text{HL}}^2 = \sum_{j=1}^J \frac{(y_i - m_j \widehat{p}_j)^2}{m_j \widehat{p}_j (1 - \widehat{p}_j)} \] against the \(\chi^2\) distribution with \(J-1\) degrees of freedom.
# Hosmer-Lemeshow test statistic
(hlstat <- with(wcgs_binned, sum((y - count * avgpred)^2 / (count * avgpred * (1 - avgpred)))))
## [1] 64.87001
# J
nrow(wcgs_binned)
## [1] 52
# p-value
pchisq(hlstat, nrow(wcgs_binned) - 1, lower.tail = FALSE)
## [1] 0.09172918

We see a moderate p-value, which indicates no lack of fit.

ROC curve

  • Logistic regression is often used as a tool for classification.

  • If we choose a threshold, say 0.2, then the predicted probabilities give a classification rule \[ \text{case $i$ is a} \begin{cases} \text{"success"} & \text{if } \widehat{p}_i \ge 0.2 \\ \text{"failure"} & \text{if } \widehat{p}_i < 0.2 \end{cases}. \]

wcgs %>% 
  mutate(predprob = predict(lmod, type = "response")) %>% 
  mutate(predout = ifelse(predprob >= 0.2, "yes", "no")) %>%
  xtabs(~ chd + predout, data = .)
##      predout
## chd     no  yes
##   no  2886   11
##   yes  254    3
  • With this classification rule, we see the error rate is about
(11 + 254) / (2886 + 254 + 11 + 3)
## [1] 0.08402029

The sensitivity is \[ \frac{\text{TP}}{\text{TP + FN}} = \frac{3}{257} = 1.17\% \] and the specificity is \[ \frac{\text{TN}}{\text{FP + TN}} = \frac{2886}{11 + 2886} = 99.62\% \]

  • If we lower the threshold, then we increase the sensitivity but decrease the specificity. If we plot sensitivity against 1-specificity by varying the threshold, then we get the receiver operating characteristic (ROC) curve.
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
lmod_roc <- roc(chd ~ predprob, wcgs)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
ggroc(lmod_roc)

The area under the curve (AUC) is a measure of the overal classification of the classifier. Larger AUC means better classification performance.

auc(lmod_roc)
## Area under the curve: 0.6007

Model selection by AIC

wcgs <- wcgs %>%
  # 1 in = 0.0254 m, 1 lb = 0.4536 kg
  mutate(bmi = 703 * weight / height)
biglm <- glm(chd ~ age + height + weight + bmi + sdp + dbp + chol + dibep + cigs + arcus,
             family = binomial, data = wcgs)

and then do sequential backward search using the step function

step(biglm, trace = TRUE, direction = "both") %>%
  tbl_regression() %>%
  bold_labels()
## Start:  AIC=1591.05
## chd ~ age + height + weight + bmi + sdp + dbp + chol + dibep + 
##     cigs + arcus
## 
##          Df Deviance    AIC
## - dbp     1   1569.1 1589.1
## - weight  1   1569.3 1589.3
## - bmi     1   1569.5 1589.5
## - height  1   1569.5 1589.5
## <none>        1569.0 1591.0
## - arcus   1   1571.1 1591.1
## - sdp     1   1576.8 1596.8
## - dibep   1   1590.4 1610.4
## - cigs    1   1592.0 1612.0
## - age     1   1593.7 1613.7
## - chol    1   1619.8 1639.8
## 
## Step:  AIC=1589.06
## chd ~ age + height + weight + bmi + sdp + chol + dibep + cigs + 
##     arcus
## 
##          Df Deviance    AIC
## - weight  1   1569.4 1587.4
## - bmi     1   1569.5 1587.5
## - height  1   1569.5 1587.5
## <none>        1569.1 1589.1
## - arcus   1   1571.2 1589.2
## + dbp     1   1569.0 1591.0
## - sdp     1   1586.2 1604.2
## - dibep   1   1590.4 1608.4
## - cigs    1   1592.5 1610.5
## - age     1   1593.7 1611.7
## - chol    1   1619.9 1637.9
## 
## Step:  AIC=1587.36
## chd ~ age + height + bmi + sdp + chol + dibep + cigs + arcus
## 
##          Df Deviance    AIC
## - height  1   1570.3 1586.3
## <none>        1569.4 1587.4
## - arcus   1   1571.5 1587.5
## + weight  1   1569.1 1589.1
## + dbp     1   1569.3 1589.3
## - bmi     1   1574.4 1590.4
## - sdp     1   1586.7 1602.7
## - dibep   1   1590.7 1606.7
## - cigs    1   1592.8 1608.8
## - age     1   1594.0 1610.0
## - chol    1   1620.0 1636.0
## 
## Step:  AIC=1586.32
## chd ~ age + bmi + sdp + chol + dibep + cigs + arcus
## 
##          Df Deviance    AIC
## <none>        1570.3 1586.3
## - arcus   1   1572.6 1586.6
## + height  1   1569.4 1587.4
## + weight  1   1569.5 1587.5
## + dbp     1   1570.3 1588.3
## - bmi     1   1577.3 1591.3
## - sdp     1   1587.2 1601.2
## - dibep   1   1591.9 1605.9
## - age     1   1594.2 1608.2
## - cigs    1   1594.6 1608.6
## - chol    1   1620.0 1634.0
Characteristic log(OR)1 95% CI1 p-value
age 0.06 0.04, 0.08 <0.001
bmi 0.00 0.00, 0.00 0.008
sdp 0.02 0.01, 0.03 <0.001
chol 0.01 0.01, 0.01 <0.001
dibep


    A
    B -0.66 -0.95, -0.38 <0.001
cigs 0.02 0.01, 0.03 <0.001
arcus


    absent
    present 0.22 -0.06, 0.50 0.13
1 OR = Odds Ratio, CI = Confidence Interval

Model selection by lasso

(wcgs <- wcgs %>% 
  select(-c(behave, typechd, timechd)) %>%
  drop_na())
## # A tibble: 3,140 × 11
##      age height weight   sdp   dbp  chol  cigs dibep chd   arcus     bmi
##    <int>  <int>  <int> <int> <int> <int> <int> <fct> <fct> <fct>   <dbl>
##  1    49     73    150   110    76   225    25 A     no    absent  1445.
##  2    42     70    160   154    84   177    20 A     no    present 1607.
##  3    42     69    160   110    78   181     0 B     no    absent  1630.
##  4    41     68    152   124    78   132    20 B     no    absent  1571.
##  5    59     70    150   144    86   255    20 B     yes   present 1506.
##  6    44     72    204   150    90   182     0 B     no    absent  1992.
##  7    44     72    164   130    84   155     0 B     no    absent  1601.
##  8    40     71    150   138    60   140     0 A     no    absent  1485.
##  9    43     72    190   146    76   149    25 B     no    absent  1855.
## 10    42     70    175   132    90   325     0 A     no    present 1758.
## # ℹ 3,130 more rows

split data into 80% training cases and 20% validation cases.

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:faraway':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# set seed for reproducibility
set.seed(200)

# list = FALSE, request result to be in a matrix (of row position) not a list
training_samples <- wcgs$chd %>%
  createDataPartition(p = 0.8, list = FALSE)
# (train_data <- wcgs %>%
#   slice(training_samples[, 1]))
# (val_data   <- wcgs %>%
#   slice(-training_samples[, 1]))

The glmnet package takes a matrix (of predictors) and a vector (of responses) as input. We use model.matrix function to create them. glmnet will add intercept by default, so we drop intercept term when forming x matrix.

# X and y from original data
x_all <- model.matrix(
  chd ~ - 1 + age + height + weight + bmi + sdp + dbp + chol + dibep + cigs + arcus, 
  data = wcgs)
y_all <- ifelse(wcgs$chd == "yes", 1, 0)
# training X and y
x_train <- x_all[training_samples[, 1], ]
y_train <- y_all[training_samples[, 1]]
# validation X and y
x_val <- x_all[-training_samples[, 1], ]
y_val <- y_all[-training_samples[, 1]]

Fit lasso regression and plot solution path:

lasso_fit <- glmnet(x_train, y_train, family = "binomial", alpha = 1)
summary(lasso_fit)
##            Length Class     Mode     
## a0          49    -none-    numeric  
## beta       539    dgCMatrix S4       
## df          49    -none-    numeric  
## dim          2    -none-    numeric  
## lambda      49    -none-    numeric  
## dev.ratio   49    -none-    numeric  
## nulldev      1    -none-    numeric  
## npasses      1    -none-    numeric  
## jerr         1    -none-    numeric  
## offset       1    -none-    logical  
## classnames   2    -none-    character
## call         5    -none-    call     
## nobs         1    -none-    numeric
plot(lasso_fit, xvar = "lambda", label = TRUE)

# predict validation case probabilities at different \lambda values and calculate test deviance
pred_val <- predict(lasso_fit, newx = x_val, type = "response", s = lasso_fit$lambda)
dev_val <- -2 * colSums(y_val * log(pred_val) + (1 - y_val) * log(1 - pred_val))
tibble(lambda = lasso_fit$lambda, dev_val = dev_val) %>%
  ggplot() + 
  geom_point(mapping = aes(x = lambda, y = dev_val)) + 
  scale_x_log10() +
  labs(y = "Binomial deviance on validation set", x = "Lambda")

From the graph, it seems that \(\lambda = 0.005\) yields a model that performs best on the validation set.

set.seed(200)
cv_lasso <- cv.glmnet(x_all, y_all, alpha = 1, family = "binomial", 
                      type.measure = "auc")
plot(cv_lasso)

The plot displays the cross-validation error (devience by default, we chose AUC here) according to the log of lambda. The left dashed vertical line indicates that the log of the optimal value of log lambda is approximately -5.2, which is the one that minimizes the average AUC. This lambda value will give the most accurate model. The exact value of lambda and corresponding model can be viewed as follow:

cv_lasso$lambda.min
## [1] 0.005696827
coef(cv_lasso, cv_lasso$lambda.min)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                         s1
## (Intercept)  -1.040951e+01
## age           4.954011e-02
## height        .           
## weight        5.334625e-03
## bmi           .           
## sdp           1.537576e-02
## dbp           .           
## chol          9.228292e-03
## dibepA        5.162308e-01
## dibepB       -5.494421e-15
## cigs          1.656021e-02
## arcuspresent  9.149817e-02

We see that this model differs from the best model chosen by AIC by replacing the predictor bmi by weight.