sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.5.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.33   R6_2.5.1        fastmap_1.1.1   xfun_0.39      
##  [5] cachem_1.0.8    knitr_1.42      htmltools_0.5.5 rmarkdown_2.21 
##  [9] cli_3.6.1       sass_0.4.6      jquerylib_0.1.4 compiler_4.3.0 
## [13] rstudioapi_0.14 tools_4.3.0     evaluate_0.20   bslib_0.4.2    
## [17] yaml_2.3.7      rlang_1.1.1     jsonlite_1.8.4
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(faraway)

Announcement

Acknowledgement

Dr. Hua Zhou’s slides

O-ring example

orings <- orings %>%
  as_tibble(rownames = "mission") %>%
  print(n = Inf)
## # A tibble: 23 × 3
##    mission  temp damage
##    <chr>   <dbl>  <dbl>
##  1 1          53      5
##  2 2          57      1
##  3 3          58      1
##  4 4          63      1
##  5 5          66      0
##  6 6          67      0
##  7 7          67      0
##  8 8          67      0
##  9 9          68      0
## 10 10         69      0
## 11 11         70      1
## 12 12         70      0
## 13 13         70      1
## 14 14         70      0
## 15 15         72      0
## 16 16         73      0
## 17 17         75      0
## 18 18         75      1
## 19 19         76      0
## 20 20         76      0
## 21 21         78      0
## 22 22         79      0
## 23 23         81      0

Descriptive statistics

There seems a pattern: lower temperature, more damages.

oring_plot <- orings %>%
  mutate(failrate = damage / 6) %>%
  ggplot(mapping = aes(x = temp, y = failrate)) + 
  geom_point() + 
  labs(x = "Temperature (F)", y = "Proportion of damages", title = "O-ring damage data")
plot(oring_plot)

Binomial model

Logistic regression

For Binomial model, we input \((\text{successes}_i, \text{failures}_i)\) as responses.

library(gtsummary)
lmod <- glm(cbind(damage, 6 - damage) ~ temp, family = binomial, data = orings)
lmod %>%
  tbl_regression(intercept = TRUE)
Characteristic log(OR)1 95% CI1 p-value
(Intercept) 12 5.6, 19 <0.001
temp -0.22 -0.33, -0.12 <0.001
1 OR = Odds Ratio, CI = Confidence Interval

Let’s fit an equivalent (weighted) Bernoulli model. Not surprisingly, we get identical estimate.

obs_wt = c(rbind(orings$damage, 6 - orings$damage))
orings %>%
  slice(rep(1:n(), each = 2)) %>% # replicate each row twice
  mutate(damage = rep(c(1, 0), 23)) %>%
  mutate(obs_wt = obs_wt) %>%
  glm(damage ~ temp, weights = obs_wt, family = binomial, data = .) %>%
  tbl_regression(intercept = TRUE)
Characteristic log(OR)1 95% CI1 p-value
(Intercept) 12 5.6, 19 <0.001
temp -0.22 -0.33, -0.12 <0.001
1 OR = Odds Ratio, CI = Confidence Interval

Prediction

Now we can predict the failure probability. The failure probability at 31F is nearly 1.

tibble(temp     = seq(25, 85, 0.1), 
       predprob = predict(lmod, newdata = tibble(temp = temp), type = "response")) %>%
  ggplot() + 
  geom_line(mapping = aes(x = temp, y = predprob)) + 
  geom_vline(xintercept = 31) +
  labs(x = "Temperature (F)", y = "Predicted failure probability")

Goodness of fit

pchisq(lmod$deviance, lmod$df.residual, lower = FALSE)
## [1] 0.7164099
drop1(lmod, test = "Chi")
## Single term deletions
## 
## Model:
## cbind(damage, 6 - damage) ~ temp
##        Df Deviance    AIC    LRT  Pr(>Chi)    
## <none>      16.912 33.675                     
## temp    1   38.898 53.660 21.985 2.747e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Pearson \(X^2\)

predprob <- predict(lmod, type = "response")
# Pearson X2 statistic
(px2 <- sum((orings$damage - 6 * predprob)^2 / (6 * predprob * (1 - predprob))))
## [1] 28.06738
pchisq(px2, lmod$df.residual, lower.tail = FALSE)
## [1] 0.1382507

This large p-value indicates that this model fits data well.

(presid <- residuals(lmod, type = "pearson"))
##          1          2          3          4          5          6          7 
##  1.3928147 -0.8972668 -0.6821441  0.3214099 -0.6647553 -0.5966330 -0.5966330 
##          8          9         10         11         12         13         14 
## -0.5966330 -0.5354916 -0.4806159  1.9587595 -0.4313637  1.9587595 -0.4313637 
##         15         16         17         18         19         20         21 
## -0.3474838 -0.3118746 -0.2512296  3.7710639 -0.2254843 -0.2254843 -0.1816382 
##         22         23 
## -0.1630244 -0.1313239

Diagnostics

orings %>%
  mutate(devres  = residuals(lmod, type = "deviance"),
         linpred = predict(lmod, type = "link")) %>%
  ggplot + 
  geom_point(mapping = aes(x = linpred, y = devres)) +
  labs(x = "Linear predictor", y = "Deviance residual")

halfnorm(hatvalues(lmod))

orings %>%
  slice(c(1, 2))
## # A tibble: 2 × 3
##   mission  temp damage
##   <chr>   <dbl>  <dbl>
## 1 1          53      5
## 2 2          57      1
halfnorm(cooks.distance(lmod))

orings %>%
  slice(c(1, 18))
## # A tibble: 2 × 3
##   mission  temp damage
##   <chr>   <dbl>  <dbl>
## 1 1          53      5
## 2 18         75      1

Overdispersion

troutegg
##    survive total location period
## 1       89    94        1      4
## 2      106   108        2      4
## 3      119   123        3      4
## 4      104   104        4      4
## 5       49    93        5      4
## 6       94    98        1      7
## 7       91   106        2      7
## 8      100   130        3      7
## 9       80    97        4      7
## 10      11   113        5      7
## 11      77    86        1      8
## 12      87    96        2      8
## 13      88   119        3      8
## 14      67    99        4      8
## 15      18    88        5      8
## 16     141   155        1     11
## 17     104   122        2     11
## 18      91   125        3     11
## 19     111   132        4     11
## 20       0   138        5     11

We fit a binomial model for the two main effects.

library(gtsummary)
bmod <- glm(cbind(survive, total - survive) ~ location + period,
            family = "binomial", data = troutegg)
#bmod %>%
#  tbl_regression(exponentiate = TRUE) %>%
#  bold_labels()
summary(bmod)
## 
## Call:
## glm(formula = cbind(survive, total - survive) ~ location + period, 
##     family = "binomial", data = troutegg)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.6358     0.2813  16.479  < 2e-16 ***
## location2    -0.4168     0.2461  -1.694   0.0903 .  
## location3    -1.2421     0.2194  -5.660 1.51e-08 ***
## location4    -0.9509     0.2288  -4.157 3.23e-05 ***
## location5    -4.6138     0.2502 -18.439  < 2e-16 ***
## period7      -2.1702     0.2384  -9.103  < 2e-16 ***
## period8      -2.3256     0.2429  -9.573  < 2e-16 ***
## period11     -2.4500     0.2341 -10.466  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1021.469  on 19  degrees of freedom
## Residual deviance:   64.495  on 12  degrees of freedom
## AIC: 157.03
## 
## Number of Fisher Scoring iterations: 5

Analysis of deviance for the significance of two factors

drop1(bmod, test = "Chisq")
## Single term deletions
## 
## Model:
## cbind(survive, total - survive) ~ location + period
##          Df Deviance    AIC    LRT  Pr(>Chi)    
## <none>         64.50 157.03                     
## location  4   913.56 998.09 849.06 < 2.2e-16 ***
## period    3   228.57 315.11 164.08 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(sigma2 <- sum(residuals(bmod, type = "pearson")^2) / 12)
## [1] 5.330322

which is substantially larger than 1. We can now see the scaled up standard errors

summary(bmod, dispersion = sigma2)
## 
## Call:
## glm(formula = cbind(survive, total - survive) ~ location + period, 
##     family = "binomial", data = troutegg)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.6358     0.6495   7.138 9.49e-13 ***
## location2    -0.4168     0.5682  -0.734   0.4632    
## location3    -1.2421     0.5066  -2.452   0.0142 *  
## location4    -0.9509     0.5281  -1.800   0.0718 .  
## location5    -4.6138     0.5777  -7.987 1.39e-15 ***
## period7      -2.1702     0.5504  -3.943 8.05e-05 ***
## period8      -2.3256     0.5609  -4.146 3.38e-05 ***
## period11     -2.4500     0.5405  -4.533 5.82e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 5.330322)
## 
##     Null deviance: 1021.469  on 19  degrees of freedom
## Residual deviance:   64.495  on 12  degrees of freedom
## AIC: 157.03
## 
## Number of Fisher Scoring iterations: 5

and make F tests on predictors

drop1(bmod, scale = sigma2, test = "F")
## Warning in drop1.glm(bmod, scale = sigma2, test = "F"): F test assumes
## 'quasibinomial' family
## Single term deletions
## 
## Model:
## cbind(survive, total - survive) ~ location + period
## 
## scale:  5.330322 
## 
##          Df Deviance    AIC F value    Pr(>F)    
## <none>         64.50 157.03                      
## location  4   913.56 308.32  39.494 8.142e-07 ***
## period    3   228.57 181.81  10.176  0.001288 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Quasi-binomial

modl <- glm(survive / total ~ location + period, 
            family = "quasibinomial", data = troutegg)
summary(modl)
## 
## Call:
## glm(formula = survive/total ~ location + period, family = "quasibinomial", 
##     data = troutegg)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.4915     0.6133   7.324 9.17e-06 ***
## location2    -0.3677     0.5574  -0.660  0.52191    
## location3    -1.2027     0.5055  -2.379  0.03481 *  
## location4    -0.9569     0.5166  -1.852  0.08874 .  
## location5    -4.4356     0.5529  -8.022 3.66e-06 ***
## period7      -2.0660     0.5137  -4.022  0.00169 ** 
## period8      -2.2046     0.5141  -4.288  0.00105 ** 
## period11     -2.3426     0.5143  -4.555  0.00066 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 0.04670695)
## 
##     Null deviance: 8.88362  on 19  degrees of freedom
## Residual deviance: 0.58247  on 12  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

Individual predictor significance is

drop1(modl, test = "F")
## Single term deletions
## 
## Model:
## survive/total ~ location + period
##          Df Deviance F value    Pr(>F)    
## <none>        0.5825                      
## location  4   7.9985  38.196 9.788e-07 ***
## period    3   2.0545  10.109  0.001325 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We see similar p-values as the overdispersion model.

Beta regression

library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
modb <- gam(survive / total ~ location + period,
            family = betar(), data = troutegg)
## Warning in family$saturated.ll(y, prior.weights, theta): saturated likelihood
## may be inaccurate
summary(modb)
## 
## Family: Beta regression(7.992) 
## Link function: logit 
## 
## Formula:
## survive/total ~ location + period
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.8507     0.5757   8.426  < 2e-16 ***
## location2    -0.3353     0.5961  -0.563   0.5738    
## location3    -0.9843     0.5739  -1.715   0.0863 .  
## location4    -0.3002     0.5975  -0.502   0.6153    
## location5    -5.3264     0.6241  -8.534  < 2e-16 ***
## period7      -2.7315     0.5614  -4.865 1.14e-06 ***
## period8      -2.9455     0.5551  -5.306 1.12e-07 ***
## period11     -3.4724     0.5410  -6.419 1.37e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.891   Deviance explained =  110%
## -REML = -67.01  Scale est. = 1         n = 20