rm(list = ls()) # clean-up workspace
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.5.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0   dplyr_1.1.3    
##  [5] purrr_1.0.2     readr_2.1.4     tidyr_1.3.0     tibble_3.2.1   
##  [9] ggplot2_3.4.3   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4     jsonlite_1.8.4   compiler_4.3.0   tidyselect_1.2.0
##  [5] jquerylib_0.1.4  scales_1.2.1     yaml_2.3.7       fastmap_1.1.1   
##  [9] R6_2.5.1         generics_0.1.3   knitr_1.42       munsell_0.5.0   
## [13] bslib_0.4.2      pillar_1.9.0     tzdb_0.3.0       rlang_1.1.1     
## [17] utf8_1.2.3       stringi_1.7.12   cachem_1.0.8     xfun_0.39       
## [21] sass_0.4.6       timechange_0.2.0 cli_3.6.1        withr_2.5.0     
## [25] magrittr_2.0.3   digest_0.6.33    grid_4.3.0       rstudioapi_0.14 
## [29] hms_1.1.3        lifecycle_1.0.3  vctrs_0.6.3      evaluate_0.20   
## [33] glue_1.6.2       fansi_1.0.4      colorspace_2.1-0 rmarkdown_2.21  
## [37] tools_4.3.0      pkgconfig_2.0.3  htmltools_0.5.5

Announcement

Acknowledgement

Dr. Hua Zhou’s slides

A typical data science project:

Data visualization

“The simple graph has brought more information to the data analyst’s mind than any other device.”

John Tukey

mpg data

print(ggplot2::mpg, width = Inf)
## # A tibble: 234 × 11
##    manufacturer model      displ  year   cyl trans      drv     cty   hwy fl   
##    <chr>        <chr>      <dbl> <int> <int> <chr>      <chr> <int> <int> <chr>
##  1 audi         a4           1.8  1999     4 auto(l5)   f        18    29 p    
##  2 audi         a4           1.8  1999     4 manual(m5) f        21    29 p    
##  3 audi         a4           2    2008     4 manual(m6) f        20    31 p    
##  4 audi         a4           2    2008     4 auto(av)   f        21    30 p    
##  5 audi         a4           2.8  1999     6 auto(l5)   f        16    26 p    
##  6 audi         a4           2.8  1999     6 manual(m5) f        18    26 p    
##  7 audi         a4           3.1  2008     6 auto(av)   f        18    27 p    
##  8 audi         a4 quattro   1.8  1999     4 manual(m5) 4        18    26 p    
##  9 audi         a4 quattro   1.8  1999     4 auto(l5)   4        16    25 p    
## 10 audi         a4 quattro   2    2008     4 manual(m6) 4        20    28 p    
##    class  
##    <chr>  
##  1 compact
##  2 compact
##  3 compact
##  4 compact
##  5 compact
##  6 compact
##  7 compact
##  8 compact
##  9 compact
## 10 compact
## # ℹ 224 more rows

Aesthetic mappings | r4ds chapter 3.3

A graphing template

ggplot(data = <DATA>) + 
  <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))

Scatter plot

  • hwy vs displ

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy))

  • An aesthetic maps data to a specifc feature of plot.

  • Check available aesthetics for a geometric object by ?geom_point.

Color of points

  • Color points according to class:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy, color = class))

Size of points

  • Assign different sizes to points according to class:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy, size = class))

    #> Warning: Using size for a discrete variable is not advised.

Transparency of points

  • Assign different transparency levels to points according to class:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
    ## Warning: Using alpha for a discrete variable is not advised.

Shape of points

  • Assign different shapes to points according to class:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy, shape = class))

  • Maximum of 6 shapes at a time. By default, additional groups will go unplotted.

Manual setting of an aesthetic

  • Set the color of all points to be blue:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

Facets | r4ds chapter 3.5

Facets

  • Facets divide a plot into subplots based on the values of one or more discrete variables.

  • A subplot for each car type:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy)) + 
      facet_wrap(~ class, nrow = 2)


  • A subplot for each car type and drive:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy)) + 
      facet_grid(drv ~ class)

Geometric objects | r4ds chapter 3.6

geom_smooth(): smooth line

How are these two plots similar?

  • hwy vs displ line:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy))

    ggplot(data = mpg) + 
      geom_smooth(mapping = aes(x = displ, y = hwy))

Different line types

  • Different line types according to drv:

    ggplot(data = mpg) + 
      geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))

Different line colors

  • Different line colors according to drv:

    ggplot(data = mpg) + 
      geom_smooth(mapping = aes(x = displ, y = hwy, color = drv))

Points and lines

  • Lines overlaid over scatter plot:

    ggplot(data = mpg) + 
      geom_point(mapping = aes(x = displ, y = hwy)) + 
      geom_smooth(mapping = aes(x = displ, y = hwy))


  • Same as

    ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
      geom_point() + geom_smooth()

Aesthetics for each geometric object

  • Different aesthetics in different layers:

    ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
      geom_point(mapping = aes(color = class)) + 
      geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)

Bar plots | r4ds chapter 3.7

diamonds data

  • diamonds data:

    diamonds
    ## # A tibble: 53,940 × 10
    ##    carat cut       color clarity depth table price     x     y     z
    ##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
    ##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
    ##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
    ##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
    ##  4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
    ##  5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
    ##  6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
    ##  7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
    ##  8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
    ##  9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
    ## 10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
    ## # ℹ 53,930 more rows

Bar plot

  • geom_bar() creates bar chart:

    ggplot(data = diamonds) + 
      geom_bar(mapping = aes(x = cut))


  • Bar charts, like histograms, frequency polygons, smoothers, and boxplots, plot some computed variables instead of raw data.

  • Check available computed variables for a geometric object via help:

    ?geom_bar

  • Use stat_count() directly:

    ggplot(data = diamonds) + 
      stat_count(mapping = aes(x = cut))

  • stat_count() has a default geom geom_bar().


  • Display frequency instead of counts:

    ggplot(data = diamonds) + 
      geom_bar(mapping = aes(x = cut, y = stat(prop), group = 1))    
    ## Warning: `stat(prop)` was deprecated in ggplot2 3.4.0.
    ## ℹ Please use `after_stat(prop)` instead.
    ## This warning is displayed once every 8 hours.
    ## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
    ## generated.

    Note the aesthetics mapping group=1 overwrites the default grouping (by cut) by considering all observations as a group. Without this we get

    ggplot(data = diamonds) + 
      geom_bar(mapping = aes(x = cut, y = stat(prop)))    

geom_bar() vs geom_col()

  • geom_bar() makes the height of the bar proportional to the number of cases in each group (or if the weight aesthetic is supplied, the sum of the weights).

    ggplot(data = diamonds) + 
      geom_bar(mapping = aes(x = cut))

    The height of bar is the number of diamonds in each cut category.

  • geom_col() makes the heights of the bars to represent values in the data.

    ggplot(data = diamonds) + 
      geom_col(mapping = aes(x = cut, y = carat))

    The height of bar is total carat in each cut category.

    ggplot(data = diamonds) + 
      geom_bar(mapping = aes(x = cut, weight = carat))

Positional adjustments | r4ds chapter 3.8




  1. The stacking is performed automatically by the position adjustment specified by the position argument.

  2. If you don’t want a stacked bar chart, you can use one of three other options:

    • "identity"

    • "dodge"

    • "fill"

    • "stack" (default)












ggplot(nz, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "white", colour = "black")


Graphics for communications | r4ds chapter 28

Label

labs()

Title

  • Figure title should be descriptive:

    ggplot(mpg, aes(x = displ, y = hwy)) +
      geom_point(aes(color = class)) +
      geom_smooth(se = FALSE) +
      labs(title = "Fuel efficiency generally decreases with engine size")

Subtitle and caption

  • subtitle adds additional detail in a smaller font beneath the title.

  • caption adds text at the bottom right of the plot, often used to describe the source of the data.

    ggplot(mpg, aes(displ, hwy)) +
      geom_point(aes(color = class)) +
      geom_smooth(se = FALSE) + 
      labs(
        title = "Fuel efficiency generally decreases with engine size",
        subtitle = "Two seaters (sports cars) are an exception because of their light weight",
        caption = "Data from fueleconomy.gov"
      )

Axis labels

  • ggplot(mpg, aes(displ, hwy)) +
    geom_point(aes(colour = class)) +
    geom_smooth(se = FALSE) +
    labs(
      x = "Engine displacement (L)",
      y = "Highway fuel economy (mpg)"
    )

Math equations

  • read about available options in ?plotmath

    df <- tibble(x = runif(10), y = runif(10))
    ggplot(df, aes(x, y)) + geom_point() +
      labs(
        x = quote(sum(x[i] ^ 2, i == 1, n)),
        y = quote(alpha + beta + frac(delta, theta))
      )

  • R package latex2exp could convert tex math expressions (Ref)

    library(latex2exp)
    df <- tibble(x = runif(10), y = runif(10))
    ggplot(df, aes(x, y)) + geom_point() +
      labs(
        y = TeX("Example: $\\alpha + \\beta + \\frac{\\delta}{\\theta}$"),
        x = TeX("$\\sum_{i = 1}^{n} x_i^2$")
      )

Annotations

  • Find the most fuel efficient car in each car class:

    best_in_class <- mpg %>%
      group_by(class) %>%
      filter(row_number(desc(hwy)) == 1)
    
    # equivalent as 
    # best_in_class <- filter(group_by(mpg, class), row_number(desc(hwy)) == 1)
    best_in_class
    ## # A tibble: 7 × 11
    ## # Groups:   class [7]
    ##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
    ##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
    ## 1 chevrolet    corvette      5.7  1999     8 manu… r        16    26 p     2sea…
    ## 2 dodge        caravan 2wd   2.4  1999     4 auto… f        18    24 r     mini…
    ## 3 nissan       altima        2.5  2008     4 manu… f        23    32 r     mids…
    ## 4 subaru       forester a…   2.5  2008     4 manu… 4        20    27 r     suv  
    ## 5 toyota       toyota tac…   2.7  2008     4 manu… 4        17    22 r     pick…
    ## 6 volkswagen   jetta         1.9  1999     4 manu… f        33    44 d     comp…
    ## 7 volkswagen   new beetle    1.9  1999     4 manu… f        35    44 d     subc…
  • dplyr::desc function transforms a vector into a format that will be sorted in descending order

  • dplyr::filter function subsets a data frame, retaining all rows that satisfy your conditions


  • Annotate points
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(colour = class)) +
  geom_text(aes(label = model), data = best_in_class)


  • geom_label() draws a rectangle behind the text
ggplot(mpg, aes(displ, hwy)) +
  geom_point(aes(colour = class)) +
  geom_label(aes(label = model), data = best_in_class, nudge_y = 2, alpha = 0.5)


  • ggrepel package automatically adjust labels so that they don’t overlap:

    library("ggrepel")
    ggplot(mpg, aes(displ, hwy)) +
      geom_point(aes(colour = class)) +
      geom_point(size = 3, shape = 1, data = best_in_class) +
      ggrepel::geom_label_repel(aes(label = model), data = best_in_class)

Scales

  • ggplot(mpg, aes(displ, hwy)) +
      geom_point(aes(colour = class))

    automatically adds scales

    ggplot(mpg, aes(displ, hwy)) +
      geom_point(aes(colour = class)) +
      scale_x_continuous() +
      scale_y_continuous() +
      scale_colour_discrete()


  • breaks

    ggplot(mpg, aes(displ, hwy)) +
      geom_point() +
      scale_y_continuous(breaks = seq(15, 40, by = 5))

When you have relatively few data and want to highlight exactly where the observations occur. This plot that shows when each US president started and ended their term.

presidential %>%
  mutate(id = 33 + row_number()) %>%
  ggplot(aes(start, id)) +
    geom_point() +
    geom_segment(aes(xend = end, yend = id)) +
    scale_x_date(NULL, breaks = presidential$start, date_labels = "'%y")


  • labels

    ggplot(mpg, aes(displ, hwy)) +
      geom_point() +
      scale_x_continuous(labels = NULL) +
      scale_y_continuous(labels = NULL)


  • Plot y-axis at log scale:

    ggplot(mpg, aes(x = displ, y = hwy)) +
      geom_point() +
      scale_y_log10()


  • Plot x-axis in reverse order:

    ggplot(mpg, aes(x = displ, y = hwy)) +
      geom_point() +
      scale_x_reverse()


#install.packages("wesanderson")
library(wesanderson)
for (name in names(wes_palettes)) {
  print(wes_palette(name))
}

  • use scale_colour_manual() to use predefined mapping between values and colors
presidential %>%
  mutate(id = 33 + row_number()) %>%
  ggplot(aes(start, id, colour = party)) +
    geom_point() +
    geom_segment(aes(xend = end, yend = id)) +
    scale_colour_manual(values = c(Republican = "red", Democratic = "blue"))

  • the above plot can be improved
presidential %>%
  mutate(id = 33 + row_number()) %>%
  ggplot(aes(start, id, colour = party)) +
    geom_point() +
    geom_segment(aes(xend = end, yend = id)) +
    scale_colour_manual(values = c(Republican = "red", Democratic = "blue")) +
    scale_x_date(NULL, breaks = presidential$start, date_labels = "'%y")


  • use scale_colour_gradient() or scale_fill_gradient() for continuous colour

  • viridis::scale_colour_viridis()

df <- tibble(
  x = rnorm(10000),
  y = rnorm(10000)
)
ggplot(df, aes(x, y)) +
  geom_hex() +
  coord_fixed()

ggplot(df, aes(x, y)) +
  geom_hex() +
  viridis::scale_fill_viridis() +
  coord_fixed()


All color scales come in two variety:

  • scale_colour_x() for colour aesthetics

  • scale_fill_x() for fill aesthetics

Legends

  • Set legend position: "left", "right", "top", "bottom", none:

    ggplot(mpg, aes(displ, hwy)) +
      geom_point(aes(colour = class)) + 
      theme(legend.position = "left")


Zooming

  • Without clipping (removes unseen data points)

    ggplot(mpg, mapping = aes(displ, hwy)) +
      geom_point(aes(color = class)) +
      geom_smooth() +
      coord_cartesian(xlim = c(5, 7), ylim = c(10, 30))


  • With clipping (removes unseen data points)

    ggplot(mpg, mapping = aes(displ, hwy)) +
      geom_point(aes(color = class)) +
      geom_smooth() +
      xlim(5, 7) + ylim(10, 30)

    same as

    mpg %>%
      filter(displ >= 5, displ <= 7, hwy >= 10, hwy <= 30) %>%
      ggplot(aes(displ, hwy)) +
      geom_point(aes(color = class)) +
      geom_smooth()


  • ggplot(mpg, mapping = aes(displ, hwy)) +
      geom_point(aes(color = class)) +
      geom_smooth() +
      scale_x_continuous(limits = c(5, 7)) +
      scale_y_continuous(limits = c(10, 30))

Themes

  • ggplot(mpg, aes(displ, hwy)) +
      geom_point(aes(color = class)) +
      geom_smooth(se = FALSE) +
      theme_bw()

Saving plots

ggplot(mpg, aes(displ, hwy)) + geom_point()

ggsave("my-plot.pdf")
## Saving 7 x 5 in image

Cheat sheet

RStudio cheat sheet is extremely helpful.