R Bootcamp - Class 7

Tidyverse odds & ends

Jay Hesselberth

RNA Bioscience Initiative | CU Anschutz

2025-09-06

Class 7 outline

  • String manipulation with stringr
  • Factor operations with forcats
  • Join functions with dplyr
  • Advanced plotting with ggplot2

Setup

library(tidyverse)
library(here)
library(cowplot)

# set the theme for all plots
theme_set(theme_cowplot())

# overwrite as tibble
penguins <- as_tibble(penguins)

String operations

Combining strings with str_c()

str_c("letter: ", letters[1:5])
[1] "letter: a" "letter: b" "letter: c" "letter: d"
[5] "letter: e"
penguins |>
  mutate(
    id = str_c(species, island, sep = "_"),
    label = str_c(species, " (", year, ")")
  ) |>
  select(species, island, year, id, label)
# A tibble: 344 × 5
   species island     year id               label        
   <fct>   <fct>     <int> <chr>            <chr>        
 1 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
 2 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
 3 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
 4 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
 5 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
 6 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
 7 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
 8 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
 9 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
10 Adelie  Torgersen  2007 Adelie_Torgersen Adelie (2007)
# ℹ 334 more rows

Detecting patterns with str_detect()

str_detect("A", LETTERS[1:10])
 [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[10] FALSE
mtcars_tbl <-
  rownames_to_column(mtcars, "name") |>
  as_tibble()

# find the Mercedes Benz's
filter(
  mtcars_tbl,
  str_detect(name, "Merc")
)
# A tibble: 7 × 12
  name         mpg   cyl  disp    hp  drat    wt  qsec    vs
  <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Merc 240D   24.4     4  147.    62  3.69  3.19  20       1
2 Merc 230    22.8     4  141.    95  3.92  3.15  22.9     1
3 Merc 280    19.2     6  168.   123  3.92  3.44  18.3     1
4 Merc 280C   17.8     6  168.   123  3.92  3.44  18.9     1
5 Merc 450SE  16.4     8  276.   180  3.07  4.07  17.4     0
6 Merc 450SL  17.3     8  276.   180  3.07  3.73  17.6     0
7 Merc 450S…  15.2     8  276.   180  3.07  3.78  18       0
# ℹ 3 more variables: am <dbl>, gear <dbl>, carb <dbl>

Splitting strings with str_split()

ids <- c("x-1", "x-2", "y-1", "y-2")
str_split(ids, "-")
[[1]]
[1] "x" "1"

[[2]]
[1] "x" "2"

[[3]]
[1] "y" "1"

[[4]]
[1] "y" "2"
# just the first parts
str_split_i(ids, '-', 1)
[1] "x" "x" "y" "y"
# create some sample data
df <- tribble(
  ~ sample_ids,
  "gene_1_treatment",
  "gene_2_control",
  "gene_3_treatment"
)

mutate(
  df,
  split_parts = str_split(sample_ids, "_"),
  gene_part = map_chr(split_parts, ~ .x[1]),
  number_part = map_chr(split_parts, ~ .x[2]),
  condition = map_chr(split_parts, ~ .x[3])
)
# A tibble: 3 × 5
  sample_ids     split_parts gene_part number_part condition
  <chr>          <list>      <chr>     <chr>       <chr>    
1 gene_1_treatm… <chr [3]>   gene      1           treatment
2 gene_2_control <chr [3]>   gene      2           control  
3 gene_3_treatm… <chr [3]>   gene      3           treatment

Factor operations

Counting factor levels with fct_count()

# equivalent with dplyr
penguins |>
  count(species)
# A tibble: 3 × 2
  species       n
  <fct>     <int>
1 Adelie      152
2 Chinstrap    68
3 Gentoo      124
fct_count(penguins$species)
# A tibble: 3 × 2
  f             n
  <fct>     <int>
1 Adelie      152
2 Chinstrap    68
3 Gentoo      124

Reordering factors with fct_reorder()

ggplot(
  penguins,
  aes(
    x = species,
    y = body_mass
  )
) +
  geom_boxplot(
    fill = "grey50"
  )

Reordering factors with fct_reorder()

Bar plot showing penguin species counts, ordered by frequency from lowest to highest

ggplot(
  penguins,
  aes(
    x = fct_reorder(
      species,
      body_mass,
      .fun = sd,
      na.rm = TRUE
    ),
    y = body_mass
  )
) +
  geom_boxplot(
    fill = "grey50"
  )

Bar plot showing penguin species counts, ordered by frequency from lowest to highest

Lumping infrequent levels with fct_lump()

# create some sample data with many levels.
# how many rows / cols are in this tibble?
sample_data <-
  tibble(
    category = factor(
      sample(
        letters[1:10],
        100,
        replace = TRUE
      )
    )
  )
mutate(
  sample_data,
  lumped = fct_lump_n(
    category,
    n = 3
  )
)
# A tibble: 100 × 2
   category lumped
   <fct>    <fct> 
 1 e        Other 
 2 a        Other 
 3 d        d     
 4 d        d     
 5 b        Other 
 6 j        j     
 7 j        j     
 8 g        g     
 9 d        d     
10 d        d     
# ℹ 90 more rows
mutate(
  sample_data,
  lumped = fct_lump_n(
    category,
    n = 3
  )
) |>
  count(lumped)
# A tibble: 5 × 2
  lumped     n
  <fct>  <int>
1 c         13
2 d         12
3 g         14
4 j         12
5 Other     49

Aside on sample() and reproducibility

# run this several times
sample(0:100, 10)
 [1] 68 80 96 83 23 35 27 41 62 19
sample(0:100, 10)
 [1] 47 94 62 21 12 25 51 80 23 33
# now make `sample()` reproducible.
# you have the set the seed each time.

set.seed(42) # set the seed for reproducibility
sample(0:100, 10)
 [1]  48  64  24  73  17 100  46  23  70  88
set.seed(42)
sample(0:100, 10)
 [1]  48  64  24  73  17 100  46  23  70  88
sample(0:100, 10)
 [1] 36 19 25  2 40 88 26 35  4 83

Join operations

Setup

Open up the tidyexplain page.

Understanding joins

Joins combine data from two tables based on matching keys.

band_members
# A tibble: 3 × 2
  name  band   
  <chr> <chr>  
1 Mick  Stones 
2 John  Beatles
3 Paul  Beatles
band_instruments
# A tibble: 3 × 2
  name  plays 
  <chr> <chr> 
1 John  guitar
2 Paul  bass  
3 Keith guitar

left_join() - keep all rows from left table

Most common join - keeps all observations from the “primary” table.

band_members |>
  left_join(
    band_instruments,
    by = "name"
  )
# A tibble: 3 × 3
  name  band    plays 
  <chr> <chr>   <chr> 
1 Mick  Stones  <NA>  
2 John  Beatles guitar
3 Paul  Beatles bass  

inner_join() - keep only matching rows

Only keeps rows that exist in both tables.

band_members |>
  inner_join(
    band_instruments,
    by = "name"
  )
# A tibble: 2 × 3
  name  band    plays 
  <chr> <chr>   <chr> 
1 John  Beatles guitar
2 Paul  Beatles bass  

full_join() - keep all rows from both tables

band_members |>
  full_join(
    band_instruments,
    by = "name"
  )
# A tibble: 4 × 3
  name  band    plays 
  <chr> <chr>   <chr> 
1 Mick  Stones  <NA>  
2 John  Beatles guitar
3 Paul  Beatles bass  
4 Keith <NA>    guitar

Keeps everything, filling missing values with NA.

Advanced plotting

Setup

library(patchwork)

scale functions in ggplot2

  • scale_color_brewer() and scale_fill_brewer() control color and fill aesthetics.
  • See available ggplot2 brewer palettes
p1 <- ggplot(
  mtcars_tbl,
  aes(
    x = mpg,
    y = hp,
    # why do we `factor` here?
    color = factor(cyl)
  )
) +
  geom_point(size = 5)

p1

scale functions in ggplot2

Example of a ggplot2 scatter plot with points colored by number of cylinders, using the cowplot theme

p1 + scale_color_brewer(palette = "Set1")

p1 + scale_color_brewer(palette = "Dark2")

Combining multiple plots into a figure?

Use the {patchwork} package.

(p1 + p1) /
  (p1 + p1) +
  plot_annotation(tag_levels = 'A') +
  plot_layout(guides = 'collect')

Combining multiple plots into a figure?

Example of combining multiple ggplot2 plots into a single figure using the patchwork package

Saving plots

Saves last plot as 5’ x 5’ file named plot_final.png in working directory.

Matches file type to file extension (*.png, *.jpeg, *.pdf).

# default is to save last plot in the buffer
# can also specify with the `plot` argument
ggsave(
  filename = here("img/plot_final.png"),
  plot = last_plot(),
  width = 5,
  height = 5
)

Displaying data in tables

We’ll use a couple of approaches to display data in tables instead of graphs, which can be useful for reports or presentations.

Using knitr::kable()

library(knitr) # also loaded with library(tidyverse)

kable(
  penguins |>
    select(species, island, body_mass) |>
    head(10),
  caption = "First 10 rows of the penguins dataset"
)

Using knitr::kable()

First 10 rows of the penguins dataset
species island body_mass
Adelie Torgersen 3750
Adelie Torgersen 3800
Adelie Torgersen 3250
Adelie Torgersen NA
Adelie Torgersen 3450
Adelie Torgersen 3650
Adelie Torgersen 3625
Adelie Torgersen 4675
Adelie Torgersen 3475
Adelie Torgersen 4250

Using gt

library(gt)

gt(
  penguins |>
    select(species, island, body_mass)
) |>
  tab_header(
    title = "The {penguins} dataset"
  ) |>
  opt_interactive(
    use_pagination = TRUE,
    use_filters = TRUE
  )

Using gt

The {penguins} dataset