Class 12: Programming in R (part 2)

Ryan Sheridan and Kent Riemondy
2023-12-14

The Rmarkdown for this class is on github

# conditionally download all of the files used in rmarkdown from github
source("https://raw.githubusercontent.com/rnabioco/bmsc-7810-pbda/main/_posts/2023-12-13-class-11-programming-pt-1/download-files.R")


Introduction

As discussed in the previous class, you should try to limit duplication in your code. One way to do this is by writing functions, another way is through iteration. Reducing code duplication has several benefits:



for loops

for loops allow you to run the same code block repeatedly without copying and pasting.

for (i in 1:4) {
  message("printing ", i, " is fun")
}

values <- c("A", "B", "C", "D")

for(val in values){
  message(val)
}


When iterating over a vector, usually it is most useful to iterate over the index of each element (aka the position in the vector), rather than the values themselves.

for (i in 1:length(values)) {
  val <- values[i]
  message("index = ", i, " value = ", val)
}


We will generally want to store the output generated in the for loop. A common paradigm is to preallocate a place to store the output. This is a faster approach than growing the output at each iteration (for more detail see this R-bloggers post).

We can generate vectors (and lists) of a given type and length using the vector() function.

n <- length(values)

# Make an empty vector that is the same length as values (4)
output <- vector(mode = "character", length = n)

output
#> [1] "" "" "" ""
for(i in 1:n){ 
  
  # get value at position i
  val <- values[i]
  
  # assign value to output character vector at position i
  output[i] <- tolower(val)
}

output
#> [1] "a" "b" "c" "d"
# It's helpful to think about what happens during each cycle of the loop
output[1] <- tolower("A")  # i = 1
output[2] <- tolower("B")  # i = 2
output[3] <- tolower("C")  # i = 3
output[4] <- tolower("D")  # i = 4


Lets use rnorm() to create a list of 5 vectors with different values for ‘mean’

# One way to do this is by copying and pasting
vec_in <- c(1, 50, 20, 5, 70)                # input

out <- vector("list", length(vec_in))        # output

out[[1]] <- rnorm(n = 10, mean = vec_in[1])    
out[[2]] <- rnorm(n = 10, mean = vec_in[2])
out[[3]] <- rnorm(n = 10, mean = vec_in[3])
out[[4]] <- rnorm(n = 10, mean = vec_in[4])
out[[5]] <- rnorm(n = 10, mean = vec_in[5])

out
#> [[1]]
#>  [1] 3.93808636 1.52557256 1.46301683 2.21399126 1.41511927 0.05814919
#>  [7] 3.85651726 2.18437184 1.27088890 0.95658068
#> 
#> [[2]]
#>  [1] 50.24725 51.08728 49.70821 50.11644 48.60709 50.50690 50.42219
#>  [8] 50.71646 50.63284 49.87456
#> 
#> [[3]]
#>  [1] 20.59113 19.62665 19.46785 19.56616 19.48752 22.06041 20.44239
#>  [8] 19.70841 21.11495 19.75437
#> 
#> [[4]]
#>  [1] 6.735522 6.014315 5.357435 6.253662 5.966504 4.853880 5.019906
#>  [8] 4.257296 7.245420 2.439478
#> 
#> [[5]]
#>  [1] 68.77657 70.17230 69.13500 71.01586 72.27142 70.16347 70.89268
#>  [8] 68.67199 72.49300 70.53869
# Use a for loop to reduce code duplication
vec_in <- c(1, 50, 20, 5, 70)                  # input

out <- vector("list", length(vec_in))          # output

for (i in 1:length(vec_in)) {                  # sequence
  
  out[[i]] <- rnorm(n = 10, mean = vec_in[i])  # body
  
}


Write a for loop that uses rnorm() to create 3 vectors of different lengths. Store the vectors in a list. Use mean = 0 and sd = 1 (the default).

Show answer
vec_in <- c(5, 10, 2)                  # input

n   <- length(vec_in)
out <- vector("list", n)               # output

for (i in 1:length(vec_in)) {          # sequence
  
  out[[i]] <- rnorm(n = vec_in[i])
  
}

out
#> [[1]]
#> [1] -1.46000733 -0.54317663  0.34326973  0.10034414 -0.09198923
#> 
#> [[2]]
#>  [1]  0.4936298 -0.9133602 -0.3896098  0.7461872  0.1311157 -0.3008083
#>  [7] -0.6366033  1.6305274 -0.4965034 -0.1632586
#> 
#> [[3]]
#> [1] 0.6410277 0.6727666


So far we have used 1:length(x) to specify the sequence to iterate over. A better alternative is using seq_along(x) instead of 1:length(x). This guards against errors when an empty vector is passed to 1:length(x).

# seq_along() mimics 1:length() for non-empty vectors
vec_in <- c(5, 10, 2)

1:length(vec_in)
#> [1] 1 2 3
seq_along(vec_in)
#> [1] 1 2 3
# seq_along() correctly handles empty vectors
emp_vec <- vector("numeric", 0)

1:length(emp_vec)
#> [1] 1 0
seq_along(emp_vec)
#> integer(0)



Using the Brauer data

Using the Brauer gene expression data lets create a figure showing the growth rate vs expression for four genes

# Load data
brauer_gene_exp <- read_csv("data/brauer_gene_exp.csv.gz")
# This is the function we wrote last class
plot_expr <- function(input, sys_name, ...) {
  
  gg_data <- input |>
    filter(systematic_name == sys_name)
  
  plot_title <- gg_data$name[1]
  plot_sub <- gg_data$MF[1]
  
  gg_data |>
    ggplot(aes(rate, expression, color = nutrient)) +
    geom_point(...) +
    labs(title = plot_title, subtitle = plot_sub) +
    facet_wrap(~ nutrient) +
    theme(legend.position = "none")
}


Lets try this with the copy-and-paste method, storing the plots in a list.

vec_in <- c("YDL104C", "YLR115W", "YMR183C", "YML017W")       # input

out <- vector("list", length(vec_in))                         # output 

out[[1]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[1])  
out[[2]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[2])
out[[3]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[3])
out[[4]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[4])

wrap_plots(out)


Re-write the code from above using a for loop to generate our figure

Show answer
vec_in <- c("YDL104C", "YLR115W", "YMR183C", "YML017W")  # input

out <- vector("list", length(vec_in))                    # output

for (i in seq_along(vec_in)) {

  out[[i]] <- brauer_gene_exp |>
    plot_expr(sys_name = vec_in[i])

}

wrap_plots(out)



A note on vectorization

In general you should try to use a vectorized function/approach before using iteration. Vectorized approaches will be faster and require less code to run. If you are working with a vector or matrix, then there is likely a vectorized operation that can be used.

There are however a few common places that iteration is used:



Introduction to map()

for loops are a powerful tool to reduce code duplication, however your code can be further simplified using the tidyverse map() functions provided in the purrr package. These map() functions essentially run for (i in seq_along(x)) behind the scenes so you don’t have to explicitly type this.

There is a function for each type of output:

Each map() function requires two inputs: map(.x, .f, ...)

# We previously used a for loop to create vectors with different values for mean
vals <- c(1, 50, 20, 5, 70)                  # input

out <- vector("list", length(vals))          # output

for (i in seq_along(vals)) {                 # sequence
  
  out[[i]] <- rnorm(n = 10, mean = vals[i])  # body
  
}

# Using map() we can further simplify this code
# .x indicates where each element of the vector should be inserted
out <- map(
  .x = vals,
  .f = ~ rnorm(n = 10, mean = .x)
)

# You can use brackets to include a multi-line code block
out <- map(vals, ~ {
  
  rnorm(n = 10, mean = .x)

})

# map() allows for very readable code
# Each element of the vector is passed to the first available argument
out <- map(vals, rnorm, n = 10)


Use rnorm() and map() to create 3 vectors of different lengths

Show answer
lens <- c(5, 10, 2)

out <- map(lens, ~ rnorm(n = .x))

out <- map(lens, rnorm)

out
#> [[1]]
#> [1]  1.2339641 -0.8387456  0.7673099  0.2520608 -0.4574441
#> 
#> [[2]]
#>  [1]  0.14702459  0.24631769  0.56770155  1.67943511 -1.02173199
#>  [6]  0.72204226 -0.55428999 -0.06726648 -2.11617887  0.33682865
#> 
#> [[3]]
#> [1]  0.5836781 -1.2154266


Re-write the code from above using map() to generate our growth rate figure

Show answer
genes <- c("YDL104C", "YOR069W", "YLR115W", "YPR036W")

expr_plots <- map(
  .x    = genes, 
  .f    = plot_expr, 
  input = brauer_gene_exp
)

wrap_plots(expr_plots)


Note that we can iterate over lists in addition to vectors. A common operation might be to read in multiple files and perform some operation.

# get paths to files in "data" directory (dir() is an alias for list.files())
file_names <- dir("data", full.names = TRUE, pattern = ".csv.gz")

# read each file into R and store in a list
lst_of_dfs <- map(file_names, read_csv)

# get nrow of each file
map_int(lst_of_dfs, nrow)

# select 5 random rows
map(lst_of_dfs, slice_sample, n = 5)

# check if any NAs are present
map(lst_of_dfs, ~ !all(complete.cases(.x)))


Note that a data.frame is a list in R, such that each column is one element of a list (e.g. see output of typeof(mtcars)). So if we use map() on a data.frame it will iterate over each column.

map(mtcars, mean)
map(mtcars, class)



Mapping over multiple arguments

If you have two vectors containing values that you want to pass to a function this can be accomplished with map2().

genes  <- c("YDL104C", "YOR069W", "YLR115W", "YPR036W")
shapes <- c(1, 2, 3, 4)

expr_plots <- map2(genes, shapes, ~ {
  plot_expr(
    input    = brauer_gene_exp,
    sys_name = .x,
    shape    = .y
  )
})

wrap_plots(expr_plots)


Use map2() to create plots for 4 different genes, each with a different point size

Show answer
genes <- c("YDL104C", "YOR069W", "YLR115W", "YPR036W")
sizes <- c(1, 2, 4, 6)

expr_plots <- map2(genes, sizes, ~ {
  plot_expr(
    input    = brauer_gene_exp,
    sys_name = .x,
    size     = .y
  )
})

wrap_plots(expr_plots)


pmap can be used to map over any number of arguments.

# Create a list of input vectors
genes  <- c("YDL104C", "YOR069W", "YLR115W", "YPR036W")
sizes  <- c(1, 2, 4, 6)
shapes <- c(1, 2, 3, 4)

plot_args <- list(genes, sizes, shapes)

# Use an argument list with pmap
expr_plots <- pmap(plot_args, ~ {
  plot_expr(
    input = brauer_gene_exp,
    sys_name = ..1,
    size     = ..2,
    shape    = ..3
  )
})

# A simpler way
plot_args <- list(
  sys_name = c("YDL104C", "YOR069W", "YLR115W", "YPR036W"),
  size     = c(2, 4, 6, 8),
  shape    = c(1, 2, 3, 4)
)

expr_plots <- pmap(
  .l = plot_args, 
  .f = plot_expr, 
  input = brauer_gene_exp
)

wrap_plots(expr_plots)



Additional resources

Control Structures, from R Programming for Data Science

Programming Basics: Introduction to Data Science

Control Flow: Advanced R



Show session info
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Denver
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods  
#> [7] base     
#> 
#> other attached packages:
#>  [1] patchwork_1.1.3 lubridate_1.9.3 forcats_1.0.0   stringr_1.5.0  
#>  [5] dplyr_1.1.3     purrr_1.0.2     readr_2.1.4     tidyr_1.3.0    
#>  [9] tibble_3.2.1    ggplot2_3.4.3   tidyverse_2.0.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] sass_0.4.7        utf8_1.2.3        generics_0.1.3   
#>  [4] stringi_1.7.12    distill_1.6       hms_1.1.3        
#>  [7] digest_0.6.33     magrittr_2.0.3    evaluate_0.22    
#> [10] grid_4.3.1        timechange_0.2.0  fastmap_1.1.1    
#> [13] jsonlite_1.8.7    fansi_1.0.4       scales_1.2.1     
#> [16] jquerylib_0.1.4   cli_3.6.1         crayon_1.5.2     
#> [19] rlang_1.1.1       bit64_4.0.5       munsell_0.5.0    
#> [22] withr_2.5.1       cachem_1.0.8      yaml_2.3.7       
#> [25] parallel_4.3.1    tools_4.3.1       tzdb_0.4.0       
#> [28] memoise_2.0.1     colorspace_2.1-0  vctrs_0.6.3      
#> [31] R6_2.5.1          lifecycle_1.0.3   bit_4.0.5        
#> [34] vroom_1.6.3       pkgconfig_2.0.3   pillar_1.9.0     
#> [37] bslib_0.5.1       gtable_0.3.4      glue_1.6.2       
#> [40] xfun_0.40         tidyselect_1.2.0  rstudioapi_0.15.0
#> [43] knitr_1.44        farver_2.1.1      htmltools_0.5.6  
#> [46] labeling_0.4.3    rmarkdown_2.25    compiler_4.3.1   
#> [49] downlit_0.4.3