Reference building for clustifyr

Building reference matrix from single cell expression matrix

In its simplest form, a reference matrix is built by averaging expression (also includes an option to take the median) of a single cell RNA-seq expression matrix by cluster. Both log transformed or raw count matrices are supported.

library(clustifyr)
library(clustifyrdata)

new_ref_matrix <- average_clusters(
  mat = pbmc_matrix,
  metadata = pbmc_meta$classified, # or use metadata = pbmc_meta, cluster_col = "classified"
  if_log = TRUE
)

head(new_ref_matrix)
#>                        B CD14+ Mono      CD8 T        DC FCGR3A+ Mono
#> AL627309.1    0.00000000 0.04740195 0.02033764 0.0000000   0.00000000
#> AP006222.2    0.00000000 0.01082590 0.01184445 0.0000000   0.00000000
#> RP11-206L10.2 0.02043998 0.00000000 0.00000000 0.0812375   0.00000000
#> RP11-206L10.9 0.00000000 0.01044641 0.00000000 0.0000000   0.01192865
#> LINC00115     0.03814842 0.03685000 0.01929541 0.0000000   0.01458683
#> NOC2L         0.46155233 0.24101294 0.44279234 0.2841343   0.40564359
#>               Memory CD4 T Naive CD4 T        NK Platelet
#> AL627309.1     0.005909767 0.006109960 0.0000000        0
#> AP006222.2     0.008172591 0.000000000 0.0000000        0
#> RP11-206L10.2  0.000000000 0.007425455 0.0000000        0
#> RP11-206L10.9  0.000000000 0.000000000 0.0000000        0
#> LINC00115      0.024390599 0.018938463 0.0569015        0
#> NOC2L          0.307346059 0.403772470 0.5311082        0

Building reference matrix from `seurat` object

For further convenience, a shortcut function for generating reference matrix from seurat object is used here.

new_ref_matrix_v2 <- seurat_ref(
  seurat_object = s_small,
  cluster_col = "res.1"
)

new_ref_matrix_v3 <- seurat_ref(
  seurat_object = s_small3,
  cluster_col = "RNA_snn_res.1"
)

tail(new_ref_matrix_v3)
#>                 0         1        2
#> C12orf75 3.223784 0.4297757 2.305767
#> RARRES3  4.341699 1.7550597 3.794566
#> PCMT1    4.576310 2.8893691 2.223141
#> LAMP1    3.919129 0.9441022 1.565788
#> SPON2    3.670497 1.3182933 3.200796
#> S100B    3.069210 0.0000000 0.000000

If given additional assay_name, output ref will include features from the designated slots (such as for CITE-seq data).

new_ref_matrix_v2 <- seurat_ref(
  seurat_object = s_small,
  cluster_col = "res.1",
  assay_name = c("ADT", "ADT2")
)

new_ref_matrix_v3 <- seurat_ref(
  seurat_object = s_small3,
  cluster_col = "RNA_snn_res.1",
  assay_name = c("ADT", "ADT2")
)

tail(new_ref_matrix_v3)
#>   0 1 2
#> A 1 1 1
#> B 1 1 1
#> C 1 1 1
#> D 1 1 1

Building reference matrix from Recount2

Bulk RNA-Seq data can be obtained from any input source; in this example we will obtain a dataset from the recount2 database. This database provides > 2000 human RNA-Seq experiments that have been processed using a consistent pipeline. We have written a wrapper function to download a count matrix from recount2, given an SRA ID.

library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
library(recount2)

dl_recount <- function(sra_id) {
  if (!file.exists(file.path(sra_id, "rse_gene.Rdata"))) {
    download_study(sra_id)
  }

  load(file.path(sra_id, "rse_gene.Rdata"))

  # no longer need to downloaded data
  unlink(sra_id, recursive = TRUE)
  rse <- scale_counts(rse_gene)
  read_counts <- assay(rse, "counts")
  gene_ids <- rownames(read_counts)

  # get gene symbols, which are stored in rowData
  id2symbol <- data_frame(
    ids = rowData(rse_gene)$gene_id,
    symbols = rowData(rse_gene)$symbol@listData
  ) %>%
    mutate(symbols = map_chr(symbols, ~ .x[1]))

  # clean up metadata into a dataframe
  print("cleaning up meta")
  mdata <- colData(rse)
  mdata_cols <- lapply(
    mdata$characteristics,
    function(x) {
      str_match(x, "^([^:]+):")[, 2]
    }
  ) %>%
    unique() %>%
    unlist()

  mdata <- data_frame(
    run = mdata$run,
    all_data = as.list(mdata$characteristics)
  ) %>%
    mutate(out = purrr::map_chr(all_data, ~ str_c(.x, collapse = "::"))) %>%
    tidyr::separate(
      out,
      sep = "::",
      into = mdata_cols
    ) %>%
    select(-all_data) %>%
    mutate_at(
      .vars = vars(-matches("run")),
      .funs = function(x) str_match(x, ": (.+)")[, 2]
    )

  # convert ids to symbols
  row_ids_to_symbols <- left_join(data_frame(ids = gene_ids), id2symbol, by = "ids")

  if (length(gene_ids) != nrow(row_ids_to_symbols)) {
    warning("gene id mapping to symbols produce more or less ids")
  }

  row_ids_to_symbols <- filter(row_ids_to_symbols, !is.na(symbols))

  out_df <- read_counts %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene_id") %>%
    left_join(., row_ids_to_symbols, by = c("gene_id" = "ids")) %>%
    dplyr::select(-gene_id) %>%
    dplyr::select(symbols, everything()) %>%
    filter(!is.na(symbols))

  out_matrix <- tidyr::gather(out_df, library, expr, -symbols) %>%
    group_by(symbols, library) %>%
    summarize(expr = sum(expr)) %>%
    tidyr::spread(library, expr) %>%
    as.data.frame()
  if (tibble::has_rownames(out_matrix)) {
    out_matrix <- tibble::remove_rownames(out_matrix)
  }
  out_matrix <- out_matrix %>% tibble::column_to_rownames("symbols") %>%
    as.matrix()

  list(
    read_counts = out_matrix,
    meta_data = mdata
  )
}

gtex_data <- dl_recount("SRP012682")

gtex_data$read_counts[1:5, 1:5]
gtex_data$meta_data[1:5, ]

Building reference matrix from microarray data

clustifyr works with microarray data-derived gene expression matrix as well (see Benchmark page). An example of generating a matrix for sorted immune cell type can be found below. The matrix can be found at clustifyrdata::hema_microarray_matrix

library(tidyverse)
# read series matrix for column names
GSE24759 <- read_tsv("GSE24759/GSE24759_series_matrix.txt", skip = 25, n_max = 1)

# read series matrix for full dataset; apply column names
GSE24759 <- read_tsv("GSE24759//GSE24759_series_matrix.txt", skip = 57, col_names = colnames(GSE24759)) %>% rename(ID = `!Sample_title`)

# read array metadata table, remove control probes and missing gene symbols
GPL4685 <- read_tsv("/Users/rf/microarray/GSE24759/GPL4685-15513.txt", skip = 14) %>%
  filter(!is.na(`Gene Symbol`), `Sequence Type` != "Control sequence") %>%
  separate(`Gene Symbol`, into = "gene_symbol", sep = " ") %>%
  select(ID, gene_symbol)

# join data and metadata, collapse gene symbols
GSE24759 <- inner_join(GSE24759, GPL4685) %>%
  select(-ID) %>%
  group_by(gene_symbol) %>%
  mutate_at(.vars = vars(-gene_symbol), .funs = list(~ log2(mean(2^.))))

# convert to matrix, add rownames
GSE24759_mat <- ungroup(GSE24759) %>%
  select(-gene_symbol) %>%
  as.matrix()
row.names(GSE24759_mat) <- GSE24759$gene_symbol

# merge samples
ref_hema_microarray <- GSE24759_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  mutate(sample = str_remove(sample, ", .*")) %>%
  group_by(sample) %>%
  summarise_all(.funs = list(~ log2(mean(2^.))))

if (tibble::has_rownames(ref_hema_microarray)) {
  ref_hema_microarray <- tibble::remove_rownames(ref_hema_microarray)
}
ref_hema_microarray <- ref_hema_microarray %>%
  column_to_rownames("sample") %>%
  t()

Essentially, any data in matrix form is accepted by clustifyR.

clustifyrdata::ref_hema_microarray[1:5, 1:5]
#>        Basophils CD4+ Central Memory CD4+ Effector Memory CD8+ Central Memory
#> DDR1    6.084244            5.967502             5.933039            6.005278
#> RFC2    6.280044            6.028615             6.047005            5.992979
#> HSPA6   6.535444            5.811475             5.746326            5.928349
#> PAX8    6.669153            5.896401             6.118577            6.270870
#> GUCA1A  5.239230            5.232116             5.206960            5.227415
#>        CD8+ Effector Memory
#> DDR1               5.895926
#> RFC2               5.942426
#> HSPA6              5.942670
#> PAX8               6.323922
#> GUCA1A             5.090882

Building reference gene list

clustify_lists and clustify_nudge uses dataframe or matrix of candidate genes for classification. Example formatting are shown below.

# directly from seurat function
head(pbmc_markers)
#>               p_val avg_logFC pct.1 pct.2     p_val_adj cluster  gene
#> RPS12 2.008629e-140 0.5029988 1.000 0.991 2.754633e-136       0 RPS12
#> RPS27 2.624075e-140 0.5020359 0.999 0.992 3.598656e-136       0 RPS27
#> RPS6  1.280169e-138 0.4673635 1.000 0.995 1.755623e-134       0  RPS6
#> RPL32 4.358823e-135 0.4242773 0.999 0.995 5.977689e-131       0 RPL32
#> RPS14 3.618793e-128 0.4283480 1.000 0.994 4.962812e-124       0 RPS14
#> RPS25 5.612007e-124 0.5203411 0.997 0.975 7.696306e-120       0 RPS25

# manually enter into dataframe format
betam <- c("INS", "IGF2", "IAPP", "MAFA", "NPTX2")
alpham <- c("GCG", "PDK4", "LOXL2", "IRX2", "GC")
deltam <- c("SST", "RBP4", "HHEX", "PCSK1", "LEPR")
data.frame(alpham, betam, deltam)
#>   alpham betam deltam
#> 1    GCG   INS    SST
#> 2   PDK4  IGF2   RBP4
#> 3  LOXL2  IAPP   HHEX
#> 4   IRX2  MAFA  PCSK1
#> 5     GC NPTX2   LEPR

# use matrixize_markers function to convert
mm <- matrixize_markers(
  marker_df = pbmc_markers,
  remove_rp = TRUE,
  unique = TRUE,
  n = 10
)
head(mm)
#>           0        1      2         3      4      5      6        7          8
#> 1    EEF1A1     AQP3 S100A9     CD79A   GZMK CDKN1C  SPON2   FCER1A        GP9
#> 2      CCR7   CD40LG S100A8     MS4A1   CD8A   HES4 AKR1C3 SERPINF1     ITGA2B
#> 3      NPM1    TRADD   CD14     CD79B   CD8B    CKB   XCL2  CLEC10A     TMEM40
#> 4 PRKCQ-AS1   TTC39C  CSF3R LINC00926  DUSP2 LILRA3  CLIC3     ENHO AP001189.4
#> 5   GLTSCR2      JUN  ASGR1     TCL1A MT-CYB MS4A4A  KLRD1    CLIC2     LY6G6F
#> 6      BTG1 ARHGAP15  FOLR3    VPREB3   DDX5 LILRB1  TTC38     CD1C      SEPT5

# get markers from ref_matrix, and then matrixize_markers
cbmc_m <- matrixize_markers(
  marker_df = ref_marker_select(cbmc_ref),
  remove_rp = TRUE,
  unique = TRUE,
  n = 3
)

# parse `garnett` marker files, see function `file_marker_parse`

Prebuilt references, in clustifyr

A few reference datasets are built into clustifyr for vignettes and testing: pbmc_bulk_matrix, cbmc_ref, pbmc_markers, cbmc_m

Prebuilt references, in clustifyrdata

More reference data, including tabula muris, and code used to generate them are available at https://github.com/rnabioco/clustifyrdata.

Also see list for individual downloads at https://rnabioco.github.io/clustifyr/articles/download_refs.html

# install.packages("devtools")
devtools::install_github("rnabioco/clustifyrdata")