In its simplest form, a reference matrix is built by averaging expression (also includes an option to take the median) of a single cell RNA-seq expression matrix by cluster. Both log transformed or raw count matrices are supported.
library(clustifyr) library(clustifyrdata) new_ref_matrix <- average_clusters( mat = pbmc_matrix, metadata = pbmc_meta$classified, # or use metadata = pbmc_meta, cluster_col = "classified" if_log = TRUE ) head(new_ref_matrix) #> B CD14+ Mono CD8 T DC FCGR3A+ Mono #> AL627309.1 0.00000000 0.04740195 0.02033764 0.0000000 0.00000000 #> AP006222.2 0.00000000 0.01082590 0.01184445 0.0000000 0.00000000 #> RP11-206L10.2 0.02043998 0.00000000 0.00000000 0.0812375 0.00000000 #> RP11-206L10.9 0.00000000 0.01044641 0.00000000 0.0000000 0.01192865 #> LINC00115 0.03814842 0.03685000 0.01929541 0.0000000 0.01458683 #> NOC2L 0.46155233 0.24101294 0.44279234 0.2841343 0.40564359 #> Memory CD4 T Naive CD4 T NK Platelet #> AL627309.1 0.005909767 0.006109960 0.0000000 0 #> AP006222.2 0.008172591 0.000000000 0.0000000 0 #> RP11-206L10.2 0.000000000 0.007425455 0.0000000 0 #> RP11-206L10.9 0.000000000 0.000000000 0.0000000 0 #> LINC00115 0.024390599 0.018938463 0.0569015 0 #> NOC2L 0.307346059 0.403772470 0.5311082 0
seurat
objectFor further convenience, a shortcut function for generating reference matrix from seurat
object is used here.
new_ref_matrix_v2 <- seurat_ref( seurat_object = s_small, cluster_col = "res.1" ) new_ref_matrix_v3 <- seurat_ref( seurat_object = s_small3, cluster_col = "RNA_snn_res.1" ) tail(new_ref_matrix_v3) #> 0 1 2 #> C12orf75 3.223784 0.4297757 2.305767 #> RARRES3 4.341699 1.7550597 3.794566 #> PCMT1 4.576310 2.8893691 2.223141 #> LAMP1 3.919129 0.9441022 1.565788 #> SPON2 3.670497 1.3182933 3.200796 #> S100B 3.069210 0.0000000 0.000000
If given additional assay_name
, output ref will include features from the designated slots (such as for CITE-seq data).
new_ref_matrix_v2 <- seurat_ref( seurat_object = s_small, cluster_col = "res.1", assay_name = c("ADT", "ADT2") ) new_ref_matrix_v3 <- seurat_ref( seurat_object = s_small3, cluster_col = "RNA_snn_res.1", assay_name = c("ADT", "ADT2") ) tail(new_ref_matrix_v3) #> 0 1 2 #> A 1 1 1 #> B 1 1 1 #> C 1 1 1 #> D 1 1 1
Bulk RNA-Seq data can be obtained from any input source; in this example we will obtain a dataset from the recount2 database. This database provides > 2000 human RNA-Seq experiments that have been processed using a consistent pipeline. We have written a wrapper function to download a count matrix from recount2
, given an SRA ID.
library(dplyr) library(purrr) library(tidyr) library(stringr) library(recount2) dl_recount <- function(sra_id) { if (!file.exists(file.path(sra_id, "rse_gene.Rdata"))) { download_study(sra_id) } load(file.path(sra_id, "rse_gene.Rdata")) # no longer need to downloaded data unlink(sra_id, recursive = TRUE) rse <- scale_counts(rse_gene) read_counts <- assay(rse, "counts") gene_ids <- rownames(read_counts) # get gene symbols, which are stored in rowData id2symbol <- data_frame( ids = rowData(rse_gene)$gene_id, symbols = rowData(rse_gene)$symbol@listData ) %>% mutate(symbols = map_chr(symbols, ~ .x[1])) # clean up metadata into a dataframe print("cleaning up meta") mdata <- colData(rse) mdata_cols <- lapply( mdata$characteristics, function(x) { str_match(x, "^([^:]+):")[, 2] } ) %>% unique() %>% unlist() mdata <- data_frame( run = mdata$run, all_data = as.list(mdata$characteristics) ) %>% mutate(out = purrr::map_chr(all_data, ~ str_c(.x, collapse = "::"))) %>% tidyr::separate( out, sep = "::", into = mdata_cols ) %>% select(-all_data) %>% mutate_at( .vars = vars(-matches("run")), .funs = function(x) str_match(x, ": (.+)")[, 2] ) # convert ids to symbols row_ids_to_symbols <- left_join(data_frame(ids = gene_ids), id2symbol, by = "ids") if (length(gene_ids) != nrow(row_ids_to_symbols)) { warning("gene id mapping to symbols produce more or less ids") } row_ids_to_symbols <- filter(row_ids_to_symbols, !is.na(symbols)) out_df <- read_counts %>% as.data.frame() %>% tibble::rownames_to_column("gene_id") %>% left_join(., row_ids_to_symbols, by = c("gene_id" = "ids")) %>% dplyr::select(-gene_id) %>% dplyr::select(symbols, everything()) %>% filter(!is.na(symbols)) out_matrix <- tidyr::gather(out_df, library, expr, -symbols) %>% group_by(symbols, library) %>% summarize(expr = sum(expr)) %>% tidyr::spread(library, expr) %>% as.data.frame() if (tibble::has_rownames(out_matrix)) { out_matrix <- tibble::remove_rownames(out_matrix) } out_matrix <- out_matrix %>% tibble::column_to_rownames("symbols") %>% as.matrix() list( read_counts = out_matrix, meta_data = mdata ) } gtex_data <- dl_recount("SRP012682") gtex_data$read_counts[1:5, 1:5] gtex_data$meta_data[1:5, ]
clustifyr
works with microarray data-derived gene expression matrix as well (see Benchmark page). An example of generating a matrix for sorted immune cell type can be found below. The matrix can be found at clustifyrdata::hema_microarray_matrix
library(tidyverse) # read series matrix for column names GSE24759 <- read_tsv("GSE24759/GSE24759_series_matrix.txt", skip = 25, n_max = 1) # read series matrix for full dataset; apply column names GSE24759 <- read_tsv("GSE24759//GSE24759_series_matrix.txt", skip = 57, col_names = colnames(GSE24759)) %>% rename(ID = `!Sample_title`) # read array metadata table, remove control probes and missing gene symbols GPL4685 <- read_tsv("/Users/rf/microarray/GSE24759/GPL4685-15513.txt", skip = 14) %>% filter(!is.na(`Gene Symbol`), `Sequence Type` != "Control sequence") %>% separate(`Gene Symbol`, into = "gene_symbol", sep = " ") %>% select(ID, gene_symbol) # join data and metadata, collapse gene symbols GSE24759 <- inner_join(GSE24759, GPL4685) %>% select(-ID) %>% group_by(gene_symbol) %>% mutate_at(.vars = vars(-gene_symbol), .funs = list(~ log2(mean(2^.)))) # convert to matrix, add rownames GSE24759_mat <- ungroup(GSE24759) %>% select(-gene_symbol) %>% as.matrix() row.names(GSE24759_mat) <- GSE24759$gene_symbol # merge samples ref_hema_microarray <- GSE24759_mat %>% t() %>% as.data.frame() %>% rownames_to_column("sample") %>% mutate(sample = str_remove(sample, ", .*")) %>% group_by(sample) %>% summarise_all(.funs = list(~ log2(mean(2^.)))) if (tibble::has_rownames(ref_hema_microarray)) { ref_hema_microarray <- tibble::remove_rownames(ref_hema_microarray) } ref_hema_microarray <- ref_hema_microarray %>% column_to_rownames("sample") %>% t()
Essentially, any data in matrix form is accepted by clustifyR
.
clustifyrdata::ref_hema_microarray[1:5, 1:5] #> Basophils CD4+ Central Memory CD4+ Effector Memory CD8+ Central Memory #> DDR1 6.084244 5.967502 5.933039 6.005278 #> RFC2 6.280044 6.028615 6.047005 5.992979 #> HSPA6 6.535444 5.811475 5.746326 5.928349 #> PAX8 6.669153 5.896401 6.118577 6.270870 #> GUCA1A 5.239230 5.232116 5.206960 5.227415 #> CD8+ Effector Memory #> DDR1 5.895926 #> RFC2 5.942426 #> HSPA6 5.942670 #> PAX8 6.323922 #> GUCA1A 5.090882
clustify_lists
and clustify_nudge
uses dataframe or matrix of candidate genes for classification. Example formatting are shown below.
# directly from seurat function head(pbmc_markers) #> p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene #> RPS12 2.008629e-140 0.5029988 1.000 0.991 2.754633e-136 0 RPS12 #> RPS27 2.624075e-140 0.5020359 0.999 0.992 3.598656e-136 0 RPS27 #> RPS6 1.280169e-138 0.4673635 1.000 0.995 1.755623e-134 0 RPS6 #> RPL32 4.358823e-135 0.4242773 0.999 0.995 5.977689e-131 0 RPL32 #> RPS14 3.618793e-128 0.4283480 1.000 0.994 4.962812e-124 0 RPS14 #> RPS25 5.612007e-124 0.5203411 0.997 0.975 7.696306e-120 0 RPS25 # manually enter into dataframe format betam <- c("INS", "IGF2", "IAPP", "MAFA", "NPTX2") alpham <- c("GCG", "PDK4", "LOXL2", "IRX2", "GC") deltam <- c("SST", "RBP4", "HHEX", "PCSK1", "LEPR") data.frame(alpham, betam, deltam) #> alpham betam deltam #> 1 GCG INS SST #> 2 PDK4 IGF2 RBP4 #> 3 LOXL2 IAPP HHEX #> 4 IRX2 MAFA PCSK1 #> 5 GC NPTX2 LEPR # use matrixize_markers function to convert mm <- matrixize_markers( marker_df = pbmc_markers, remove_rp = TRUE, unique = TRUE, n = 10 ) head(mm) #> 0 1 2 3 4 5 6 7 8 #> 1 EEF1A1 AQP3 S100A9 CD79A GZMK CDKN1C SPON2 FCER1A GP9 #> 2 CCR7 CD40LG S100A8 MS4A1 CD8A HES4 AKR1C3 SERPINF1 ITGA2B #> 3 NPM1 TRADD CD14 CD79B CD8B CKB XCL2 CLEC10A TMEM40 #> 4 PRKCQ-AS1 TTC39C CSF3R LINC00926 DUSP2 LILRA3 CLIC3 ENHO AP001189.4 #> 5 GLTSCR2 JUN ASGR1 TCL1A MT-CYB MS4A4A KLRD1 CLIC2 LY6G6F #> 6 BTG1 ARHGAP15 FOLR3 VPREB3 DDX5 LILRB1 TTC38 CD1C SEPT5 # get markers from ref_matrix, and then matrixize_markers cbmc_m <- matrixize_markers( marker_df = ref_marker_select(cbmc_ref), remove_rp = TRUE, unique = TRUE, n = 3 ) # parse `garnett` marker files, see function `file_marker_parse`
A few reference datasets are built into clustifyr for vignettes and testing: pbmc_bulk_matrix
, cbmc_ref
, pbmc_markers
, cbmc_m
More reference data, including tabula muris, and code used to generate them are available at https://github.com/rnabioco/clustifyrdata.
Also see list for individual downloads at https://rnabioco.github.io/clustifyr/articles/download_refs.html
# install.packages("devtools")
devtools::install_github("rnabioco/clustifyrdata")