Compare scRNA-seq data to reference data.
Usage
clustify(input, ...)
# Default S3 method
clustify(
input,
ref_mat,
metadata = NULL,
cluster_col = NULL,
query_genes = NULL,
n_genes = 1000,
per_cell = FALSE,
n_perm = 0,
compute_method = "spearman",
pseudobulk_method = "mean",
verbose = TRUE,
lookuptable = NULL,
rm0 = FALSE,
obj_out = TRUE,
seurat_out = obj_out,
vec_out = FALSE,
rename_prefix = NULL,
threshold = "auto",
low_threshold_cell = 0,
exclude_genes = c(),
if_log = TRUE,
organism = "hsapiens",
plot_name = NULL,
rds_name = NULL,
expand_unassigned = FALSE,
...
)
# S3 method for class 'Seurat'
clustify(
input,
ref_mat,
cluster_col = NULL,
query_genes = NULL,
n_genes = 1000,
per_cell = FALSE,
n_perm = 0,
compute_method = "spearman",
pseudobulk_method = "mean",
use_var_genes = TRUE,
dr = "umap",
obj_out = TRUE,
seurat_out = obj_out,
vec_out = FALSE,
threshold = "auto",
verbose = TRUE,
rm0 = FALSE,
rename_prefix = NULL,
exclude_genes = c(),
metadata = NULL,
organism = "hsapiens",
plot_name = NULL,
rds_name = NULL,
expand_unassigned = FALSE,
...
)
# S3 method for class 'SingleCellExperiment'
clustify(
input,
ref_mat,
cluster_col = NULL,
query_genes = NULL,
per_cell = FALSE,
n_perm = 0,
compute_method = "spearman",
pseudobulk_method = "mean",
use_var_genes = TRUE,
dr = "umap",
obj_out = TRUE,
seurat_out = obj_out,
vec_out = FALSE,
threshold = "auto",
verbose = TRUE,
rm0 = FALSE,
rename_prefix = NULL,
exclude_genes = c(),
metadata = NULL,
organism = "hsapiens",
plot_name = NULL,
rds_name = NULL,
expand_unassigned = FALSE,
...
)
Arguments
- input
single-cell expression matrix or Seurat object
- ...
additional arguments to pass to compute_method function
- ref_mat
reference expression matrix
- metadata
cell cluster assignments, supplied as a vector or data.frame. If data.frame is supplied then
cluster_col
needs to be set. Not required if running correlation per cell.- cluster_col
column in metadata that contains cluster ids per cell. Will default to first column of metadata if not supplied. Not required if running correlation per cell.
- query_genes
A vector of genes of interest to compare. If NULL, then common genes between the expr_mat and ref_mat will be used for comparision.
- n_genes
number of genes limit for Seurat variable genes, by default 1000, set to 0 to use all variable genes (generally not recommended)
- per_cell
if true run per cell, otherwise per cluster.
- n_perm
number of permutations, set to 0 by default
- compute_method
method(s) for computing similarity scores
- pseudobulk_method
method used for summarizing clusters, options are mean (default), median, truncate (10% truncated mean), or trimean, max, min
- verbose
whether to report certain variables chosen and steps
- lookuptable
if not supplied, will look in built-in table for object parsing
- rm0
consider 0 as missing data, recommended for per_cell
- obj_out
whether to output object instead of cor matrix
- seurat_out
output cor matrix or called seurat object (deprecated, use obj_out instead)
- vec_out
only output a result vector in the same order as metadata
- rename_prefix
prefix to add to type and r column names
- threshold
identity calling minimum correlation score threshold, only used when obj_out = TRUE
- low_threshold_cell
option to remove clusters with too few cells
- exclude_genes
a vector of gene names to throw out of query
- if_log
input data is natural log, averaging will be done on unlogged data
- organism
for GO term analysis, organism name: human - 'hsapiens', mouse - 'mmusculus'
- plot_name
name for saved pdf, if NULL then no file is written (default)
- rds_name
name for saved rds of rank_diff, if NULL then no file is written (default)
- expand_unassigned
test all ref clusters for unassigned results
- use_var_genes
if providing a seurat object, use the variable genes (stored in seurat_object@var.genes) as the query_genes.
- dr
stored dimension reduction
Value
single cell object with identity assigned in metadata, or matrix of correlation values, clusters from input as row names, cell types from ref_mat as column names
Examples
# Annotate a matrix and metadata
clustify(
input = pbmc_matrix_small,
metadata = pbmc_meta,
ref_mat = cbmc_ref,
query_genes = pbmc_vargenes,
cluster_col = "RNA_snn_res.0.5",
verbose = TRUE
)
#> using # of genes: 599
#> similarity computation completed, matrix of 9 x 13, preparing output
#> B CD14+ Mono CD16+ Mono CD34+ CD4 T CD8 T DC
#> 0 0.6443057 0.5213433 0.5625255 0.6480900 0.8891488 0.8707703 0.5419174
#> 1 0.6268758 0.5373744 0.5866177 0.6265186 0.8610826 0.8347482 0.5541873
#> 2 0.5279272 0.9145428 0.8919954 0.5103056 0.4636934 0.4381450 0.7643448
#> 3 0.9093577 0.5347126 0.5908755 0.6248639 0.6411407 0.6269530 0.6036723
#> 4 0.5791953 0.4853982 0.5498177 0.5663773 0.7657468 0.7692801 0.5187820
#> 5 0.5380921 0.8429769 0.9294914 0.5331927 0.4857454 0.4643569 0.7235944
#> 6 0.5042912 0.4399003 0.5092065 0.5350808 0.6755032 0.6926741 0.4817286
#> 7 0.6088123 0.7415535 0.7520484 0.5694074 0.4951613 0.4789794 0.8493399
#> 8 0.1456751 0.2571662 0.2867421 0.2455947 0.1422081 0.1208273 0.1898078
#> Eryth Memory CD4 T Mk Naive CD4 T NK pDCs
#> 0 0.5528441 0.7277404 0.2668581 0.6967073 0.6880495 0.4631464
#> 1 0.5395170 0.7128879 0.2635539 0.6871755 0.6999959 0.4696025
#> 2 0.4559350 0.4553454 0.4227047 0.4564707 0.5917870 0.4738376
#> 3 0.4817829 0.4803367 0.2241017 0.5407149 0.5484213 0.5823418
#> 4 0.5007841 0.6323214 0.2961827 0.6036230 0.8257838 0.4536589
#> 5 0.4604308 0.4614474 0.3937726 0.4574654 0.6028156 0.4797367
#> 6 0.4578726 0.5665250 0.2783269 0.5505877 0.8940904 0.4185162
#> 7 0.4470039 0.4764929 0.2948557 0.4475783 0.5643991 0.6825388
#> 8 0.3068445 0.1457308 0.7319575 0.1609963 0.1396979 0.2152614
# Annotate using a different method
clustify(
input = pbmc_matrix_small,
metadata = pbmc_meta,
ref_mat = cbmc_ref,
query_genes = pbmc_vargenes,
cluster_col = "RNA_snn_res.0.5",
compute_method = "cosine"
)
#> using # of genes: 599
#> similarity computation completed, matrix of 9 x 13, preparing output
#> B CD14+ Mono CD16+ Mono CD34+ CD4 T CD8 T DC
#> 0 0.7986227 0.7404425 0.7704809 0.8194404 0.9493149 0.9444251 0.7508430
#> 1 0.7865337 0.7486511 0.7797697 0.8075621 0.9318837 0.9235744 0.7594695
#> 2 0.7527223 0.9627275 0.9460084 0.7577338 0.7096110 0.7061112 0.8919577
#> 3 0.9669867 0.7149343 0.7720125 0.8004738 0.7635724 0.7657181 0.8062767
#> 4 0.7510063 0.7070527 0.7467587 0.7677927 0.8562280 0.8674614 0.7351337
#> 5 0.7590829 0.9172636 0.9762851 0.7746692 0.7286717 0.7274581 0.8800476
#> 6 0.6784059 0.6671718 0.7107553 0.7238537 0.7777913 0.7973584 0.6864934
#> 7 0.8230905 0.8842235 0.9086642 0.8037140 0.7246647 0.7260570 0.9560634
#> 8 0.5463488 0.5932935 0.6068613 0.5953661 0.5558004 0.5516889 0.5730797
#> Eryth Memory CD4 T Mk Naive CD4 T NK pDCs
#> 0 0.7181980 0.8877208 0.7125300 0.8847456 0.8097839 0.7246787
#> 1 0.7101214 0.8855087 0.7122113 0.8731255 0.8102022 0.7241459
#> 2 0.6951238 0.7557225 0.7911713 0.7542713 0.7675894 0.7495702
#> 3 0.6514282 0.7123319 0.6594086 0.7494337 0.6957714 0.7863873
#> 4 0.6826181 0.8232194 0.7188642 0.8123971 0.9018633 0.7085898
#> 5 0.6880012 0.7601400 0.7722167 0.7561998 0.7776798 0.7477358
#> 6 0.6500106 0.7578212 0.6960715 0.7544227 0.9480188 0.6812692
#> 7 0.6874076 0.7519049 0.7448677 0.7491931 0.7595771 0.8503681
#> 8 0.5898023 0.5655621 0.8775257 0.5455913 0.5569351 0.5756617
# Annotate a SingleCellExperiment object
sce <- sce_pbmc()
clustify(
sce,
cbmc_ref,
cluster_col = "clusters",
obj_out = TRUE,
per_cell = FALSE,
dr = "umap"
)
#> object data retrieval complete, moving to similarity computation
#> Variable features not available, using all genes instead
#> consider supplying variable features to `query_genes` argument.
#> using # of genes: 599
#> similarity computation completed, matrix of 9 x 13, preparing output
#> using threshold of 0.7
#> class: SingleCellExperiment
#> dim: 2000 2638
#> metadata(0):
#> assays(2): counts logcounts
#> rownames(2000): PPBP LYZ ... CLIC2 HEMGN
#> rowData names(0):
#> colnames(2638): AAACATACAACCAC AAACATTGAGCTAC ... TTTGCATGAGAGGC
#> TTTGCATGCCTCAC
#> colData names(8): cell_source sum ... type r
#> reducedDimNames(1): UMAP
#> mainExpName: NULL
#> altExpNames(0):
# Annotate a Seurat object
so <- so_pbmc()
clustify(
so,
cbmc_ref,
cluster_col = "seurat_clusters",
obj_out = TRUE,
per_cell = FALSE,
dr = "umap"
)
#> object data retrieval complete, moving to similarity computation
#> using # of genes: 356
#> similarity computation completed, matrix of 9 x 13, preparing output
#> using threshold of 0.7
#> An object of class Seurat
#> 2000 features across 2638 samples within 1 assay
#> Active assay: RNA (2000 features, 2000 variable features)
#> 2 layers present: counts, data
#> 1 dimensional reduction calculated: umap
# Annotate (and return) a Seurat object per-cell
clustify(
input = so,
ref_mat = cbmc_ref,
cluster_col = "seurat_clusters",
obj_out = TRUE,
per_cell = TRUE,
dr = "umap"
)
#> object data retrieval complete, moving to similarity computation
#> using # of genes: 356
#> similarity computation completed, matrix of 2638 x 13, preparing output
#> using threshold of 0.55
#> An object of class Seurat
#> 2000 features across 2638 samples within 1 assay
#> Active assay: RNA (2000 features, 2000 variable features)
#> 2 layers present: counts, data
#> 1 dimensional reduction calculated: umap