Skip to contents

Compare scRNA-seq data to reference data.

Usage

clustify(input, ...)

# Default S3 method
clustify(
  input,
  ref_mat,
  metadata = NULL,
  cluster_col = NULL,
  query_genes = NULL,
  n_genes = 1000,
  per_cell = FALSE,
  n_perm = 0,
  compute_method = "spearman",
  pseudobulk_method = "mean",
  verbose = TRUE,
  lookuptable = NULL,
  rm0 = FALSE,
  obj_out = TRUE,
  seurat_out = obj_out,
  vec_out = FALSE,
  rename_prefix = NULL,
  threshold = "auto",
  low_threshold_cell = 0,
  exclude_genes = c(),
  if_log = TRUE,
  organism = "hsapiens",
  plot_name = NULL,
  rds_name = NULL,
  expand_unassigned = FALSE,
  ...
)

# S3 method for class 'Seurat'
clustify(
  input,
  ref_mat,
  cluster_col = NULL,
  query_genes = NULL,
  n_genes = 1000,
  per_cell = FALSE,
  n_perm = 0,
  compute_method = "spearman",
  pseudobulk_method = "mean",
  use_var_genes = TRUE,
  dr = "umap",
  obj_out = TRUE,
  seurat_out = obj_out,
  vec_out = FALSE,
  threshold = "auto",
  verbose = TRUE,
  rm0 = FALSE,
  rename_prefix = NULL,
  exclude_genes = c(),
  metadata = NULL,
  organism = "hsapiens",
  plot_name = NULL,
  rds_name = NULL,
  expand_unassigned = FALSE,
  ...
)

# S3 method for class 'SingleCellExperiment'
clustify(
  input,
  ref_mat,
  cluster_col = NULL,
  query_genes = NULL,
  per_cell = FALSE,
  n_perm = 0,
  compute_method = "spearman",
  pseudobulk_method = "mean",
  use_var_genes = TRUE,
  dr = "umap",
  obj_out = TRUE,
  seurat_out = obj_out,
  vec_out = FALSE,
  threshold = "auto",
  verbose = TRUE,
  rm0 = FALSE,
  rename_prefix = NULL,
  exclude_genes = c(),
  metadata = NULL,
  organism = "hsapiens",
  plot_name = NULL,
  rds_name = NULL,
  expand_unassigned = FALSE,
  ...
)

Arguments

input

single-cell expression matrix or Seurat object

...

additional arguments to pass to compute_method function

ref_mat

reference expression matrix

metadata

cell cluster assignments, supplied as a vector or data.frame. If data.frame is supplied then cluster_col needs to be set. Not required if running correlation per cell.

cluster_col

column in metadata that contains cluster ids per cell. Will default to first column of metadata if not supplied. Not required if running correlation per cell.

query_genes

A vector of genes of interest to compare. If NULL, then common genes between the expr_mat and ref_mat will be used for comparision.

n_genes

number of genes limit for Seurat variable genes, by default 1000, set to 0 to use all variable genes (generally not recommended)

per_cell

if true run per cell, otherwise per cluster.

n_perm

number of permutations, set to 0 by default

compute_method

method(s) for computing similarity scores

pseudobulk_method

method used for summarizing clusters, options are mean (default), median, truncate (10% truncated mean), or trimean, max, min

verbose

whether to report certain variables chosen and steps

lookuptable

if not supplied, will look in built-in table for object parsing

rm0

consider 0 as missing data, recommended for per_cell

obj_out

whether to output object instead of cor matrix

seurat_out

output cor matrix or called seurat object (deprecated, use obj_out instead)

vec_out

only output a result vector in the same order as metadata

rename_prefix

prefix to add to type and r column names

threshold

identity calling minimum correlation score threshold, only used when obj_out = TRUE

low_threshold_cell

option to remove clusters with too few cells

exclude_genes

a vector of gene names to throw out of query

if_log

input data is natural log, averaging will be done on unlogged data

organism

for GO term analysis, organism name: human - 'hsapiens', mouse - 'mmusculus'

plot_name

name for saved pdf, if NULL then no file is written (default)

rds_name

name for saved rds of rank_diff, if NULL then no file is written (default)

expand_unassigned

test all ref clusters for unassigned results

use_var_genes

if providing a seurat object, use the variable genes (stored in seurat_object@var.genes) as the query_genes.

dr

stored dimension reduction

Value

single cell object with identity assigned in metadata, or matrix of correlation values, clusters from input as row names, cell types from ref_mat as column names

Examples

# Annotate a matrix and metadata
clustify(
    input = pbmc_matrix_small,
    metadata = pbmc_meta,
    ref_mat = cbmc_ref,
    query_genes = pbmc_vargenes,
    cluster_col = "RNA_snn_res.0.5",
    verbose = TRUE
)
#> using # of genes: 599
#> similarity computation completed, matrix of 9 x 13, preparing output
#>           B CD14+ Mono CD16+ Mono     CD34+     CD4 T     CD8 T        DC
#> 0 0.6443057  0.5213433  0.5625255 0.6480900 0.8891488 0.8707703 0.5419174
#> 1 0.6268758  0.5373744  0.5866177 0.6265186 0.8610826 0.8347482 0.5541873
#> 2 0.5279272  0.9145428  0.8919954 0.5103056 0.4636934 0.4381450 0.7643448
#> 3 0.9093577  0.5347126  0.5908755 0.6248639 0.6411407 0.6269530 0.6036723
#> 4 0.5791953  0.4853982  0.5498177 0.5663773 0.7657468 0.7692801 0.5187820
#> 5 0.5380921  0.8429769  0.9294914 0.5331927 0.4857454 0.4643569 0.7235944
#> 6 0.5042912  0.4399003  0.5092065 0.5350808 0.6755032 0.6926741 0.4817286
#> 7 0.6088123  0.7415535  0.7520484 0.5694074 0.4951613 0.4789794 0.8493399
#> 8 0.1456751  0.2571662  0.2867421 0.2455947 0.1422081 0.1208273 0.1898078
#>       Eryth Memory CD4 T        Mk Naive CD4 T        NK      pDCs
#> 0 0.5528441    0.7277404 0.2668581   0.6967073 0.6880495 0.4631464
#> 1 0.5395170    0.7128879 0.2635539   0.6871755 0.6999959 0.4696025
#> 2 0.4559350    0.4553454 0.4227047   0.4564707 0.5917870 0.4738376
#> 3 0.4817829    0.4803367 0.2241017   0.5407149 0.5484213 0.5823418
#> 4 0.5007841    0.6323214 0.2961827   0.6036230 0.8257838 0.4536589
#> 5 0.4604308    0.4614474 0.3937726   0.4574654 0.6028156 0.4797367
#> 6 0.4578726    0.5665250 0.2783269   0.5505877 0.8940904 0.4185162
#> 7 0.4470039    0.4764929 0.2948557   0.4475783 0.5643991 0.6825388
#> 8 0.3068445    0.1457308 0.7319575   0.1609963 0.1396979 0.2152614

# Annotate using a different method
clustify(
    input = pbmc_matrix_small,
    metadata = pbmc_meta,
    ref_mat = cbmc_ref,
    query_genes = pbmc_vargenes,
    cluster_col = "RNA_snn_res.0.5",
    compute_method = "cosine"
)
#> using # of genes: 599
#> similarity computation completed, matrix of 9 x 13, preparing output
#>           B CD14+ Mono CD16+ Mono     CD34+     CD4 T     CD8 T        DC
#> 0 0.7986227  0.7404425  0.7704809 0.8194404 0.9493149 0.9444251 0.7508430
#> 1 0.7865337  0.7486511  0.7797697 0.8075621 0.9318837 0.9235744 0.7594695
#> 2 0.7527223  0.9627275  0.9460084 0.7577338 0.7096110 0.7061112 0.8919577
#> 3 0.9669867  0.7149343  0.7720125 0.8004738 0.7635724 0.7657181 0.8062767
#> 4 0.7510063  0.7070527  0.7467587 0.7677927 0.8562280 0.8674614 0.7351337
#> 5 0.7590829  0.9172636  0.9762851 0.7746692 0.7286717 0.7274581 0.8800476
#> 6 0.6784059  0.6671718  0.7107553 0.7238537 0.7777913 0.7973584 0.6864934
#> 7 0.8230905  0.8842235  0.9086642 0.8037140 0.7246647 0.7260570 0.9560634
#> 8 0.5463488  0.5932935  0.6068613 0.5953661 0.5558004 0.5516889 0.5730797
#>       Eryth Memory CD4 T        Mk Naive CD4 T        NK      pDCs
#> 0 0.7181980    0.8877208 0.7125300   0.8847456 0.8097839 0.7246787
#> 1 0.7101214    0.8855087 0.7122113   0.8731255 0.8102022 0.7241459
#> 2 0.6951238    0.7557225 0.7911713   0.7542713 0.7675894 0.7495702
#> 3 0.6514282    0.7123319 0.6594086   0.7494337 0.6957714 0.7863873
#> 4 0.6826181    0.8232194 0.7188642   0.8123971 0.9018633 0.7085898
#> 5 0.6880012    0.7601400 0.7722167   0.7561998 0.7776798 0.7477358
#> 6 0.6500106    0.7578212 0.6960715   0.7544227 0.9480188 0.6812692
#> 7 0.6874076    0.7519049 0.7448677   0.7491931 0.7595771 0.8503681
#> 8 0.5898023    0.5655621 0.8775257   0.5455913 0.5569351 0.5756617

# Annotate a SingleCellExperiment object
sce <- sce_pbmc()
clustify(
    sce,
    cbmc_ref,
    cluster_col = "clusters",
    obj_out = TRUE,
    per_cell = FALSE,
    dr = "umap"
)
#> object data retrieval complete, moving to similarity computation
#> Variable features not available, using all genes instead
#> consider supplying variable features to `query_genes` argument.
#> using # of genes: 599
#> similarity computation completed, matrix of 9 x 13, preparing output
#> using threshold of 0.7
#> class: SingleCellExperiment 
#> dim: 2000 2638 
#> metadata(0):
#> assays(2): counts logcounts
#> rownames(2000): PPBP LYZ ... CLIC2 HEMGN
#> rowData names(0):
#> colnames(2638): AAACATACAACCAC AAACATTGAGCTAC ... TTTGCATGAGAGGC
#>   TTTGCATGCCTCAC
#> colData names(8): cell_source sum ... type r
#> reducedDimNames(1): UMAP
#> mainExpName: NULL
#> altExpNames(0):

# Annotate a Seurat object
so <- so_pbmc()
clustify(
    so,
    cbmc_ref,
    cluster_col = "seurat_clusters",
    obj_out = TRUE,
    per_cell = FALSE,
    dr = "umap"
)
#> object data retrieval complete, moving to similarity computation
#> using # of genes: 356
#> similarity computation completed, matrix of 9 x 13, preparing output
#> using threshold of 0.7
#> An object of class Seurat 
#> 2000 features across 2638 samples within 1 assay 
#> Active assay: RNA (2000 features, 2000 variable features)
#>  2 layers present: counts, data
#>  1 dimensional reduction calculated: umap

# Annotate (and return) a Seurat object per-cell
clustify(
    input = so,
    ref_mat = cbmc_ref,
    cluster_col = "seurat_clusters",
    obj_out = TRUE,
    per_cell = TRUE,
    dr = "umap"
)
#> object data retrieval complete, moving to similarity computation
#> using # of genes: 356
#> similarity computation completed, matrix of 2638 x 13, preparing output
#> using threshold of 0.55
#> An object of class Seurat 
#> 2000 features across 2638 samples within 1 assay 
#> Active assay: RNA (2000 features, 2000 variable features)
#>  2 layers present: counts, data
#>  1 dimensional reduction calculated: umap