Skip to contents

clustifyrdatahub

clustifyrdatahub provides external reference data sets for cell-type assignment with clustifyr.

Installation

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("clustifyrdatahub")

Available references include

knitr::kable(dplyr::select(
  read.csv(system.file("extdata", "metadata.csv", package = "clustifyrdatahub")),
  c(1, 9, 2:7)))
Title Species Description RDataPath BiocVersion Genome SourceType SourceUrl
ref_MCA Mus musculus Mouse Cell Atlas clustifyrdatahub/ref_MCA.rda 3.12 mm10 Zip https://ndownloader.figshare.com/files/10756795
ref_tabula_muris_drop Mus musculus Tabula Muris (10X) clustifyrdatahub/ref_tabula_muris_drop.rda 3.12 mm10 Zip https://ndownloader.figshare.com/articles/5821263
ref_tabula_muris_facs Mus musculus Tabula Muris (SmartSeq2) clustifyrdatahub/ref_tabula_muris_facs.rda 3.12 mm10 Zip https://ndownloader.figshare.com/articles/5821263
ref_mouse.rnaseq Mus musculus Mouse RNA-seq from 28 cell types clustifyrdatahub/ref_mouse.rnaseq.rda 3.12 mm10 RDA https://github.com/dviraran/SingleR/tree/master/data
ref_moca_main Mus musculus Mouse Organogenesis Cell Atlas (main cell types) clustifyrdatahub/ref_moca_main.rda 3.12 mm10 RDA https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads
ref_immgen Mus musculus Mouse sorted immune cells clustifyrdatahub/ref_immgen.rda 3.12 mm10 RDA https://github.com/dviraran/SingleR/tree/master/data
ref_hema_microarray Homo sapiens Human hematopoietic cell microarray clustifyrdatahub/ref_hema_microarray.rda 3.12 hg38 TXT https://ftp.ncbi.nlm.nih.gov/geo/series/GSE24nnn/GSE24759/matrix/GSE24759_series_matrix.txt.gz
ref_cortex_dev Homo sapiens Human cortex development scRNA-seq clustifyrdatahub/ref_cortex_dev.rda 3.12 hg38 TSV https://cells.ucsc.edu/cortex-dev/exprMatrix.tsv.gz
ref_pan_indrop Homo sapiens Human pancreatic cell scRNA-seq (inDrop) clustifyrdatahub/ref_pan_indrop.rda 3.12 hg38 RDA https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/baron-human.rds
ref_pan_smartseq2 Homo sapiens Human pancreatic cell scRNA-seq (SmartSeq2) clustifyrdatahub/ref_pan_smartseq2.rda 3.12 hg38 RDA https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/segerstolpe.rds
ref_mouse_atlas Mus musculus Mouse Atlas scRNA-seq from 321 cell types clustifyrdatahub/ref_mouse_atlas.rda 3.12 mm10 RDA https://github.com/rnabioco/scRNA-seq-Cell-Ref-Matrix/blob/master/atlas/musMusculus/MouseAtlas.rda

To use clustifyrdatahub

library(ExperimentHub)
eh <- ExperimentHub()

## query
refs <- query(eh, "clustifyrdatahub")
refs
#> ExperimentHub with 11 records
#> # snapshotDate(): 2022-04-26
#> # $dataprovider: figshare, S3, GitHub, GEO, washington.edu, UCSC
#> # $species: Mus musculus, Homo sapiens
#> # $rdataclass: data.frame
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass,
#> #   tags, rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["EH3444"]]' 
#> 
#>            title                
#>   EH3444 | ref_MCA              
#>   EH3445 | ref_tabula_muris_drop
#>   EH3446 | ref_tabula_muris_facs
#>   EH3447 | ref_mouse.rnaseq     
#>   EH3448 | ref_moca_main        
#>   ...      ...                  
#>   EH3450 | ref_hema_microarray  
#>   EH3451 | ref_cortex_dev       
#>   EH3452 | ref_pan_indrop       
#>   EH3453 | ref_pan_smartseq2    
#>   EH3779 | ref_mouse_atlas
## either by index or id
ref_hema_microarray <- refs[[7]]         ## load the first resource in the list
ref_hema_microarray <- refs[["EH3450"]]  ## load by EH id

## or list and load
refs <- listResources(eh, "clustifyrdatahub")
ref_hema_microarray <- loadResources(
    eh, 
    "clustifyrdatahub",
    "ref_hema_microarray"
    )[[1]]

## use for classification of cell types
res <- clustifyr::clustify(
    input = clustifyr::pbmc_matrix_small,
    metadata = clustifyr::pbmc_meta$classified,
    ref_mat = ref_hema_microarray,
    query_genes = clustifyr::pbmc_vargenes
)
## or load refs by function name (after loading hub library)
library(clustifyrdatahub)
ref_hema_microarray()[1:5, 1:5]           ## data are loaded
#>        Basophils CD4+ Central Memory CD4+ Effector Memory
#> DDR1    6.084244            5.967502             5.933039
#> RFC2    6.280044            6.028615             6.047005
#> HSPA6   6.535444            5.811475             5.746326
#> PAX8    6.669153            5.896401             6.118577
#> GUCA1A  5.239230            5.232116             5.206960
#>        CD8+ Central Memory CD8+ Effector Memory
#> DDR1              6.005278             5.895926
#> RFC2              5.992979             5.942426
#> HSPA6             5.928349             5.942670
#> PAX8              6.270870             6.323922
#> GUCA1A            5.227415             5.090882
ref_hema_microarray(metadata = TRUE)      ## only metadata
#> ExperimentHub with 1 record
#> # snapshotDate(): 2022-04-26
#> # names(): EH3450
#> # package(): clustifyrdatahub
#> # $dataprovider: GEO
#> # $species: Homo sapiens
#> # $rdataclass: data.frame
#> # $rdatadateadded: 2020-05-14
#> # $title: ref_hema_microarray
#> # $description: Human hematopoietic cell microarray
#> # $taxonomyid: 9606
#> # $genome: hg38
#> # $sourcetype: TXT
#> # $sourceurl: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE24nnn/GSE2475...
#> # $sourcesize: NA
#> # $tags: c("SingleCellData", "SequencingData", "MicroarrayData",
#> #   "ExperimentHub") 
#> # retrieve record with 'object[["EH3450"]]'

session info

sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.5 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] clustifyrdatahub_0.99.4 ExperimentHub_2.4.0    
#> [3] AnnotationHub_3.4.0     BiocFileCache_2.4.0    
#> [5] dbplyr_2.2.1            BiocGenerics_0.42.0    
#> [7] BiocStyle_2.24.0       
#> 
#> loaded via a namespace (and not attached):
#>   [1] matrixStats_0.62.0            bitops_1.0-7                 
#>   [3] fs_1.5.2                      bit64_4.0.5                  
#>   [5] filelock_1.0.2                httr_1.4.4                   
#>   [7] rprojroot_2.0.3               GenomeInfoDb_1.32.4          
#>   [9] tools_4.2.2                   bslib_0.4.1                  
#>  [11] utf8_1.2.2                    R6_2.5.1                     
#>  [13] DBI_1.1.3                     colorspace_2.0-3             
#>  [15] withr_2.5.0                   gridExtra_2.3                
#>  [17] tidyselect_1.2.0              bit_4.0.4                    
#>  [19] curl_4.3.3                    compiler_4.2.2               
#>  [21] textshaping_0.3.6             cli_3.4.1                    
#>  [23] Biobase_2.56.0                DelayedArray_0.22.0          
#>  [25] desc_1.4.2                    entropy_1.3.1                
#>  [27] bookdown_0.29                 sass_0.4.2                   
#>  [29] scales_1.2.1                  rappdirs_0.3.3               
#>  [31] pkgdown_2.0.6                 systemfonts_1.0.4            
#>  [33] stringr_1.4.1                 digest_0.6.30                
#>  [35] rmarkdown_2.17                XVector_0.36.0               
#>  [37] pkgconfig_2.0.3               htmltools_0.5.3              
#>  [39] MatrixGenerics_1.8.1          fastmap_1.1.0                
#>  [41] highr_0.9                     rlang_1.0.6                  
#>  [43] RSQLite_2.2.18                shiny_1.7.3                  
#>  [45] jquerylib_0.1.4               generics_0.1.3               
#>  [47] jsonlite_1.8.3                BiocParallel_1.30.4          
#>  [49] dplyr_1.0.10                  clustifyr_1.8.0              
#>  [51] RCurl_1.98-1.9                magrittr_2.0.3               
#>  [53] GenomeInfoDbData_1.2.8        Matrix_1.5-1                 
#>  [55] Rcpp_1.0.9                    munsell_0.5.0                
#>  [57] S4Vectors_0.34.0              fansi_1.0.3                  
#>  [59] lifecycle_1.0.3               stringi_1.7.8                
#>  [61] yaml_2.3.6                    SummarizedExperiment_1.26.1  
#>  [63] zlibbioc_1.42.0               grid_4.2.2                   
#>  [65] blob_1.2.3                    parallel_4.2.2               
#>  [67] promises_1.2.0.1              crayon_1.5.2                 
#>  [69] lattice_0.20-45               cowplot_1.1.1                
#>  [71] Biostrings_2.64.1             KEGGREST_1.36.3              
#>  [73] knitr_1.40                    pillar_1.8.1                 
#>  [75] fgsea_1.22.0                  GenomicRanges_1.48.0         
#>  [77] codetools_0.2-18              stats4_4.2.2                 
#>  [79] fastmatch_1.1-3               glue_1.6.2                   
#>  [81] BiocVersion_3.15.2            evaluate_0.17                
#>  [83] data.table_1.14.4             BiocManager_1.30.19          
#>  [85] png_0.1-7                     vctrs_0.5.0                  
#>  [87] httpuv_1.6.6                  tidyr_1.2.1                  
#>  [89] gtable_0.3.1                  purrr_0.3.5                  
#>  [91] assertthat_0.2.1              cachem_1.0.6                 
#>  [93] ggplot2_3.3.6                 xfun_0.34                    
#>  [95] mime_0.12                     xtable_1.8-4                 
#>  [97] later_1.3.0                   ragg_1.2.4                   
#>  [99] SingleCellExperiment_1.18.1   tibble_3.1.8                 
#> [101] AnnotationDbi_1.58.0          memoise_2.0.1                
#> [103] IRanges_2.30.1                ellipsis_0.3.2               
#> [105] interactiveDisplayBase_1.34.0