Import V(D)J data

Usage

import_vdj(
  input = NULL,
  vdj_dir = NULL,
  prefix = "",
  data_cols = NULL,
  filter_chains = TRUE,
  filter_paired = FALSE,
  define_clonotypes = NULL,
  include_mutations = FALSE,
  include_constant = FALSE,
  aggr_dir = NULL,
  quiet = FALSE,
  sep = ";"
)

Arguments

input

Object containing single cell data, if set to NULL a data.frame containing V(D)J results will be returned

vdj_dir

Directory containing the output from cellranger vdj. A vector or named vector can be given to load data from multiple runs. If a named vector is given, the cell barcodes will be prefixed with the provided names. This mimics the behavior of Seurat::Read10X().

prefix

Prefix to add to new columns

data_cols

Additional columns from filtered_contig_annotations.csv to include in object.

filter_chains

Only include chains with at least one productive and full length contig.

filter_paired

Only include clonotypes with paired chains. For TCR data each clonotype must have at least one TRA and TRB chain, for BCR data each clonotype must have at least one IGH chain and at least one IGK or IGL chain.

define_clonotypes

Define clonotype IDs based on V(D)J data. This is useful if the V(D)J datasets being loaded do not have consistent clonotype IDs, i.e., clonotype1 is not the same across samples. Possible values are:

'cdr3aa', define clonotypes based on the CDR3 amino acid sequence
'cdr3nt', define clonotypes based on the CDR3 nucleotide sequence
'cdr3_gene', define clonotypes based on the combination of the CDR3 nucleotide sequence and the V(D)J genes.

When defining clonotypes, only productive full length chains will be used. Set to NULL (default) to use the clonotype IDs already present in the input data.

include_mutations

Include information about the number of insertions/deletions/mismatches for each chain. This requires the concat_ref.bam file from cellranger vdj to be present the directory provided to vdj_dir. If include_mutations is TRUE, filter_chains is also automatically set TRUE since indel data is only available for productive chains.

include_constant

If the constant region should be included in the "all" mutation count. If TRUE, the constant region will be included in the "all" mutation count and the length of the V + J + D + C regions will be used to calculate the "all_freq". If FALSE (the default), any mutations in the c region will not be counted in the "all" mutation count and only the length of the V + J + D region will be used to calculate the frequency.

aggr_dir

Path to cellranger aggr output. To include mutation information for each chain, also provide paths to the original cellranger vdj output directories using the vdj_dir argument.

To correctly match cell barcodes to those in the object, gene expression data for each sample must be loaded in the same order as the samples were specified in the cellranger aggr config file. In addition, if loading mutation data, sample paths provided to the vdj_dir argument must also be in the same order as the samples were specified in the cellranger aggr config file.

quiet

If TRUE progress updates will not be displayed

sep

Separator to use for storing per cell V(D)J data

Value

Single cell object or data.frame with added V(D)J data

Examples

# Load GEX data
data_dir <- system.file("extdata/splen", package = "djvdj")

gex_dirs <- c(
  BL6 = file.path(data_dir, "BL6_GEX/filtered_feature_bc_matrix"),
  MD4 = file.path(data_dir, "MD4_GEX/filtered_feature_bc_matrix")
)

splen_so <- gex_dirs |>
  Seurat::Read10X() |>
  Seurat::CreateSeuratObject()

# Loading multiple datasets
# to ensure cell barcodes for the V(D)J data match those in the object
# load the datasets in the same order as the gene expression data
vdj_dirs <- c(
  file.path(data_dir, "BL6_BCR"),
  file.path(data_dir, "MD4_BCR")
)

res <- splen_so |>
  import_vdj(vdj_dir = vdj_dirs)
#> ℹ Loading V(D)J data
#> ✔ Loading V(D)J data [205ms]
#> 
#> ℹ Formatting V(D)J data
#> ✔ Formatting V(D)J data [139ms]
#> 
#> ──────────────────────────────────────────────────────────────────────────
#>       # cells   # VDJ   # paired   # overlap   % overlap 
#> ✔ 1 |     500 |   170 |       67 |       170 |      100%
#> ✔ 2 |     500 |   159 |        7 |       159 |      100%
#> ──────────────────────────────────────────────────────────────────────────

head(slot(res, "meta.data"), 1)
#>                        orig.ident nCount_RNA nFeature_RNA clonotype_id
#> BL6_AAACGGGGTTCTGTTT-1        BL6        202           25         <NA>
#>                        exact_subclonotype_id chains n_chains cdr3 cdr3_nt
#> BL6_AAACGGGGTTCTGTTT-1                    NA   <NA>       NA <NA>    <NA>
#>                        cdr3_length cdr3_nt_length v_gene d_gene j_gene
#> BL6_AAACGGGGTTCTGTTT-1        <NA>           <NA>   <NA>   <NA>   <NA>
#>                        c_gene isotype reads umis productive full_length
#> BL6_AAACGGGGTTCTGTTT-1   <NA>    <NA>  <NA> <NA>       <NA>        <NA>
#>                        paired
#> BL6_AAACGGGGTTCTGTTT-1     NA

# Specifying cell prefixes using vector names
# cell barcode prefixes can also be specified by passing a named vector
vdj_dirs <- c(
  BL6 = file.path(data_dir, "BL6_BCR"),
  MD4 = file.path(data_dir, "MD4_BCR")
)

res <- splen_so |>
  import_vdj(vdj_dir = vdj_dirs)
#> ℹ Loading V(D)J data
#> ✔ Loading V(D)J data [80ms]
#> 
#> ℹ Formatting V(D)J data
#> ✔ Formatting V(D)J data [138ms]
#> 
#> ──────────────────────────────────────────────────────────────────────────
#>          # cells   # VDJ   # paired   # overlap   % overlap 
#> ✔ BL6_ |     500 |   170 |       67 |       170 |      100%
#> ✔ MD4_ |     500 |   159 |        7 |       159 |      100%
#> ──────────────────────────────────────────────────────────────────────────

head(slot(res, "meta.data"), 1)
#>                        orig.ident nCount_RNA nFeature_RNA clonotype_id
#> BL6_AAACGGGGTTCTGTTT-1        BL6        202           25         <NA>
#>                        exact_subclonotype_id chains n_chains cdr3 cdr3_nt
#> BL6_AAACGGGGTTCTGTTT-1                    NA   <NA>       NA <NA>    <NA>
#>                        cdr3_length cdr3_nt_length v_gene d_gene j_gene
#> BL6_AAACGGGGTTCTGTTT-1        <NA>           <NA>   <NA>   <NA>   <NA>
#>                        c_gene isotype reads umis productive full_length
#> BL6_AAACGGGGTTCTGTTT-1   <NA>    <NA>  <NA> <NA>       <NA>        <NA>
#>                        paired
#> BL6_AAACGGGGTTCTGTTT-1     NA

# Only include V(D)J data for paired chains
res <- splen_so |>
  import_vdj(
    vdj_dir = vdj_dirs,
    filter_paired = TRUE
  )
#> ℹ Loading V(D)J data
#> ✔ Loading V(D)J data [93ms]
#> 
#> ℹ Formatting V(D)J data
#> ✔ Formatting V(D)J data [118ms]
#> 
#> ──────────────────────────────────────────────────────────────────────────
#>          # cells   # VDJ   # paired   # overlap   % overlap 
#> ✔ BL6_ |     500 |   170 |       67 |       170 |      100%
#> ✔ MD4_ |     500 |   159 |        7 |       159 |      100%
#> ──────────────────────────────────────────────────────────────────────────

head(slot(res, "meta.data"), 1)
#>                        orig.ident nCount_RNA nFeature_RNA clonotype_id
#> BL6_AAACGGGGTTCTGTTT-1        BL6        202           25         <NA>
#>                        exact_subclonotype_id chains n_chains cdr3 cdr3_nt
#> BL6_AAACGGGGTTCTGTTT-1                    NA   <NA>       NA <NA>    <NA>
#>                        cdr3_length cdr3_nt_length v_gene d_gene j_gene
#> BL6_AAACGGGGTTCTGTTT-1        <NA>           <NA>   <NA>   <NA>   <NA>
#>                        c_gene isotype reads umis productive full_length
#> BL6_AAACGGGGTTCTGTTT-1   <NA>    <NA>  <NA> <NA>       <NA>        <NA>
#>                        paired
#> BL6_AAACGGGGTTCTGTTT-1     NA

# Defining clonotypes
# this is useful if the original clonotype IDs are not consistent across
# datasets, i.e. clonotype1 is not the same for all samples
res <- splen_so |>
  import_vdj(
    vdj_dir = vdj_dirs,
    define_clonotypes = "cdr3_gene"
  )
#> ℹ Loading V(D)J data
#> ✔ Loading V(D)J data [78ms]
#> 
#> ℹ Formatting V(D)J data
#> ✔ Formatting V(D)J data [123ms]
#> 
#> ℹ Defining clonotypes
#> ✔ Defining clonotypes [151ms]
#> 
#> ──────────────────────────────────────────────────────────────────────────
#>          # cells   # VDJ   # paired   # overlap   % overlap 
#> ✔ BL6_ |     500 |   170 |       67 |       170 |      100%
#> ✔ MD4_ |     500 |   159 |        7 |       159 |      100%
#> ──────────────────────────────────────────────────────────────────────────

head(slot(res, "meta.data"), 1)
#>                        orig.ident nCount_RNA nFeature_RNA
#> BL6_AAACGGGGTTCTGTTT-1        BL6        202           25
#>                        exact_subclonotype_id chains n_chains cdr3 cdr3_nt
#> BL6_AAACGGGGTTCTGTTT-1                    NA   <NA>       NA <NA>    <NA>
#>                        cdr3_length cdr3_nt_length v_gene d_gene j_gene
#> BL6_AAACGGGGTTCTGTTT-1        <NA>           <NA>   <NA>   <NA>   <NA>
#>                        c_gene isotype reads umis productive full_length
#> BL6_AAACGGGGTTCTGTTT-1   <NA>    <NA>  <NA> <NA>       <NA>        <NA>
#>                        paired clonotype_id
#> BL6_AAACGGGGTTCTGTTT-1     NA         <NA>

# Include mutation information for each chain
# this information will be included if the file concat_ref.bam is present
# including mutation information will cause data import to be slower
res <- splen_so |>
  import_vdj(
    vdj_dir = vdj_dirs,
    include_mutations = TRUE
  )
#> ℹ Loading V(D)J data
#> ✔ Loading V(D)J data [349ms]
#> 
#> ℹ Calculating mutation frequencies
#> ✔ Calculating mutation frequencies [490ms]
#> 
#> ℹ Formatting V(D)J data
#> ✔ Formatting V(D)J data [234ms]
#> 
#> ──────────────────────────────────────────────────────────────────────────
#>          # cells   # VDJ   # paired   # overlap   % overlap 
#> ✔ BL6_ |     500 |   170 |       67 |       170 |      100%
#> ✔ MD4_ |     500 |   159 |        7 |       159 |      100%
#> ──────────────────────────────────────────────────────────────────────────

head(slot(res, "meta.data"), 1)
#>                        orig.ident nCount_RNA nFeature_RNA clonotype_id
#> BL6_AAACGGGGTTCTGTTT-1        BL6        202           25         <NA>
#>                        exact_subclonotype_id chains n_chains cdr3 cdr3_nt
#> BL6_AAACGGGGTTCTGTTT-1                    NA   <NA>       NA <NA>    <NA>
#>                        cdr3_length cdr3_nt_length v_gene d_gene j_gene
#> BL6_AAACGGGGTTCTGTTT-1        <NA>           <NA>   <NA>   <NA>   <NA>
#>                        c_gene isotype reads umis productive full_length
#> BL6_AAACGGGGTTCTGTTT-1   <NA>    <NA>  <NA> <NA>       <NA>        <NA>
#>                        paired v_ins v_del v_mis d_ins d_del d_mis j_ins
#> BL6_AAACGGGGTTCTGTTT-1     NA  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
#>                        j_del j_mis c_ins c_del c_mis all_ins all_del
#> BL6_AAACGGGGTTCTGTTT-1  <NA>  <NA>  <NA>  <NA>  <NA>    <NA>    <NA>
#>                        all_mis vd_ins vd_del dj_ins dj_del v_mis_freq
#> BL6_AAACGGGGTTCTGTTT-1    <NA>   <NA>   <NA>   <NA>   <NA>       <NA>
#>                        d_mis_freq j_mis_freq c_mis_freq all_mis_freq
#> BL6_AAACGGGGTTCTGTTT-1       <NA>       <NA>       <NA>         <NA>