<- biomaRt::useMart(
mart "ENSEMBL_MART_ENSEMBL",
dataset = "mmusculus_gene_ensembl"
)
<- biomaRt::getBM(
t2g attributes = c(
"ensembl_transcript_id",
"ensembl_gene_id",
"external_gene_name"
),mart = mart
|>
) as_tibble()
# maps systematic to common gene names
<- t2g |>
gene_name_map ::select(-ensembl_transcript_id) |>
dplyr# kill redundant ??()
RNAseq DE
Prepare t2g
We are going to imagine that we have only two conditions: DIV0 and DIV7 with DESeq2
.
The first thing we need to do is read in the data again and move from transcript-level expression values to gene-level expression values with tximport
. Let’s use biomaRt
to get a table that relates gene and transcript IDs.
Prepare metdata
and import
Now we can read in the transcript-level data and collapse to gene-level data with tximport
<- data.frame(
metadata sample_id = list.files(here("??"),
pattern = "quant.sf"),
salmon_dirs = list.files(here("??"),
recursive = T,
pattern = ".gz$",
full.names = T)
|>
) separate(col = sample_id, into = c("timepoint","rep"), sep = "\\.", remove = F)
$rep <- gsub(pattern = "Rep", replacement = "", metadata$rep)
metadata
rownames(metadata) <- metadata$sample_id
#keep only samples we want
<- metadata |>
metadata filter(timepoint %in% c("??","??"))
<- metadata$??
salmdir names(salmdir) <- metadata$??
<- tximport(
txi files = salmdir,
type = "salmon",
tx2gene = t2g,
dropInfReps = TRUE,
countsFromAbundance = "lengthScaledTPM"
)
Filter lowly expressed genes
Create DESeq object
There are essentially two steps to using DESeq2
. The first involves creating a DESeqDataSet
from your data. Luckily, if you have a tximport
object, which we do in the form of txi
, then this becomes easy.
# metadata$timepoint <- as.factor(metadata$timepoint)
<- DESeqDataSetFromTximport(
ddsTxi
??,colData = metadata,
design = ~??
)
# keep genes with sufficient expession
<- ddsTxi[??,] ddsTxi
Design formula
You can see that DESeqDataSetFromTximport
wants three things. The first is our tximport
object. The second is the dataframe we made that relates samples and conditions (or in this case timepoints). The last is something called a design formula. A design formula contains all of the variables that will go into DESeq2
’s model. The formula starts with a tilde and then has variables separated by a plus sign think lm()
. It is common practice, and in fact basically required with DESeq2
, to put the variable of interest last. In our case, that’s trivial because we only have one: timepoint. So our design formula is very simple:
design = ~ timepoint
Your design formula should ideally include all of the sources of variation in your data. For example, let’s say that here we thought there was a batch effect with the replicates. Maybe all of the Rep1 samples were prepped and sequenced on a different day than the Rep2 samples and so on. We could potentially account for this in DESeq2
’s model with the following forumula:
design = ~ rep + timepoint
Here, timepoint is still the variable of interest, but we are controlling for differences that arise due to differences in replicates.
Run DESeq2
We can see here that DESeq2
is taking the counts produced by tximport
for gene quantifications. There are 52346 genes (rows) here and 7 samples (columns). Now using this ddsTxi object, we can run DESeq2
.
<- DESeq(??) dds
There are many useful things in this dds
object. Take a look at the vignette for a full explanation. Including info on many more tests and analyses that can be done with DESeq2
.
The results can be accessed using the results()
function. We will use the contrast
argument here. DESeq2
reports changes in RNA abundance between two samples as a log2FoldChange
. But, it’s often not clear what the numerator and denominator of that fold change ratio…it could be either DIV7/DIV0 or DIV0/DIV7.
The lexographically first condition will be the numerator. I find it easier to explicitly specify what the numerator and denominator of this ratio are using the contrast
argument. The contrast
argument can be used to implement more complicated design formula. Remember our design formula that accounted for potential differences due to Replicate batch effects:
~ replicate + timepoint
DESeq2
will account for differences between replicates here to find differences between timepoints.
Contrasts to get results
# For contrast, we give three strings: the factor we are interested in, the numerator, and the denominator
results(dds, contrast = c("timepoint", "??", "??"))
The columns we are most interested in are log2FoldChange and padj.
log2FoldChange is self-explanatory. padj is the Benjamini-Hochberg corrected pvalue for a test asking if the expression of this gene is different between the two conditions.
Cleanup results
Let’s do a little work on this data frame to make it slightly cleaner and more informative.
<-
diff results(
dds,contrast = c("timepoint", "DIV7", "DIV0")
|>
) # Change this into a dataframe
|>
??() # Move ensembl gene IDs into their own column
var = "ensembl_gene_id") |>
??(# drop unused columns
::select(-c(baseMean, lfcSE, stat, pvalue)) |>
dplyr# Merge this with a table relating ensembl_gene_id with gene symbols
inner_join(??) |>
# Rename external_gene_name column
::rename(gene = external_gene_name) |>
dplyras_tibble()
How many are significant
OK now we have a table of gene expression results. How many genes are significantly up/down regulated between these two timepoints? We will use 0.01 as an FDR (p.adj) cutoff.
# number of upregulated genes
nrow(filter(diff, padj < ?? & log2FoldChange > ??))
# number of downregulated genes
nrow(filter(diff, padj < ?? & log2FoldChange < ??))
Volcano plot of differential expression results
Let’s make a volcano plot of these results.
# meets the FDR cutoff
<-
diff_sig mutate(
diff,sig = case_when(
< ?? ~ "yes",
padj .default = "no"
)|>
) # if a gene did not meet expression cutoffs that DESeq2 automatically does, it gets a pvalue of NA
drop_na()
ggplot(
diff_sig,aes(
x = ??,
y = ??,
color = sig
)+
) geom_point(alpha = 0.2) +
labs(
x = "DIV7 expression / DIV0 expression, log2",
y = "-log10(FDR)"
+
) scale_color_manual(
values = c("black", "red"),
labels = c("NS", "FDR < 0.01"),
name = ""
+
) theme_cowplot()
Change the LFC threshold
In addition to an FDR cutoff, let’s also apply a log2FoldChange cutoff. This will of course be more conservative, but will probably give you a more confident set of genes.
# Is the expression of the gene at least 3-fold different?
<-
diff_lfc results(dds,
contrast = c("timepoint", "DIV7", "DIV0"),
lfcThreshold = log2(??)
|>
) # Change this into a dataframe
as.data.frame() |>
# Move ensembl gene IDs into their own column
rownames_to_column(var = "ensembl_gene_id") |>
# drop unused columns
::select(-c(baseMean, lfcSE, stat, pvalue)) |>
dplyr# Merge this with a table relating ensembl_gene_id with gene short names
inner_join(gene_name_map) |>
# Rename external_gene_name column
::rename(gene = external_gene_name) |>
dplyras_tibble()
# number of upregulated genes
nrow(
filter(
< 0.01 & log2FoldChange > 0
diff_lfc, padj
)
)
# number of downregulated genes
nrow(
filter(
< 0.01 & log2FoldChange < 0
diff_lfc, padj
) )
Change the LFC threshold
<-
diff_lfc_sig mutate(
diff_lfc,sig = case_when(
< 0.01 ~ "yes",
padj .default = "no"
)|>
) drop_na()
# look at some specific genes
|>
diff_lfc_sig filter(?? %in%
c("Bdnf","Dlg4","Klf4","Sox2")) |>
gt()
Filtered Volcano
Plotting the expression of single genes
Sometimes we will have particular marker genes that we might want to highlight to give confidence that the experiment worked as expected. We can plot the expression of these genes in each replicate. Let’s plot the expression of two pluripotency genes (which we expect to decrease) and two neuronal genes (which we expect to increase).
So what is the value that we would plot? We could use the ‘normalized counts’ value provided by DESeq2
. However, remember there is not length calculation so it is difficult to compare accross genes.
A more interpretable value to plot might be TPM, since TPM is length-normalized. Let’s say a gene was expressed at 500 TPM. Right off the bat, I know generally what kind of expression that reflects (pretty high).
Get TPMs
Let’s plot the expression of Klf4, Sox2, Bdnf, and Dlg4 in our samples.
<- txi$abundance |>
tpms as.data.frame() |>
rownames_to_column(var = "ensembl_gene_id") |>
inner_join(??) |> # add symbols
::rename(gene = external_gene_name) |>
dplyr# Filter for genes we are interested in
filter(gene %in% c("Klf4", "Sox2", "Bdnf", "Dlg4")) |>
pivot_longer(-c(ensembl_gene_id, gene)) |>
separate(col = name, into = c("condition","rep"), sep = "\\.")
gt(tpms)
Now plot
ggplot(
tpms,aes(
x = ??,
y = ??,
color = ??
)+
) geom_jitter(size = 2, width = .25) +
labs(
x = "",
y = "TPM"
+
) theme_cowplot() +
scale_color_manual(values = c("blue", "red")) +
facet_wrap(~gene, scales = "free_y")
How about pathways?
Say that instead of plotting individual genes we wanted to ask whether a whole class of genes are going up or down. We can do that by retrieving all genes that belong to a particular gene ontology term.
There are three classes of genes we will look at here:
- Maintenance of pluripotency (GO:0019827)
- Positive regulation of the cell cycle (GO:0045787)
- Neuronal differentitaion (GO:0030182)
Retrieve pathway information
We can use biomaRt
to get all genes that belong to each of these categories. Think of it like doing a gene ontology enrichment analysis in reverse.
pluripotencygenes <- getBM(
attributes = c("ensembl_gene_id"),
filters = c("go_parent_term"),
values = c("GO:0019827"),
mart = mart
)
cellcyclegenes <- getBM(
attributes = c("ensembl_gene_id"),
filters = c("go_parent_term"),
values = c("GO:0045787"),
mart = mart
)
neurongenes <- getBM(
attributes = c("ensembl_gene_id"),
filters = c("go_parent_term"),
values = c("GO:0030182"),
mart = mart
)
# pathway <- bind_rows(pluripotencygenes,
# cellcyclegenes,
# neurongenes
# )
#
# pathway$path <- c(
# rep("pluri",nrow(pluripotencygenes)),
# rep("cellcycle",nrow(cellcyclegenes)),
# rep("neuron",nrow(neurongenes))
# )
#
# write_csv(x = pathway, file = here("data","block-rna","pathwaygenes.csv.gz"))
Add pathway information to results
You can see that these items are one-column dataframes that have the column name ‘ensembl_gene_id’. We can now go through our results dataframe and add an annotation column that marks whether the gene is in any of these categories.
<-
diff_paths |>
diff_lfc mutate(
annot = case_when(
%in% pluripotencygenes$ensembl_gene_id ~ "pluripotency",
?? %in% cellcyclegenes$ensembl_gene_id ~ "cellcycle",
?? %in% neurongenes$ensembl_gene_id ~ "neurondiff",
?? .default = "none"
)|>
) drop_na() # drop na
# Reorder these for plotting purposes
$annot <-
diff_pathsfactor(
$annot,
diff_pathslevels = c("none", "cellcycle",
"pluripotency", "neurondiff")
)
Are there significant differences?
OK we’ve got our table, now we are going to ask if the log2FoldChange values for the genes in each of these classes are different that what we would expect. So what is the expected value? Well, we have a distribution of log2 fold changes for all the genes that are not in any of these categories. So we will ask if the distribution of log2 fold changes for each gene category is different than that null distribution.
<- rstatix::wilcox_test(data = ??,
pvals ~ ??, ref.group = "none")
??
<- pvals |>
p.pluripotency filter(group2 == "pluripotency") |>
pull(p.adj)
<- pvals |>
p.cellcycle filter(group2 == "cellcycle") |>
pull(p.adj)
<- pvals |>
p.neurondiff filter(group2 == "neurondiff") |>
pull(p.adj)
plot pathway differences
ggplot(
??,aes(
x = ??,
y = ??,
fill = ??
)+
) labs(
x = "Gene class",
y = "DIV7/DIV0, log2"
+
) geom_hline(
yintercept = 0,
color = "gray",
linetype = "dashed"
+
) geom_boxplot(
notch = TRUE,
outlier.shape = NA
+
) theme_cowplot() +
scale_fill_manual(values = c("gray", "red", "blue", "purple"), guide = F) +
scale_x_discrete(
labels = c(
"none", "Cell cycle", "Pluripotency", "Neuron\ndifferentiation"
)+
) ylim(-5, 7) +
# hacky significance bars
annotate("segment", x = 1, xend = 2, y = 4, yend = 4) +
annotate("segment", x = 1, xend = 3, y = 5, yend = 5) +
annotate("segment", x = 1, xend = 4, y = 6, yend = 6) +
annotate("text", x = 1.5, y = 4.4, label = paste0("p = ", p.cellcycle)) +
annotate("text", x = 2, y = 5.4, label = paste0("p = ", p.pluripotency)) +
annotate("text", x = 2.5, y = 6.4, label = paste0("p = ", p.neurondiff))
What if we want to look at pathways in an unbiased way?
We will use Gene Set Enrichment Analysis (GSEA) to determine if pre-defined gene sets (pathways, GO terms, experimentally defined genes) are coordinately up-regulated or down-regulated between the two conditions you are comparing. To run gsea you need 2 things. 1. You list of expressed genes ranked by fold change. 2. Pre-defined gene sets. See MSigDb
PMID: 12808457, 16199517
GSEA examples
Top = upregulated
Bottom = downregulated
Prep GSEA
We need to make a list of all genes and their LFC.
We need to find interesting gene sets.
Run GSEA
# retrieve hallmark gene set from msigdb
<- msigdbr(species = "??") %>%
mouse_hallmark filter(gs_cat == "?") %>% ## halmark
::select(gs_name, gene_symbol)
dplyr
# create a list of gene LFCs
<- diff_lfc %>% pull(??)
rankedgenes
# add symbols as names of the list
names(rankedgenes) <- diff$??
# sort by LFC
<- sort(rankedgenes, decreasing = TRUE)
rankedgenes
# deduplicate
<- rankedgenes[!duplicated(names(rankedgenes))] rankedgenes
Run GSEA
# run gsea
<- GSEA(geneList = ??,
div7vs0 eps = 0,
pAdjustMethod = "fdr",
pvalueCutoff = .05,
minGSSize = 20,
maxGSSize = 1000,
TERM2GENE = ??)
@result |>
div7vs0select(??,??,??) |>
gt()
Plot GSEA
# plot "HALLMARK_G2M_CHECKPOINT"
gseaplot(x = div7vs0, geneSetID = "HALLMARK_G2M_CHECKPOINT")