library(biomaRt)
library(tximport)
library(tidyverse)
library(DESeq2)
library(ggrepel)
library(here)
library(cowplot)
library(rstatix)
library(clusterProfiler)
library(enrichplot)
library(msigdbr)
# run the code below to generate t2g file and gene id-symbol mapping file
mart <- biomaRt::useMart(
"ENSEMBL_MART_ENSEMBL",
dataset = "mmusculus_gene_ensembl"
)
t2g <- biomaRt::getBM(
attributes = c(
"ensembl_transcript_id",
"ensembl_gene_id",
"external_gene_name"
),
mart = mart
) |>
as_tibble()
# maps systematic to common gene names
gene_name_map <- t2g |>
dplyr::select(-ensembl_transcript_id) |>
unique()
RNA Block - Problem Set 24
Problem Set
Total points: 20. Q1 - 10 pts, Q2,3 - 5 points each.
Perform differential expression analysis comparing DIV28 vs DIV0 (10 pts)
Are axonogenesis and cell cycle genes significantly differentially expressed? If so, in what direction (up/down-regulated)?
Perform GSEA analysis using the Hallmark gene set. Make enrichment plot of
HALLMARK_G2M_CHECKPOINT
geneset.
Load libraries and generate gene information files (0 pts)
Make sure to run the code chunk below to load required libraries and generate t2g
(file for tximport
) and gene id to symbol mapping file.
Q1 Perform differential expression analysis comparing DIV28 vs DIV0
Prepare metadata
and use tximport
metadata <- data.frame(
sample_id = list.files(here("data/block-rna/salmonouts"),
pattern = "^DIV"
),
salmon_dirs = list.files(here("data/block-rna/salmonouts"),
recursive = T,
pattern = ".gz$",
full.names = T
)
) |>
separate(col = sample_id, into = c("timepoint", "rep"), sep = "\\.", remove = F)
metadata$rep <- gsub(pattern = "Rep", replacement = "", metadata$rep)
rownames(metadata) <- metadata$sample_id
metadata <- metadata |>
filter(timepoint %in% c("DIV0", "DIV28"))
salmdir <- metadata$salmon_dirs
names(salmdir) <- metadata$sample_id
txi <- tximport(
files = salmdir,
type = "salmon",
tx2gene = t2g,
dropInfReps = TRUE,
countsFromAbundance = "lengthScaledTPM"
)
Filter genes and perform DESeq2
# examine distribution of TPMs
hist(log2(1 + rowSums(txi$abundance)), breaks = 40)
# decide a cutoff
keepG <- txi$abundance[log2(1 + rowSums(txi$abundance)) > 4.5, ] |>
rownames()
ddsTxi <- DESeqDataSetFromTximport(
txi,
colData = metadata,
design = ~timepoint
)
# keep genes with sufficient expession
ddsTxi <- ddsTxi[keepG, ]
# run DESeq2
dds <- DESeq(ddsTxi)
# create a dataframe containing results and join w/gene symbols
diff <-
results(
dds,
contrast = c("timepoint", "DIV28", "DIV0")
) |>
# Change this into a dataframe
as.data.frame() |>
# Move ensembl gene IDs into their own column
rownames_to_column(var = "ensembl_gene_id") |>
# drop unused columns
dplyr::select(-c(baseMean, lfcSE, stat, pvalue)) |>
# Merge this with a table relating ensembl_gene_id with gene short names
inner_join(gene_name_map) |>
# Rename external_gene_name column
dplyr::rename(gene = external_gene_name) |>
as_tibble()
Q2. Are axonogenesis and cell cycle genes significantly differentially expressed? If so, in what direction (up/down-regulated)?
cellcyclegenes <- getBM(
attributes = c("ensembl_gene_id"),
filters = c("go_parent_term"),
values = c("GO:0045787"),
mart = mart
)
axongenes <- getBM(
attributes = c("ensembl_gene_id"),
filters = c("go_parent_term"),
values = c("GO:0007409"),
mart = mart
)
diff_paths <-
diff |>
mutate(
annot = case_when(
ensembl_gene_id %in% axongenes$ensembl_gene_id ~ "axonogenesis",
ensembl_gene_id %in% cellcyclegenes$ensembl_gene_id ~ "cellcycle",
.default = "none"
)
) |>
drop_na() # drop na
# Reorder these for plotting purposes
diff_paths$annot <-
factor(
diff_paths$annot,
levels = c("none", "axonogenesis", "cellcycle")
)
# calculate and report p-value using wilcox test
pvals <- wilcox_test(
data = diff_paths,
log2FoldChange ~ annot,
ref.group = "none"
)
# make a plot of the LFC (y-axis) ~ pathways (x-axis)
ggplot(
diff_paths,
aes(
x = annot,
y = log2FoldChange,
fill = annot
)
) +
labs(
x = "Gene class",
y = "DIV28/DIV0, log2"
) +
geom_hline(
yintercept = 0,
color = "gray",
linetype = "dashed"
) +
geom_boxplot(
notch = TRUE,
outlier.shape = NA
) +
ylim(-5, 5) +
theme_cowplot()
Q3. Perform GSEA analysis using the Hallmark gene set.
# retrieve mouse hallmark gene set from msigdb
mouse_hallmark <- msigdbr(species = "Mus musculus") %>%
filter(gs_cat == "H") %>%
dplyr::select(gs_name, gene_symbol)
# create a list of gene LFCs
rankedgenes <- diff %>% pull(log2FoldChange)
# add symbols as names of the list
names(rankedgenes) <- diff$gene
# sort by LFC
rankedgenes <- sort(rankedgenes, decreasing = TRUE)
# deduplicate
rankedgenes <- rankedgenes[!duplicated(names(rankedgenes))]
# run gsea
div28vs0 <- GSEA(
geneList = rankedgenes,
eps = 0,
pAdjustMethod = "fdr",
pvalueCutoff = .05,
minGSSize = 20,
maxGSSize = 1000,
TERM2GENE = mouse_hallmark
)
# plot "HALLMARK_G2M_CHECKPOINT"
gseaplot(x = div28vs0, geneSetID = "HALLMARK_G2M_CHECKPOINT")