RNA Block - Problem Set 24

Published

October 21, 2024

Problem Set

Total points: 20. Q1 - 10 pts, Q2,3 - 5 points each.

Perform differential expression analysis comparing DIV28 vs DIV0 (10 pts)
Are axonogenesis and cell cycle genes significantly differentially expressed? If so, in what direction (up/down-regulated)?
Perform GSEA analysis using the Hallmark gene set. Make enrichment plot of HALLMARK_G2M_CHECKPOINT geneset.

Load libraries and generate gene information files (0 pts)

Make sure to run the code chunk below to load required libraries and generate t2g (file for tximport) and gene id to symbol mapping file.

library(biomaRt)
library(tximport)
library(tidyverse)
library(DESeq2)
library(ggrepel)
library(here)
library(cowplot)
library(rstatix)
library(clusterProfiler)
library(enrichplot)
library(msigdbr)
# run the code below to generate t2g file and gene id-symbol mapping file
mart <- biomaRt::useMart(
  "ENSEMBL_MART_ENSEMBL",
  dataset = "mmusculus_gene_ensembl"
)

t2g <- biomaRt::getBM(
  attributes = c(
    "ensembl_transcript_id",
    "ensembl_gene_id",
    "external_gene_name"
  ),
  mart = mart
) |>
  as_tibble()

# maps systematic to common gene names
gene_name_map <- t2g |>
  dplyr::select(-ensembl_transcript_id) |>
  unique()

Q1 Perform differential expression analysis comparing DIV28 vs DIV0

Prepare `metadata` and use `tximport`

metadata <- data.frame(
  sample_id = list.files(here("data/block-rna/salmonouts"),
    pattern = "^DIV"
  ),
  salmon_dirs = list.files(here("data/block-rna/salmonouts"),
    recursive = T,
    pattern = ".gz$",
    full.names = T
  )
) |>
  separate(col = sample_id, into = c("timepoint", "rep"), sep = "\\.", remove = F)

metadata$rep <- gsub(pattern = "Rep", replacement = "", metadata$rep)

rownames(metadata) <- metadata$sample_id

metadata <- metadata |>
  filter(timepoint %in% c("DIV0", "DIV28"))

salmdir <- metadata$salmon_dirs
names(salmdir) <- metadata$sample_id

txi <- tximport(
  files = salmdir,
  type = "salmon",
  tx2gene = t2g,
  dropInfReps = TRUE,
  countsFromAbundance = "lengthScaledTPM"
)

Filter genes and perform `DESeq2`

# examine distribution of TPMs
hist(log2(1 + rowSums(txi$abundance)), breaks = 40)

# decide a cutoff
keepG <- txi$abundance[log2(1 + rowSums(txi$abundance)) > 4.5, ] |>
  rownames()

ddsTxi <- DESeqDataSetFromTximport(
  txi,
  colData = metadata,
  design = ~timepoint
)

# keep genes with sufficient expession
ddsTxi <- ddsTxi[keepG, ]

# run DESeq2
dds <- DESeq(ddsTxi)

# create a dataframe containing results and join w/gene symbols
diff <-
  results(
    dds,
    contrast = c("timepoint", "DIV28", "DIV0")
  ) |>
  # Change this into a dataframe
  as.data.frame() |>
  # Move ensembl gene IDs into their own column
  rownames_to_column(var = "ensembl_gene_id") |>
  # drop unused columns
  dplyr::select(-c(baseMean, lfcSE, stat, pvalue)) |>
  # Merge this with a table relating ensembl_gene_id with gene short names
  inner_join(gene_name_map) |>
  # Rename external_gene_name column
  dplyr::rename(gene = external_gene_name) |>
  as_tibble()

Q2. Are axonogenesis and cell cycle genes significantly differentially expressed? If so, in what direction (up/down-regulated)?

cellcyclegenes <- getBM(
  attributes = c("ensembl_gene_id"),
  filters = c("go_parent_term"),
  values = c("GO:0045787"),
  mart = mart
)


axongenes <- getBM(
  attributes = c("ensembl_gene_id"),
  filters = c("go_parent_term"),
  values = c("GO:0007409"),
  mart = mart
)

diff_paths <-
  diff |>
  mutate(
    annot = case_when(
      ensembl_gene_id %in% axongenes$ensembl_gene_id ~ "axonogenesis",
      ensembl_gene_id %in% cellcyclegenes$ensembl_gene_id ~ "cellcycle",
      .default = "none"
    )
  ) |>
  drop_na() # drop na

# Reorder these for plotting purposes
diff_paths$annot <-
  factor(
    diff_paths$annot,
    levels = c("none", "axonogenesis", "cellcycle")
  )

# calculate and report p-value using wilcox test
pvals <- wilcox_test(
  data = diff_paths,
  log2FoldChange ~ annot,
  ref.group = "none"
)

# make a plot of the LFC (y-axis) ~ pathways (x-axis)
ggplot(
  diff_paths,
  aes(
    x = annot,
    y = log2FoldChange,
    fill = annot
  )
) +
  labs(
    x = "Gene class",
    y = "DIV28/DIV0, log2"
  ) +
  geom_hline(
    yintercept = 0,
    color = "gray",
    linetype = "dashed"
  ) +
  geom_boxplot(
    notch = TRUE,
    outlier.shape = NA
  ) +
  ylim(-5, 5) +
  theme_cowplot()

Q3. Perform GSEA analysis using the Hallmark gene set.

# retrieve mouse hallmark gene set from msigdb
mouse_hallmark <- msigdbr(species = "Mus musculus") %>%
  filter(gs_cat == "H") %>%
  dplyr::select(gs_name, gene_symbol)


# create a list of gene LFCs
rankedgenes <- diff %>% pull(log2FoldChange)

# add symbols as names of the list
names(rankedgenes) <- diff$gene

# sort by LFC
rankedgenes <- sort(rankedgenes, decreasing = TRUE)

# deduplicate
rankedgenes <- rankedgenes[!duplicated(names(rankedgenes))]


# run gsea
div28vs0 <- GSEA(
  geneList = rankedgenes,
  eps = 0,
  pAdjustMethod = "fdr",
  pvalueCutoff = .05,
  minGSSize = 20,
  maxGSSize = 1000,
  TERM2GENE = mouse_hallmark
)


# plot "HALLMARK_G2M_CHECKPOINT"
gseaplot(x = div28vs0, geneSetID = "HALLMARK_G2M_CHECKPOINT")