RNA Block - Problem Set 28

Published

October 21, 2024

library(BSgenome.Hsapiens.UCSC.hg19)
Loading required package: BSgenome
Loading required package: BiocGenerics

Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
    tapply, union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4

Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':

    findMatches
The following objects are masked from 'package:base':

    expand.grid, I, unname
Loading required package: IRanges
Loading required package: GenomeInfoDb
Loading required package: GenomicRanges
Loading required package: Biostrings
Loading required package: XVector

Attaching package: 'Biostrings'
The following object is masked from 'package:base':

    strsplit
Loading required package: BiocIO
Loading required package: rtracklayer
Warning: replacing previous import 'S4Arrays::makeNindexFromArrayViewport' by
'DelayedArray::makeNindexFromArrayViewport' when loading 'SummarizedExperiment'

Attaching package: 'rtracklayer'
The following object is masked from 'package:BiocIO':

    FileForFormat
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
Loading required package: GenomicFeatures
Loading required package: AnnotationDbi
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.
library(annotatr)
library(org.Hs.eg.db)
Loading required package: ggplot2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::%within%() masks IRanges::%within%()
✖ dplyr::collapse()     masks Biostrings::collapse(), IRanges::collapse()
✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
✖ purrr::compact()      masks XVector::compact()
✖ dplyr::desc()         masks IRanges::desc()
✖ tidyr::expand()       masks S4Vectors::expand()
✖ dplyr::filter()       masks stats::filter()
✖ dplyr::first()        masks S4Vectors::first()
✖ dplyr::lag()          masks stats::lag()
✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
✖ dplyr::rename()       masks S4Vectors::rename()
✖ lubridate::second()   masks S4Vectors::second()
✖ lubridate::second<-() masks S4Vectors::second<-()
✖ dplyr::select()       masks AnnotationDbi::select()
✖ dplyr::slice()        masks XVector::slice(), IRanges::slice()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: 'cowplot'

The following object is masked from 'package:lubridate':

    stamp

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

Attaching package: 'gt'

The following object is masked from 'package:cowplot':

    as_gtable
here() starts at /home/runner/work/molb-7950/molb-7950

Problem Set

Total points: 20. Q1 - 10 pts, Q2 - 10 points each.

Exercises: 1. Identify the transcript regions most enriched for binding by the RBP ZFP36. 2. Within that preferentially bound region, identify which 5mers ZFP36 most likes to bind to.

Remember all the data is available here: rag-tag ENCODE

We are looking for an ZFP36 PAR-CLIP corresponding to this SRA (short-read archive) ID: SRR1046759

1. Determine the annotation region most enriched for ZFP36 binding sites.

To do so you will need to annotate the ZFP36 binding sites and a randomized set of binding sites (based on ZFP36 binding sites) to determine the which binding region is preferred by ZFP36 binding.

1a. Build annotation database

# What annotation categories are available?
possible_annotations <- builtin_annotations()

# Keep only those containing "hg19"
hg19_annots <- grep("hg19_genes", possible_annotations, value = T)


# let's keep 5' utr, cds, intron, 3' utr and intergenic
my_hg19_annots <- hg19_annots[c(3, 4, 7, 10, 11)]


# build the annotation database
annotations <- build_annotations(genome = "hg19", annotations = my_hg19_annots)
'select()' returned 1:1 mapping between keys and columns
Building promoters...
Building 1to5kb upstream of TSS...
Building intergenic...
Building cds...
Building 5UTRs...
Building 3UTRs...
Building exons...
Building introns...

1b. Annotate the ZFP36 binding sites

# We are importing a bed file directly from github for zfp36 binding sites
zfp36_regions <- read_regions(
  con = "https://raw.githubusercontent.com/BIMSBbioinfo/RCAS_meta-analysis/master/rbp-binding-sites/SRR1046759.clusters.bed",
  genome = "hg19", format = "bed"
)


# annotate zfp36 binding sites and convert to df
zfp36_annot <-
  annotate_regions(
    regions = zfp36_regions,
    annotations = annotations,
    ignore.strand = FALSE,
    quiet = FALSE
  ) %>%
  data.frame()
Annotating...
# We will keep only columns we need and collapse the redundant info
myInfo <- c("seqnames", "start", "end", "width", "strand", "annot.symbol", "annot.type")

zfp36_annot <- zfp36_annot[, myInfo] %>%
  unique()

# Just getting rid of the "hg19_genes_" string to simplify `annot.type`
zfp36_annot$annot.type <- gsub("hg19_genes_", "", zfp36_annot$annot.type)

table(zfp36_annot$annot.type)

     3UTRs      5UTRs        cds intergenic    introns 
      1358         20         41        373        520 

1c. Annotate the randomized binding sites

random_regions <- randomize_regions(regions = zfp36_regions, allow.overlaps = TRUE, per.chromosome = TRUE)
Randomizing regions...
random_annot <- annotate_regions(
  regions = random_regions,
  annotations = annotations,
  ignore.strand = FALSE,
  quiet = FALSE
) %>%
  data.frame()
Annotating...
random_annot <- random_annot[, myInfo] %>%
  unique()

random_annot$annot.type <- gsub("hg19_genes_", "", random_annot$annot.type)

1d. Determine which regions exhibit more than expected ZFP36 binding sites.

site_dist <- bind_cols(
  region = names(table(zfp36_annot$annot.type)),
  observed = table(zfp36_annot$annot.type),
  expected = table(random_annot$annot.type)
)

site_dist$enrichment <- site_dist$observed / site_dist$expected


site_dist_long <- pivot_longer(site_dist, cols = c(observed, expected, enrichment))
colnames(site_dist_long) <- c("region", "type", "value")

ggplot(site_dist_long %>% filter(type == "enrichment"), aes(y = value, x = region)) +
  geom_bar(stat = "identity") +
  ylab("Observed vs Expected") +
  theme_cowplot()
Don't know how to automatically pick scale for object of type <table>.
Defaulting to continuous.

2. Determine the preferred seqeunces bound by ZFP36.

You have determined which region ZFP36 binding sites are preferred to reside - the 3’ UTR :).
Focusing on 3’ UTR binding sites, determine which sequences ZFP36 binds to compared to background sequences for that region USING 5mers.

Remember the workflow for determine k-mer composition (USING 5mers) for any set of intervals (binding sites or annotation categories).

  1. Create a Granges object for a given annotation category.
  2. Remove duplicated intervals (from diff transcript ids) with reduce.
  3. Retrieve seqeunces using getSeqs
  4. Create a dataframe containing the count and frequency of each 5mer.

2a. Calculate 5mers in ZFP36 binding sites in 3’ UTRs.

# create a Grange for 3' UTRs

zfp36_3utr <- makeGRangesFromDataFrame(df = zfp36_annot %>% filter(annot.type == "3UTRs"), ignore.strand = F, seqnames.field = "seqnames", start.field = "start", end.field = "end", strand.field = "strand", keep.extra.columns = T)


# get sequencees for those coordinates
zfp36_3utr_seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg19, zfp36_3utr)

# count  all 5mer instances per sequence, add all instances, and turn into a dataframe with column name counts

zfp36_3utr_5mer <- oligonucleotideFrequency(x = zfp36_3utr_seqs, width = 5, as.prob = F, simplify.as = "matrix") %>%
  colSums(.) %>%
  as.data.frame()

colnames(zfp36_3utr_5mer) <- "zfp36_utr_count"

zfp36_3utr_5mer$zfp36_utr_freq <- zfp36_3utr_5mer$zfp36_utr_count / sum(zfp36_3utr_5mer$zfp36_utr_count)

2b. Calculate 5mers in 3’ UTRs.

# create a Grange for 3' UTRs
threeUTR <- GenomicRanges::reduce(annotations[annotations$type %in% my_hg19_annots[4]])

# get  sequencees for those coordinates
threeUTR_seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg19, threeUTR)

# count  all 5mer instances per sequence, add all instances, and turn into a dataframe with column name counts
utr3_5mer <- oligonucleotideFrequency(x = threeUTR_seqs, width = 5, as.prob = F, simplify.as = "matrix") %>%
  colSums(.) %>%
  as.data.frame()

colnames(utr3_5mer) <- "utr_count"

utr3_5mer$utr_freq <- utr3_5mer$utr_count / sum(utr3_5mer$utr_count)

2c. Compare 5mer frequencies in ZFP36 binding sites vs background.

Label 5mers that are 8x, remember log2(8) = 3, enriched above background.

# check if rownames are identical and stop if they are  not
stopifnot(identical(rownames(zfp36_3utr_5mer), rownames(utr3_5mer)))

utr3_df <- bind_cols(zfp36_3utr_5mer, utr3_5mer)

utr3_df$zfp36_enrich <- utr3_df$zfp36_utr_freq / utr3_df$utr_freq


# scatter plot
p_scat <- ggplot(data = utr3_df, aes(y = zfp36_utr_freq, x = utr_freq)) +
  geom_point(color = ifelse(utr3_df$zfp36_enrich > 8, "red", "black")) +
  ylab("ZFP36 3'UTR") +
  xlab("3'UTR") +
  geom_abline(intercept = 0, slope = 1) +
  geom_text_repel(aes(label = ifelse(zfp36_enrich > 8,
    rownames(utr3_df),
    ""
  ))) +
  theme_cowplot()

# print top 10
utr3_df %>%
  rownames_to_column(var = "5mer") %>%
  arrange(-zfp36_enrich) %>%
  top_n(n = 10) %>%
  gt()
Selecting by zfp36_enrich
5mer zfp36_utr_count zfp36_utr_freq utr_count utr_freq zfp36_enrich
TTATT 1015 0.032996327 83943 0.002537075 13.005656
ATTTA 856 0.027827444 71276 0.002154230 12.917580
TATTT 1278 0.041546114 109891 0.003321322 12.508909
TTTAT 1015 0.032996327 91054 0.002751996 11.989960
CTATT 304 0.009882644 34851 0.001053329 9.382294
TATAT 539 0.017522187 61994 0.001873693 9.351684
ATTAT 449 0.014596405 52041 0.001572876 9.280074
TATTA 411 0.013361074 47670 0.001440768 9.273579
ATATT 578 0.018790026 67348 0.002035511 9.231108
TTATA 426 0.013848705 52238 0.001578830 8.771498