RNA Block - Problem Set 27

Author

Your Name Here

Published

October 21, 2024

library(BSgenome.Hsapiens.UCSC.hg19)
Loading required package: BSgenome
Loading required package: BiocGenerics

Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
    tapply, union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4

Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':

    findMatches
The following objects are masked from 'package:base':

    expand.grid, I, unname
Loading required package: IRanges
Loading required package: GenomeInfoDb
Loading required package: GenomicRanges
Loading required package: Biostrings
Loading required package: XVector

Attaching package: 'Biostrings'
The following object is masked from 'package:base':

    strsplit
Loading required package: BiocIO
Loading required package: rtracklayer
Warning: replacing previous import 'S4Arrays::makeNindexFromArrayViewport' by
'DelayedArray::makeNindexFromArrayViewport' when loading 'SummarizedExperiment'

Attaching package: 'rtracklayer'
The following object is masked from 'package:BiocIO':

    FileForFormat
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
Loading required package: GenomicFeatures
Loading required package: AnnotationDbi
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.
library(annotatr)
library(org.Hs.eg.db)
Loading required package: ggplot2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::%within%() masks IRanges::%within%()
✖ dplyr::collapse()     masks Biostrings::collapse(), IRanges::collapse()
✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
✖ purrr::compact()      masks XVector::compact()
✖ dplyr::desc()         masks IRanges::desc()
✖ tidyr::expand()       masks S4Vectors::expand()
✖ dplyr::filter()       masks stats::filter()
✖ dplyr::first()        masks S4Vectors::first()
✖ dplyr::lag()          masks stats::lag()
✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
✖ dplyr::rename()       masks S4Vectors::rename()
✖ lubridate::second()   masks S4Vectors::second()
✖ lubridate::second<-() masks S4Vectors::second<-()
✖ dplyr::select()       masks AnnotationDbi::select()
✖ dplyr::slice()        masks XVector::slice(), IRanges::slice()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: 'cowplot'

The following object is masked from 'package:lubridate':

    stamp

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

Attaching package: 'gt'

The following object is masked from 'package:cowplot':

    as_gtable
here() starts at /home/runner/work/molb-7950/molb-7950

Problem Set

Total points: 20. Q1 - 10 pts, Q2 - 10 points each.

Exercises: 1. Identify the transcript regions most enriched for binding by the RBP ZFP36. 2. Within that preferentially bound region, identify which 5mers ZFP36 most likes to bind to.

Remember all the data is available here: rag-tag ENCODE

We are looking for an ZFP36 PAR-CLIP corresponding to this SRA (short-read archive) ID: SRR1046759

1. Determine the annotation region most enriched for ZFP36 binding sites.

To do so you will need to annotate the ZFP36 binding sites and a randomized set of binding sites (based on ZFP36 binding sites) to determine the which binding region is preferred by ZFP36 binding.

1a. Build annotation database

# What annotation categories are available?
possible_annotations <- builtin_annotations()

# Keep only those containing "hg19"
hg19_annots <- grep("hg19_genes", possible_annotations, value = T)


# let's keep 5' utr, cds, intron, 3' utr and intergenic
my_hg19_annots <- hg19_annots[c(3,4,7,10,11)]


# build the annotation database
annotations <- build_annotations(genome = 'hg19', annotations = my_hg19_annots)

1b. Annotate the ZFP36 binding sites

# We are importing a bed file directly from github for zfp36 binding sites
zfp36_regions <- read_regions(con = "https://raw.githubusercontent.com/BIMSBbioinfo/RCAS_meta-analysis/master/rbp-binding-sites/SRR1046759.clusters.bed",
                     genome = 'hg19', format = 'bed')


# annotate zfp36 binding sites and convert to df
zfp36_annot <-
  annotate_regions(??) %>%
  data.frame()

# We will keep only columns we need and collapse the redundant info
myInfo <- c("seqnames","start","end","width","strand","annot.symbol","annot.type")

zfp36_annot <- zfp36_annot[,??] %>%
  unique() 

# Just getting rid of the "hg19_genes_" string to simplify `annot.type`

zfp36_annot$annot.type <- gsub("hg19_genes_","",zfp36_annot$annot.type)

table(zfp36_annot$annot.type)

1c. Annotate the randomized binding sites

random_regions <- randomize_regions(??)

random_annot <- annotate_regions(??) %>%
  data.frame()

random_annot <- random_annot[,??] %>%
  unique() 

random_annot$annot.type <- gsub("hg19_genes_","",random_annot$annot.type)

1d. Determine which regions exhibit more than expected ZFP36 binding sites.

site_dist <- bind_cols(
  region=??(??(??$annot.type)),
  observed=??(??$annot.type),
  expected=??(??$annot.type)
          )

site_dist$enrichment <- site_dist$observed/site_dist$expected


site_dist_long <- pivot_longer(site_dist,cols = c(??,??,??))

colnames(site_dist_long) <- c("region","type","value")

ggplot(?? %>% filter(type=="??"), aes(y = ??, x = ??)) +
  geom_bar(stat="identity") +
  ylab("Observed vs Expected") +
  theme_cowplot()

2. Determine the preferred seqeunces bound by ZFP36.

You have determined which region ZFP36 binding sites are preferred to reside - the 3’ UTR :).
Focusing on 3’ UTR binding sites, determine which sequences ZFP36 binds to compared to background sequences for that region USING 5mers.

Remember the workflow for determine k-mer composition (USING 5mers) for any set of intervals (binding sites or annotation categories).

  1. Create a Granges object for a given annotation category.
  2. Remove duplicated intervals (from diff transcript ids) with reduce.
  3. Retrieve seqeunces using getSeqs
  4. Create a dataframe containing the count and frequency of each 5mer.

2a. Calculate 5mers in ZFP36 binding sites in 3’ UTRs.

# create a Grange for zfp36 3' UTR sites 

zfp36_3utr <- makeGRangesFromDataFrame(df = ?? %>% filter(annot.type=="??"), ignore.strand = F, seqnames.field = "seqnames", start.field = "start", end.field = "end", strand.field = "strand", keep.extra.columns = T)


# get sequencees for those coordinates
zfp36_3utr_seqs <- getSeq(??, ??)

# count  all 5mer instances per sequence, add all instances, and turn into a dataframe with column name counts

zfp36_3utr_5mer <- ??(x = ??,
                      width = ??,
                      as.prob = F,
                      simplify.as="matrix") %>%
  colSums(.) %>%
  as.data.frame()

colnames(zfp36_3utr_5mer) <- "zfp36_utr_count"

zfp36_3utr_5mer$zfp36_utr_freq <- zfp36_3utr_5mer$zfp36_utr_count/sum(zfp36_3utr_5mer$zfp36_utr_count)

2b. Calculate 5mers in 3’ UTRs.

# create a Grange for 3' UTRs 
threeUTR <- GenomicRanges::reduce(annotations[annotations$type %in% my_hg19_annots[??]])

# get  sequencees for those coordinates
threeUTR_seqs <- getSeq(??, ??)

# count  all 5mer instances per sequence, add all instances, and turn into a dataframe with column name counts
utr3_5mer <- oligonucleotideFrequency(
  x = ??,
  width = ??,
  as.prob = F,
  simplify.as="matrix") %>%
  colSums(.) %>%
  as.data.frame()

colnames(utr3_5mer) <- "utr_count"

utr3_5mer$utr_freq <- utr3_5mer$utr_count/sum(utr3_5mer$utr_count)

2c. Compare 5mer frequencies in ZFP36 binding sites vs background.

Label 5mers that are 8x, remember log2(8) = 3, enriched above background.

# check if rownames are identical and stop if they are  not
stopifnot(identical(rownames(zfp36_3utr_5mer),rownames(utr3_5mer)))

utr3_df <- bind_cols(??,??)

utr3_df$zfp36_enrich <- utr3_df$zfp36_utr_freq/utr3_df$utr_freq


# scatter plot
ggplot(data = ??,
       aes(y = ??, x = ??)) +
  geom_point(color = ifelse(utr3_df$?? > 8, "red", "black")) +
  ylab("ZFP36 3'UTR") + xlab("3'UTR") +
  geom_abline(intercept = 0, slope = 1) +
  geom_text_repel(aes(label=ifelse(?? > 8,
                                   rownames(utr3_df),
                             ""))) +
  theme_cowplot()  

# print top 10 with gt
?? %>%
  rownames_to_column(var = "5mer") %>%
  arrange(-??) %>%
  top_n(n = 10) %>%
  gt()