RNA Block - Problem Set 28

Published

October 21, 2024

library(BSgenome.Hsapiens.UCSC.hg19)

Loading required package: BSgenome

Loading required package: BiocGenerics


Attaching package: 'BiocGenerics'

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
    tapply, union, unique, unsplit, which.max, which.min

Loading required package: S4Vectors

Loading required package: stats4


Attaching package: 'S4Vectors'

The following object is masked from 'package:utils':

    findMatches

The following objects are masked from 'package:base':

    expand.grid, I, unname

Loading required package: IRanges

Loading required package: GenomeInfoDb

Loading required package: GenomicRanges

Loading required package: Biostrings

Loading required package: XVector


Attaching package: 'Biostrings'

The following object is masked from 'package:base':

    strsplit

Loading required package: BiocIO

Loading required package: rtracklayer

Warning: replacing previous import 'S4Arrays::makeNindexFromArrayViewport' by
'DelayedArray::makeNindexFromArrayViewport' when loading 'SummarizedExperiment'


Attaching package: 'rtracklayer'

The following object is masked from 'package:BiocIO':

    FileForFormat

library(TxDb.Hsapiens.UCSC.hg19.knownGene)

Loading required package: GenomicFeatures

Loading required package: AnnotationDbi

Loading required package: Biobase

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

library(annotatr)
library(org.Hs.eg.db)

library(ggrepel)

Loading required package: ggplot2

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::%within%() masks IRanges::%within%()
✖ dplyr::collapse()     masks Biostrings::collapse(), IRanges::collapse()
✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
✖ purrr::compact()      masks XVector::compact()
✖ dplyr::desc()         masks IRanges::desc()
✖ tidyr::expand()       masks S4Vectors::expand()
✖ dplyr::filter()       masks stats::filter()
✖ dplyr::first()        masks S4Vectors::first()
✖ dplyr::lag()          masks stats::lag()
✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
✖ dplyr::rename()       masks S4Vectors::rename()
✖ lubridate::second()   masks S4Vectors::second()
✖ lubridate::second<-() masks S4Vectors::second<-()
✖ dplyr::select()       masks AnnotationDbi::select()
✖ dplyr::slice()        masks XVector::slice(), IRanges::slice()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(cowplot)


Attaching package: 'cowplot'

The following object is masked from 'package:lubridate':

    stamp

library(scales)


Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

library(gt)


Attaching package: 'gt'

The following object is masked from 'package:cowplot':

    as_gtable

library(here)

here() starts at /home/runner/work/molb-7950/molb-7950

Problem Set

Total points: 20. Q1 - 10 pts, Q2 - 10 points each.

Exercises: 1. Identify the transcript regions most enriched for binding by the RBP ZFP36. 2. Within that preferentially bound region, identify which 5mers ZFP36 most likes to bind to.

Remember all the data is available here: rag-tag ENCODE

We are looking for an ZFP36 PAR-CLIP corresponding to this SRA (short-read archive) ID: SRR1046759

1. Determine the annotation region most enriched for ZFP36 binding sites.

To do so you will need to annotate the ZFP36 binding sites and a randomized set of binding sites (based on ZFP36 binding sites) to determine the which binding region is preferred by ZFP36 binding.

1a. Build annotation database

# What annotation categories are available?
possible_annotations <- builtin_annotations()

# Keep only those containing "hg19"
hg19_annots <- grep("hg19_genes", possible_annotations, value = T)


# let's keep 5' utr, cds, intron, 3' utr and intergenic
my_hg19_annots <- hg19_annots[c(3, 4, 7, 10, 11)]


# build the annotation database
annotations <- build_annotations(genome = "hg19", annotations = my_hg19_annots)

'select()' returned 1:1 mapping between keys and columns

Building promoters...

Building 1to5kb upstream of TSS...

Building intergenic...

Building cds...

Building 5UTRs...

Building 3UTRs...

Building exons...

Building introns...

1b. Annotate the ZFP36 binding sites

# We are importing a bed file directly from github for zfp36 binding sites
zfp36_regions <- read_regions(
  con = "https://raw.githubusercontent.com/BIMSBbioinfo/RCAS_meta-analysis/master/rbp-binding-sites/SRR1046759.clusters.bed",
  genome = "hg19", format = "bed"
)


# annotate zfp36 binding sites and convert to df
zfp36_annot <-
  annotate_regions(
    regions = zfp36_regions,
    annotations = annotations,
    ignore.strand = FALSE,
    quiet = FALSE
  ) %>%
  data.frame()

Annotating...

# We will keep only columns we need and collapse the redundant info
myInfo <- c("seqnames", "start", "end", "width", "strand", "annot.symbol", "annot.type")

zfp36_annot <- zfp36_annot[, myInfo] %>%
  unique()

# Just getting rid of the "hg19_genes_" string to simplify `annot.type`
zfp36_annot$annot.type <- gsub("hg19_genes_", "", zfp36_annot$annot.type)

table(zfp36_annot$annot.type)


     3UTRs      5UTRs        cds intergenic    introns 
      1358         20         41        373        520

1c. Annotate the randomized binding sites

random_regions <- randomize_regions(regions = zfp36_regions, allow.overlaps = TRUE, per.chromosome = TRUE)

Randomizing regions...

random_annot <- annotate_regions(
  regions = random_regions,
  annotations = annotations,
  ignore.strand = FALSE,
  quiet = FALSE
) %>%
  data.frame()

Annotating...

random_annot <- random_annot[, myInfo] %>%
  unique()

random_annot$annot.type <- gsub("hg19_genes_", "", random_annot$annot.type)

1d. Determine which regions exhibit more than expected ZFP36 binding sites.

site_dist <- bind_cols(
  region = names(table(zfp36_annot$annot.type)),
  observed = table(zfp36_annot$annot.type),
  expected = table(random_annot$annot.type)
)

site_dist$enrichment <- site_dist$observed / site_dist$expected


site_dist_long <- pivot_longer(site_dist, cols = c(observed, expected, enrichment))
colnames(site_dist_long) <- c("region", "type", "value")

ggplot(site_dist_long %>% filter(type == "enrichment"), aes(y = value, x = region)) +
  geom_bar(stat = "identity") +
  ylab("Observed vs Expected") +
  theme_cowplot()

Don't know how to automatically pick scale for object of type <table>.
Defaulting to continuous.

2. Determine the preferred seqeunces bound by ZFP36.

You have determined which region ZFP36 binding sites are preferred to reside - the 3’ UTR :).
Focusing on 3’ UTR binding sites, determine which sequences ZFP36 binds to compared to background sequences for that region USING 5mers.

Remember the workflow for determine k-mer composition (USING 5mers) for any set of intervals (binding sites or annotation categories).

Create a Granges object for a given annotation category.
Remove duplicated intervals (from diff transcript ids) with reduce.
Retrieve seqeunces using getSeqs
Create a dataframe containing the count and frequency of each 5mer.

2a. Calculate 5mers in ZFP36 binding sites in 3’ UTRs.

# create a Grange for 3' UTRs

zfp36_3utr <- makeGRangesFromDataFrame(df = zfp36_annot %>% filter(annot.type == "3UTRs"), ignore.strand = F, seqnames.field = "seqnames", start.field = "start", end.field = "end", strand.field = "strand", keep.extra.columns = T)


# get sequencees for those coordinates
zfp36_3utr_seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg19, zfp36_3utr)

# count  all 5mer instances per sequence, add all instances, and turn into a dataframe with column name counts

zfp36_3utr_5mer <- oligonucleotideFrequency(x = zfp36_3utr_seqs, width = 5, as.prob = F, simplify.as = "matrix") %>%
  colSums(.) %>%
  as.data.frame()

colnames(zfp36_3utr_5mer) <- "zfp36_utr_count"

zfp36_3utr_5mer$zfp36_utr_freq <- zfp36_3utr_5mer$zfp36_utr_count / sum(zfp36_3utr_5mer$zfp36_utr_count)

2b. Calculate 5mers in 3’ UTRs.

# create a Grange for 3' UTRs
threeUTR <- GenomicRanges::reduce(annotations[annotations$type %in% my_hg19_annots[4]])

# get  sequencees for those coordinates
threeUTR_seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg19, threeUTR)

# count  all 5mer instances per sequence, add all instances, and turn into a dataframe with column name counts
utr3_5mer <- oligonucleotideFrequency(x = threeUTR_seqs, width = 5, as.prob = F, simplify.as = "matrix") %>%
  colSums(.) %>%
  as.data.frame()

colnames(utr3_5mer) <- "utr_count"

utr3_5mer$utr_freq <- utr3_5mer$utr_count / sum(utr3_5mer$utr_count)

2c. Compare 5mer frequencies in ZFP36 binding sites vs background.

Label 5mers that are 8x, remember log2(8) = 3, enriched above background.

# check if rownames are identical and stop if they are  not
stopifnot(identical(rownames(zfp36_3utr_5mer), rownames(utr3_5mer)))

utr3_df <- bind_cols(zfp36_3utr_5mer, utr3_5mer)

utr3_df$zfp36_enrich <- utr3_df$zfp36_utr_freq / utr3_df$utr_freq


# scatter plot
p_scat <- ggplot(data = utr3_df, aes(y = zfp36_utr_freq, x = utr_freq)) +
  geom_point(color = ifelse(utr3_df$zfp36_enrich > 8, "red", "black")) +
  ylab("ZFP36 3'UTR") +
  xlab("3'UTR") +
  geom_abline(intercept = 0, slope = 1) +
  geom_text_repel(aes(label = ifelse(zfp36_enrich > 8,
    rownames(utr3_df),
    ""
  ))) +
  theme_cowplot()

# print top 10
utr3_df %>%
  rownames_to_column(var = "5mer") %>%
  arrange(-zfp36_enrich) %>%
  top_n(n = 10) %>%
  gt()

Selecting by zfp36_enrich

5mer	zfp36_utr_count	zfp36_utr_freq	utr_count	utr_freq	zfp36_enrich
TTATT	1015	0.032996327	83943	0.002537075	13.005656
ATTTA	856	0.027827444	71276	0.002154230	12.917580
TATTT	1278	0.041546114	109891	0.003321322	12.508909
TTTAT	1015	0.032996327	91054	0.002751996	11.989960
CTATT	304	0.009882644	34851	0.001053329	9.382294
TATAT	539	0.017522187	61994	0.001873693	9.351684
ATTAT	449	0.014596405	52041	0.001572876	9.280074
TATTA	411	0.013361074	47670	0.001440768	9.273579
ATATT	578	0.018790026	67348	0.002035511	9.231108
TTATA	426	0.013848705	52238	0.001578830	8.771498