RBP-RNA part 2

Neelanjan Mukherjee

RNA Bioscience Initiative | CU Anschutz

2024-10-21

mature mRNA regulatory decisions

Just a review of how RBPs control the fate of mRNAs in the cytoplasm.

Mayya and Duchaine

HuR and ARE decay

Remember our analysis of HuR PAR-CLIP data from last class. The model we were testing is that HuR binds to AU-rich elements (ARE) in 3’ UTRs of mRNAs to promote mRNA stability.

Model predictions:

  1. HuR binds to the 3’ UTR and introns.
  2. HuR binds to AU-rich sequences (AUUUA) and U-rich sequences.
  3. HuR binding promotes target RNA stabilization (and binding by the other RBPs to the ARE promotes destabilization).

Set up annotation database

possible_annotations <- builtin_annotations()

# grep to keep those containing "hg19"
hg19_annots <- grep("hg19_genes", possible_annotations, value = T)

# WHY DID WE PICK hg19?

# let's keep 5' utr, cds, intron, 3' utr and intergenic
my_hg19_annots <- hg19_annots[c(3, 4, 7, 10, 11)]

# build the annotation database
annotations <- build_annotations(genome = "hg19", annotations = my_hg19_annots)

annotations
GRanges object with 1446460 ranges and 5 metadata columns:
                  seqnames        ranges strand |               id       tx_id
                     <Rle>     <IRanges>  <Rle> |      <character> <character>
        [1]           chr1   12190-12227      + |            CDS:1  uc010nxq.1
        [2]           chr1   12595-12721      + |            CDS:2  uc010nxq.1
        [3]           chr1   13403-13639      + |            CDS:3  uc010nxq.1
        [4]           chr1   69091-70008      + |            CDS:4  uc001aal.1
        [5]           chr1 324343-324345      + |            CDS:5  uc009vjk.2
        ...            ...           ...    ... .              ...         ...
  [1446456] chrUn_gl000246       1-38154      * | intergenic:17023        <NA>
  [1446457] chrUn_gl000247        1-5786      * | intergenic:17024        <NA>
  [1446458] chrUn_gl000247   10817-36422      * | intergenic:17025        <NA>
  [1446459] chrUn_gl000248       1-39786      * | intergenic:17026        <NA>
  [1446460] chrUn_gl000249       1-38502      * | intergenic:17027        <NA>
                gene_id      symbol                  type
            <character> <character>           <character>
        [1]   100287102     DDX11L1        hg19_genes_cds
        [2]   100287102     DDX11L1        hg19_genes_cds
        [3]   100287102     DDX11L1        hg19_genes_cds
        [4]       79501       OR4F5        hg19_genes_cds
        [5]   100133331        <NA>        hg19_genes_cds
        ...         ...         ...                   ...
  [1446456]        <NA>        <NA> hg19_genes_intergenic
  [1446457]        <NA>        <NA> hg19_genes_intergenic
  [1446458]        <NA>        <NA> hg19_genes_intergenic
  [1446459]        <NA>        <NA> hg19_genes_intergenic
  [1446460]        <NA>        <NA> hg19_genes_intergenic
  -------
  seqinfo: 93 sequences (1 circular) from hg19 genome

Annotate PAR-CLIP data

hur_regions <- read_regions(
  con = "https://raw.githubusercontent.com/BIMSBbioinfo/RCAS_meta-analysis/master/rbp-binding-sites/SRR248532.clusters.bed",
  genome = "hg19", format = "bed"
)

# let's annotate
hur_annot <- annotate_regions(
  regions = hur_regions,
  annotations = annotations,
  ignore.strand = FALSE,
  quiet = FALSE
) %>%
  data.frame()

# keep only columns we need
myInfo <- c("seqnames", "start", "end", "width", "strand", "annot.symbol", "annot.type")

hur_annot <- hur_annot[, myInfo] %>%
  unique()

# getting rid of the "hg19_genes_" string to simplify `annot.type`
hur_annot$annot.type <- gsub("hg19_genes_", "", hur_annot$annot.type)

table(hur_annot$annot.type)

     3UTRs      5UTRs        cds intergenic    introns 
     20936        621       1081      17084      71755 

Summarize PAR-CLIP data to gene level

Now we want to get the following info: 1. The # of HuR binding sites per gene. 2. The # of HuR binding sites per region per gene.

# count the # sites per gene and annotation cat
hur_gene_clip <- hur_annot %>%
  filter(annot.type != "intergenic") %>%
  group_by(annot.symbol, annot.type) %>%
  dplyr::count() %>%
  pivot_wider(names_from = annot.type, values_from = n)

hur_gene_clip <- hur_gene_clip %>% mutate_if(is.numeric, replace_na, replace = 0)

# new column w/total # sites
hur_gene_clip$total <- rowSums(hur_gene_clip[, -1])

# remove symbols that are NA
hur_gene_clip <- hur_gene_clip %>%
  filter(annot.symbol != "NA")

# rename cols
colnames(hur_gene_clip)[1] <- "Symbol"
colnames(hur_gene_clip)[3:4] <- c("utr3", "utr5")

Binding sites per gene

hur_gene_clip_long <- hur_gene_clip %>%
  pivot_longer(-Symbol)

colnames(hur_gene_clip_long) <- c("symbol", "region", "sites")

ggplot(hur_gene_clip_long %>% filter(region != "total"), aes(sites, fill = region)) +
  geom_histogram() +
  scale_x_log10() +
  theme_cowplot()

Explore intron vs 3’ UTR sites

# how many genes have both intron and/or 3' UTR sites
site_combo <- hur_gene_clip[, 2:5] %>%
  mutate(
    type = case_when(
      introns > 0 & utr3 > 0 ~ "intron_utr3",
      introns > 0 & utr3 == 0 ~ "intron",
      introns == 0 & utr3 > 0 ~ "utr3",
      TRUE ~ "other"
    )
  )

ggplot(site_combo, aes(type)) +
  geom_bar(stat = "count") +
  theme_cowplot()

HuR binds to both introns and 3’ UTRs.

Genes more often have intronic and 3’ UTR binding sites than either alone.

Load HuR knockdown RNA-seq data

  1. primary (unspliced precursor) transcripts in siGFP treated cells.
  2. mature transcripts in siGFP treated cells.
  3. primary (unspliced precursor) transcripts in siHuR treated cells.
  4. mature transcripts in siHuR treated cells.
# load object called HuR.R

# HuR siRNA RNAseq
load(here("data/block-rna/HuR.rds"))


# gene information
gene_info <- read_csv(here("data/block-rna/geneInfo.csv.zip"))

HuR$Gene <- rownames(HuR) # new column gene ids

HuR <- left_join(HuR, gene_info[c(2, 11)], by = "Gene") # Symbol

# Filter for expression
hur_filt_rnaseq <- HuR %>%
  dplyr::filter(rowMeans(HuR[, 1:4]) > 1) %>%
  dplyr::select(-Gene)

integrate siRNA and clip-seq data

# join kd and clip data
kd_clip <- left_join(hur_filt_rnaseq, hur_gene_clip, by = "Symbol")

# convert all NA to 0
kd_clip <- kd_clip %>% mutate_if(is.numeric, replace_na, replace = 0)

# calculate log fold changes
kd_clip <- kd_clip %>%
  mutate(
    lfc_mature = log2(Mature_siGFP) - log2(Mature_siHuR),
    lfc_primary = log2(Primary_siGFP) - log2(Primary_siHuR)
  )

What is the relationship between HuR binding sites and change expression?

Does HuR promote the stability of its mRNA targets?

Does the number of HuR binding influence the degree of stabilization?

Does the region of HuR binding influence stabilization?

To target or not to target?

# create new column for target or not target

kd_clip$target <- case_when(
  kd_clip$total > 0 ~ "target",
  TRUE ~ "not target"
)


ggplot(kd_clip, aes(lfc_mature, color = target)) +
  stat_ecdf() +
  xlim(-2, 2) +
  theme_cowplot() +
  ylab("cumulative fraction of LFC") +
  ggtitle("HuR Target vs Not Target") +
  geom_hline(yintercept = .75, color = "grey") +
  geom_hline(yintercept = .5, color = "grey") +
  geom_hline(yintercept = .25, color = "grey")

To target a little or a lot?

Does the number of HuR binding influence the degree of stabilization?

# hist(kd_clip$utr3)
# let's make bins for # of sites per  3' UTR

kd_clip$utr3_bin <- cut2(x = kd_clip$utr3, c(0, 1, 2, 4, 8, 1000))

# table(kd_clip$utr3_bin)

# let's make bins for UTR
kd_clip$utr3_bin <- recode_factor(kd_clip$utr3_bin,
  "   0" = "0",
  "   1" = "1",
  "[   2,   4)" = "2-3",
  "[   4,   8)" = "4-7",
  "[   8,1000]" = "8+"
)


ggplot(data = kd_clip, aes(x = lfc_mature, color = utr3_bin)) +
  stat_ecdf() +
  xlim(-2, 2) +
  theme_cowplot() +
  ylab("cumulative fraction- LFCs") +
  ggtitle("# of HuR binding sites in 3' UTR") +
  geom_hline(yintercept = .75, color = "grey", linetype = "dashed") +
  geom_hline(yintercept = .5, color = "grey", linetype = "dashed") +
  geom_hline(yintercept = .25, color = "grey", linetype = "dashed") +
  geom_segment(aes(x = 0, y = 0, xend = 0, yend = .5, colour = "segment"), color = "black", linetype = "dashed")

More binding -> more stabilization?

TEST: Does the number of HuR binding influence the degree of stabilization?

# keep only finite values for lfc_mature and lfc_primary
kd_clip_targets <- kd_clip[is.finite(kd_clip$lfc_mature), ]

# IMPORTANT relevel indicating what everything will be compared to
kd_clip_targets$utr3_bin <- relevel(factor(kd_clip_targets$utr3_bin), ref = "0")


# calculate fit using `lm`
fit_bins <- lm(
  data = kd_clip_targets,
  formula = lfc_mature ~ utr3_bin
)

# examine estimates and p-vals
tidy(fit_bins) %>%
  gt()
term estimate std.error statistic p.value
(Intercept) 0.08788192 0.006970151 12.60832 3.057791e-36
utr3_bin1 0.22661067 0.016347699 13.86193 2.153701e-43
utr3_bin2-3 0.40806066 0.016755400 24.35398 3.252151e-128
utr3_bin4-7 0.56621272 0.019438822 29.12793 6.881968e-181
utr3_bin8+ 0.63471928 0.025913270 24.49399 1.225419e-129

More binding -> more stabilization?

TEST: Does the number of HuR binding influence the degree of stabilization?

# mean lfc of mRNAs with no 3' UTR binding sites
ref_mean <- kd_clip_targets %>%
  filter(utr3_bin == "0") %>%
  pull(lfc_mature) %>%
  mean()


# examine estimates and p-vals
tidy(fit_bins) %>%
  ggplot(
    data = .,
    aes(
      x = term,
      y = estimate,
      fill = -log10(p.value)
    )
  ) +
  geom_bar(stat = "identity") +
  coord_flip() +
  geom_hline(
    yintercept = ref_mean,
    color = "red"
  ) +
  theme_cowplot()