# What annotation categories are available?
possible_annotations <- builtin_annotations()
# Keep only those containing "hg19"
hg19_annots <- grep("hg19_genes", possible_annotations, value = T)
# let's keep 5' utr, cds, intron, 3' utr and intergenic
my_hg19_annots <- hg19_annots[c(3,4,7,10,11)]
# build the annotation database
annotations <- build_annotations(genome = 'hg19', annotations = my_hg19_annots)
RNA Block - Problem Set 28
Problem Set
Total points: 20. Q1 - 15 pts, Q2 - 5 points each.
Exercises: We want to visualize 1) if RNAs that have HuR binding sites introns only, 3’ UTRs only, or both exhibit different degrees of stabilization upon HuR knockdown AND 2) which of those categories have a statistically significant different LFC than non-targeted RNAs.
The model we were tesing is that HuR binds to AU-rich elements (ARE) in 3’ UTRs of mRNAs to promote mRNA stability.
This model makes a few specific prediction:
- HuR binds to the 3’ UTR and introns.
- HuR binds to AU-rich sequences (AUUUA) and U-rich sequences.
- HuR binding promotes target RNA stabilization (and binding by the other RBPs to the ARE promotes destabilization).
More specifically, we have been exploring the following questions about the relationship between HuR binding sites and change expression:
Does HuR promote the stability of its mRNA targets?
Does the number of HuR binding influence the degree of stabilization?
Does the region of HuR binding influence stabilization? and Which regions are statistically significant?
We want to visualize 1) if RNAs that have HuR binding sites introns, 3’ UTRs, or both exhibit different degrees of stabilization upon HuR knockdown AND 2) which of those categories have a statistically significant different LFC than non-targeted RNAs. For simplicity, you will categorize genes as either: 1. intron (but not 3’ UTR, ignore other annotation categories)
2. utr3 (but not 3’ intron, ignore other annotation categories)
3. intron and utr3 (ignore other annotation categories) 4. other (genes w/binding sites but none in intron either 3’ UTR)
5. not a target (this is the reference comparison)
1. Annotate HuR binding sites.
1a. Build annotation database
1b. Annotate the HuR binding sites
# read hur binding sites
hur_regions <- read_regions(con = "https://raw.githubusercontent.com/BIMSBbioinfo/RCAS_meta-analysis/master/rbp-binding-sites/SRR248532.clusters.bed",
genome = 'hg19', format = 'bed')
# let's annotate
hur_annot <- annotate_regions(
regions = hur_regions,
annotations = annotations,
ignore.strand = FALSE,
quiet = FALSE) %>%
data.frame()
# keep only columns we need
myInfo <- c("seqnames","start","end","width","strand","annot.symbol","annot.type")
hur_annot <- hur_annot[,myInfo] %>%
unique()
# getting rid of the "hg19_genes_" string to simplify `annot.type`
hur_annot$annot.type <- gsub("hg19_genes_","",hur_annot$annot.type)
1c. Calculate a summary table for HuR binding sites per gene per region.
Now we want to assemble the # of HuR binding sites per region per gene.
# count the # sites per gene and annotation cat
hur_gene_clip <- hur_annot %>%
filter() %>%
group_by() %>%
dplyr::count() %>%
pivot_wider()
hur_gene_clip <- hur_gene_clip %>%
mutate_if(is.numeric, replace_na, replace = 0)
# new column w/total # sites
hur_gene_clip$total <- rowSums(hur_gene_clip[,-1])
# remove symbols that are NA
hur_gene_clip <- hur_gene_clip %>%
filter(annot.symbol != "NA")
# rename cols
colnames(hur_gene_clip)[1] <- "Symbol"
colnames(hur_gene_clip)[3:4] <- c("utr3","utr5")
# for each gene use case_when to figure out categories of intron and/or 3' UTR binding
site_combo <- hur_gene_clip %>%
mutate(
type = case_when(
)
)
1c. import kd data and join w/clip data
# load object called HuR.R
# HuR siRNA RNAseq
load(here("data/block-rna/HuR.R"))
# gene information
<- read_csv(here("data/block-rna/geneInfo.csv.zip"))
gene_info
$Gene <- rownames(HuR) # new column gene ids
HuR
<- left_join(HuR, gene_info[c(2,11)], by="Gene") # Symbol
HuR
# Filter for expression
<- HuR %>%
hur_filt_rnaseq ::filter(rowMeans(HuR[,1:4]) > 1) %>%
dplyr::select(-Gene)
dplyr
<- left_join(hur_filt_rnaseq, ??, by = "Symbol")
kd_clip
# convert all NA to 0
$type[is.na(kd_clip$type)] <- "not_target"
kd_clip
# calculate log fold changes for mature
<- kd_clip %>%
kd_clip mutate(
lfc_mature = log2(Mature_siGFP) - log2(Mature_siHuR)
)
# relevel so "not_target" is ref
$?? <- relevel(x = factor(??),
kd_clipref = "not_target")
1d. Visualize binding region types and LFC
# make cdf plot of LFC colored by binding region categories
2. Statistical test
Is there a statistically significant different in LFC for the different binding region categories compared tonon-targeted RNAs.
# keep only finite values for lfc_mature
<- kd_clip[is.finite(kd_clip$lfc_mature),]
kd_clip
# calculate fit using `lm`
<- lm(
fit_bins data = kd_clip,
formula = ?? ~ ??)
# plot estimates and p-vals