Interval statistics

Jay Hesselberth

November 21 2016

valr can be used to explore relationships between sets of intervals. Here we explore the relationship between transcription start sites and repetitive elements in the human genome.

library(valr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(broom)

# load repeats and genes. Data in the valr package is restricted to chr22; the entire
# files can be downloaded from UCSC.
rpts <- read_bed(valr_example('hg19.rmsk.chr22.bed.gz'), n_fields = 6) 
gens <- read_bed(valr_example('hg19.refGene.chr22.bed.gz'), n_fields = 12)

# load chrom sizes
chrs <- read_genome(valr_example('hg19.chrom.sizes.gz'))

# create 1bp intervals representing transcription start sites
tss <- mutate(gens,
              .start = ifelse(strand == '+', start, end),
              .end = .start + 1) %>%
  select(chrom, start = .start, end = .end, name)

tss
#> # A tibble: 1,267 × 4
#>    chrom    start      end         name
#>    <chr>    <int>    <dbl>        <chr>
#> 1  chr22 16193009 16193010    NR_122113
#> 2  chr22 16157078 16157079    NR_133911
#> 3  chr22 16162065 16162066    NR_073459
#> 4  chr22 16162065 16162066    NR_073460
#> 5  chr22 16231289 16231290    NR_132385
#> 6  chr22 16287937 16287938 NM_001136213
#> 7  chr22 16274608 16274609    NR_046571
#> 8  chr22 16449804 16449805 NM_001005239
#> 9  chr22 17073700 17073701    NM_014406
#> 10 chr22 17082800 17082801    NR_001591
#> # ... with 1,257 more rows

First we define a function that takes x and y intervals and computes distance statistics (using bed_reldist() and bed_absdist()) for specified groups.

gen_stats <- function(x, y, genome, grp, type = NA) {
  group_by_(x, .dots = grp) %>%
    do(reldist = bed_reldist(., y, detail = TRUE) %>%
         select(.value = .reldist),
       absdist = bed_absdist(., y, genome) %>%
         select(.value = .absdist_scaled)
       ) %>%
    gather_('stat', 'value', setdiff(names(.), list(grp))) %>%
    mutate(type = type)
} 
obs_stats <- gen_stats(rpts, tss, chrs, 'name', 'obs')

shfs <- bed_shuffle(rpts, chrs, within = TRUE)
shf_stats <- gen_stats(shfs, tss, chrs, 'name', 'shuf')

res <- bind_rows(obs_stats, shf_stats) %>%
  unnest(value) %>% 
  group_by(name, stat, type) %>%
  mutate(.id = row_number()) %>%
  spread(type, .value) %>%
  na.omit()

res
#> Source: local data frame [16,788 x 5]
#> Groups: name, stat [1,911]
#> 
#>     name    stat   .id         obs         shuf
#>    <chr>   <chr> <int>       <dbl>        <dbl>
#> 1   (A)n absdist     1 126.4610794 361.25652664
#> 2   (A)n absdist     2   5.5929738 337.75391617
#> 3   (A)n absdist     3   0.7715170 328.45763096
#> 4   (A)n absdist     4   0.2205569 299.49234101
#> 5   (A)n absdist     5   1.0203258 154.04617842
#> 6   (A)n absdist     6   0.5561215   0.66609128
#> 7   (A)n absdist     7   0.3747072   0.08801532
#> 8   (A)n absdist     8   0.5429340   0.66337476
#> 9   (A)n absdist     9   0.3323295   0.16933812
#> 10  (A)n absdist    10   3.3093910   0.08154506
#> # ... with 16,778 more rows

Now that the data are formatted, we can use ks.test() to determine whether there are significant differences between the observed and shuffled data for each group.

pvals <- res %>% do(twosided = broom::tidy(ks.test(.$obs, .$shuf)),
                    less = broom::tidy(ks.test(.$obs, .$shuf, alternative = 'less')),
                    greater = broom::tidy(ks.test(.$obs, .$shuf, alternative = 'greater'))) %>%
  gather(alt, type, -name, -stat) %>%
  unnest(type) %>%
  select(name:p.value) %>%
  arrange(p.value)

ggplot(pvals, aes(p.value)) +
  geom_histogram(binwidth = 0.05) +
  facet_grid(stat ~ alt) + theme_bw()

We can also assess false discovery rates (q.values) using p.adjust().

pvals <- group_by(pvals, stat, alt) %>%
  mutate(q.value = p.adjust(p.value)) %>%
  ungroup() %>%
  arrange(q.value)

Finally we can visualize these results using stat_ecdf().

res_fold <- res %>%
  gather(type, value, -name, -stat, -.id)

signif <- head(pvals, 25)
res_signif <- signif %>% left_join(res_fold, by = c('name','stat'))

ggplot(res_signif, aes(x = value, color = type)) +
  stat_ecdf() + 
  facet_wrap(name ~ stat) + theme_classic() + scale_x_log10()
#> Warning: Transformation introduced infinite values in continuous x-axis
#> Warning: Removed 2 rows containing non-finite values (stat_ecdf).


bed_projection() is an additional statistical approach to assess the relationship between two intervals, based on the binomial distribution. Here, we’ll investigate the distribution of repetitive elements within the promoters coding or non-coding genes.

First we’ll extract 5kb regions upstream of the transcription start sites to represent the promoter regions for coding and non-coding genes.

# Using the same data as before
gens
#> # A tibble: 1,267 × 12
#>    chrom    start      end         name score strand cds_start  cds_end
#>    <chr>    <int>    <int>        <chr> <chr>  <chr>     <int>    <int>
#> 1  chr22 16150528 16193009    NR_122113     0      -  16193009 16193009
#> 2  chr22 16157078 16172265    NR_133911     0      +  16172265 16172265
#> 3  chr22 16162065 16172265    NR_073459     0      +  16172265 16172265
#> 4  chr22 16162065 16172265    NR_073460     0      +  16172265 16172265
#> 5  chr22 16199673 16231289    NR_132385     0      -  16231289 16231289
#> 6  chr22 16256331 16287937 NM_001136213     0      -  16258185 16287885
#> 7  chr22 16274608 16277577    NR_046571     0      +  16277577 16277577
#> 8  chr22 16448823 16449804 NM_001005239     0      -  16448823 16449804
#> 9  chr22 17071647 17073700    NM_014406     0      -  17071766 17073440
#> 10 chr22 17082800 17129720    NR_001591     0      +  17129720 17129720
#> # ... with 1,257 more rows, and 4 more variables: item_rgb <chr>,
#> #   exon_count <int>, exon_sizes <chr>, exon_starts <chr>
rpts
#> # A tibble: 10,000 × 6
#>    chrom    start      end     name score strand
#>    <chr>    <int>    <int>    <chr> <chr>  <chr>
#> 1  chr22 10522608 10522644  (AATA)n    39      +
#> 2  chr22 10529446 10529561    AluJo   771      +
#> 3  chr22 10534866 10534924     L1MD   356      -
#> 4  chr22 10536350 10536483    L1MEd   362      -
#> 5  chr22 10536811 10537128   AluYb8  3025      +
#> 6  chr22 10543083 10543417 Tigger3a  1540      +
#> 7  chr22 10549961 10550092    MSTB1   782      -
#> 8  chr22 10554305 10554514    LTR29   835      -
#> 9  chr22 10562009 10562402    L1MCa   858      -
#> 10 chr22 10563539 10563681     L1M5   325      -
#> # ... with 9,990 more rows


# create intervals 5kb upstream of tss representing promoters
promoters <- mutate(gens,
              .start = ifelse(strand == '+', start - 5000, end - 1),
              .end   = ifelse(strand == '+', start + 1, end + 5000),
              name   = ifelse(grepl("NR_", name), "non-coding", "coding")) %>%
  select(chrom, start = .start, end = .end, name, score, strand)
  
# select coding and non-coding promoters
nc_promoters <- filter(promoters, name == "non-coding")
coding_promoters <- filter(promoters, name == "coding")
nc_promoters
#> # A tibble: 294 × 6
#>    chrom    start      end       name score strand
#>    <chr>    <dbl>    <dbl>      <chr> <chr>  <chr>
#> 1  chr22 16193008 16198009 non-coding     0      -
#> 2  chr22 16152078 16157079 non-coding     0      +
#> 3  chr22 16157065 16162066 non-coding     0      +
#> 4  chr22 16157065 16162066 non-coding     0      +
#> 5  chr22 16231288 16236289 non-coding     0      -
#> 6  chr22 16269608 16274609 non-coding     0      +
#> 7  chr22 17077800 17082801 non-coding     0      +
#> 8  chr22 17156429 17161430 non-coding     0      -
#> 9  chr22 17229327 17234328 non-coding     0      -
#> 10 chr22 17303363 17308364 non-coding     0      +
#> # ... with 284 more rows
coding_promoters
#> # A tibble: 973 × 6
#>    chrom    start      end   name score strand
#>    <chr>    <dbl>    <dbl>  <chr> <chr>  <chr>
#> 1  chr22 16287936 16292937 coding     0      -
#> 2  chr22 16449803 16454804 coding     0      -
#> 3  chr22 17073699 17078700 coding     0      -
#> 4  chr22 17302588 17307589 coding     0      -
#> 5  chr22 17302588 17307589 coding     0      -
#> 6  chr22 17489111 17494112 coding     0      -
#> 7  chr22 17560848 17565849 coding     0      +
#> 8  chr22 17560848 17565849 coding     0      +
#> 9  chr22 17602256 17607257 coding     0      -
#> 10 chr22 17602212 17607213 coding     0      -
#> # ... with 963 more rows

Next we’ll apply the bed_projection() test for each repeat class for both coding and non-coding regions.


# function to apply bed_projection to groups
gen_stats <- function(x, y, genome, grp, type = NA) {
  group_by_(x, .dots = grp) %>%
    do(repeat_counts = nrow(.),
       projection = bed_projection(., y, genome) 
       ) %>%
    mutate(type = type)
} 

pvals_nc <- gen_stats(rpts, nc_promoters, chrs, "name", "non_coding")
pvals_cd <- gen_stats(rpts, coding_promoters, chrs, "name", "coding")

pvals <- bind_rows(pvals_nc, pvals_cd) %>% 
  unnest() %>%
  select(-chrom)

#filter for repeat classes with at least 10 intervals
pvals <- filter(pvals, 
                repeat_counts > 10,
                obs_exp_ratio != 0)
# adjust pvalues 
pvals <- pvals %>%
  mutate(q.value = p.adjust(p.value))

pvals
#> # A tibble: 179 × 7
#>      name       type repeat_counts     p.value obs_exp_ratio lower_tail
#>     <chr>      <chr>         <int>       <dbl>         <dbl>      <chr>
#> 1    (A)n non_coding            28 0.003531508     4.7141679      FALSE
#> 2   (AT)n non_coding            48 0.298021071     0.9166438      FALSE
#> 3   (CA)n non_coding            31 0.156181108     1.4193194      FALSE
#> 4   (GT)n non_coding            42 0.247323981     1.0475929      FALSE
#> 5    (T)n non_coding            61 0.404997861     0.7212935      FALSE
#> 6   (TG)n non_coding            40 0.062261083     2.1999450      FALSE
#> 7  A-rich non_coding            54 0.348149650     0.8147945      FALSE
#> 8     Alu non_coding            15 0.044577839     2.9332600      FALSE
#> 9   AluJb non_coding           271 0.022516393     1.7859332      FALSE
#> 10  AluJo non_coding           208 0.021590030     1.9037986      FALSE
#> # ... with 169 more rows, and 1 more variables: q.value <dbl>

The projection test is a two-tailed statistical test. Significant p.values indicate either enrichment or depletion of query intervals in the reference interval sets. lower_tail = TRUE indicates that the query intervals are depleted, whereas lower_tail = FALSE indicates that the query intervals are enriched.


# show top 5 most significant repeats
sig <- pvals %>% 
  arrange(q.value) %>%
  group_by(type) %>%
  top_n(-5, q.value) %>%
  arrange(type)

knitr::kable(sig,
             caption = "The most significant repeats overlapping coding and non-coding gene promoters")
The most significant repeats overlapping coding and non-coding gene promoters
name type repeat_counts p.value obs_exp_ratio lower_tail q.value
AluSx1 coding 364 0.0000114 2.105784 FALSE 0.0020433
AluSx3 coding 95 0.0001425 2.913617 FALSE 0.0252231
AluJb coding 271 0.0002226 2.042757 FALSE 0.0385082
Charlie4z coding 11 0.0012298 5.806859 FALSE 0.2053719
AluSz coding 259 0.0013212 1.890779 FALSE 0.2193255
MER3 non_coding 27 0.0000270 8.147944 FALSE 0.0048130
L1MC4a non_coding 23 0.0001449 7.651983 FALSE 0.0254988
L1ME1 non_coding 38 0.0002036 5.789329 FALSE 0.0356315
G-rich non_coding 54 0.0002174 4.888767 FALSE 0.0378348
L1MC5 non_coding 29 0.0004565 6.068814 FALSE 0.0785198