Last updated on 2024-05-02 13:50:14 CEST.
Flavor | Version | Tinstall | Tcheck | Ttotal | Status | Flags |
---|---|---|---|---|---|---|
r-devel-linux-x86_64-debian-clang | 0.2.6 | 80.09 | 838.48 | 918.57 | ERROR | |
r-devel-linux-x86_64-debian-gcc | 0.2.6 | 61.26 | 1101.72 | 1162.98 | ERROR | |
r-devel-linux-x86_64-fedora-clang | 0.2.6 | 1208.36 | ERROR | |||
r-devel-linux-x86_64-fedora-gcc | 0.2.6 | 2761.61 | NOTE | |||
r-devel-windows-x86_64 | 0.2.6 | 66.00 | 531.00 | 597.00 | NOTE | |
r-patched-linux-x86_64 | 0.2.6 | 77.88 | 1434.26 | 1512.14 | ERROR | |
r-release-linux-x86_64 | 0.2.6 | 76.99 | 1312.59 | 1389.58 | ERROR | |
r-release-macos-arm64 | 0.2.6 | 369.00 | NOTE | |||
r-release-macos-x86_64 | 0.2.6 | 1070.00 | NOTE | |||
r-release-windows-x86_64 | 0.2.6 | 74.00 | 538.00 | 612.00 | NOTE | |
r-oldrel-macos-arm64 | 0.2.6 | 411.00 | NOTE | |||
r-oldrel-macos-x86_64 | 0.2.6 | 849.00 | NOTE | |||
r-oldrel-windows-x86_64 | 0.2.6 | 87.00 | 1204.00 | 1291.00 | NOTE |
Version: 0.2.6
Check: for GNU extensions in Makefiles
Result: NOTE
GNU make is a SystemRequirements.
Flavors: r-devel-linux-x86_64-debian-clang, r-devel-linux-x86_64-debian-gcc, r-devel-linux-x86_64-fedora-clang, r-devel-linux-x86_64-fedora-gcc, r-devel-windows-x86_64, r-patched-linux-x86_64, r-release-linux-x86_64, r-release-macos-arm64, r-release-macos-x86_64, r-release-windows-x86_64, r-oldrel-macos-arm64, r-oldrel-macos-x86_64, r-oldrel-windows-x86_64
Version: 0.2.6
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [17s/21s]
Running ‘test_RadixTree.R’ [642s/395s]
Running ‘test_pairwise.R’ [122s/88s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> library(seqtrie)
> library(stringi)
> library(stringdist)
> library(Biostrings)
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, rank, rbind, rownames, sapply, setdiff, table, tapply,
union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
> library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
>
> # Use 2 threads on github actions and CRAN, 4 threads locally
> IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
> NTHREADS <- ifelse(IS_LOCAL, 4, 2)
> NITER <- ifelse(IS_LOCAL, 4, 1)
> NSEQS <- 10000
> MAXSEQLEN <- 200
> CHARSET <- "ACGT"
>
> random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
>
> mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
>
> # Biostrings notes:
> # subject (target) must be of length 1 or equal to pattern (query)
> # To get a distance matrix, iterate over target and perform a column bind
> # special_zero_case -- if both query and target are empty, Biostrings fails with an error
> pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
>
> biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': Could not load package pwalign. Is it installed?
Note that Starting with BioC 3.19, calling pairwiseAlignment() requires
the pwalign package. Please install it with:
BiocManager::install("pwalign")
Calls: <Anonymous> ... <Anonymous> -> .call_fun_in_pwalign -> .load_package_gracefully
Execution halted
Flavor: r-devel-linux-x86_64-debian-clang
Version: 0.2.6
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [34s/37s]
Running ‘test_RadixTree.R’ [14m/11m]
Running ‘test_pairwise.R’ [185s/160s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> library(seqtrie)
> library(stringi)
> library(stringdist)
> library(Biostrings)
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, rank, rbind, rownames, sapply, setdiff, table, tapply,
union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
> library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
>
> # Use 2 threads on github actions and CRAN, 4 threads locally
> IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
> NTHREADS <- ifelse(IS_LOCAL, 4, 2)
> NITER <- ifelse(IS_LOCAL, 4, 1)
> NSEQS <- 10000
> MAXSEQLEN <- 200
> CHARSET <- "ACGT"
>
> random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
>
> mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
>
> # Biostrings notes:
> # subject (target) must be of length 1 or equal to pattern (query)
> # To get a distance matrix, iterate over target and perform a column bind
> # special_zero_case -- if both query and target are empty, Biostrings fails with an error
> pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
>
> biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': Could not load package pwalign. Is it installed?
Note that Starting with BioC 3.19, calling pairwiseAlignment() requires
the pwalign package. Please install it with:
BiocManager::install("pwalign")
Calls: <Anonymous> ... <Anonymous> -> .call_fun_in_pwalign -> .load_package_gracefully
Execution halted
Flavor: r-devel-linux-x86_64-debian-gcc
Version: 0.2.6
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [26s/54s]
Running ‘test_RadixTree.R’ [14m/23m]
Running ‘test_pairwise.R’ [169s/479s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> library(seqtrie)
> library(stringi)
> library(stringdist)
> library(Biostrings)
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, rank, rbind, rownames, sapply, setdiff, table, tapply,
union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
> library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
>
> # Use 2 threads on github actions and CRAN, 4 threads locally
> IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
> NTHREADS <- ifelse(IS_LOCAL, 4, 2)
> NITER <- ifelse(IS_LOCAL, 4, 1)
> NSEQS <- 10000
> MAXSEQLEN <- 200
> CHARSET <- "ACGT"
>
> random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
>
> mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
>
> # Biostrings notes:
> # subject (target) must be of length 1 or equal to pattern (query)
> # To get a distance matrix, iterate over target and perform a column bind
> # special_zero_case -- if both query and target are empty, Biostrings fails with an error
> pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
>
> biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': Could not load package pwalign. Is it installed?
Note that starting with BioC 3.19, calling pairwiseAlignment() requires
the pwalign package. Please install it with:
BiocManager::install("pwalign")
Calls: <Anonymous> ... <Anonymous> -> .call_fun_in_pwalign -> .load_package_gracefully
Execution halted
Flavor: r-devel-linux-x86_64-fedora-clang
Version: 0.2.6
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [31s/29s]
Running ‘test_RadixTree.R’ [19m/12m]
Running ‘test_pairwise.R’ [208s/148s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> library(seqtrie)
> library(stringi)
> library(stringdist)
> library(Biostrings)
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, rank, rbind, rownames, sapply, setdiff, table, tapply,
union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
> library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
>
> # Use 2 threads on github actions and CRAN, 4 threads locally
> IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
> NTHREADS <- ifelse(IS_LOCAL, 4, 2)
> NITER <- ifelse(IS_LOCAL, 4, 1)
> NSEQS <- 10000
> MAXSEQLEN <- 200
> CHARSET <- "ACGT"
>
> random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
>
> mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
>
> # Biostrings notes:
> # subject (target) must be of length 1 or equal to pattern (query)
> # To get a distance matrix, iterate over target and perform a column bind
> # special_zero_case -- if both query and target are empty, Biostrings fails with an error
> pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
>
> biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': Could not load package pwalign. Is it installed?
Note that Starting with BioC 3.19, calling pairwiseAlignment() requires
the pwalign package. Please install it with:
BiocManager::install("pwalign")
Calls: <Anonymous> ... <Anonymous> -> .call_fun_in_pwalign -> .load_package_gracefully
Execution halted
Flavor: r-patched-linux-x86_64
Version: 0.2.6
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [34s/33s]
Running ‘test_RadixTree.R’ [17m/10m]
Running ‘test_pairwise.R’ [217s/149s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> library(seqtrie)
> library(stringi)
> library(stringdist)
> library(Biostrings)
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, rank, rbind, rownames, sapply, setdiff, table, tapply,
union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
> library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
>
> # Use 2 threads on github actions and CRAN, 4 threads locally
> IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
> NTHREADS <- ifelse(IS_LOCAL, 4, 2)
> NITER <- ifelse(IS_LOCAL, 4, 1)
> NSEQS <- 10000
> MAXSEQLEN <- 200
> CHARSET <- "ACGT"
>
> random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
>
> mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
>
> # Biostrings notes:
> # subject (target) must be of length 1 or equal to pattern (query)
> # To get a distance matrix, iterate over target and perform a column bind
> # special_zero_case -- if both query and target are empty, Biostrings fails with an error
> pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
>
> biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
>
> biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
>
> for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': Could not load package pwalign. Is it installed?
Note that Starting with BioC 3.19, calling pairwiseAlignment() requires
the pwalign package. Please install it with:
BiocManager::install("pwalign")
Calls: <Anonymous> ... <Anonymous> -> .call_fun_in_pwalign -> .load_package_gracefully
Execution halted
Flavor: r-release-linux-x86_64
Version: 0.2.6
Check: installed package size
Result: NOTE
installed size is 5.8Mb
sub-directories of 1Mb or more:
data 1.1Mb
libs 4.2Mb
Flavors: r-release-macos-arm64, r-release-macos-x86_64, r-oldrel-macos-arm64, r-oldrel-macos-x86_64
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.