In this vignette I show comparisons between
namedCapture::df_match_variable
and its closest cousin in the R
package universe, tidyr::extract
. The two packages can be used to
compute the same result, but the code/syntax is different.
In this first comparison we use a syntax with each group name on the same line as its pattern. Here are some observations from the comparison:
namedCapture::df_match_variable
.convert
argument of tidyr::extract
, which uses
utils::type.convert
. Because type.convert
does not know how to
convert strings like 111,000
to integer, we first need to use
remove.commas
to create a new data.frame to use as input to
tidyr::extract
. In contrast namedCapture supports arbitrary
group-specific type conversion functions; we specify to.int
on the
same line as the corresponding name/pattern for the
chromStart/chromEnd groups.## First define data.
(sacct.df <- data.frame(
position=c(
"chr10:213,054,000-213,055,000",
"chrM:111,000-222,000",
"this will not match",
NA, # neither will this.
"chr1:110-111 chr2:220-222"), # two possible matches.
JobID=c(
"13937810_25",
"13937810_25.batch",
"13937810_25.extern",
"14022192_[1-3]",
"14022204_[4]"),
stringsAsFactors=FALSE))
#> position JobID
#> 1 chr10:213,054,000-213,055,000 13937810_25
#> 2 chrM:111,000-222,000 13937810_25.batch
#> 3 this will not match 13937810_25.extern
#> 4 <NA> 14022192_[1-3]
#> 5 chr1:110-111 chr2:220-222 14022204_[4]
remove.commas <- function(x)gsub(",", "", x)
long.list <- list()
## namedCapture: 29 lines of code.
range.list <- list(
"\\[",
task1="[0-9]+", as.integer,
"(?:-",#begin optional end of range.
taskN="[0-9]+", as.integer,
")?", #end is optional.
"\\]")
task.list <- list(
"(?:",#begin alternate
task="[0-9]+", as.integer,
"|",#either one task(above) or range(below)
range.list,
")")#end alternate
to.int <- function(x)as.integer(remove.commas(x))
(long.list$namedCapture <- namedCapture::df_match_variable(
sacct.df,
JobID=list(
job="[0-9]+", as.integer,
"_",
task.list,
"(?:[.]",
type=".*",
")?"),
position=list(
chrom="chr.*?",
":",
chromStart=".*?", to.int,
"-",
chromEnd="[0-9,]*", to.int)))
#> position JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000 13937810_25 13937810 25
#> 2 chrM:111,000-222,000 13937810_25.batch 13937810 25
#> 3 this will not match 13937810_25.extern 13937810 25
#> 4 <NA> 14022192_[1-3] 14022192 NA
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 14022204 NA
#> JobID.task1 JobID.taskN JobID.type position.chrom position.chromStart
#> 1 NA NA chr10 213054000
#> 2 NA NA batch chrM 111000
#> 3 NA NA extern <NA> NA
#> 4 1 3 <NA> NA
#> 5 4 NA chr1 110
#> position.chromEnd
#> 1 213055000
#> 2 222000
#> 3 NA
#> 4 NA
#> 5 111
## tidyr: 46 lines of code.
range.vec <- c(
"\\[",
task1="[0-9]+",
"(?:-",#begin optional end of range.
taskN="[0-9]+",
")?", #end is optional.
"\\]")
task.vec <- c(
"(?:",#begin alternate
task="[0-9]+",
"|",#either one task(above) or range(below)
range.vec,
")")#end alternate
regex.list <- list(
JobID=c(
job="[0-9]+",
"_",
task.vec,
"(?:[.]",
type=".*",
")?"),
position=c(
chrom="chr.*?",
":",
chromStart=".*?",
"-",
chromEnd="[0-9,]*"))
tidyr.input <- transform(
sacct.df,
position=remove.commas(position))
tidyr.df.list <- list(sacct.df)
for(col.name in names(regex.list)){
regex.vec <- regex.list[[col.name]]
is.group <- names(regex.vec)!=""
format.vec <- ifelse(is.group, "(%s)", "%s")
group.vec <- sprintf(format.vec, regex.vec)
regex <- paste(group.vec, collapse="")
group.names <- names(regex.vec)[is.group]
result <- tidyr::extract(
tidyr.input, col.name, group.names, regex, convert=TRUE)
to.save <- result[, group.names, drop=FALSE]
names(to.save) <- paste0(col.name, ".", group.names)
tidyr.df.list[[col.name]] <- to.save
}
names(tidyr.df.list) <- NULL
long.list$tidyr <- do.call(cbind, tidyr.df.list)
## Make sure the results are the same.
t(sapply(long.list, names))
#> [,1] [,2] [,3] [,4] [,5]
#> namedCapture "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> tidyr "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> [,6] [,7] [,8]
#> namedCapture "JobID.taskN" "JobID.type" "position.chrom"
#> tidyr "JobID.taskN" "JobID.type" "position.chrom"
#> [,9] [,10]
#> namedCapture "position.chromStart" "position.chromEnd"
#> tidyr "position.chromStart" "position.chromEnd"
t(sapply(long.list, sapply, class))
#> position JobID JobID.job JobID.task JobID.task1
#> namedCapture "character" "character" "integer" "integer" "integer"
#> tidyr "character" "character" "integer" "integer" "integer"
#> JobID.taskN JobID.type position.chrom position.chromStart
#> namedCapture "integer" "character" "character" "integer"
#> tidyr "integer" "character" "character" "integer"
#> position.chromEnd
#> namedCapture "integer"
#> tidyr "integer"
long.list$tidyr$JobID.type <- ifelse(
is.na(long.list$tidyr$JobID.type),
"",
long.list$tidyr$JobID.type)
with(long.list, identical(tidyr, namedCapture))
#> [1] TRUE
This second comparison uses a syntax with the entire regex on one line. In my opinion this syntax makes the regular expressions more difficult to read/understand. Complicated regular expressions like the one used for matching the JobID column are not maintainable/understandable at all using this syntax.
## First define data.
(sacct.df <- data.frame(
position=c(
"chr10:213,054,000-213,055,000",
"chrM:111,000-222,000",
"this will not match",
NA, # neither will this.
"chr1:110-111 chr2:220-222"), # two possible matches.
JobID=c(
"13937810_25",
"13937810_25.batch",
"13937810_25.extern",
"14022192_[1-3]",
"14022204_[4]"),
stringsAsFactors=FALSE))
#> position JobID
#> 1 chr10:213,054,000-213,055,000 13937810_25
#> 2 chrM:111,000-222,000 13937810_25.batch
#> 3 this will not match 13937810_25.extern
#> 4 <NA> 14022192_[1-3]
#> 5 chr1:110-111 chr2:220-222 14022204_[4]
short.list <- list()
## tidyr alternate (13 lines total)
e <- function(col.name, group.names, pattern){
result <- tidyr::extract(
sacct.df, col.name, group.names, pattern, convert=TRUE)
to.save <- result[, group.names, drop=FALSE]
names(to.save) <- paste0(col.name, ".", group.names)
to.save
}
short.list$tidyr <- do.call(cbind, list(
sacct.df,
e("JobID", c("job", "task", "task1", "taskN", "type"),
"([0-9]+)_(?:([0-9]+)|\\[([0-9]+)(?:-([0-9]+))?\\])(?:[.](.*))?"),
e("position", c("chrom", "chromStart", "chromEnd"),
"(chr.*?):(.*?)-([0-9,]*)")))
## namedCapture alternate (7 lines total)
(short.list$namedCapture <- namedCapture::df_match_variable(
sacct.df,
JobID="(?P<job>[0-9]+)_(?:(?P<task>[0-9]+)|\\[(?P<task1>[0-9]+)(?:-(?P<taskN>[0-9]+))?\\])(?:[.](?P<type>.*))?",
position="(?P<chrom>chr.*?):(?P<chromStart>.*?)-(?P<chromEnd>[0-9,]*)"))
#> position JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000 13937810_25 13937810 25
#> 2 chrM:111,000-222,000 13937810_25.batch 13937810 25
#> 3 this will not match 13937810_25.extern 13937810 25
#> 4 <NA> 14022192_[1-3] 14022192
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 14022204
#> JobID.task1 JobID.taskN JobID.type position.chrom position.chromStart
#> 1 chr10 213,054,000
#> 2 batch chrM 111,000
#> 3 extern <NA> <NA>
#> 4 1 3 <NA> <NA>
#> 5 4 chr1 110
#> position.chromEnd
#> 1 213,055,000
#> 2 222,000
#> 3 <NA>
#> 4 <NA>
#> 5 111
for(N in names(short.list$namedCapture)){
short.list$namedCapture[[N]] <- type.convert(short.list$namedCapture[[N]], as.is=TRUE)
}
## Make sure the results are the same.
t(sapply(short.list, names))
#> [,1] [,2] [,3] [,4] [,5]
#> tidyr "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> namedCapture "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> [,6] [,7] [,8]
#> tidyr "JobID.taskN" "JobID.type" "position.chrom"
#> namedCapture "JobID.taskN" "JobID.type" "position.chrom"
#> [,9] [,10]
#> tidyr "position.chromStart" "position.chromEnd"
#> namedCapture "position.chromStart" "position.chromEnd"
t(sapply(short.list, sapply, class))
#> position JobID JobID.job JobID.task JobID.task1
#> tidyr "character" "character" "integer" "integer" "integer"
#> namedCapture "character" "character" "integer" "integer" "integer"
#> JobID.taskN JobID.type position.chrom position.chromStart
#> tidyr "integer" "character" "character" "character"
#> namedCapture "integer" "character" "character" "character"
#> position.chromEnd
#> tidyr "character"
#> namedCapture "character"
short.list$tidyr$JobID.type <- ifelse(
is.na(short.list$tidyr$JobID.type),
"",
short.list$tidyr$JobID.type)
with(short.list, identical(tidyr, namedCapture))
#> [1] TRUE