R mapping

John Mount

2019-06-14

rquery re-maps a number of symbols during SQL translation.

During expression parsing the internal rquery function tokenize_call_for_SQL() implements the following re-mappings from R idioms to SQL notation.

library("rquery")
library("wrapr")

show_translation <- function(strings) {
  vapply(strings,
         function(si) {
           format(rquery::tokenize_for_SQL(str2lang(si), colnames = NULL)$parsed_toks)
         }, character(1))
}

mapping_table <- data.frame(
  example = c('!x', 'is.na(x)', 'ifelse(a, b, c)', 'a^b', 'a%%b', 
               'a==b', 'a&&b', 'a&b', 'a||b', 'a|b', 
              'pmin(a, b)', 'pmax(a, b)'),
  stringsAsFactors = FALSE)
mapping_table$translation <- show_translation(mapping_table$example)
knitr::kable(mapping_table)
example translation
!x ( NOT ( x ) )
is.na(x) ( ( x ) IS NULL )
ifelse(a, b, c) ( CASE WHEN ( a ) THEN ( b ) WHEN NOT ( a ) THEN ( c ) ELSE NULL END )
a^b POWER ( a , b )
a%%b MOD ( a , b )
a==b a = b
a&&b a AND b
a&b a AND b
a||b a OR b
a|b a OR b
pmin(a, b) ( CASE WHEN ( a ) IS NULL THEN ( b ) WHEN ( b ) IS NULL THEN ( a ) WHEN ( a ) <= ( b ) THEN ( a ) ELSE ( b ) END )
pmax(a, b) ( CASE WHEN ( a ) IS NULL THEN ( b ) WHEN ( b ) IS NULL THEN ( a ) WHEN ( a ) >= ( b ) THEN ( a ) ELSE ( b ) END )

Note: not all possible mappings are implemented. For example we currently do not re-map %in%, preferring the user to explicitly work with set_indicator() directly.

In addition to this the database connectors can specify additional re-mappings. This can be found by building a formal connector and inspecting the re-mappings.

raw_connection <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
RSQLite::initExtension(raw_connection)
db <- rquery_db_info(
  connection = raw_connection,
  is_dbi = TRUE,
  connection_options = rq_connection_tests(raw_connection))

# RSQLite has a non-standard modulo operator
db$expr_map[["MOD"]] <- list(pre_sql_token("("),
                           3,
                           pre_sql_token("%"),
                           5,
                           pre_sql_token(")"))

fn_name_map <- db$connection_options[[paste0("rquery.", rq_connection_name(db), ".", "fn_name_map")]]
fn_name_map
##  mean 
## "avg"

We see above that “mean” is re-mapped to “avg”.

In all cases we can see what re-mappings happen by examining a query.

d_local <- build_frame(
   "subjectID", "surveyCategory"     , "assessmentTotal", "irrelevantCol1", "irrelevantCol2" |
   1L         , "withdrawal behavior", 5                , "irrel1"        , "irrel2"         |
   1L         , "positive re-framing", 2                , "irrel1"        , "irrel2"         |
   3L         , "withdrawal behavior", 3                , "irrel1"        , "irrel2"         |
   3L         , "positive re-framing", 4                , "irrel1"        , "irrel2"         )
table_handle <- rq_copy_to(db, 'd',
            d_local,
            temporary = TRUE, 
            overwrite = TRUE)
print(table_handle)
## [1] "table(`d`; subjectID, surveyCategory, assessmentTotal, irrelevantCol1, irrelevantCol2)"
ops <- table_handle %.>% 
  project(., 
          avg_total := avg(pmax(0, assessmentTotal)),
          groupby = "subjectID")

cat(to_sql(ops, db))
## SELECT `subjectID`, avg ( ( CASE WHEN ( 0 ) IS NULL THEN ( `assessmentTotal` ) WHEN ( `assessmentTotal` ) IS NULL THEN ( 0 ) WHEN ( 0 ) >= ( `assessmentTotal` ) THEN ( 0 ) ELSE ( `assessmentTotal` ) END ) ) AS `avg_total` FROM (
##  SELECT
##   `subjectID`,
##   `assessmentTotal`
##  FROM
##   `d`
##  ) tsql_35868337402762883376_0000000000
## GROUP BY
##  `subjectID`
ops %.>%
  execute(db, .) %.>%
  knitr::kable(.)
subjectID avg_total
1 3.5
3 3.5

Additional function re-mappings can be specified by user code. One such example is re-writing MOD as % for RSQLite.

rquery::rq_function_mappings(db) %.>%
  knitr::kable(.)
fn_name sql_mapping simple_name_mapping
mean avg TRUE
as.Date to_date ( .(3) , ‘YYYY-MM-DD’ ) FALSE
MOD ( .(3) % .(5) ) FALSE
table_handle %.>% extend(., z := subjectID %% 3) -> ops
cat(to_sql(ops, db))
## SELECT
##  `subjectID`,
##  `surveyCategory`,
##  `assessmentTotal`,
##  `irrelevantCol1`,
##  `irrelevantCol2`,
##  ( `subjectID` % 3 )  AS `z`
## FROM (
##  SELECT
##   `subjectID`,
##   `surveyCategory`,
##   `assessmentTotal`,
##   `irrelevantCol1`,
##   `irrelevantCol2`
##  FROM
##   `d`
##  ) tsql_14279028501846047745_0000000000
execute(db, ops)
##   subjectID      surveyCategory assessmentTotal irrelevantCol1
## 1         1 withdrawal behavior               5         irrel1
## 2         1 positive re-framing               2         irrel1
## 3         3 withdrawal behavior               3         irrel1
## 4         3 positive re-framing               4         irrel1
##   irrelevantCol2 z
## 1         irrel2 1
## 2         irrel2 1
## 3         irrel2 0
## 4         irrel2 0
DBI::dbDisconnect(raw_connection)