In this vignette we compare computational requirements (time and memory) of common operations using data.table and tidyverse functions.

Setup

library(data.table)
#> data.table 1.14.7 IN DEVELOPMENT built 2022-12-07 19:18:30 UTC using 6 threads (see ?getDTthreads).  Latest news: r-datatable.com
as.integer(Sys.getenv("SLURM_JOB_CPUS_PER_NODE", "1"))
#> [1] 1
max.threads <- parallel::detectCores()
threads.vec <- unique(as.integer(c(1, max.threads/2, max.threads)))
is.toby <- system("whoami", intern=TRUE) %in% c("tdhock", "nau\\th798")
##seconds.limit <- if(is.toby)1 else 0.01
seconds.limit <- 1
aplot <- function(atime.list, my.title, xmax, max.seconds, xlab, color.vec=NULL){
  best.list <- atime::references_best(atime.list)
  both.dt <- best.list$meas
  blank.dt <- data.table(x=both.dt$N[1], y=max.seconds, unit="seconds")
  if(require(ggplot2)){
    hline.df <- with(atime.list, data.frame(seconds.limit, unit="seconds"))
    gg <- ggplot()+
      ggtitle(my.title)+
      theme_bw()+
      geom_blank(aes(
        x, y),
        data=blank.dt)+
      facet_grid(unit ~ ., scales="free")+
      geom_hline(aes(
        yintercept=seconds.limit),
        color="grey",
        data=hline.df)+
      geom_line(aes(
        N, empirical, color=expr.name),
        data=best.list$meas)+
      geom_ribbon(aes(
        N, ymin=min, ymax=max, fill=expr.name),
        data=best.list$meas[unit=="seconds"],
        alpha=0.5)+
      scale_x_log10(xlab)+
      scale_y_log10("median line, min/max band")
    if(!is.null(color.vec)){
      gg <- gg+
        scale_color_manual(values=color.vec)+
        scale_fill_manual(values=color.vec)
    }
    if(require(directlabels)){
      gg+
        directlabels::geom_dl(aes(
          N, empirical, color=expr.name, label=expr.name),
          method="right.polygons",
          data=best.list$meas)+
        theme(legend.position="none")+
        coord_cartesian(xlim=c(NA,xmax))
    }else{
      gg
    }
  }
}

Writing CSV

First we define some code which will be used in all of the writing benchmarks,

atime_write <- function(make.mat.fun, fmt){
  atime::atime(
    N=10^seq(1, 7),
    setup={
      mat <- make.mat.fun(N)
      name.list <- list()
      for(fun in c("fwrite", "write_csv", "write.csv")){
        name.list[[fun]] <- file.path(
          tempdir(), sprintf(fmt, fun, N))
      }
      dt <- data.table(mat)
    },
    seconds.limit = seconds.limit,
    results=FALSE,
    expr.list=atime::atime_grid(
      list(THREADS=threads.vec),
      "data.table::fwrite"={
        data.table::setDTthreads(THREADS)
        data.table::fwrite(dt, name.list$fwrite, showProgress = FALSE)
      }, 
      "readr::write_csv"={
        readr::write_csv(
          dt, name.list$write_csv, progress = FALSE, num_threads = THREADS)
      }),
    "utils::write.csv"=utils::write.csv(dt, name.list$write.csv))
}
if(FALSE){
  RColorBrewer::display.brewer.all()
  dput(RColorBrewer::brewer.pal(Inf, "Set2"))
  dput(RColorBrewer::brewer.pal(Inf, "RdGy"))
}
PRGn <- c(
  "#40004B", "#762A83", "#9970AB", "#C2A5CF", "#E7D4E8",
  "#F7F7F7", 
  "#D9F0D3", "#A6DBA0", "#5AAE61", "#1B7837", "#00441B")
expr.colors <- c(
  "#67001F",#dark red
  "#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
  "#FFFFFF",#white
  "#E0E0E0", "#BABABA", "#878787", "#4D4D4D",
  "#1A1A1A",#almost black
  "utils::write.csv"="deepskyblue")
names(expr.colors)[c(3,2,1)] <- paste0(
  "data.table::fwrite THREADS=",threads.vec)
names(expr.colors)[c(9,10,11)] <- paste0(
  "readr::write_csv THREADS=",threads.vec)
write.colors <- expr.colors[names(expr.colors)!=""]

The code below is for real numbers with a constant number of columns, and a variable number of rows.

random_real <- function(N.rows, N.cols){
  set.seed(1)
  matrix(rnorm(N.rows*N.cols), N.rows, N.cols)
}
write.real.vary.rows <- atime_write(
  function(N.rows, N.cols=10)random_real(N.rows, N.cols),
  "10_real_cols_%s_%d.csv")
aplot(write.real.vary.rows, "Write CSV with 10 random normal real columns", 1e7, 1e1, "Number of rows", write.colors)
#> Loading required package: ggplot2
#> Loading required package: directlabels
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis

The code below writes real numbers with a constant number of rows, and a variable number of columns.

write.real.vary.cols <- atime_write(
  function(N.cols, N.rows=10)random_real(N.rows, N.cols),
  "10_real_rows_%s_%d.csv")
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
aplot(write.real.vary.cols, "Write CSV with 10 random normal real rows", 1e7, 1e1, "Number of columns", write.colors)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning: Transformation introduced infinite values in continuous y-axis

The code below is for character data with a constant number of columns, and a variable number of rows.

chr_mat <- function(N.rows, N.cols){
  matrix("'quoted data'", N.rows, N.cols)
}
write.chr.vary.rows <- atime_write(
  function(N.rows,N.cols=10)chr_mat(N.rows, N.cols),
  "10_chr_cols_%s_%d.csv")
aplot(write.chr.vary.rows, "Write CSV with 10 character columns", 1e7, 1e1, "Number of rows", write.colors)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis

The code below is for character data with a constant number of rows, and a variable number of columns.

write.chr.vary.cols <- atime_write(
  function(N.cols, N.rows=10)chr_mat(N.rows, N.cols),
  "10_chr_rows_%s_%d.csv")
aplot(write.chr.vary.cols, "Write CSV with 10 character rows", 1e7, 1e1, "Number of columns", write.colors)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis

The comparisons above show significant advantages for data.table for writing CSV data with a large number of columns:

Reading CSV

First we define a function which we will use for all of the read benchmarks,

read.expr.list <- c(atime::atime_grid(
  list(LAZY=c(TRUE, FALSE), THREADS=threads.vec),
  "readr::read_csv"=readr::read_csv(
    f.csv, num_threads = THREADS, lazy = LAZY,
    show_col_types=FALSE, progress=FALSE)),
  atime::atime_grid(
    list(THREADS=threads.vec),
    "data.table::fread"={
      data.table::setDTthreads(THREADS)
      data.table::fread(f.csv, showProgress=FALSE)
    }),
  "utils::read.csv"=quote(utils::read.csv(f.csv)))
atime_read <- function(glob){  
  fmt <- sub("[*]", "%d", glob)
  csv.dt <- nc::capture_first_vec(
    Sys.glob(file.path(tempdir(), glob)),
    N="[0-9]+", as.integer,
    ".csv")[order(N)]
  atime::atime(
    N=csv.dt$N,
    setup={
      f.csv <- file.path(tempdir(), sprintf(fmt, N))
    },
    seconds.limit = seconds.limit,
    results=FALSE,
    expr.list=read.expr.list)
}
PRGn <- c(
  "#40004B", "#762A83", "#9970AB", "#C2A5CF", "#E7D4E8",#5
  "#F7F7F7", #6
  "#D9F0D3", "#A6DBA0", "#5AAE61", "#1B7837", "#00441B",#11
  "#67001F",#dark red 12
  "#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
  "#FFFFFF",#white 17
  "#E0E0E0", "#BABABA", "#878787", "#4D4D4D",
  "#1A1A1A",#almost black 22
  "utils::write.csv"="deepskyblue")
names(PRGn)[c(3,2,1,9,10,11,14,13,12,23)] <- names(read.expr.list)
read.colors <- PRGn[names(PRGn)!=""]

Below we read real numbers with a constant number of columns, and a variable number of rows.

read.real.vary.rows <- atime_read("10_real_cols_fwrite_*.csv")
aplot(read.real.vary.rows, "Read CSV with 10 real columns", 1e7, 1e1, "Number of rows", read.colors)

Below we read real numbers with a constant number of rows, and a variable number of columns.

read.real.vary.cols <- atime_read("10_real_rows_fwrite_*.csv")
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
aplot(read.real.vary.cols, "Read CSV with 10 real rows", 1e7, 1e1, "Number of columns", read.colors)

Below we read character data with a constant number of columns, and a variable number of rows.

read.chr.vary.rows <- atime_read("10_chr_cols_fwrite_*.csv")
aplot(read.chr.vary.rows, "Read CSV with 10 character columns", 1e7, 1e1, "Number of rows", read.colors)

Below we read character data with a constant number of rows, and a variable number of columns.

read.chr.vary.cols <- atime_read("10_chr_rows_fwrite_*.csv")
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
aplot(read.chr.vary.cols, "Read CSV with 10 character rows", 1e7, 1e1, "Number of columns", read.colors)

From the comparisons above it can be seen that fread uses much less time and memory than the alternatives.

Summarize by group

The next problem is motivated by a common operation in machine learning code: computing the mean/SD over cross-validation folds.

summary.expr.list <- c(atime::atime_grid(
  list(THREADS=threads.vec),
  "[.data.table"={
    data.table::setDTthreads(THREADS)
    loss.dt[, .(
      loss_length=.N,
      loss_mean=mean(loss),
      loss_sd=sd(loss)
    ), by=.(set, epoch)]
  }),
  atime::atime_grid(
    "base::by"={
      base::by(
        loss.dt$loss, 
        list(loss.dt$set, loss.dt$epoch), 
        function(values)c(
          loss_length=length(values),
          loss_mean=mean(values), 
          loss_sd=sd(values)))
    },
    "base::tapply"={
      base::tapply(
        loss.dt$loss, 
        list(loss.dt$set, loss.dt$epoch), 
        function(values)c(
          loss_length=length(values),
          loss_mean=mean(values), 
          loss_sd=sd(values)))
    }, 
    "stats::aggregate"={
      res <- stats::aggregate(
        loss ~ set + epoch, 
        loss.dt, 
        function(values)list(c(
          loss_length=length(values),
          loss_mean=mean(values), 
          loss_sd=sd(values))))
      data.frame(
        subset(res, select=-loss), 
        do.call(rbind, res$loss))
    },
    "data.table::dcast"={
      dcast(
        loss.dt,
        set + epoch ~ .,
        list(length, mean, sd),
        value.var="loss")
    }, 
    "dplyr::summarise"={
      loss.dt |> 
      dplyr::group_by(set, epoch) |> 
      dplyr::summarise(
        loss_length=length(loss),
        loss_mean=mean(loss), 
        loss_sd=sd(loss))
    }, 
    "tidyr::pivot_wider"={
      loss.dt |> 
      tidyr::pivot_wider(
        id_cols = c(set,epoch), 
        values_from=loss, 
        names_from=name, 
        values_fn=function(values)list(c(
          loss_length=length(values),
          loss_mean=mean(values), 
          loss_sd=sd(values)))) |> 
          tidyr::unnest_wider(loss)
    }))
summary.colors <- c(
  "#D6604D",
  "#B2182B",
  "#67001F",#dark red
  "#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", 
  "#E5C494", "#B3B3B3")
names(summary.colors)[1:length(summary.expr.list)] <- names(summary.expr.list)
summary.colors <- summary.colors[names(summary.colors)!=""]
options(dplyr.summarise.inform=FALSE)
summary.atime.list <- atime::atime(
  N=as.integer(10^seq(0, 7, by=0.5)),
  setup={
    n.folds <- 10
    loss.dt <- data.table(
      name="loss", 
      fold=rep(1:n.folds, each=2*N),
      loss=rnorm(2*N*n.folds),
      set=rep(c("subtrain","validation"),each=N),
      epoch=1:N,
      key=c("set","epoch","fold"))
  },
  results=FALSE,
  seconds.limit=seconds.limit,
  expr.list=summary.expr.list)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.

#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
aplot(summary.atime.list, "Length, Mean, SD over 10 folds for each epoch and set", 1e7, 1e1, "Number of epochs", summary.colors)

The comparison above shows that using [.data.table is by far the fastest way to compute the Mean and SD over folds.

Join / merge

i <- 1:100
DT <- CJ(letter=LETTERS, i)[, x := rnorm(.N)]
setkey(DT, letter, i)
DF <- data.frame(DT)
rownames(DF) <- with(DF, paste0(letter, i))
atime.list <- atime::atime(
  N=10^seq(1, 7),
  setup={
    select.dt <- data.table(
      letter=sample(LETTERS, N, replace=TRUE),
      i=sample(i, N, replace=TRUE),
      y=rnorm(N))
    setkey(select.dt, letter, i)
    select.df <- data.frame(select.dt)
  },
  "data.table::`[.data.table`"=DT[select.dt, x+y],
  "data.table::merge"=data.table::merge.data.table(DT,select.dt)[, x+y],
  "base::merge.data.frame"=with(base::merge.data.frame(DF, select.df, by=c('letter','i')), x+y),
  "[+paste0"=with(select.df, DF[paste0(letter,i),"x"]+y),
  "dplyr::inner_join"=with(dplyr::inner_join(DT, select.dt, by=c('letter','i')), x+y))
aplot(atime.list, "Join and sum", 1e7, 1e1, "Size of output vector")

Join and summarize

i <- 1:100
DT <- CJ(letter=LETTERS, i)[, x := rnorm(.N)]
setkey(DT, letter, i)
DF <- data.frame(DT)
rownames(DF) <- with(DF, paste0(letter, i))
atime.list <- atime::atime(
  N=as.integer(10^seq(0, 7, by=0.5)),
  setup={
    select.dt <- data.table(
      letter=sample(LETTERS, N, replace=TRUE),
      i=sample(i, N, replace=TRUE),
      y=rnorm(N))
    setkey(select.dt, letter, i)
    select.df <- data.frame(select.dt)
  },
  seconds.limit=seconds.limit,
  "data.table::`[.data.table`"={
    select.dt[DT, .(rows=.N, diff=mean(y)-x), by=.EACHI, nomatch=0L]
  },
  "base::by"={
    do.call(rbind, base::by(
      select.df, 
      with(select.df, paste0(letter, i)), 
      function(sdf){
        srow <- sdf[1,]
        data.frame(
          srow[,c("letter","i")],
          rows=nrow(sdf), 
          diff=mean(sdf$y)-DF[with(srow,paste0(letter,i)),"x"])
      }))
  }, 
  "dplyr::inner_join"={
    dplyr::inner_join(DT, select.dt, by=c('letter','i')) |> 
      dplyr::group_by(letter, i) |> 
      dplyr::summarise(rows=length(y), diff=mean(y)-x[1])
  })
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.

#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
aplot(atime.list, "Join and summarize", 1e10, 1e1, "Rows in join table")

Rolling join

This situation arises when you want to compute the average in a regular grid over some irregularly spaced numbers.

digits <- 1
grid.space <- 10^(-digits)
offset <- grid.space/2
atime.list <- atime::atime(
  N=10^seq(1:7),
  setup={
    set.seed(1)
    X <- runif(N)
    Y <- 10*X+rnorm(N)
  }, 
  "data.table::[roll=nearest"={
    irreg.dt <- data.table(X, Y, key="X")
    grid <- seq(offset, 1-offset, by=grid.space)
    reg.dt <- data.table(grid, X=grid, key="X")
    join.dt <- reg.dt[irreg.dt, roll="nearest"]
    join.dt[, .(Y.N=.N, Y.mean=mean(Y), Y.sd=sd(Y)), by=grid]
  },
  "round,data.table"={
    data.table(
      grid=round(X+offset, digits=digits)-offset,
      Y
    )[, .(
      Y.N=.N, 
      Y.mean=mean(Y), 
      Y.sd=sd(Y)
    ), by=grid]
  },
  "round,aggregate"={
    grid <- round(X+offset, digits=digits)-offset
    aggregate(Y ~ grid, FUN=function(values)c(
      N=length(values),
      mean=mean(values),
      sd=sd(values)))
  })
aplot(atime.list, "Join and summarize", 1e10, 1e1, "Rows in join table")