Quickstart

quickstart example for drake

William Michael Landau

2017-03-06

Quickstart examples

Drake has small self-contained built-in examples. To see the names of the available examples, use

examples_drake()
## [1] "basic"

Then use example_drake() to write the files for the example to your working directory. This vignette walks through the "basic" example, for which you can get the code with

example_drake("basic")

Setting up the basic example

Load your libraries first. Drake will detect loaded packages and reload them on all your compute nodes, if applicable.

library(knitr)
library(rmarkdown)
library(drake)

This example is a simulation study, and we’re using a function to simulate random datasets. Enter your simulation function and drake will import it automatically.

simulate = function(n){
  data.frame(
    x = rnorm(n),
    y = rpois(n, 1)
  )
}

For each dataset we simulate, we’ll apply a bunch of methods of analysis.

reg1 = function(d){
  lm(y ~ + x, data = d)
}

reg2 = function(d){
  d$x2 = d$x^2
  lm(y ~ x2, data = d)
}

After we’re done, we’ll want to knit a dynamic R Markdown report with a bunch of results.

my_knit = function(file, ...){
  knit(file) # drake knows you loaded the knitr package
}

my_render = function(file, ...){
  render(file) # drake knows you loaded the rmarkdown package
}

The example provides an example report.Rmd, which uses readd() and loadd() to load objects we’ll generate with drake.

# Write the R Markdown source for a dynamic knitr report
lines = c(
  "---",
  "title: Example Report",
  "author: You",
  "output: html_document",
  "---",
  "",
  "Look how I read outputs from the drake cache.",
  "",
  "```{r example_chunk}",
  "library(drake)",
  "readd(small)",
  "readd(coef_regression2_small)",
  "loadd(large)",
  "head(large)",
  "```"
)

writeLines(lines, "report.Rmd")

The workflow plan

In drake, your workflow plan is organized into a data frame. Each row represents a target, which is either a variable or a file that will be produced with a single command. Here is the part of the plan that generates our datasets.

datasets = plan(
  small = simulate(5),
  large = simulate(50))
datasets
##   target      command
## 1  small  simulate(5)
## 2  large simulate(50)

Commands need not be function calls. They can be any kind R expression except for formulas with ~ and function definitions. If I want multiple replicates, I can just use expand, but let’s just stick to our two datasets here.

expand(datasets, values = c("rep1", "rep2"))
##       target      command
## 1 small_rep1  simulate(5)
## 2 small_rep2  simulate(5)
## 3 large_rep1 simulate(50)
## 4 large_rep2 simulate(50)

To plan my analyses, we first declare the methods we will use.

methods = plan(
  regression1 = reg1(..dataset..),
  regression2 = reg2(..dataset..))
methods
##        target           command
## 1 regression1 reg1(..dataset..)
## 2 regression2 reg2(..dataset..)

The wildcard placeholder ..dataset.. says to substitute the names of our datasets one at a time in our actual analysis plan.

analyses = analyses(methods, data = datasets)
analyses
##              target     command
## 1 regression1_small reg1(small)
## 2 regression1_large reg1(large)
## 3 regression2_small reg2(small)
## 4 regression2_large reg2(large)

Now, we should summarize each analysis of each dataset a few different ways.

summary_types = plan(summ = summary(..analysis..),
                     coef = coef(..analysis..))
summary_types
##   target               command
## 1   summ summary(..analysis..)
## 2   coef    coef(..analysis..)
results = summaries(summary_types, analyses, datasets, 
  gather = NULL)
results
##                   target                    command
## 1 summ_regression1_small summary(regression1_small)
## 2 summ_regression1_large summary(regression1_large)
## 3 summ_regression2_small summary(regression2_small)
## 4 summ_regression2_large summary(regression2_large)
## 5 coef_regression1_small    coef(regression1_small)
## 6 coef_regression1_large    coef(regression1_large)
## 7 coef_regression2_small    coef(regression2_small)
## 8 coef_regression2_large    coef(regression2_large)

The gather argument of summaries is used to group summaries together by type, and I am skipping it here to make the workflow plan data frames more readable. The ..analysis.. wildcard acts similarly to the ..dataset.. wildcard. Functions analyses() and summaries() make use of evaluate() and gather() behind the scenes, which you can use them directly for added flexibility.

For the dynamic report, we have to tell drake which targets will be loaded into the embedded R chunks. That way, when the targets change, the report will automatically rebuild.

load_in_report = plan(
  report_dependencies = c(small, large, coef_regression2_small))
load_in_report
##                target                                 command
## 1 report_dependencies c(small, large, coef_regression2_small)

In the commands to render the report, keep in mind the rule for working with files: use single quotes to declare external file targets and dependencies, and use double quotes to remove any special meaning from character strings.

report = plan(
  report.md = my_knit('report.Rmd', report_dependencies),
  report.html = my_render('report.md', report_dependencies),
  file_targets = TRUE, strings_in_dots = "filenames")
report
##          target                                     command
## 1   'report.md'  my_knit('report.Rmd', report_dependencies)
## 2 'report.html' my_render('report.md', report_dependencies)

To finish planning your full workflow, use rbind() to piece all your commands together. Row order does not matter here. Drake knows which commands to run first.

plan = rbind(report, datasets, load_in_report, analyses, results)
plan
##                    target                                     command
## 1             'report.md'  my_knit('report.Rmd', report_dependencies)
## 2           'report.html' my_render('report.md', report_dependencies)
## 3                   small                                 simulate(5)
## 4                   large                                simulate(50)
## 5     report_dependencies     c(small, large, coef_regression2_small)
## 6       regression1_small                                 reg1(small)
## 7       regression1_large                                 reg1(large)
## 8       regression2_small                                 reg2(small)
## 9       regression2_large                                 reg2(large)
## 10 summ_regression1_small                  summary(regression1_small)
## 11 summ_regression1_large                  summary(regression1_large)
## 12 summ_regression2_small                  summary(regression2_small)
## 13 summ_regression2_large                  summary(regression2_large)
## 14 coef_regression1_small                     coef(regression1_small)
## 15 coef_regression1_large                     coef(regression1_large)
## 16 coef_regression2_small                     coef(regression2_small)
## 17 coef_regression2_large                     coef(regression2_large)

Optionally, check your workflow plan for obvious errors and pitfalls.

check(plan)

Run the workflow in a single process

Use make(plan) to run your workflow.

make(plan)
## import 'report.Rmd'
## import c
## import summary
## import coef
## import render
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import my_render
## import reg2
## import reg1
## import simulate
## import my_knit
## build small
## build large
## build regression1_small
## build regression1_large
## build regression2_small
## build regression2_large
## build summ_regression1_small
## build summ_regression1_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression1_small
## build coef_regression1_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'
## processing file: report.Rmd
## output file: report.md
## build 'report.html'
## /lrlhps/apps/pandoc/pandoc +RTS -K512m -RTS report.utf8.md --to html --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash --output report.html --smart --email-obfuscation none --self-contained --standalone --section-divs --template /home/c240390/.R/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable 'theme:bootstrap' --include-in-header /node/scratch/RtmpMkZD68/rmarkdown-str21a315f1bbb4.html --mathjax --variable 'mathjax-url:https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
## 
## Output created: report.html

Use readd() and loadd() to see the targets you generated. (They are stored in the hidden .drake/ folder using storr). Other functions interact and view the cache.

readd(coef_regression2_large)
## (Intercept)          x2 
##  0.89247217 -0.06069947
loadd(small)
head(small)
##            x y
## 1  0.3410611 2
## 2 -0.1728483 1
## 3 -0.6123971 1
## 4 -0.3395540 2
## 5 -0.9519455 1
cached(small, large)
## small large 
##  TRUE  TRUE
cached()
##  [1] "'report.Rmd'"           "'report.html'"         
##  [3] "'report.md'"            "c"                     
##  [5] "coef"                   "coef_regression1_large"
##  [7] "coef_regression1_small" "coef_regression2_large"
##  [9] "coef_regression2_small" "data.frame"            
## [11] "knit"                   "large"                 
## [13] "lm"                     "my_knit"               
## [15] "my_render"              "reg1"                  
## [17] "reg2"                   "regression1_large"     
## [19] "regression1_small"      "regression2_large"     
## [21] "regression2_small"      "render"                
## [23] "report_dependencies"    "rnorm"                 
## [25] "rpois"                  "simulate"              
## [27] "small"                  "summ_regression1_large"
## [29] "summ_regression1_small" "summ_regression2_large"
## [31] "summ_regression2_small" "summary"
built()
##  [1] "'report.html'"          "'report.md'"           
##  [3] "coef_regression1_large" "coef_regression1_small"
##  [5] "coef_regression2_large" "coef_regression2_small"
##  [7] "large"                  "regression1_large"     
##  [9] "regression1_small"      "regression2_large"     
## [11] "regression2_small"      "report_dependencies"   
## [13] "small"                  "summ_regression1_large"
## [15] "summ_regression1_small" "summ_regression2_large"
## [17] "summ_regression2_small"
imported()
##  [1] "'report.Rmd'" "c"            "coef"         "data.frame"  
##  [5] "knit"         "lm"           "my_knit"      "my_render"   
##  [9] "reg1"         "reg2"         "render"       "rnorm"       
## [13] "rpois"        "simulate"     "summary"
head(read_plan())
##                target                                     command
## 1         'report.md'  my_knit('report.Rmd', report_dependencies)
## 2       'report.html' my_render('report.md', report_dependencies)
## 3               small                                 simulate(5)
## 4               large                                simulate(50)
## 5 report_dependencies     c(small, large, coef_regression2_small)
## 6   regression1_small                                 reg1(small)
# plot_graph() # plots the tree structure of your workflow plan
head(status()) # last call to make()
##           'report.Rmd'          'report.html'            'report.md' 
##             "finished"             "finished"             "finished" 
##                      c                   coef coef_regression1_large 
##             "finished"             "finished"             "finished"
status(large)
##      large 
## "finished"

The next time you run make(plan), nothing will be built because drake knows everything is up to date.

make(plan)
## Warning in assert_unique_names(imports = names(imports), targets = plan$target): There are targets in your workflow plan that share names with imported objects from your environment/workspace. Behavior may be unpredictable. Duplicates found:
##   small
## import 'report.Rmd'
## import c
## import summary
## import coef
## import render
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import my_render
## import reg2
## import reg1
## import simulate
## import my_knit

But if you change one of your functions, commands, or other dependencies, drake will update the affected parts of the workflow. Let’s say we want to change the quadratic term to a cubic term in our reg2() function.

reg2 = function(d){
  d$x3 = d$x^3
  lm(y ~ x3, data = d)
}

Voila! Targets depending on reg2() are updated, and those depending only on reg1() are left alone.

make(plan)
## Warning in assert_unique_names(imports = names(imports), targets = plan$target): There are targets in your workflow plan that share names with imported objects from your environment/workspace. Behavior may be unpredictable. Duplicates found:
##   small
## import 'report.Rmd'
## import c
## import summary
## import coef
## import render
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import my_render
## import reg2
## import reg1
## import simulate
## import my_knit
## build regression2_small
## build regression2_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'
## processing file: report.Rmd
## output file: report.md
## build 'report.html'
## /lrlhps/apps/pandoc/pandoc +RTS -K512m -RTS report.utf8.md --to html --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash --output report.html --smart --email-obfuscation none --self-contained --standalone --section-divs --template /home/c240390/.R/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable 'theme:bootstrap' --include-in-header /node/scratch/RtmpMkZD68/rmarkdown-str21a3dcaaf66.html --mathjax --variable 'mathjax-url:https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
## 
## Output created: report.html

Need to add new work on the fly?

Just append rows to the workflow plan. If the rest of your workflow is up to date, only the new work is run.

new_simulation = function(n){
  data.frame(x = rnorm(n), y = rnorm(n))
}

additions = plan(
  new_data = new_simulation(36) + sqrt(10))  
additions
##     target                       command
## 1 new_data new_simulation(36) + sqrt(10)
plan = rbind(plan, additions)
plan
##                    target                                     command
## 1             'report.md'  my_knit('report.Rmd', report_dependencies)
## 2           'report.html' my_render('report.md', report_dependencies)
## 3                   small                                 simulate(5)
## 4                   large                                simulate(50)
## 5     report_dependencies     c(small, large, coef_regression2_small)
## 6       regression1_small                                 reg1(small)
## 7       regression1_large                                 reg1(large)
## 8       regression2_small                                 reg2(small)
## 9       regression2_large                                 reg2(large)
## 10 summ_regression1_small                  summary(regression1_small)
## 11 summ_regression1_large                  summary(regression1_large)
## 12 summ_regression2_small                  summary(regression2_small)
## 13 summ_regression2_large                  summary(regression2_large)
## 14 coef_regression1_small                     coef(regression1_small)
## 15 coef_regression1_large                     coef(regression1_large)
## 16 coef_regression2_small                     coef(regression2_small)
## 17 coef_regression2_large                     coef(regression2_large)
## 18               new_data               new_simulation(36) + sqrt(10)
make(plan)
## Warning in assert_unique_names(imports = names(imports), targets = plan$target): There are targets in your workflow plan that share names with imported objects from your environment/workspace. Behavior may be unpredictable. Duplicates found:
##   small
## import 'report.Rmd'
## import c
## import summary
## import coef
## import sqrt
## import render
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import my_render
## import reg2
## import reg1
## import simulate
## import my_knit
## import new_simulation
## build new_data

To clean up, use clean(). Any targets removed from the cache will have to be rebuilt on the next call to make(), so only clean if you are sure you will not lose anything important.

clean(small, reg1) # uncaches individual targets and imported objects
clean() # cleans all targets out of the cache
clean(destroy = TRUE) # removes the cache entirely

High-performance parallel computing

Within a single R session and a single compute node, you can spread your work over multiple parallel processes. Note: this does not work on Windows because drake uses parallel::mclapply() in the backend

make(plan, parallelism = "mclapply", jobs = 2) # "mclapply" is default.
readd(coef_regression2_large)

Alternatively, set parallelism = "Makefile" to spread targets over multiple parallel R sessions. This gets into true distributed computing. Windows users will need to download and install Rtools. The following are equivalent.

make(plan, parallelism = "Makefile", jobs = 4, verbose = FALSE)
make(plan, parallelism = "Makefile", command = "make", args = "--jobs=4 --silent") 

To distribute those Makefile jobs over multiple nodes on a cluster or supercomputer, you may need a shell.sh file like the following

#!/bin/bash
shift
echo "module load R/3.3.2; $*" | qsub -sync y -cwd -j y

Note that you may need to replace R/3.3.2 with your version of R. Next, put your main code, including your call to make(plan, ...), inside an R script such as script.R. To run your workflow on the cluster, use the Linux terminal to enter the following.

nohup nice -19 R CMD BATCH script.R &

Even after you log out, a background process will remain on the login node to submit new jobs through Make as new targets become ready.

A warning about the Makefile

The Makefile generated by make(plan, parallelism = "Makefile") is not standalone. Do not run it outside of drake::make(). Drake uses dummy timestamp files to tell the Makefile what to do, and running make in the terminal will most likely give incorrect results.

Flexible generation of workflow plans

More flexibility for generating workflow plans

If your workflow does not fit the rigid datasets/analyses/summaries framework, check out functions expand(), evaluate(), and gather().

df = plan(data = simulate(center = MU, scale = SIGMA))
df
##   target                              command
## 1   data simulate(center = MU, scale = SIGMA)
df = expand(df, values = c("rep1", "rep2"))
df
##      target                              command
## 1 data_rep1 simulate(center = MU, scale = SIGMA)
## 2 data_rep2 simulate(center = MU, scale = SIGMA)
evaluate(df, wildcard = "MU", values = 1:2)
##        target                             command
## 1 data_rep1_1 simulate(center = 1, scale = SIGMA)
## 2 data_rep1_2 simulate(center = 2, scale = SIGMA)
## 3 data_rep2_1 simulate(center = 1, scale = SIGMA)
## 4 data_rep2_2 simulate(center = 2, scale = SIGMA)
evaluate(df, wildcard = "MU", values = 1:2, expand = FALSE)
##      target                             command
## 1 data_rep1 simulate(center = 1, scale = SIGMA)
## 2 data_rep2 simulate(center = 2, scale = SIGMA)
evaluate(df, rules = list(MU = 1:2, SIGMA = c(0.1, 1)), expand = FALSE)
##      target                           command
## 1 data_rep1 simulate(center = 1, scale = 0.1)
## 2 data_rep2   simulate(center = 2, scale = 1)
evaluate(df, rules = list(MU = 1:2, SIGMA = c(0.1, 1, 10)))
##             target                           command
## 1  data_rep1_1_0.1 simulate(center = 1, scale = 0.1)
## 2    data_rep1_1_1   simulate(center = 1, scale = 1)
## 3   data_rep1_1_10  simulate(center = 1, scale = 10)
## 4  data_rep1_2_0.1 simulate(center = 2, scale = 0.1)
## 5    data_rep1_2_1   simulate(center = 2, scale = 1)
## 6   data_rep1_2_10  simulate(center = 2, scale = 10)
## 7  data_rep2_1_0.1 simulate(center = 1, scale = 0.1)
## 8    data_rep2_1_1   simulate(center = 1, scale = 1)
## 9   data_rep2_1_10  simulate(center = 1, scale = 10)
## 10 data_rep2_2_0.1 simulate(center = 2, scale = 0.1)
## 11   data_rep2_2_1   simulate(center = 2, scale = 1)
## 12  data_rep2_2_10  simulate(center = 2, scale = 10)
gather(df)
##   target                                            command
## 1 target list(data_rep1 = data_rep1, data_rep2 = data_rep2)
gather(df, target = "my_summaries", gather = "rbind")
##         target                                             command
## 1 my_summaries rbind(data_rep1 = data_rep1, data_rep2 = data_rep2)