Drake has small self-contained built-in examples. To see the names of the available examples, use
examples_drake()
## [1] "basic"
Then use example_drake()
to write the files for the example to your working directory. This vignette walks through the "basic"
example, for which you can get the code with
example_drake("basic")
Load your libraries first. Drake will detect loaded packages and reload them on all your compute nodes, if applicable.
library(knitr)
library(rmarkdown)
library(drake)
This example is a simulation study, and we’re using a function to simulate random datasets. Enter your simulation function and drake will import it automatically.
simulate = function(n){
data.frame(
x = rnorm(n),
y = rpois(n, 1)
)
}
For each dataset we simulate, we’ll apply a bunch of methods of analysis.
reg1 = function(d){
lm(y ~ + x, data = d)
}
reg2 = function(d){
d$x2 = d$x^2
lm(y ~ x2, data = d)
}
After we’re done, we’ll want to knit a dynamic R Markdown report with a bunch of results.
my_knit = function(file, ...){
knit(file) # drake knows you loaded the knitr package
}
my_render = function(file, ...){
render(file) # drake knows you loaded the rmarkdown package
}
The example provides an example report.Rmd
, which uses readd()
and loadd()
to load objects we’ll generate with drake.
# Write the R Markdown source for a dynamic knitr report
lines = c(
"---",
"title: Example Report",
"author: You",
"output: html_document",
"---",
"",
"Look how I read outputs from the drake cache.",
"",
"```{r example_chunk}",
"library(drake)",
"readd(small)",
"readd(coef_regression2_small)",
"loadd(large)",
"head(large)",
"```"
)
writeLines(lines, "report.Rmd")
In drake, your workflow plan is organized into a data frame. Each row represents a target, which is either a variable or a file that will be produced with a single command. Here is the part of the plan that generates our datasets.
datasets = plan(
small = simulate(5),
large = simulate(50))
datasets
## target command
## 1 small simulate(5)
## 2 large simulate(50)
Commands need not be function calls. They can be any kind R expression except for formulas with ~
and function definitions. If I want multiple replicates, I can just use expand
, but let’s just stick to our two datasets here.
expand(datasets, values = c("rep1", "rep2"))
## target command
## 1 small_rep1 simulate(5)
## 2 small_rep2 simulate(5)
## 3 large_rep1 simulate(50)
## 4 large_rep2 simulate(50)
To plan my analyses, we first declare the methods we will use.
methods = plan(
regression1 = reg1(..dataset..),
regression2 = reg2(..dataset..))
methods
## target command
## 1 regression1 reg1(..dataset..)
## 2 regression2 reg2(..dataset..)
The wildcard placeholder ..dataset..
says to substitute the names of our datasets one at a time in our actual analysis plan.
analyses = analyses(methods, data = datasets)
analyses
## target command
## 1 regression1_small reg1(small)
## 2 regression1_large reg1(large)
## 3 regression2_small reg2(small)
## 4 regression2_large reg2(large)
Now, we should summarize each analysis of each dataset a few different ways.
summary_types = plan(summ = summary(..analysis..),
coef = coef(..analysis..))
summary_types
## target command
## 1 summ summary(..analysis..)
## 2 coef coef(..analysis..)
results = summaries(summary_types, analyses, datasets,
gather = NULL)
results
## target command
## 1 summ_regression1_small summary(regression1_small)
## 2 summ_regression1_large summary(regression1_large)
## 3 summ_regression2_small summary(regression2_small)
## 4 summ_regression2_large summary(regression2_large)
## 5 coef_regression1_small coef(regression1_small)
## 6 coef_regression1_large coef(regression1_large)
## 7 coef_regression2_small coef(regression2_small)
## 8 coef_regression2_large coef(regression2_large)
The gather
argument of summaries
is used to group summaries together by type, and I am skipping it here to make the workflow plan data frames more readable. The ..analysis..
wildcard acts similarly to the ..dataset..
wildcard. Functions analyses()
and summaries()
make use of evaluate()
and gather()
behind the scenes, which you can use them directly for added flexibility.
For the dynamic report, we have to tell drake which targets will be loaded into the embedded R chunks. That way, when the targets change, the report will automatically rebuild.
load_in_report = plan(
report_dependencies = c(small, large, coef_regression2_small))
load_in_report
## target command
## 1 report_dependencies c(small, large, coef_regression2_small)
In the commands to render the report, keep in mind the rule for working with files: use single quotes to declare external file targets and dependencies, and use double quotes to remove any special meaning from character strings.
report = plan(
report.md = my_knit('report.Rmd', report_dependencies),
report.html = my_render('report.md', report_dependencies),
file_targets = TRUE, strings_in_dots = "filenames")
report
## target command
## 1 'report.md' my_knit('report.Rmd', report_dependencies)
## 2 'report.html' my_render('report.md', report_dependencies)
To finish planning your full workflow, use rbind()
to piece all your commands together. Row order does not matter here. Drake knows which commands to run first.
plan = rbind(report, datasets, load_in_report, analyses, results)
plan
## target command
## 1 'report.md' my_knit('report.Rmd', report_dependencies)
## 2 'report.html' my_render('report.md', report_dependencies)
## 3 small simulate(5)
## 4 large simulate(50)
## 5 report_dependencies c(small, large, coef_regression2_small)
## 6 regression1_small reg1(small)
## 7 regression1_large reg1(large)
## 8 regression2_small reg2(small)
## 9 regression2_large reg2(large)
## 10 summ_regression1_small summary(regression1_small)
## 11 summ_regression1_large summary(regression1_large)
## 12 summ_regression2_small summary(regression2_small)
## 13 summ_regression2_large summary(regression2_large)
## 14 coef_regression1_small coef(regression1_small)
## 15 coef_regression1_large coef(regression1_large)
## 16 coef_regression2_small coef(regression2_small)
## 17 coef_regression2_large coef(regression2_large)
Optionally, check your workflow plan for obvious errors and pitfalls.
check(plan)
Use make(plan)
to run your workflow.
make(plan)
## import 'report.Rmd'
## import c
## import summary
## import coef
## import render
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import my_render
## import reg2
## import reg1
## import simulate
## import my_knit
## build small
## build large
## build regression1_small
## build regression1_large
## build regression2_small
## build regression2_large
## build summ_regression1_small
## build summ_regression1_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression1_small
## build coef_regression1_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'
## processing file: report.Rmd
## output file: report.md
## build 'report.html'
## /lrlhps/apps/pandoc/pandoc +RTS -K512m -RTS report.utf8.md --to html --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash --output report.html --smart --email-obfuscation none --self-contained --standalone --section-divs --template /home/c240390/.R/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable 'theme:bootstrap' --include-in-header /node/scratch/RtmpMkZD68/rmarkdown-str21a315f1bbb4.html --mathjax --variable 'mathjax-url:https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
##
## Output created: report.html
Use readd()
and loadd()
to see the targets you generated. (They are stored in the hidden .drake/
folder using storr). Other functions interact and view the cache.
readd(coef_regression2_large)
## (Intercept) x2
## 0.89247217 -0.06069947
loadd(small)
head(small)
## x y
## 1 0.3410611 2
## 2 -0.1728483 1
## 3 -0.6123971 1
## 4 -0.3395540 2
## 5 -0.9519455 1
cached(small, large)
## small large
## TRUE TRUE
cached()
## [1] "'report.Rmd'" "'report.html'"
## [3] "'report.md'" "c"
## [5] "coef" "coef_regression1_large"
## [7] "coef_regression1_small" "coef_regression2_large"
## [9] "coef_regression2_small" "data.frame"
## [11] "knit" "large"
## [13] "lm" "my_knit"
## [15] "my_render" "reg1"
## [17] "reg2" "regression1_large"
## [19] "regression1_small" "regression2_large"
## [21] "regression2_small" "render"
## [23] "report_dependencies" "rnorm"
## [25] "rpois" "simulate"
## [27] "small" "summ_regression1_large"
## [29] "summ_regression1_small" "summ_regression2_large"
## [31] "summ_regression2_small" "summary"
built()
## [1] "'report.html'" "'report.md'"
## [3] "coef_regression1_large" "coef_regression1_small"
## [5] "coef_regression2_large" "coef_regression2_small"
## [7] "large" "regression1_large"
## [9] "regression1_small" "regression2_large"
## [11] "regression2_small" "report_dependencies"
## [13] "small" "summ_regression1_large"
## [15] "summ_regression1_small" "summ_regression2_large"
## [17] "summ_regression2_small"
imported()
## [1] "'report.Rmd'" "c" "coef" "data.frame"
## [5] "knit" "lm" "my_knit" "my_render"
## [9] "reg1" "reg2" "render" "rnorm"
## [13] "rpois" "simulate" "summary"
head(read_plan())
## target command
## 1 'report.md' my_knit('report.Rmd', report_dependencies)
## 2 'report.html' my_render('report.md', report_dependencies)
## 3 small simulate(5)
## 4 large simulate(50)
## 5 report_dependencies c(small, large, coef_regression2_small)
## 6 regression1_small reg1(small)
# plot_graph() # plots the tree structure of your workflow plan
head(status()) # last call to make()
## 'report.Rmd' 'report.html' 'report.md'
## "finished" "finished" "finished"
## c coef coef_regression1_large
## "finished" "finished" "finished"
status(large)
## large
## "finished"
The next time you run make(plan)
, nothing will be built because drake knows everything is up to date.
make(plan)
## Warning in assert_unique_names(imports = names(imports), targets = plan$target): There are targets in your workflow plan that share names with imported objects from your environment/workspace. Behavior may be unpredictable. Duplicates found:
## small
## import 'report.Rmd'
## import c
## import summary
## import coef
## import render
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import my_render
## import reg2
## import reg1
## import simulate
## import my_knit
But if you change one of your functions, commands, or other dependencies, drake will update the affected parts of the workflow. Let’s say we want to change the quadratic term to a cubic term in our reg2()
function.
reg2 = function(d){
d$x3 = d$x^3
lm(y ~ x3, data = d)
}
Voila! Targets depending on reg2()
are updated, and those depending only on reg1()
are left alone.
make(plan)
## Warning in assert_unique_names(imports = names(imports), targets = plan$target): There are targets in your workflow plan that share names with imported objects from your environment/workspace. Behavior may be unpredictable. Duplicates found:
## small
## import 'report.Rmd'
## import c
## import summary
## import coef
## import render
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import my_render
## import reg2
## import reg1
## import simulate
## import my_knit
## build regression2_small
## build regression2_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'
## processing file: report.Rmd
## output file: report.md
## build 'report.html'
## /lrlhps/apps/pandoc/pandoc +RTS -K512m -RTS report.utf8.md --to html --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash --output report.html --smart --email-obfuscation none --self-contained --standalone --section-divs --template /home/c240390/.R/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable 'theme:bootstrap' --include-in-header /node/scratch/RtmpMkZD68/rmarkdown-str21a3dcaaf66.html --mathjax --variable 'mathjax-url:https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
##
## Output created: report.html
Just append rows to the workflow plan. If the rest of your workflow is up to date, only the new work is run.
new_simulation = function(n){
data.frame(x = rnorm(n), y = rnorm(n))
}
additions = plan(
new_data = new_simulation(36) + sqrt(10))
additions
## target command
## 1 new_data new_simulation(36) + sqrt(10)
plan = rbind(plan, additions)
plan
## target command
## 1 'report.md' my_knit('report.Rmd', report_dependencies)
## 2 'report.html' my_render('report.md', report_dependencies)
## 3 small simulate(5)
## 4 large simulate(50)
## 5 report_dependencies c(small, large, coef_regression2_small)
## 6 regression1_small reg1(small)
## 7 regression1_large reg1(large)
## 8 regression2_small reg2(small)
## 9 regression2_large reg2(large)
## 10 summ_regression1_small summary(regression1_small)
## 11 summ_regression1_large summary(regression1_large)
## 12 summ_regression2_small summary(regression2_small)
## 13 summ_regression2_large summary(regression2_large)
## 14 coef_regression1_small coef(regression1_small)
## 15 coef_regression1_large coef(regression1_large)
## 16 coef_regression2_small coef(regression2_small)
## 17 coef_regression2_large coef(regression2_large)
## 18 new_data new_simulation(36) + sqrt(10)
make(plan)
## Warning in assert_unique_names(imports = names(imports), targets = plan$target): There are targets in your workflow plan that share names with imported objects from your environment/workspace. Behavior may be unpredictable. Duplicates found:
## small
## import 'report.Rmd'
## import c
## import summary
## import coef
## import sqrt
## import render
## import lm
## import data.frame
## import rnorm
## import rpois
## import knit
## import my_render
## import reg2
## import reg1
## import simulate
## import my_knit
## import new_simulation
## build new_data
To clean up, use clean()
. Any targets removed from the cache will have to be rebuilt on the next call to make()
, so only clean if you are sure you will not lose anything important.
clean(small, reg1) # uncaches individual targets and imported objects
clean() # cleans all targets out of the cache
clean(destroy = TRUE) # removes the cache entirely
Within a single R session and a single compute node, you can spread your work over multiple parallel processes. Note: this does not work on Windows because drake uses parallel::mclapply() in the backend
make(plan, parallelism = "mclapply", jobs = 2) # "mclapply" is default.
readd(coef_regression2_large)
Alternatively, set parallelism = "Makefile"
to spread targets over multiple parallel R sessions. This gets into true distributed computing. Windows users will need to download and install Rtools
. The following are equivalent.
make(plan, parallelism = "Makefile", jobs = 4, verbose = FALSE)
make(plan, parallelism = "Makefile", command = "make", args = "--jobs=4 --silent")
To distribute those Makefile jobs over multiple nodes on a cluster or supercomputer, you may need a shell.sh
file like the following
#!/bin/bash
shift
echo "module load R/3.3.2; $*" | qsub -sync y -cwd -j y
Note that you may need to replace R/3.3.2
with your version of R. Next, put your main code, including your call to make(plan, ...)
, inside an R script such as script.R
. To run your workflow on the cluster, use the Linux terminal to enter the following.
nohup nice -19 R CMD BATCH script.R &
Even after you log out, a background process will remain on the login node to submit new jobs through Make as new targets become ready.
The Makefile generated by make(plan, parallelism = "Makefile")
is not standalone. Do not run it outside of drake::make()
. Drake uses dummy timestamp files to tell the Makefile what to do, and running make
in the terminal will most likely give incorrect results.
If your workflow does not fit the rigid datasets/analyses/summaries framework, check out functions expand()
, evaluate()
, and gather()
.
df = plan(data = simulate(center = MU, scale = SIGMA))
df
## target command
## 1 data simulate(center = MU, scale = SIGMA)
df = expand(df, values = c("rep1", "rep2"))
df
## target command
## 1 data_rep1 simulate(center = MU, scale = SIGMA)
## 2 data_rep2 simulate(center = MU, scale = SIGMA)
evaluate(df, wildcard = "MU", values = 1:2)
## target command
## 1 data_rep1_1 simulate(center = 1, scale = SIGMA)
## 2 data_rep1_2 simulate(center = 2, scale = SIGMA)
## 3 data_rep2_1 simulate(center = 1, scale = SIGMA)
## 4 data_rep2_2 simulate(center = 2, scale = SIGMA)
evaluate(df, wildcard = "MU", values = 1:2, expand = FALSE)
## target command
## 1 data_rep1 simulate(center = 1, scale = SIGMA)
## 2 data_rep2 simulate(center = 2, scale = SIGMA)
evaluate(df, rules = list(MU = 1:2, SIGMA = c(0.1, 1)), expand = FALSE)
## target command
## 1 data_rep1 simulate(center = 1, scale = 0.1)
## 2 data_rep2 simulate(center = 2, scale = 1)
evaluate(df, rules = list(MU = 1:2, SIGMA = c(0.1, 1, 10)))
## target command
## 1 data_rep1_1_0.1 simulate(center = 1, scale = 0.1)
## 2 data_rep1_1_1 simulate(center = 1, scale = 1)
## 3 data_rep1_1_10 simulate(center = 1, scale = 10)
## 4 data_rep1_2_0.1 simulate(center = 2, scale = 0.1)
## 5 data_rep1_2_1 simulate(center = 2, scale = 1)
## 6 data_rep1_2_10 simulate(center = 2, scale = 10)
## 7 data_rep2_1_0.1 simulate(center = 1, scale = 0.1)
## 8 data_rep2_1_1 simulate(center = 1, scale = 1)
## 9 data_rep2_1_10 simulate(center = 1, scale = 10)
## 10 data_rep2_2_0.1 simulate(center = 2, scale = 0.1)
## 11 data_rep2_2_1 simulate(center = 2, scale = 1)
## 12 data_rep2_2_10 simulate(center = 2, scale = 10)
gather(df)
## target command
## 1 target list(data_rep1 = data_rep1, data_rep2 = data_rep2)
gather(df, target = "my_summaries", gather = "rbind")
## target command
## 1 my_summaries rbind(data_rep1 = data_rep1, data_rep2 = data_rep2)