README

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

filecacher

Installation

install.packages("filecacher")

if(!requireNamespace("remotes")) install.packages("remotes")

remotes::install_github("orgadish/filecacher")

Example

# Example files: iris table split by species into three files.
iris_files_by_species <- list.files(
  system.file("extdata", package = "filecacher"),
  pattern = "_only[.]csv$",
  full.names = TRUE
)
basename(iris_files_by_species)
#> [1] "iris_setosa_only.csv"     "iris_versicolor_only.csv"
#> [3] "iris_virginica_only.csv"


# Create a temporary directory to run these examples.
tf <- withr::local_tempfile()
dir.create(tf)


something_that_takes_a_while <- function(x) {
  Sys.sleep(0.5)
  return(x)
}

# Example standard pipeline without caching:
#   1. Read using a vectorized `read.csv`.
#   2. Perform some custom processing that takes a while (currently using sleep as an example).
normal_pipeline <- function(files, cache_dir = NULL) {
  files |>
    filecacher::vectorize_reader(read.csv)() |>
    suppressMessages() |>
    something_that_takes_a_while()
}

# Same pipeline, using `cached_read` which caches the contents and the file info for checking later:
pipeline_using_cached_read <- function(files, cache_dir) {
  files |>
    filecacher::cached_read(
      label = "processed_data_using_cached_read",
      read_fn = normal_pipeline,
      cache = cache_dir,
      type = "parquet"
    )
}

# Alternate syntax, with `with_cache`. Using `with_cache` only checks that the cache file
# exists, without any information about the file list.
pipeline_using_with_cache <- function(files, cache_dir) {
  normal_pipeline(files) |>
    filecacher::with_cache(
      label = "processed_data_using_with_cache",
      cache = cache_dir,
      type = "parquet"
    )
}

# Time each pipeline when repeated 3 times:
time_pipeline <- function(pipeline_fn) {
  function_name <- as.character(match.call()[2])
  print(function_name)

  # Create a separate directory for the cache for this function.
  cache_dir <- tempfile(function_name, tmpdir = tf)
  dir.create(cache_dir)

  gc()

  for (i in 1:3) {
    print(system.time(pipeline_fn(iris_files_by_species, cache_dir)))
  }
}

time_pipeline(normal_pipeline)
#> [1] "normal_pipeline"
#>    user  system elapsed 
#>    0.06    0.03    0.60 
#>    user  system elapsed 
#>    0.00    0.02    0.52 
#>    user  system elapsed 
#>     0.0     0.0     0.5
time_pipeline(pipeline_using_cached_read)
#> [1] "pipeline_using_cached_read"
#>    user  system elapsed 
#>    0.59    0.18    1.30 
#>    user  system elapsed 
#>    0.03    0.00    0.03 
#>    user  system elapsed 
#>    0.00    0.02    0.01
time_pipeline(pipeline_using_with_cache)
#> [1] "pipeline_using_with_cache"
#>    user  system elapsed 
#>    0.01    0.02    0.53 
#>    user  system elapsed 
#>    0.01    0.01    0.03 
#>    user  system elapsed 
#>    0.00    0.02    0.02

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.