Rebuilding Cached Random CDISC Data

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Getting Started

The following script is used to create, compare and save cached data to the data/ directory.

Starting in R 3.6.0 the default kind of under-the-hood random-number generator was changed. Now, in order to get the results from set.seed() to match, you have to first call the function RNGkind(sample.kind = "Rounding").

It throws the expected warning:

Warning: non-uniform 'Rounding' sampler used

Code Maintenance

Currently, when a random.cdisc.data data-generating function is created or modified, then the below code chunk must be run to build the new/updated cached dataset and add it to the data/ directory. If a dataset that is a dependency for another dataset has been updated then the dependent dataset will also be updated. To manually specify which datasets should be updated, edit the data_to_update vector below, entering the desired dataset names.

Update Cached Data

Note: Prior to running the following code chunk, please ensure that you have reinstalled the random.cdisc.data package after completing all dataset modifications.

# Helper functions
#
flatten_list_of_deps <- function(updated_data, data_deps) {
  # Get higher deps fnc
  get_higher_deps <- function(cur_dep, data_deps) {
    sapply(seq_along(data_deps), function(x) {
      if (any(cur_dep %in% data_deps[[x]])) {
        names(data_deps)[x]
      }
    })
  }

  # Get lower deps fnc
  get_lower_deps <- function(cur_dep, data_deps) {
    data_deps[sapply(cur_dep, function(x) which(x == names(data_deps)))]
  }

  # Sort data_deps
  sort_data_deps <- function(upd_data, data_deps) {
    iup <- upd_data
    for (ud in upd_data) {
      up <- unlist(get_lower_deps(ud, data_deps))
      if (any(up %in% upd_data)) {
        iup <- unique(unlist(c(up[up %in% upd_data], iup)))
      }
    }
    iup
  }

  # Firstly, lets sort by dependencies the initial updated data
  fin_up <- sort_data_deps(updated_data, data_deps)

  # Extracting higher dependencies for each value
  cnt <- 1
  while (cnt <= length(fin_up)) {
    cur_deps <- unlist(
      get_higher_deps(fin_up[cnt], data_deps)
    )
    if (!is.null(cur_deps)) {
      cur_deps <- sort_data_deps(cur_deps, data_deps)
      fin_up <- unique(c(fin_up[seq_len(cnt)], cur_deps, fin_up[-seq_len(cnt)]))
    }
    cnt <- cnt + 1
  }

  fin_up
}

rebuild_cached_data.R

library(random.cdisc.data)
library(diffdf)
library(dplyr)

# Call function to match random number generation from previous R versions
RNGkind(sample.kind = "Rounding")

# Datasets must be listed after all of their dependencies
# e.g. adsl is a dependency for all other datasets so it is listed first.

pkg_dir <- dirname(getwd())
# Listing source files and extraction of datasets' names
src_files <- list.files(paste0(pkg_dir, "/R"))
data_nms <- src_files[grepl("^ra*", src_files)] %>%
  stringr::str_remove(pattern = "^r") %>%
  stringr::str_remove(pattern = ".R$") %>%
  sort()
# Exception handling
data_nms <- data_nms[data_nms != "adsaftte"] # Unbuilt for now

# Construction of dependency tree based on formals
data_deps <- sapply(
  data_nms,
  function(x) {
    dat_args <- names(formals(paste0("r", x)))
    dat_args[dat_args %in% data_nms]
  }
)

git_call <- "git diff origin/main --name-only"
updated_files <- tryCatch(
  system(git_call, intern = TRUE),
  error = function(e) e
)
status_uf <- attr(updated_files, "status")
if (is(updated_files, "error") || (!is.null(status_uf) && status_uf == 1)) {
  message("Found following error in git call: ", git_call)
  message(e)
  message(
    "The calculation continues as default by recreating all datasets ",
    "and updating the cached data if any change is found."
  )
  updated_data <- data_nms
} else {
  updated_data <- updated_files[grepl("^R\\/", updated_files)] %>%
    stringr::str_remove("^R\\/") %>%
    stringr::str_remove(pattern = "^r") %>%
    stringr::str_remove(pattern = ".R$")
}

if (length(updated_data) != 0) {
  stopifnot(all(updated_data %in% names(data_deps)))

  data_to_update <- flatten_list_of_deps(updated_data, data_deps)
  default_args <- list(seed = 1, na_vars = list(), who_coding = TRUE, percent = 80, number = 2)

  # Generate and save updated cached datasets
  for (dat in data_to_update) {
    # Match arguments with defaults
    dat_args <- default_args[names(default_args) %in% names(formals(paste0("r", dat)))]

    # Get the data deps cache that is already there (if adsl returns list())
    dat_deps <- lapply(data_deps[[dat]], function(x) get(paste0("c", x)))

    # Main call to creation function
    cdataset <- do.call(paste0("r", dat), c(dat_args, dat_deps))

    # Preview differences
    cat("\nSaving cached data for dataset", paste0("*", dat, "*"), "with the following changes found (diffdf):\n")
    diff_test <- diffdf(get(paste0("c", dat)), cdataset)
    print(diff_test)

    # Check if there is any actual change to the data
    if (length(diff_test) > 0) { # If no difference -> list()
      # Save new cached dataset
      assign(paste0("c", dat), cdataset)
      fl_save <- paste0(dirname(getwd()), "/data/c", dat, ".RData")
      attr(cdataset, "creation date") <- lubridate::date() # This should NOT be updated if no changes in diffdf
      save(list = paste0("c", dat), file = fl_save, compress = "xz")
      cat("Cached dataset updated for", paste0("*", dat, "*"), "in", paste0("data/", basename(fl_save), "."), "\n")
    } else {
      message("No update detected on the final data. No cached data was updated for *", dat, "*.")
    }
  }
} else {
  message("No source files changed: no cached datasets currently require updates.")
}

rebuild_cached_data.R

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.