--- title: "Rebuilding Cached Random CDISC Data" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Rebuilding Cached Random CDISC Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8}{inputenc} --- ## Getting Started The following script is used to create, compare and save cached data to the `data/` directory. Starting in `R 3.6.0` the default kind of under-the-hood random-number generator was changed. Now, in order to get the results from `set.seed()` to match, you have to first call the function `RNGkind(sample.kind = "Rounding")`. It throws the expected warning: ``` Warning: non-uniform 'Rounding' sampler used ``` ## Code Maintenance Currently, when a `random.cdisc.data` data-generating function is created or modified, then the below code chunk must be run to build the new/updated cached dataset and add it to the `data/` directory. If a dataset that is a dependency for another dataset has been updated then the dependent dataset will also be updated. To manually specify which datasets should be updated, edit the `data_to_update` vector below, entering the desired dataset names. ## Update Cached Data **Note:** Prior to running the following code chunk, please ensure that you have reinstalled the `random.cdisc.data` package after completing all dataset modifications. ```{r, eval=FALSE} # Helper functions # flatten_list_of_deps <- function(updated_data, data_deps) { # Get higher deps fnc get_higher_deps <- function(cur_dep, data_deps) { sapply(seq_along(data_deps), function(x) { if (any(cur_dep %in% data_deps[[x]])) { names(data_deps)[x] } }) } # Get lower deps fnc get_lower_deps <- function(cur_dep, data_deps) { data_deps[sapply(cur_dep, function(x) which(x == names(data_deps)))] } # Sort data_deps sort_data_deps <- function(upd_data, data_deps) { iup <- upd_data for (ud in upd_data) { up <- unlist(get_lower_deps(ud, data_deps)) if (any(up %in% upd_data)) { iup <- unique(unlist(c(up[up %in% upd_data], iup))) } } iup } # Firstly, lets sort by dependencies the initial updated data fin_up <- sort_data_deps(updated_data, data_deps) # Extracting higher dependencies for each value cnt <- 1 while (cnt <= length(fin_up)) { cur_deps <- unlist( get_higher_deps(fin_up[cnt], data_deps) ) if (!is.null(cur_deps)) { cur_deps <- sort_data_deps(cur_deps, data_deps) fin_up <- unique(c(fin_up[seq_len(cnt)], cur_deps, fin_up[-seq_len(cnt)])) } cnt <- cnt + 1 } fin_up } ``` ```{r, eval=FALSE} library(random.cdisc.data) library(diffdf) library(dplyr) # Call function to match random number generation from previous R versions RNGkind(sample.kind = "Rounding") # Datasets must be listed after all of their dependencies # e.g. adsl is a dependency for all other datasets so it is listed first. pkg_dir <- dirname(getwd()) # Listing source files and extraction of datasets' names src_files <- list.files(paste0(pkg_dir, "/R")) data_nms <- src_files[grepl("^ra*", src_files)] %>% stringr::str_remove(pattern = "^r") %>% stringr::str_remove(pattern = ".R$") %>% sort() # Exception handling data_nms <- data_nms[data_nms != "adsaftte"] # Unbuilt for now # Construction of dependency tree based on formals data_deps <- sapply( data_nms, function(x) { dat_args <- names(formals(paste0("r", x))) dat_args[dat_args %in% data_nms] } ) git_call <- "git diff origin/main --name-only" updated_files <- tryCatch( system(git_call, intern = TRUE), error = function(e) e ) status_uf <- attr(updated_files, "status") if (is(updated_files, "error") || (!is.null(status_uf) && status_uf == 1)) { message("Found following error in git call: ", git_call) message(e) message( "The calculation continues as default by recreating all datasets ", "and updating the cached data if any change is found." ) updated_data <- data_nms } else { updated_data <- updated_files[grepl("^R\\/", updated_files)] %>% stringr::str_remove("^R\\/") %>% stringr::str_remove(pattern = "^r") %>% stringr::str_remove(pattern = ".R$") } if (length(updated_data) != 0) { stopifnot(all(updated_data %in% names(data_deps))) data_to_update <- flatten_list_of_deps(updated_data, data_deps) default_args <- list(seed = 1, na_vars = list(), who_coding = TRUE, percent = 80, number = 2) # Generate and save updated cached datasets for (dat in data_to_update) { # Match arguments with defaults dat_args <- default_args[names(default_args) %in% names(formals(paste0("r", dat)))] # Get the data deps cache that is already there (if adsl returns list()) dat_deps <- lapply(data_deps[[dat]], function(x) get(paste0("c", x))) # Main call to creation function cdataset <- do.call(paste0("r", dat), c(dat_args, dat_deps)) # Preview differences cat("\nSaving cached data for dataset", paste0("*", dat, "*"), "with the following changes found (diffdf):\n") diff_test <- diffdf(get(paste0("c", dat)), cdataset) print(diff_test) # Check if there is any actual change to the data if (length(diff_test) > 0) { # If no difference -> list() # Save new cached dataset assign(paste0("c", dat), cdataset) fl_save <- paste0(dirname(getwd()), "/data/c", dat, ".RData") attr(cdataset, "creation date") <- lubridate::date() # This should NOT be updated if no changes in diffdf save(list = paste0("c", dat), file = fl_save, compress = "xz") cat("Cached dataset updated for", paste0("*", dat, "*"), "in", paste0("data/", basename(fl_save), "."), "\n") } else { message("No update detected on the final data. No cached data was updated for *", dat, "*.") } } } else { message("No source files changed: no cached datasets currently require updates.") } ```