From 26e9271c9f93e45f26e6f85ebb100014509ab6e3 Mon Sep 17 00:00:00 2001 From: nperez Date: Thu, 10 Sep 2020 11:14:25 +0200 Subject: [PATCH 01/12] cleaning added to Chunk.ecf --- DESCRIPTION | 2 +- NEWS.md | 2 ++ inst/chunking/Chunk.ecf | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a75b2cf..856bb18 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 2.0.1 +Version: 2.0.2 Authors@R: c( person("BSC-CNS", role = c("aut", "cph")), person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), diff --git a/NEWS.md b/NEWS.md index c348dda..295a972 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# startR v2.0.1 (Release date: 2020-09-10) +- /dev/shm automatic cleaning on Compute() # startR v2.0.1 (Release date: 2020-08-25) - Bugfix for the function .chunk(). Its name was chunk() before v2.0.0, and there are two parts were not renamed to .chunk() in v2.0.0. diff --git a/inst/chunking/Chunk.ecf b/inst/chunking/Chunk.ecf index 96b7645..26f910b 100644 --- a/inst/chunking/Chunk.ecf +++ b/inst/chunking/Chunk.ecf @@ -15,4 +15,7 @@ task_path=%REMOTE_ECF_HOME%/%ECF_NAME% Rscript load_process_save_chunk.R --args $task_path insert_indices #include_transfer_back_and_rm +#clean temporal folder +rm /dev/shm/* + %include "./tail.h" -- GitLab From 491337415cbdac115ba06af22faccb7e0e271938 Mon Sep 17 00:00:00 2001 From: nperez Date: Mon, 14 Sep 2020 17:39:10 +0200 Subject: [PATCH 02/12] removing specific bigmemory pointers --- DESCRIPTION | 2 +- R/ByChunks.R | 2 ++ R/Start.R | 3 ++- inst/chunking/Chunk.ecf | 2 +- inst/chunking/clean_devshm.sh | 15 +++++++++++++++ inst/chunking/load_process_save_chunk.R | 2 ++ 6 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 inst/chunking/clean_devshm.sh diff --git a/DESCRIPTION b/DESCRIPTION index 856bb18..e531e9d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 2.0.2 +Version: 2.0.3 Authors@R: c( person("BSC-CNS", role = c("aut", "cph")), person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), diff --git a/R/ByChunks.R b/R/ByChunks.R index 5f0bba5..9acc14f 100644 --- a/R/ByChunks.R +++ b/R/ByChunks.R @@ -551,6 +551,8 @@ ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', ecflow_suite_dir_suite) file.copy(system.file('chunking/tail.h', package = 'startR'), ecflow_suite_dir_suite) + file.copy(system.file('chunking/clean_devshm.sh', package = 'startR'), + ecflow_suite_dir_suite) } add_line <- function(suite, line, tabs) { diff --git a/R/Start.R b/R/Start.R index 8243fda..668a0b2 100644 --- a/R/Start.R +++ b/R/Start.R @@ -4238,7 +4238,8 @@ Start <- function(..., # dim = indices/selectors, Files = array_of_files_to_load, NotFoundFiles = array_of_not_found_files, FileSelectors = file_selectors, - PatternDim = found_pattern_dim) + PatternDim = found_pattern_dim, + ObjectBigmemory = attr(shared_matrix_pointer, 'description')$sharedName) ) attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class')) data_array diff --git a/inst/chunking/Chunk.ecf b/inst/chunking/Chunk.ecf index 26f910b..af07c48 100644 --- a/inst/chunking/Chunk.ecf +++ b/inst/chunking/Chunk.ecf @@ -16,6 +16,6 @@ Rscript load_process_save_chunk.R --args $task_path insert_indices #include_transfer_back_and_rm #clean temporal folder -rm /dev/shm/* +bash clean_devshm.sh %include "./tail.h" diff --git a/inst/chunking/clean_devshm.sh b/inst/chunking/clean_devshm.sh new file mode 100644 index 0000000..16c490f --- /dev/null +++ b/inst/chunking/clean_devshm.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Take the filename +remote=%REMOTE_ECF_HOME%/filename.txt +name=$( Date: Mon, 14 Sep 2020 18:33:13 +0200 Subject: [PATCH 03/12] print and change filename --- inst/chunking/clean_devshm.sh | 2 +- inst/chunking/load_process_save_chunk.R | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/chunking/clean_devshm.sh b/inst/chunking/clean_devshm.sh index 16c490f..dfebae8 100644 --- a/inst/chunking/clean_devshm.sh +++ b/inst/chunking/clean_devshm.sh @@ -1,6 +1,6 @@ #!/bin/bash # Take the filename -remote=%REMOTE_ECF_HOME%/filename.txt +remote=%REMOTE_ECF_HOME%/Chunkfilename.txt name=$( Date: Tue, 15 Sep 2020 09:28:58 +0200 Subject: [PATCH 04/12] fixing file reading --- inst/chunking/clean_devshm.sh | 12 ++++++++---- inst/chunking/load_process_save_chunk.R | 6 +++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/inst/chunking/clean_devshm.sh b/inst/chunking/clean_devshm.sh index dfebae8..0f4843e 100644 --- a/inst/chunking/clean_devshm.sh +++ b/inst/chunking/clean_devshm.sh @@ -1,9 +1,12 @@ #!/bin/bash # Take the filename -remote=%REMOTE_ECF_HOME%/Chunkfilename.txt -name=$( Date: Tue, 15 Sep 2020 11:32:01 +0200 Subject: [PATCH 05/12] correct folders --- inst/chunking/Chunk.ecf | 2 +- inst/chunking/clean_devshm.sh | 4 +++- inst/chunking/load_process_save_chunk.R | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/inst/chunking/Chunk.ecf b/inst/chunking/Chunk.ecf index af07c48..ccc6338 100644 --- a/inst/chunking/Chunk.ecf +++ b/inst/chunking/Chunk.ecf @@ -16,6 +16,6 @@ Rscript load_process_save_chunk.R --args $task_path insert_indices #include_transfer_back_and_rm #clean temporal folder -bash clean_devshm.sh +bash %REMOTE_ECF_HOME%clean_devshm.sh $task_path %include "./tail.h" diff --git a/inst/chunking/clean_devshm.sh b/inst/chunking/clean_devshm.sh index 0f4843e..a2f317b 100644 --- a/inst/chunking/clean_devshm.sh +++ b/inst/chunking/clean_devshm.sh @@ -1,6 +1,8 @@ #!/bin/bash # Take the filename -remote=%REMOTE_ECF_HOME%/filename.txt +path=$1 +name=.filename.txt +remote=$1$name echo "$remote" while IFS= read -r line do diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index 5d777d4..bae0b54 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -71,9 +71,10 @@ for (input in 1:length(data)) { start_call[['num_procs']] <- threads_load } data[[input]] <- eval(start_call) - warning(attributes(data[[input]])$ObjectBigmemory) - write.table(attributes(data)$ObjectBigmemory, file = paste0(out_dir, '/filename.txt'), - col.names = FALSE, row.names = FALSE, quote = FALSE, append = TRUE) + warning(paste(out_dir, attributes(data[[input]])$ObjectBigmemory)) + write.table(attributes(data[[input]])$ObjectBigmemory, + file = paste0(task_path, '.filename.txt'), + col.names = FALSE, row.names = FALSE, quote = FALSE) } t_end_load <- Sys.time() t_load <- as.numeric(difftime(t_end_load, t_begin_load, units = 'secs')) -- GitLab From 990b4d6343ddf780a5eba7f8ca5f5a66a3da466f Mon Sep 17 00:00:00 2001 From: nperez Date: Tue, 15 Sep 2020 11:49:59 +0200 Subject: [PATCH 06/12] Remove unnecessary warning --- inst/chunking/load_process_save_chunk.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index bae0b54..8b7e843 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -71,7 +71,7 @@ for (input in 1:length(data)) { start_call[['num_procs']] <- threads_load } data[[input]] <- eval(start_call) - warning(paste(out_dir, attributes(data[[input]])$ObjectBigmemory)) + warning(attributes(data[[input]])$ObjectBigmemory) write.table(attributes(data[[input]])$ObjectBigmemory, file = paste0(task_path, '.filename.txt'), col.names = FALSE, row.names = FALSE, quote = FALSE) -- GitLab From 401a47c2e0cce5e74959372adac9eed19560a626 Mon Sep 17 00:00:00 2001 From: nperez Date: Wed, 16 Sep 2020 19:10:26 +0200 Subject: [PATCH 07/12] adding parameter for bigmemory name file --- DESCRIPTION | 2 +- R/Start.R | 8 + inst/chunking/load_process_save_chunk.R | 16 +- man/AddStep.Rd | 1 - man/CDORemapper.Rd | 1 - man/Collect.Rd | 1 - man/Compute.Rd | 16 +- man/NcCloser.Rd | 1 - man/NcDataReader.Rd | 10 +- man/NcDimReader.Rd | 10 +- man/NcOpener.Rd | 1 - man/NcVarReader.Rd | 10 +- man/SelectorChecker.Rd | 4 +- man/Sort.Rd | 9 +- man/Start.Rd | 1244 ++++++++++++----------- man/Step.Rd | 10 +- man/indices.Rd | 1 - man/values.Rd | 1 - 18 files changed, 701 insertions(+), 645 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e531e9d..35d79cf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -37,4 +37,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true SystemRequirements: cdo -RoxygenNote: 5.0.0 +RoxygenNote: 7.0.1 diff --git a/R/Start.R b/R/Start.R index 668a0b2..a5b21f5 100644 --- a/R/Start.R +++ b/R/Start.R @@ -686,6 +686,9 @@ #' multiple involved files in a call to Start(). If set to NULL, #' takes the number of available cores (as detected by detectCores() in #' the package 'future'). The default value is 1 (no parallel execution). +#'@param ObjectBigmemory a character string to be included as part of the +#' bigmemory object name. This parameter is thought to be used internally by the +#' chunking capabilities of startR. #'@param silent A logical value of whether to display progress messages (FALSE) #' or not (TRUE). The default value is FALSE. #'@param debug A logical value of whether to return detailed messages on the @@ -807,6 +810,7 @@ Start <- function(..., # dim = indices/selectors, path_glob_permissive = FALSE, retrieve = FALSE, num_procs = 1, + ObjectBigmemory = NULL, silent = FALSE, debug = FALSE) { #, config_file = NULL #dictionary_dim_names = , @@ -3717,6 +3721,9 @@ Start <- function(..., # dim = indices/selectors, # to the work pieces. data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1) shared_matrix_pointer <- bigmemory::describe(data_array) + if (!is.null(ObjectBigmemory)) { + attr(shared_matrix_pointer, 'description')$sharedName <- ObjectBigmemory + } if (is.null(num_procs)) { num_procs <- future::availableCores() } @@ -4282,6 +4289,7 @@ Start <- function(..., # dim = indices/selectors, file_data_reader, synonims, transform, transform_params, silent = FALSE, debug = FALSE) { + warning(attr(shared_matrix_pointer, 'description')$sharedName) # suppressPackageStartupMessages({library(bigmemory)}) ### TODO: Specify dependencies as parameter # suppressPackageStartupMessages({library(ncdf4)}) diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index 8b7e843..9f9f35a 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -70,7 +70,21 @@ for (input in 1:length(data)) { if (!('num_procs' %in% names(start_call))) { start_call[['num_procs']] <- threads_load } - data[[input]] <- eval(start_call) + nameMemoryObject <- gsub("[^0-9.-]", "", gsub(out_dir, "", task_path)) + start_call[['ObjectBigmemory']] <- nameMemoryObject + data[[input]] <- tryCatch(eval(start_call), + # Handler when an error occurs: + error = function(e) { + message(paste("The data cannot be loaded.")) + message("See the original error message:") + message(e) + message("\n Current files in /dev/shm:") + noreturn <- lapply(list.files("/dev/shm"), function (x) { + info <- file.info(paste0("/dev/shm/", x)) + message(paste("file:", rownames(info), + "size:", info$size, + "uname:", info$uname))}) + }) warning(attributes(data[[input]])$ObjectBigmemory) write.table(attributes(data[[input]])$ObjectBigmemory, file = paste0(task_path, '.filename.txt'), diff --git a/man/AddStep.Rd b/man/AddStep.Rd index 3eece05..0d0ce46 100644 --- a/man/AddStep.Rd +++ b/man/AddStep.Rd @@ -54,4 +54,3 @@ create the complete workflow. It is the final step before data processing. wf <- AddStep(data, step, pi_val = pi_short) } - diff --git a/man/CDORemapper.Rd b/man/CDORemapper.Rd index 4f56baa..763be77 100644 --- a/man/CDORemapper.Rd +++ b/man/CDORemapper.Rd @@ -65,4 +65,3 @@ perform the interpolation, hence CDO is required to be installed. \seealso{ \code{\link[s2dverification]{CDORemap}} } - diff --git a/man/Collect.Rd b/man/Collect.Rd index 44a7dee..97b529b 100644 --- a/man/Collect.Rd +++ b/man/Collect.Rd @@ -83,4 +83,3 @@ of results as one data array when the execution is done. See more details on } } - diff --git a/man/Compute.Rd b/man/Compute.Rd index e07106a..7d6db4d 100644 --- a/man/Compute.Rd +++ b/man/Compute.Rd @@ -4,9 +4,18 @@ \alias{Compute} \title{Specify the execution parameters and trigger the execution} \usage{ -Compute(workflow, chunks = "auto", threads_load = 1, threads_compute = 1, - cluster = NULL, ecflow_suite_dir = NULL, ecflow_server = NULL, - silent = FALSE, debug = FALSE, wait = TRUE) +Compute( + workflow, + chunks = "auto", + threads_load = 1, + threads_compute = 1, + cluster = NULL, + ecflow_suite_dir = NULL, + ecflow_server = NULL, + silent = FALSE, + debug = FALSE, + wait = TRUE +) } \arguments{ \item{workflow}{A list of the class 'startR_workflow' returned by function @@ -104,4 +113,3 @@ arrays and additional metadata. res <- Compute(wf, chunks = list(longitude = 4, sdate = 2)) } - diff --git a/man/NcCloser.Rd b/man/NcCloser.Rd index 65beab8..588f63a 100644 --- a/man/NcCloser.Rd +++ b/man/NcCloser.Rd @@ -32,4 +32,3 @@ NcCloser(connection) \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcDimReader}} \code{\link{NcVarReader}} } - diff --git a/man/NcDataReader.Rd b/man/NcDataReader.Rd index a6d32c7..9014789 100644 --- a/man/NcDataReader.Rd +++ b/man/NcDataReader.Rd @@ -4,8 +4,13 @@ \alias{NcDataReader} \title{NetCDF file data reader for 'startR'} \usage{ -NcDataReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDataReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -61,4 +66,3 @@ in turn uses nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDimReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcDimReader.Rd b/man/NcDimReader.Rd index d539ffd..38dd870 100644 --- a/man/NcDimReader.Rd +++ b/man/NcDimReader.Rd @@ -4,8 +4,13 @@ \alias{NcDimReader} \title{NetCDF dimension reader for 'startR'} \usage{ -NcDimReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDimReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ This function uses the function NcReadDims() in the package 'easyNCDF'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcOpener.Rd b/man/NcOpener.Rd index e46384c..30885fc 100644 --- a/man/NcOpener.Rd +++ b/man/NcOpener.Rd @@ -34,4 +34,3 @@ NcCloser(connection) \code{\link{NcDimReader}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcVarReader.Rd b/man/NcVarReader.Rd index c601907..fb093ae 100644 --- a/man/NcVarReader.Rd +++ b/man/NcVarReader.Rd @@ -4,8 +4,13 @@ \alias{NcVarReader} \title{NetCDF variable reader for 'startR'} \usage{ -NcVarReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - var_name = NULL, synonims) +NcVarReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + var_name = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcDimReader}} } - diff --git a/man/SelectorChecker.Rd b/man/SelectorChecker.Rd index ef83575..e1cf112 100644 --- a/man/SelectorChecker.Rd +++ b/man/SelectorChecker.Rd @@ -4,8 +4,7 @@ \alias{SelectorChecker} \title{Translate a set of selectors into a set of numeric indices} \usage{ -SelectorChecker(selectors, var = NULL, return_indices = TRUE, - tolerance = NULL) +SelectorChecker(selectors, var = NULL, return_indices = TRUE, tolerance = NULL) } \arguments{ \item{selectors}{A vector or a list of two of numeric indices or variable @@ -50,4 +49,3 @@ sub_array_of_values <- seq(90, -90, length.out = 258)[2:257] SelectorChecker(sub_array_of_selectors, sub_array_of_values) } - diff --git a/man/Sort.Rd b/man/Sort.Rd index 9ab516e..25a92fe 100644 --- a/man/Sort.Rd +++ b/man/Sort.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/Sort.R \name{Sort} -\alias{CircularSort} \alias{Sort} +\alias{CircularSort} \title{Sort the coordinate variable values in a Start() call} \usage{ Sort(...) @@ -10,12 +10,12 @@ Sort(...) CircularSort(start, end, ...) } \arguments{ +\item{\dots}{Additional parameters to adjust the reorderig. See function +sort() for more details.} + \item{start}{A numeric indicating the lower bound of the circular range.} \item{end}{A numeric indicating the upper bound of the circular range.} - -\item{\dots}{Additional parameters to adjust the reorderig. See function -sort() for more details.} } \value{ A list of 2 containing: @@ -57,4 +57,3 @@ range. This is useful for circular coordinates such as the Earth longitudes. retrieve = FALSE) } - diff --git a/man/Start.Rd b/man/Start.Rd index c41c961..5598210 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -4,656 +4,677 @@ \alias{Start} \title{Declare, discover, subset and retrieve multidimensional distributed data sets} \usage{ -Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, - file_var_reader = NcVarReader, file_dim_reader = NcDimReader, - file_data_reader = NcDataReader, file_closer = NcCloser, - transform = NULL, transform_params = NULL, transform_vars = NULL, - transform_extra_cells = 2, apply_indices_after_transform = FALSE, - pattern_dims = NULL, metadata_dims = NULL, - selector_checker = SelectorChecker, merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, retrieve = FALSE, num_procs = 1, - silent = FALSE, debug = FALSE) +Start( + ..., + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, + debug = FALSE +) } \arguments{ -\item{return_vars}{A named list where the names are the names of the -variables to be fetched in the files, and the values are vectors of -character strings with the names of the file dimension which to retrieve each -variable for, or NULL if the variable has to be retrieved only once -from any (the first) of the involved files.\cr\cr -Apart from retrieving a multidimensional data array, retrieving auxiliary -variables inside the files can also be needed. The parameter -'return_vars' allows for requesting such variables, as long as a -'file_var_reader' function is also specified in the call to -Start() (see documentation on the corresponding parameter). -\cr\cr -In the case of the the item sales example (see documentation on parameter -\code{\dots)}, the store location variable is requested with the parameter\cr -\code{return_vars = list(store_location = NULL)}.\cr This will cause -Start() to fetch once the variable 'store_location' and return it in -the component\cr \code{$Variables$common$store_location},\cr and will be an -array of character strings with the location names, with the dimensions -\code{c('store' = 100)}. Although useless in this example, we could ask -Start() to fetch and return such variable for each file along the -items dimension as follows: \cr -\code{return_vars = list(store_location = c('item'))}.\cr In that case, the -variable will be fetched once from a file of each of the items, and will be -returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. -\cr\cr -If a variable is requested along a file dimension that contains path pattern -specifications ('source' in the example), the fetched variable values will be -returned in the component\cr \code{$Variables$$}.\cr -For example: -\cr +\item{\dots}{A selection of custemized parameters depending on the data +format. When we retrieve data from one or a collection of data sets, +the involved data can be perceived as belonging to a large multi-dimensional +array. For instance, let us consider an example case. We want to retrieve data +from a source, which contains data for the number of monthly sales of various +items, and also for their retail price each month. The data on source is +stored as follows:\cr\cr \command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item'))) -\cr # # Checking the structure of the returned variables -\cr # str(found_data$Variables) -\cr # Named list -\cr # ..$common: NULL -\cr # ..$sourceA: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # ..$sourceB: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # # Checking the dimensions of the returned variable -\cr # # for the source A -\cr # dim(found_data$Variables$sourceA) -\cr # item store -\cr # 3 3 +\cr # /data/ +\cr # |-> sales/ +\cr # | |-> electronics +\cr # | | |-> item_a.data +\cr # | | |-> item_b.data +\cr # | | |-> item_c.data +\cr # | |-> clothing +\cr # | |-> item_d.data +\cr # | |-> idem_e.data +\cr # | |-> idem_f.data +\cr # |-> prices/ +\cr # |-> electronics +\cr # | |-> item_a.data +\cr # | |-> item_b.data +\cr # | |-> item_c.data +\cr # |-> clothing +\cr # |-> item_d.data +\cr # |-> item_e.data +\cr # |-> item_f.data +}\cr\cr +Each item file contains data, stored in whichever format, for the sales or +prices over a time period, e.g. for the past 24 months, registered at 100 +different stores over the world. Whichever the format it is stored in, each +file can be perceived as a container of a data array of 2 dimensions, time and +store. Let us assume the '.data' format allows to keep a name for each of +these dimensions, and the actual names are 'time' and 'store'.\cr\cr +The different item files for sales or prices can be perceived as belonging to +an 'item' dimension of length 3, and the two groups of three items to a +'section' dimension of length 2, and the two groups of two sections (one with +the sales and the other with the prices) can be perceived as belonging also to +another dimension 'variable' of length 2. Even the source can be perceived as +belonging to a dimension 'source' of length 1.\cr\cr +All in all, in this example, the whole data could be perceived as belonging to +a multidimensional 'large array' of dimensions\cr +\command{ +\cr # source variable section item store month +\cr # 1 2 2 3 100 24 } \cr\cr -The names of the requested variables do not necessarily have to match the -actual variable names inside the files. A list of alternative names to be -seeked can be specified via the parameter 'synonims'.} - -\item{synonims}{A named list where the names are the requested variable or -dimension names, and the values are vectors of character strings with -alternative names to seek for such dimension or variable.\cr\cr -In some requests, data from different sources may follow different naming -conventions for the dimensions or variables, or even files in the same source -could have varying names. This parameter is in order for Start() to -properly identify the dimensions or variables with different names. +The dimensions of this 'large array' can be classified in two types. The ones +that group actual files (the file dimensions) and the ones that group data +values inside the files (the inner dimensions). In the example, the file +dimensions are 'source', 'variable', 'section' and 'item', whereas the inner +dimensions are 'store' and 'month'. \cr\cr -In the example used in parameter 'return_vars', it may be the case that -the two involved data sources follow slightly different naming conventions. -For example, source A uses 'sect' as name for the sections dimension, whereas -source B uses 'section'; source A uses 'store_loc' as variable name for the -store locations, whereas source B uses 'store_location'. This can be taken -into account as follows: -\cr +Having the dimensions of our target sources in mind, the parameter \code{\dots} +expects to receive information on: + \itemize{ + \item{ +The names of the expected dimensions of the 'large dataset' we want to +retrieve data from + } + \item{ +The indices to take from each dimension (and other constraints) + } + \item{ +How to reorder the dimension if needed + } + \item{ +The location and organization of the files of the data sets + } + } +For each dimension, the 3 first information items can be specified with a set +of parameters to be provided through \code{\dots}. For a given dimension +'dimname', six parameters can be specified:\cr \command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item')), -\cr # synonims = list( -\cr # section = c('sec', 'section'), -\cr # store_location = c('store_loc', -\cr # 'store_location') -\cr # )) +\cr # dimname = , # 'all' / 'first' / 'last' / +\cr # # indices(c(1, 10, 20)) / +\cr # # indices(c(1:20)) / +\cr # # indices(list(1, 20)) / +\cr # # c(1, 10, 20) / c(1:20) / +\cr # # list(1, 20) +\cr # dimname_var = , +\cr # dimname_tolerance = , +\cr # dimname_reorder = , +\cr # dimname_depends = , +\cr # dimname_across = } -\cr} - -\item{file_opener}{A function that receives as a single parameter - 'file_path' a character string with the path to a file to be opened, - and returns an object with an open connection to the file (optionally with - header information) on success, or returns NULL on failure. \cr\cr -This parameter takes by default NcOpener() (an opener function for NetCDF -files). +The \bold{indices to take} can be specified in three possible formats (see +code comments above for examples). The first format consists in using +character tags, such as 'all' (take all the indices available for that +dimension), 'first' (take only the first) and 'last' (only the last). The +second format consists in using numeric indices, which have to be wrapped in a +call to the indices() helper function. For the second format, either a +vector of numeric indices can be provided, or a list with two numeric indices +can be provided to take all the indices in the range between the two specified +indices (both extremes inclusive). The third format consists in providing a +vector character strings (for file dimensions) or of values of whichever type +(for inner dimensions). For the file dimensions, the provided character +strings in the third format will be used as components to build up the final +path to the files (read further). For inner dimensions, the provided values in +the third format will be compared to the values of an associated coordinate +variable (must be specified in '_reorder', read further), and the +indices of the closest values will be retrieved. When using the third format, +a list with two values can also be provided to take all the indices of the +values within the specified range. \cr\cr -See NcOpener() for a template to build a file opener for your own file -format.} - -\item{file_var_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, - \code{synonims} that returns an array with auxiliary data (i.e. data from a - variable) inside a file. Start() will provide automatically either a - 'file_path' or a 'file_object' to the 'file_var_reader' - function (the function has to be ready to work whichever of these two is - provided). The parameter 'file_selectors' will also be provided - automatically to the variable reader, containing a named list where the - names are the names of the file dimensions of the queried data set (see - documentation on \code{\dots}) and the values are single character strings - with the components used to build the path to the file being read (the one - provided in 'file_path' or 'file_object'). The parameter 'var_name' - will be filled in automatically by Start() also, with the name of one - of the variales to be read. The parameter 'synonims' will be filled in - with exactly the same value as provided in the parameter 'synonims' in - the call to Start(), and has to be used in the code of the variable - reader to check for alternative variable names inside the target file. The - 'file_var_reader' must return a (multi)dimensional array with named - dimensions, and optionally with the attribute 'variales' with other - additional metadata on the retrieved variable. +The \bold{name of the associated coordinate variable} must be a character +string with the name of an associated coordinate variable to be found in the +data files (in all* of them). For this to work, a 'file_var_reader' +function must be specified when calling Start() (see parameter +'file_var_reader'). The coordinate variable must also be requested in the +parameter 'return_vars' (see its section for details). This feature only +works for inner dimensions. \cr\cr -Usually, the 'file_var_reader' should be a degenerate case of the -'file_data_reader' (see documentation on the corresponding parameter), -so it is recommended to code the 'file_data_reder' in first place. +The \bold{tolerance value} is useful when indices for an inner dimension are +specified in the third format (values of whichever type). In that case, the +indices of the closest values in the coordinate variable are seeked. However +the closest value might be too distant and we would want to consider no real +match exists for such provided value. This is possible via the tolerance, +which allows to specify a threshold beyond which not to seek for matching +values and mark that index as missing value. \cr\cr -This parameter takes by default NcVarReader() (a variable reader function -for NetCDF files). +The \bold{reorder_function} is useful when indices for an inner dimension are +specified in the third fromat, and the retrieved indices need to be reordered +in function of their provided associated variable values. A function can be +provided, which receives as input a vector of values, and returns as outputs a +list with the components \code{$x} with the reordered values, and \code{$ix} +with the permutation indices. Two reordering functions are included in +startR, the Sort() and the CircularSort(). \cr\cr -See NcVarReader() for a template to build a variale reader for your own -file format.} - -\item{file_dim_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} - that returns a named numeric vector where the names are the names of the - dimensions of the multidimensional data array in the file and the values are - the sizes of such dimensions. Start() will provide automatically - either a 'file_path' or a 'file_object' to the - 'file_dim_reader' function (the function has to be ready to work - whichever of these two is provided). The parameter 'file_selectors' - will also be provided automatically to the dimension reader, containing a - named list where the names are the names of the file dimensions of the - queried data set (see documentation on \code{\dots}) and the values are - single character strings with the components used to build the path to the - file being read (the one provided in 'file_path' or 'file_object'). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and can optionally be used in advanced configurations. +The \bold{name of another dimension} to be specified in _depends, +only available for file dimensions, must be a character string with the name +of another requested \bold{file dimension} in \code{\dots}, and will make +Start() aware that the path components of a file dimension can vary in +function of the path component of another file dimension. For instance, in the +example above, specifying \code{item_depends = 'section'} will make +Start() aware that the item names vary in function of the section, i.e. +section 'electronics' has items 'a', 'b' and 'c' but section 'clothing' has +items 'd', 'e', 'f'. Otherwise Start() would expect to find the same +item names in all the sections. \cr\cr -This parameter takes by default NcDimReader() (a dimension reader -function for NetCDF files). +The \bold{name of another dimension} to be specified in '_across', +only available for inner dimensions, must be a character string with the name +of another requested \bold{inner dimension} in \code{\dots}, and will make +Start() aware that an inner dimension extends along multiple files. For +instance, let us imagine that in the example above, the records for each item +are so large that it becomes necessary to split them in multiple files each +one containing the registers for a different period of time, e.g. in 10 files +with 100 months each ('item_a_period1.data', 'item_a_period2.data', and so on). +In that case, the data can be perceived as having an extra file dimension, the +'period' dimension. The inner dimension 'month' would extend across multiple +files, and providing the parameter \code{month = indices(1, 300)} would make +Start() crash because it would perceive we have made a request out of +bounds (each file contains 100 'month' indices, but we requested 1 to 300). +This can be solved by specifying the parameter \code{month_across = period} (a +long with the full specification of the dimension 'period'). \cr\cr -See NcDimReader() for (an advanced) template to build a dimension reader -for your own file format.} - -\item{file_data_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, - \code{inner_indices = NULL}, \code{synonims} that returns a subset of the - multidimensional data array inside a file (even if internally it is not an - array). Start() will provide automatically either a 'file_path' - or a 'file_object' to the 'file_data_reader' function (the - function has to be ready to work whichever of these two is provided). The - parameter 'file_selectors' will also be provided automatically to the - data reader, containing a named list where the names are the names of the - file dimensions of the queried data set (see documentation on \code{\dots}) - and the values are single character strings with the components used to - build the path to the file being read (the one provided in 'file_path' or - 'file_object'). The parameter 'inner_indices' will be filled in - automatically by Start() also, with a named list of numeric vectors, - where the names are the names of all the expected inner dimensions in a file - to be read, and the numeric vectors are the indices to be taken from the - corresponding dimension (the indices may not be consecutive nor in order). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and has to be used in the code of the data reader to check for alternative - dimension names inside the target file. The 'file_data_reader' must - return a (multi)dimensional array with named dimensions, and optionally with - the attribute 'variables' with other additional metadata on the retrieved - data. +\bold{Defining the path pattern} +\cr +As mentioned above, the parameter \dots also expects to receive information +with the location of the data files. In order to do this, a special dimension +must be defined. In that special dimension, in place of specifying indices to +take, a path pattern must be provided. The path pattern is a character string +that encodes the way the files are organized in their source. It must be a +path to one of the data set files in an accessible local or remote file system, +or a URL to one of the files provided by a local or remote server. The regions +of this path that vary across files (along the file dimensions) must be +replaced by wildcards. The wildcards must match any of the defined file +dimensions in the call to Start() and must be delimited with heading +and trailing '$'. Shell globbing expressions can be used in the path pattern. +See the next code snippet for an example of a path pattern. \cr\cr -Usually, 'file_data_reader' should use 'file_dim_reader' -(see documentation on the corresponding parameter), so it is recommended to -code 'file_dim_reder' in first place. +All in all, the call to Start() to load the entire data set in the +example of store item sales, would look as follows: +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # variable = 'all', +\cr # section = 'all', +\cr # item = 'all', +\cr # item_depends = 'section', +\cr # store = 'all', +\cr # month = 'all') +} \cr\cr -This parameter takes by default NcDataReader() (a data reader function -for NetCDF files). +Note that in this example it would still be pending to properly define the +parameters 'file_opener', 'file_closer', 'file_dim_reader', +'file_var_reader' and 'file_data_reader' for the '.data' file format +(see the corresponding sections). \cr\cr -See NcDataReader() for a template to build a data reader for your own -file format.} - -\item{file_closer}{A function that receives as a single parameter - 'file_object' an open connection (as returned by 'file_opener') - to one of the files to be read, optionally with header information, and - closes the open connection. Always returns NULL. +The call to Start() will return a multidimensional R array with the +following dimensions: +\cr +\command{ +\cr # source variable section item store month +\cr # 1 2 2 3 100 24 +} +\cr +The dimension specifications in the \code{\dots} do not have to follow any +particular order. The returned array will have the dimensions in the same order +as they have been specified in the call. For example, the following call: +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # month = 'all', +\cr # store = 'all', +\cr # item = 'all', +\cr # item_depends = 'section', +\cr # section = 'all', +\cr # variable = 'all') +} \cr\cr -This parameter takes by default NcCloser() (a closer function for NetCDF -files). +would return an array with the following dimensions: +\cr +\command{ +\cr # source month store item section variable +\cr # 1 24 100 3 2 2 +} \cr\cr -See NcCloser() for a template to build a file closer for your own file -format.} - -\item{transform}{A function with the header \code{dara_array}, -\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as -input, through the parameter \code{data_array}, a subset of a -multidimensional array (as returned by 'file_data_reader'), applies a -transformation to it and returns it, preserving the amount of dimensions but -potentially modifying their size. This transformation may require data from -other auxiliary variables, automatically provided to 'transform' -through the parameter 'variables', in the form of a named list where -the names are the variable names and the values are (multi)dimensional -arrays. Which variables need to be sent to 'transform' can be specified -with the parameter 'transform_vars' in Start(). The parameter -'file_selectors' will also be provided automatically to -'transform', containing a named list where the names are the names of -the file dimensions of the queried data set (see documentation on -\code{\dots}) and the values are single character strings with the -components used to build the path to the file the subset being processed -belongs to. The parameter \code{\dots} will be filled in with other -additional parameters to adjust the transformation, exactly as provided in -the call to Start() via the parameter 'transform_params'.} - -\item{transform_params}{A named list with additional parameters to be sent to -the 'transform' function (if specified). See documentation on parameter -'transform' for details.} - -\item{transform_vars}{A vector of character strings with the names of -auxiliary variables to be sent to the 'transform' function (if -specified). All the variables to be sent to 'transform' must also -have been requested as return variables in the parameter 'return_vars' -of Start().} - -\item{transform_extra_cells}{An integer of extra indices to retrieve from the -data set, beyond the requested indices in \code{\dots}, in order for -'transform' to dispose of additional information to properly apply -whichever transformation (if needed). As many as -'transform_extra_cells' will be retrieved beyond each of the limits for -each of those inner dimensions associated to a coordinate variable and sent -to 'transform' (i.e. present in 'transform_vars'). After -'transform' has finished, Start() will take again and return a -subset of the result, for the returned data to fall within the specified -bounds in \code{\dots}. The default value is 2.} - -\item{apply_indices_after_transform}{A logical value indicating when a -'transform' is specified in Start() and numeric indices are -provided for any of the inner dimensions that depend on coordinate variables, -these numeric indices can be made effective (retrieved) before applying the -transformation or after. The boolean flag allows to adjust this behaviour. -It takes FALSE by default (numeric indices are applied before sending -data to 'transform').} - -\item{pattern_dims}{A character string indicating the name of the dimension -with path pattern specifications (see \code{\dots} for details). If not -specified, Start() assumes the first provided dimension is the pattern -dimension, with a warning.} - -\item{metadata_dims}{A vector of character strings with the names of the file -dimensions which to return metadata for. As noted in 'file_data_reader', -the data reader can optionally return auxiliary data via the attribute -'variables' of the returned array. Start() by default returns the -auxiliary data read for only the first file of each source (or data set) in -the pattern dimension (see \code{\dots} for info on what the pattern -dimension is). However it can be configured to return the metadata for all -the files along any set of file dimensions. The default value is NULL, and -it will be assigned automatically as parameter 'pattern_dims'.} - -\item{selector_checker}{A function used internaly by Start() to -translate a set of selectors (values for a dimension associated to a -coordinate variable) into a set of numeric indices. It takes by default -SelectorChecker() and, in principle, it should not be required to -change it for customized file formats. The option to replace it is left open -for more versatility. See the code of SelectorChecker() for details on -the inputs, functioning and outputs of a selector checker.} - -\item{merge_across_dims}{A logical value indicating whether to merge -dimensions across which another dimension extends (according to the -'_across' parameters). Takes the value FALSE by default. For -example, if the dimension 'time' extends across the dimension 'chunk' and -\code{merge_across_dims = TRUE}, the resulting data array will only contain -only the dimension 'time' as long as all the chunks together.} - -\item{merge_across_dims_narm}{A logical value indicating whether to remove -the additional NAs from data when parameter 'merge_across_dims' is TRUE. -It is helpful when the length of the to-be-merged dimension is different -across another dimension. For example, if the dimension 'time' extends -across dimension 'chunk', and the time length along the first chunk is 2 -while along the second chunk is 10. Setting this parameter as TRUE can -remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} - -\item{split_multiselected_dims}{A logical value indicating whether to split a -dimension that has been selected with a multidimensional array of selectors -into as many dimensions as present in the selector array. The default value -is FALSE.} - -\item{path_glob_permissive}{A logical value or an integer specifying how many - folder levels in the path pattern, beginning from the end, the shell glob - expressions must be preserved and worked out for each file. The default - value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr -When specifying a path pattern for a dataset, it might contain shell glob -experissions. For each dataset, the first file matching the path pattern is -found, and the found file is used to work out fixed values for the glob -expressions that will be used for all the files of the dataset. However, in -some cases, the values of the shell glob expressions may not be constant for -all files in a dataset, and they need to be worked out for each file -involved.\cr\cr -For example, a path pattern could be as follows: \cr -\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving -\code{path_glob_permissive = FALSE} will trigger automatic seek of the - contents to replace the asterisks (e.g. the first asterisk matches with - \code{'bar'} and the second with \code{'baz'}. The found contents will be - used for all files in the dataset (in the example, the path pattern will be - fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if - any of the files in the dataset have other contents in the position of the - asterisks, Start() will not find them (in the example, a file like \cr - \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be - found). Setting \code{path_glob_permissive = 1} would preserve global - expressions in the latest level (in the example, the fixed path pattern - would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the - problematic file mentioned before would be found), but of course this would - slow down the Start() call if the dataset involves a large number of - files. Setting \code{path_glob_permissive = 2} would leave the original path - pattern with the original glob expressions in the 1st and 2nd levels (in the - example, both asterisks would be preserved, thus would allow Start() - to recognize files such as \cr - \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr -Note that each glob expression can only represent one possibility (Start() -chooses the first). Because /code{*} is not the tag, which means it cannot -be a dimension of the output array. Therefore, only one possibility can be -adopted. For example, if \cr -\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr -has two matches:\cr -\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr -\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr -only the first found file will be used.} - -\item{retrieve}{A logical value indicating whether to retrieve the data -defined in the Start() call or to explore only its dimension lengths -and names, and the values for the file and inner dimensions. The default -value is FALSE.} - -\item{num_procs}{An integer of number of processes to be created for the -parallel execution of the retrieval/transformation/arrangement of the -multiple involved files in a call to Start(). If set to NULL, -takes the number of available cores (as detected by detectCores() in -the package 'future'). The default value is 1 (no parallel execution).} - -\item{silent}{A logical value of whether to display progress messages (FALSE) -or not (TRUE). The default value is FALSE.} - -\item{debug}{A logical value of whether to return detailed messages on the -progress and operations in a Start() call (TRUE) or not (FALSE). The -default value is FALSE.} - -\item{\dots}{A selection of custemized parameters depending on the data -format. When we retrieve data from one or a collection of data sets, -the involved data can be perceived as belonging to a large multi-dimensional -array. For instance, let us consider an example case. We want to retrieve data -from a source, which contains data for the number of monthly sales of various -items, and also for their retail price each month. The data on source is -stored as follows:\cr\cr +Next, a more advanced example to retrieve data for only the sales records, for +the first section ('electronics'), for the 1st and 3rd items and for the +stores located in Barcelona (assuming the files contain the variable +'store_location' with the name of the city each of the 100 stores are located +at): +\cr \command{ -\cr # /data/ -\cr # |-> sales/ -\cr # | |-> electronics -\cr # | | |-> item_a.data -\cr # | | |-> item_b.data -\cr # | | |-> item_c.data -\cr # | |-> clothing -\cr # | |-> item_d.data -\cr # | |-> idem_e.data -\cr # | |-> idem_f.data -\cr # |-> prices/ -\cr # |-> electronics -\cr # | |-> item_a.data -\cr # | |-> item_b.data -\cr # | |-> item_c.data -\cr # |-> clothing -\cr # |-> item_d.data -\cr # |-> item_e.data -\cr # |-> item_f.data -}\cr\cr -Each item file contains data, stored in whichever format, for the sales or -prices over a time period, e.g. for the past 24 months, registered at 100 -different stores over the world. Whichever the format it is stored in, each -file can be perceived as a container of a data array of 2 dimensions, time and -store. Let us assume the '.data' format allows to keep a name for each of -these dimensions, and the actual names are 'time' and 'store'.\cr\cr -The different item files for sales or prices can be perceived as belonging to -an 'item' dimension of length 3, and the two groups of three items to a -'section' dimension of length 2, and the two groups of two sections (one with -the sales and the other with the prices) can be perceived as belonging also to -another dimension 'variable' of length 2. Even the source can be perceived as -belonging to a dimension 'source' of length 1.\cr\cr -All in all, in this example, the whole data could be perceived as belonging to -a multidimensional 'large array' of dimensions\cr +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr\cr +The defined names for the dimensions do not necessarily have to match the +names of the dimensions inside the file. Lists of alternative names to be +seeked can be defined in the parameter 'synonims'. +\cr\cr +If data from multiple sources (not necessarily following the same structure) +has to be retrieved, it can be done by providing a vector of character strings +with path pattern specifications, or, in the extended form, by providing a +list of lists with the components 'name' and 'path', and the name of the +dataset and path pattern as values, respectively. For example: +\cr \command{ -\cr # source variable section item store month -\cr # 1 2 2 3 100 24 +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr} + +\item{return_vars}{A named list where the names are the names of the +variables to be fetched in the files, and the values are vectors of +character strings with the names of the file dimension which to retrieve each +variable for, or NULL if the variable has to be retrieved only once +from any (the first) of the involved files.\cr\cr +Apart from retrieving a multidimensional data array, retrieving auxiliary +variables inside the files can also be needed. The parameter +'return_vars' allows for requesting such variables, as long as a +'file_var_reader' function is also specified in the call to +Start() (see documentation on the corresponding parameter). +\cr\cr +In the case of the the item sales example (see documentation on parameter +\code{\dots)}, the store location variable is requested with the parameter\cr +\code{return_vars = list(store_location = NULL)}.\cr This will cause +Start() to fetch once the variable 'store_location' and return it in +the component\cr \code{$Variables$common$store_location},\cr and will be an +array of character strings with the location names, with the dimensions +\code{c('store' = 100)}. Although useless in this example, we could ask +Start() to fetch and return such variable for each file along the +items dimension as follows: \cr +\code{return_vars = list(store_location = c('item'))}.\cr In that case, the +variable will be fetched once from a file of each of the items, and will be +returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. +\cr\cr +If a variable is requested along a file dimension that contains path pattern +specifications ('source' in the example), the fetched variable values will be +returned in the component\cr \code{$Variables$$}.\cr +For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item'))) +\cr # # Checking the structure of the returned variables +\cr # str(found_data$Variables) +\cr # Named list +\cr # ..$common: NULL +\cr # ..$sourceA: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # ..$sourceB: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # # Checking the dimensions of the returned variable +\cr # # for the source A +\cr # dim(found_data$Variables$sourceA) +\cr # item store +\cr # 3 3 } \cr\cr -The dimensions of this 'large array' can be classified in two types. The ones -that group actual files (the file dimensions) and the ones that group data -values inside the files (the inner dimensions). In the example, the file -dimensions are 'source', 'variable', 'section' and 'item', whereas the inner -dimensions are 'store' and 'month'. +The names of the requested variables do not necessarily have to match the +actual variable names inside the files. A list of alternative names to be +seeked can be specified via the parameter 'synonims'.} + +\item{synonims}{A named list where the names are the requested variable or +dimension names, and the values are vectors of character strings with +alternative names to seek for such dimension or variable.\cr\cr +In some requests, data from different sources may follow different naming +conventions for the dimensions or variables, or even files in the same source +could have varying names. This parameter is in order for Start() to +properly identify the dimensions or variables with different names. \cr\cr -Having the dimensions of our target sources in mind, the parameter \code{\dots} -expects to receive information on: - \itemize{ - \item{ -The names of the expected dimensions of the 'large dataset' we want to -retrieve data from - } - \item{ -The indices to take from each dimension (and other constraints) - } - \item{ -How to reorder the dimension if needed - } - \item{ -The location and organization of the files of the data sets - } - } -For each dimension, the 3 first information items can be specified with a set -of parameters to be provided through \code{\dots}. For a given dimension -'dimname', six parameters can be specified:\cr +In the example used in parameter 'return_vars', it may be the case that +the two involved data sources follow slightly different naming conventions. +For example, source A uses 'sect' as name for the sections dimension, whereas +source B uses 'section'; source A uses 'store_loc' as variable name for the +store locations, whereas source B uses 'store_location'. This can be taken +into account as follows: +\cr \command{ -\cr # dimname = , # 'all' / 'first' / 'last' / -\cr # # indices(c(1, 10, 20)) / -\cr # # indices(c(1:20)) / -\cr # # indices(list(1, 20)) / -\cr # # c(1, 10, 20) / c(1:20) / -\cr # # list(1, 20) -\cr # dimname_var = , -\cr # dimname_tolerance = , -\cr # dimname_reorder = , -\cr # dimname_depends = , -\cr # dimname_across = +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item')), +\cr # synonims = list( +\cr # section = c('sec', 'section'), +\cr # store_location = c('store_loc', +\cr # 'store_location') +\cr # )) } +\cr} + +\item{file_opener}{A function that receives as a single parameter + 'file_path' a character string with the path to a file to be opened, + and returns an object with an open connection to the file (optionally with + header information) on success, or returns NULL on failure. \cr\cr -The \bold{indices to take} can be specified in three possible formats (see -code comments above for examples). The first format consists in using -character tags, such as 'all' (take all the indices available for that -dimension), 'first' (take only the first) and 'last' (only the last). The -second format consists in using numeric indices, which have to be wrapped in a -call to the indices() helper function. For the second format, either a -vector of numeric indices can be provided, or a list with two numeric indices -can be provided to take all the indices in the range between the two specified -indices (both extremes inclusive). The third format consists in providing a -vector character strings (for file dimensions) or of values of whichever type -(for inner dimensions). For the file dimensions, the provided character -strings in the third format will be used as components to build up the final -path to the files (read further). For inner dimensions, the provided values in -the third format will be compared to the values of an associated coordinate -variable (must be specified in '_reorder', read further), and the -indices of the closest values will be retrieved. When using the third format, -a list with two values can also be provided to take all the indices of the -values within the specified range. -\cr\cr -The \bold{name of the associated coordinate variable} must be a character -string with the name of an associated coordinate variable to be found in the -data files (in all* of them). For this to work, a 'file_var_reader' -function must be specified when calling Start() (see parameter -'file_var_reader'). The coordinate variable must also be requested in the -parameter 'return_vars' (see its section for details). This feature only -works for inner dimensions. -\cr\cr -The \bold{tolerance value} is useful when indices for an inner dimension are -specified in the third format (values of whichever type). In that case, the -indices of the closest values in the coordinate variable are seeked. However -the closest value might be too distant and we would want to consider no real -match exists for such provided value. This is possible via the tolerance, -which allows to specify a threshold beyond which not to seek for matching -values and mark that index as missing value. +This parameter takes by default NcOpener() (an opener function for NetCDF +files). \cr\cr -The \bold{reorder_function} is useful when indices for an inner dimension are -specified in the third fromat, and the retrieved indices need to be reordered -in function of their provided associated variable values. A function can be -provided, which receives as input a vector of values, and returns as outputs a -list with the components \code{$x} with the reordered values, and \code{$ix} -with the permutation indices. Two reordering functions are included in -startR, the Sort() and the CircularSort(). +See NcOpener() for a template to build a file opener for your own file +format.} + +\item{file_var_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, + \code{synonims} that returns an array with auxiliary data (i.e. data from a + variable) inside a file. Start() will provide automatically either a + 'file_path' or a 'file_object' to the 'file_var_reader' + function (the function has to be ready to work whichever of these two is + provided). The parameter 'file_selectors' will also be provided + automatically to the variable reader, containing a named list where the + names are the names of the file dimensions of the queried data set (see + documentation on \code{\dots}) and the values are single character strings + with the components used to build the path to the file being read (the one + provided in 'file_path' or 'file_object'). The parameter 'var_name' + will be filled in automatically by Start() also, with the name of one + of the variales to be read. The parameter 'synonims' will be filled in + with exactly the same value as provided in the parameter 'synonims' in + the call to Start(), and has to be used in the code of the variable + reader to check for alternative variable names inside the target file. The + 'file_var_reader' must return a (multi)dimensional array with named + dimensions, and optionally with the attribute 'variales' with other + additional metadata on the retrieved variable. \cr\cr -The \bold{name of another dimension} to be specified in _depends, -only available for file dimensions, must be a character string with the name -of another requested \bold{file dimension} in \code{\dots}, and will make -Start() aware that the path components of a file dimension can vary in -function of the path component of another file dimension. For instance, in the -example above, specifying \code{item_depends = 'section'} will make -Start() aware that the item names vary in function of the section, i.e. -section 'electronics' has items 'a', 'b' and 'c' but section 'clothing' has -items 'd', 'e', 'f'. Otherwise Start() would expect to find the same -item names in all the sections. +Usually, the 'file_var_reader' should be a degenerate case of the +'file_data_reader' (see documentation on the corresponding parameter), +so it is recommended to code the 'file_data_reder' in first place. \cr\cr -The \bold{name of another dimension} to be specified in '_across', -only available for inner dimensions, must be a character string with the name -of another requested \bold{inner dimension} in \code{\dots}, and will make -Start() aware that an inner dimension extends along multiple files. For -instance, let us imagine that in the example above, the records for each item -are so large that it becomes necessary to split them in multiple files each -one containing the registers for a different period of time, e.g. in 10 files -with 100 months each ('item_a_period1.data', 'item_a_period2.data', and so on). -In that case, the data can be perceived as having an extra file dimension, the -'period' dimension. The inner dimension 'month' would extend across multiple -files, and providing the parameter \code{month = indices(1, 300)} would make -Start() crash because it would perceive we have made a request out of -bounds (each file contains 100 'month' indices, but we requested 1 to 300). -This can be solved by specifying the parameter \code{month_across = period} (a -long with the full specification of the dimension 'period'). +This parameter takes by default NcVarReader() (a variable reader function +for NetCDF files). \cr\cr -\bold{Defining the path pattern} -\cr -As mentioned above, the parameter \dots also expects to receive information -with the location of the data files. In order to do this, a special dimension -must be defined. In that special dimension, in place of specifying indices to -take, a path pattern must be provided. The path pattern is a character string -that encodes the way the files are organized in their source. It must be a -path to one of the data set files in an accessible local or remote file system, -or a URL to one of the files provided by a local or remote server. The regions -of this path that vary across files (along the file dimensions) must be -replaced by wildcards. The wildcards must match any of the defined file -dimensions in the call to Start() and must be delimited with heading -and trailing '$'. Shell globbing expressions can be used in the path pattern. -See the next code snippet for an example of a path pattern. +See NcVarReader() for a template to build a variale reader for your own +file format.} + +\item{file_dim_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} + that returns a named numeric vector where the names are the names of the + dimensions of the multidimensional data array in the file and the values are + the sizes of such dimensions. Start() will provide automatically + either a 'file_path' or a 'file_object' to the + 'file_dim_reader' function (the function has to be ready to work + whichever of these two is provided). The parameter 'file_selectors' + will also be provided automatically to the dimension reader, containing a + named list where the names are the names of the file dimensions of the + queried data set (see documentation on \code{\dots}) and the values are + single character strings with the components used to build the path to the + file being read (the one provided in 'file_path' or 'file_object'). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and can optionally be used in advanced configurations. \cr\cr -All in all, the call to Start() to load the entire data set in the -example of store item sales, would look as follows: -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # variable = 'all', -\cr # section = 'all', -\cr # item = 'all', -\cr # item_depends = 'section', -\cr # store = 'all', -\cr # month = 'all') -} +This parameter takes by default NcDimReader() (a dimension reader +function for NetCDF files). \cr\cr -Note that in this example it would still be pending to properly define the -parameters 'file_opener', 'file_closer', 'file_dim_reader', -'file_var_reader' and 'file_data_reader' for the '.data' file format -(see the corresponding sections). +See NcDimReader() for (an advanced) template to build a dimension reader +for your own file format.} + +\item{file_data_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, + \code{inner_indices = NULL}, \code{synonims} that returns a subset of the + multidimensional data array inside a file (even if internally it is not an + array). Start() will provide automatically either a 'file_path' + or a 'file_object' to the 'file_data_reader' function (the + function has to be ready to work whichever of these two is provided). The + parameter 'file_selectors' will also be provided automatically to the + data reader, containing a named list where the names are the names of the + file dimensions of the queried data set (see documentation on \code{\dots}) + and the values are single character strings with the components used to + build the path to the file being read (the one provided in 'file_path' or + 'file_object'). The parameter 'inner_indices' will be filled in + automatically by Start() also, with a named list of numeric vectors, + where the names are the names of all the expected inner dimensions in a file + to be read, and the numeric vectors are the indices to be taken from the + corresponding dimension (the indices may not be consecutive nor in order). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and has to be used in the code of the data reader to check for alternative + dimension names inside the target file. The 'file_data_reader' must + return a (multi)dimensional array with named dimensions, and optionally with + the attribute 'variables' with other additional metadata on the retrieved + data. \cr\cr -The call to Start() will return a multidimensional R array with the -following dimensions: -\cr -\command{ -\cr # source variable section item store month -\cr # 1 2 2 3 100 24 -} -\cr -The dimension specifications in the \code{\dots} do not have to follow any -particular order. The returned array will have the dimensions in the same order -as they have been specified in the call. For example, the following call: -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # month = 'all', -\cr # store = 'all', -\cr # item = 'all', -\cr # item_depends = 'section', -\cr # section = 'all', -\cr # variable = 'all') -} +Usually, 'file_data_reader' should use 'file_dim_reader' +(see documentation on the corresponding parameter), so it is recommended to +code 'file_dim_reder' in first place. \cr\cr -would return an array with the following dimensions: -\cr -\command{ -\cr # source month store item section variable -\cr # 1 24 100 3 2 2 -} +This parameter takes by default NcDataReader() (a data reader function +for NetCDF files). \cr\cr -Next, a more advanced example to retrieve data for only the sales records, for -the first section ('electronics'), for the 1st and 3rd items and for the -stores located in Barcelona (assuming the files contain the variable -'store_location' with the name of the city each of the 100 stores are located -at): -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} +See NcDataReader() for a template to build a data reader for your own +file format.} + +\item{file_closer}{A function that receives as a single parameter + 'file_object' an open connection (as returned by 'file_opener') + to one of the files to be read, optionally with header information, and + closes the open connection. Always returns NULL. \cr\cr -The defined names for the dimensions do not necessarily have to match the -names of the dimensions inside the file. Lists of alternative names to be -seeked can be defined in the parameter 'synonims'. +This parameter takes by default NcCloser() (a closer function for NetCDF +files). \cr\cr -If data from multiple sources (not necessarily following the same structure) -has to be retrieved, it can be done by providing a vector of character strings -with path pattern specifications, or, in the extended form, by providing a -list of lists with the components 'name' and 'path', and the name of the -dataset and path pattern as values, respectively. For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} -\cr} +See NcCloser() for a template to build a file closer for your own file +format.} + +\item{transform}{A function with the header \code{dara_array}, +\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as +input, through the parameter \code{data_array}, a subset of a +multidimensional array (as returned by 'file_data_reader'), applies a +transformation to it and returns it, preserving the amount of dimensions but +potentially modifying their size. This transformation may require data from +other auxiliary variables, automatically provided to 'transform' +through the parameter 'variables', in the form of a named list where +the names are the variable names and the values are (multi)dimensional +arrays. Which variables need to be sent to 'transform' can be specified +with the parameter 'transform_vars' in Start(). The parameter +'file_selectors' will also be provided automatically to +'transform', containing a named list where the names are the names of +the file dimensions of the queried data set (see documentation on +\code{\dots}) and the values are single character strings with the +components used to build the path to the file the subset being processed +belongs to. The parameter \code{\dots} will be filled in with other +additional parameters to adjust the transformation, exactly as provided in +the call to Start() via the parameter 'transform_params'.} + +\item{transform_params}{A named list with additional parameters to be sent to +the 'transform' function (if specified). See documentation on parameter +'transform' for details.} + +\item{transform_vars}{A vector of character strings with the names of +auxiliary variables to be sent to the 'transform' function (if +specified). All the variables to be sent to 'transform' must also +have been requested as return variables in the parameter 'return_vars' +of Start().} + +\item{transform_extra_cells}{An integer of extra indices to retrieve from the +data set, beyond the requested indices in \code{\dots}, in order for +'transform' to dispose of additional information to properly apply +whichever transformation (if needed). As many as +'transform_extra_cells' will be retrieved beyond each of the limits for +each of those inner dimensions associated to a coordinate variable and sent +to 'transform' (i.e. present in 'transform_vars'). After +'transform' has finished, Start() will take again and return a +subset of the result, for the returned data to fall within the specified +bounds in \code{\dots}. The default value is 2.} + +\item{apply_indices_after_transform}{A logical value indicating when a +'transform' is specified in Start() and numeric indices are +provided for any of the inner dimensions that depend on coordinate variables, +these numeric indices can be made effective (retrieved) before applying the +transformation or after. The boolean flag allows to adjust this behaviour. +It takes FALSE by default (numeric indices are applied before sending +data to 'transform').} + +\item{pattern_dims}{A character string indicating the name of the dimension +with path pattern specifications (see \code{\dots} for details). If not +specified, Start() assumes the first provided dimension is the pattern +dimension, with a warning.} + +\item{metadata_dims}{A vector of character strings with the names of the file +dimensions which to return metadata for. As noted in 'file_data_reader', +the data reader can optionally return auxiliary data via the attribute +'variables' of the returned array. Start() by default returns the +auxiliary data read for only the first file of each source (or data set) in +the pattern dimension (see \code{\dots} for info on what the pattern +dimension is). However it can be configured to return the metadata for all +the files along any set of file dimensions. The default value is NULL, and +it will be assigned automatically as parameter 'pattern_dims'.} + +\item{selector_checker}{A function used internaly by Start() to +translate a set of selectors (values for a dimension associated to a +coordinate variable) into a set of numeric indices. It takes by default +SelectorChecker() and, in principle, it should not be required to +change it for customized file formats. The option to replace it is left open +for more versatility. See the code of SelectorChecker() for details on +the inputs, functioning and outputs of a selector checker.} + +\item{merge_across_dims}{A logical value indicating whether to merge +dimensions across which another dimension extends (according to the +'_across' parameters). Takes the value FALSE by default. For +example, if the dimension 'time' extends across the dimension 'chunk' and +\code{merge_across_dims = TRUE}, the resulting data array will only contain +only the dimension 'time' as long as all the chunks together.} + +\item{merge_across_dims_narm}{A logical value indicating whether to remove +the additional NAs from data when parameter 'merge_across_dims' is TRUE. +It is helpful when the length of the to-be-merged dimension is different +across another dimension. For example, if the dimension 'time' extends +across dimension 'chunk', and the time length along the first chunk is 2 +while along the second chunk is 10. Setting this parameter as TRUE can +remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} + +\item{split_multiselected_dims}{A logical value indicating whether to split a +dimension that has been selected with a multidimensional array of selectors +into as many dimensions as present in the selector array. The default value +is FALSE.} + +\item{path_glob_permissive}{A logical value or an integer specifying how many + folder levels in the path pattern, beginning from the end, the shell glob + expressions must be preserved and worked out for each file. The default + value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr +When specifying a path pattern for a dataset, it might contain shell glob +experissions. For each dataset, the first file matching the path pattern is +found, and the found file is used to work out fixed values for the glob +expressions that will be used for all the files of the dataset. However, in +some cases, the values of the shell glob expressions may not be constant for +all files in a dataset, and they need to be worked out for each file +involved.\cr\cr +For example, a path pattern could be as follows: \cr +\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving +\code{path_glob_permissive = FALSE} will trigger automatic seek of the + contents to replace the asterisks (e.g. the first asterisk matches with + \code{'bar'} and the second with \code{'baz'}. The found contents will be + used for all files in the dataset (in the example, the path pattern will be + fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if + any of the files in the dataset have other contents in the position of the + asterisks, Start() will not find them (in the example, a file like \cr + \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be + found). Setting \code{path_glob_permissive = 1} would preserve global + expressions in the latest level (in the example, the fixed path pattern + would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the + problematic file mentioned before would be found), but of course this would + slow down the Start() call if the dataset involves a large number of + files. Setting \code{path_glob_permissive = 2} would leave the original path + pattern with the original glob expressions in the 1st and 2nd levels (in the + example, both asterisks would be preserved, thus would allow Start() + to recognize files such as \cr + \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +Note that each glob expression can only represent one possibility (Start() +chooses the first). Because /code{*} is not the tag, which means it cannot +be a dimension of the output array. Therefore, only one possibility can be +adopted. For example, if \cr +\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +has two matches:\cr +\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +only the first found file will be used.} + +\item{retrieve}{A logical value indicating whether to retrieve the data +defined in the Start() call or to explore only its dimension lengths +and names, and the values for the file and inner dimensions. The default +value is FALSE.} + +\item{num_procs}{An integer of number of processes to be created for the +parallel execution of the retrieval/transformation/arrangement of the +multiple involved files in a call to Start(). If set to NULL, +takes the number of available cores (as detected by detectCores() in +the package 'future'). The default value is 1 (no parallel execution).} + +\item{ObjectBigmemory}{a character string to be included as part of the +bigmemory object name. This parameter is thought to be used internally by the +chunking capabilities of startR.} + +\item{silent}{A logical value of whether to display progress messages (FALSE) +or not (TRUE). The default value is FALSE.} + +\item{debug}{A logical value of whether to return detailed messages on the +progress and operations in a Start() call (TRUE) or not (FALSE). The +default value is FALSE.} } \value{ If \code{retrieve = TRUE} the involved data is loaded into RAM memory @@ -809,4 +830,3 @@ file format. retrieve = FALSE) } - diff --git a/man/Step.Rd b/man/Step.Rd index 65f0c72..c473ccb 100644 --- a/man/Step.Rd +++ b/man/Step.Rd @@ -4,8 +4,13 @@ \alias{Step} \title{Define the operation applied on declared data.} \usage{ -Step(fun, target_dims, output_dims, use_libraries = NULL, - use_attributes = NULL) +Step( + fun, + target_dims, + output_dims, + use_libraries = NULL, + use_attributes = NULL +) } \arguments{ \item{fun}{A function in R format defining the operation to be applied to the @@ -70,4 +75,3 @@ to the expected order for this function. wf <- AddStep(data, step) } - diff --git a/man/indices.Rd b/man/indices.Rd index a3d85ea..6233b71 100644 --- a/man/indices.Rd +++ b/man/indices.Rd @@ -39,4 +39,3 @@ original data. See details in the documentation of the parameter \code{\dots} \seealso{ \code{\link{values}} } - diff --git a/man/values.Rd b/man/values.Rd index 3300f19..31ce95a 100644 --- a/man/values.Rd +++ b/man/values.Rd @@ -41,4 +41,3 @@ coordinate variable. See details in the documentation of the parameter \seealso{ \code{\link{indices}} } - -- GitLab From f2e3f6616b0b4b4c26b87491c372d91515f59f05 Mon Sep 17 00:00:00 2001 From: nperez Date: Wed, 16 Sep 2020 19:26:10 +0200 Subject: [PATCH 08/12] the best version until now. save the bigmemory filename and do the cleaning if all works --- R/Start.R | 8 ++++---- inst/chunking/load_process_save_chunk.R | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/Start.R b/R/Start.R index a5b21f5..73d924b 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3721,9 +3721,9 @@ Start <- function(..., # dim = indices/selectors, # to the work pieces. data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1) shared_matrix_pointer <- bigmemory::describe(data_array) - if (!is.null(ObjectBigmemory)) { - attr(shared_matrix_pointer, 'description')$sharedName <- ObjectBigmemory - } + #if (!is.null(ObjectBigmemory)) { + # attr(shared_matrix_pointer, 'description')$sharedName <- ObjectBigmemory + #} if (is.null(num_procs)) { num_procs <- future::availableCores() } @@ -4289,7 +4289,7 @@ Start <- function(..., # dim = indices/selectors, file_data_reader, synonims, transform, transform_params, silent = FALSE, debug = FALSE) { - warning(attr(shared_matrix_pointer, 'description')$sharedName) + #warning(attr(shared_matrix_pointer, 'description')$sharedName) # suppressPackageStartupMessages({library(bigmemory)}) ### TODO: Specify dependencies as parameter # suppressPackageStartupMessages({library(ncdf4)}) diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index 9f9f35a..53a0dc9 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -70,8 +70,8 @@ for (input in 1:length(data)) { if (!('num_procs' %in% names(start_call))) { start_call[['num_procs']] <- threads_load } - nameMemoryObject <- gsub("[^0-9.-]", "", gsub(out_dir, "", task_path)) - start_call[['ObjectBigmemory']] <- nameMemoryObject + #nameMemoryObject <- gsub("[^0-9.-]", "", gsub(out_dir, "", task_path)) + #start_call[['ObjectBigmemory']] <- nameMemoryObject data[[input]] <- tryCatch(eval(start_call), # Handler when an error occurs: error = function(e) { -- GitLab From eed625071081b6f7e1d67d12c1de82c8d31eef19 Mon Sep 17 00:00:00 2001 From: nperez Date: Mon, 21 Sep 2020 13:48:49 +0200 Subject: [PATCH 09/12] change name and save in temp_dir folder the bigmemory obj --- R/ByChunks.R | 4 ++-- R/Start.R | 17 +++++++++++++++-- inst/chunking/Chunk.ecf | 2 +- inst/chunking/load_process_save_chunk.R | 14 +++++++++----- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/R/ByChunks.R b/R/ByChunks.R index 9acc14f..dd10112 100644 --- a/R/ByChunks.R +++ b/R/ByChunks.R @@ -551,8 +551,8 @@ ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', ecflow_suite_dir_suite) file.copy(system.file('chunking/tail.h', package = 'startR'), ecflow_suite_dir_suite) - file.copy(system.file('chunking/clean_devshm.sh', package = 'startR'), - ecflow_suite_dir_suite) + #file.copy(system.file('chunking/clean_devshm.sh', package = 'startR'), + # ecflow_suite_dir_suite) } add_line <- function(suite, line, tabs) { diff --git a/R/Start.R b/R/Start.R index 73d924b..fdcffc4 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3719,8 +3719,21 @@ Start <- function(..., # dim = indices/selectors, # TODO: try performance of storing all in cols instead of rows # Create the shared memory array, and a pointer to it, to be sent # to the work pieces. - data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1) + if (is.null(ObjectBigmemory)) { + data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1) + } else { + data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1, + backingfile = ObjectBigmemory) + } shared_matrix_pointer <- bigmemory::describe(data_array) + if (is.null(ObjectBigmemory)) { + name_bigmemory_obj <- attr(shared_matrix_pointer, 'description')$sharedName + } else { + name_bigmemory_obj <- attr(shared_matrix_pointer, 'description')$filename + } + + #warning(paste("SharedName:", attr(shared_matrix_pointer, 'description')$sharedName)) + #warning(paste("Filename:", attr(shared_matrix_pointer, 'description')$filename)) #if (!is.null(ObjectBigmemory)) { # attr(shared_matrix_pointer, 'description')$sharedName <- ObjectBigmemory #} @@ -4246,7 +4259,7 @@ Start <- function(..., # dim = indices/selectors, NotFoundFiles = array_of_not_found_files, FileSelectors = file_selectors, PatternDim = found_pattern_dim, - ObjectBigmemory = attr(shared_matrix_pointer, 'description')$sharedName) + ObjectBigmemory = name_bigmemory_obj) #attr(shared_matrix_pointer, 'description')$sharedName) ) attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class')) data_array diff --git a/inst/chunking/Chunk.ecf b/inst/chunking/Chunk.ecf index ccc6338..60bd051 100644 --- a/inst/chunking/Chunk.ecf +++ b/inst/chunking/Chunk.ecf @@ -16,6 +16,6 @@ Rscript load_process_save_chunk.R --args $task_path insert_indices #include_transfer_back_and_rm #clean temporal folder -bash %REMOTE_ECF_HOME%clean_devshm.sh $task_path +#bash %REMOTE_ECF_HOME%clean_devshm.sh $task_path %include "./tail.h" diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index 53a0dc9..465357d 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -70,8 +70,8 @@ for (input in 1:length(data)) { if (!('num_procs' %in% names(start_call))) { start_call[['num_procs']] <- threads_load } - #nameMemoryObject <- gsub("[^0-9.-]", "", gsub(out_dir, "", task_path)) - #start_call[['ObjectBigmemory']] <- nameMemoryObject + nameMemoryObject <- gsub("[^0-9.-]", "", gsub(out_dir, "", task_path)) + start_call[['ObjectBigmemory']] <- nameMemoryObject data[[input]] <- tryCatch(eval(start_call), # Handler when an error occurs: error = function(e) { @@ -84,11 +84,15 @@ for (input in 1:length(data)) { message(paste("file:", rownames(info), "size:", info$size, "uname:", info$uname))}) + message(getwd()) + file.remove(nameMemoryObject) + file.remove(paste0(nameMemoryObject, ".desc")) + message(paste("Files", nameMemoryObject, "has been removed.")) }) warning(attributes(data[[input]])$ObjectBigmemory) - write.table(attributes(data[[input]])$ObjectBigmemory, - file = paste0(task_path, '.filename.txt'), - col.names = FALSE, row.names = FALSE, quote = FALSE) + #write.table(attributes(data[[input]])$ObjectBigmemory, + # file = paste0(task_path, '.filename.txt'), + # col.names = FALSE, row.names = FALSE, quote = FALSE) } t_end_load <- Sys.time() t_load <- as.numeric(difftime(t_end_load, t_begin_load, units = 'secs')) -- GitLab From dfa531aaa76c14ce83f26b2d0faa314d66a90bae Mon Sep 17 00:00:00 2001 From: nperez Date: Mon, 21 Sep 2020 16:39:22 +0200 Subject: [PATCH 10/12] improved name of the temporal object when chunking --- inst/chunking/load_process_save_chunk.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index 465357d..aa21be7 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -70,7 +70,8 @@ for (input in 1:length(data)) { if (!('num_procs' %in% names(start_call))) { start_call[['num_procs']] <- threads_load } - nameMemoryObject <- gsub("[^0-9.-]", "", gsub(out_dir, "", task_path)) + nameMemoryObject <- gsub("[^0-9.-]", "_", gsub(out_dir, "", task_path)) + nameMemoryObject <- substr(nameMemoryObject, 2, nchar(nameMemoryObject)) start_call[['ObjectBigmemory']] <- nameMemoryObject data[[input]] <- tryCatch(eval(start_call), # Handler when an error occurs: -- GitLab From 52a781eafd7300fc192ac05a8b4f027db17bd7d1 Mon Sep 17 00:00:00 2001 From: nperez Date: Mon, 21 Sep 2020 18:19:33 +0200 Subject: [PATCH 11/12] improved files names --- inst/chunking/load_process_save_chunk.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index aa21be7..49a94a9 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -70,8 +70,11 @@ for (input in 1:length(data)) { if (!('num_procs' %in% names(start_call))) { start_call[['num_procs']] <- threads_load } + # Creates a name for the temporal file using the chunks numbers: nameMemoryObject <- gsub("[^0-9.-]", "_", gsub(out_dir, "", task_path)) nameMemoryObject <- substr(nameMemoryObject, 2, nchar(nameMemoryObject)) + removeRS <- function(str) paste(rle(strsplit(str, "")[[1]])$values, collapse = "") + nameMemoryObject <- removeRS(nameMemoryObject) start_call[['ObjectBigmemory']] <- nameMemoryObject data[[input]] <- tryCatch(eval(start_call), # Handler when an error occurs: -- GitLab From c80751eaff3a32c56441d73bc223adabe02f1d8c Mon Sep 17 00:00:00 2001 From: nperez Date: Tue, 6 Oct 2020 11:47:35 +0200 Subject: [PATCH 12/12] doc created with R3.2.0 --- DESCRIPTION | 16 +- man/AddStep.Rd | 1 + man/CDORemapper.Rd | 1 + man/Collect.Rd | 1 + man/Compute.Rd | 16 +- man/NcCloser.Rd | 1 + man/NcDataReader.Rd | 10 +- man/NcDimReader.Rd | 10 +- man/NcOpener.Rd | 1 + man/NcVarReader.Rd | 10 +- man/SelectorChecker.Rd | 4 +- man/Sort.Rd | 9 +- man/Start.Rd | 1248 ++++++++++++++++++++-------------------- man/Step.Rd | 10 +- man/indices.Rd | 1 + man/values.Rd | 1 + 16 files changed, 655 insertions(+), 685 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 35d79cf..0a32038 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,15 +9,15 @@ Authors@R: c( person("Javier", "Vegas", , "javier.vegas@bsc.es", role = c("ctb")), person("Pierre-Antoine", "Bretonniere", , "pierre-antoine.bretonniere@bsc.es", role = c("ctb")), person("Roberto", "Serrano", , "rsnotivoli@gmal.com", role = c("ctb"))) -Description: Tool to automatically fetch, transform and arrange subsets of multi- - dimensional data sets (collections of files) stored in local and/or remote - file systems or servers, using multicore capabilities where possible. The tool - provides an interface to perceive a collection of data sets as a single large - multidimensional data array, and enables the user to request for automatic +Description: Tool to automatically fetch, transform and arrange subsets of + multi- dimensional data sets (collections of files) stored in local and/or + remote file systems or servers, using multicore capabilities where possible. + The tool provides an interface to perceive a collection of data sets as a single + large multidimensional data array, and enables the user to request for automatic retrieval, processing and arrangement of subsets of the large array. Wrapper functions to add support for custom file formats can be plugged in/out, making - the tool suitable for any research field where large multidimensional data - sets are involved. + the tool suitable for any research field where large multidimensional data sets + are involved. Depends: R (>= 3.2.0) Imports: @@ -37,4 +37,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true SystemRequirements: cdo -RoxygenNote: 7.0.1 +RoxygenNote: 5.0.0 diff --git a/man/AddStep.Rd b/man/AddStep.Rd index 0d0ce46..3eece05 100644 --- a/man/AddStep.Rd +++ b/man/AddStep.Rd @@ -54,3 +54,4 @@ create the complete workflow. It is the final step before data processing. wf <- AddStep(data, step, pi_val = pi_short) } + diff --git a/man/CDORemapper.Rd b/man/CDORemapper.Rd index 763be77..4f56baa 100644 --- a/man/CDORemapper.Rd +++ b/man/CDORemapper.Rd @@ -65,3 +65,4 @@ perform the interpolation, hence CDO is required to be installed. \seealso{ \code{\link[s2dverification]{CDORemap}} } + diff --git a/man/Collect.Rd b/man/Collect.Rd index 97b529b..44a7dee 100644 --- a/man/Collect.Rd +++ b/man/Collect.Rd @@ -83,3 +83,4 @@ of results as one data array when the execution is done. See more details on } } + diff --git a/man/Compute.Rd b/man/Compute.Rd index 7d6db4d..e07106a 100644 --- a/man/Compute.Rd +++ b/man/Compute.Rd @@ -4,18 +4,9 @@ \alias{Compute} \title{Specify the execution parameters and trigger the execution} \usage{ -Compute( - workflow, - chunks = "auto", - threads_load = 1, - threads_compute = 1, - cluster = NULL, - ecflow_suite_dir = NULL, - ecflow_server = NULL, - silent = FALSE, - debug = FALSE, - wait = TRUE -) +Compute(workflow, chunks = "auto", threads_load = 1, threads_compute = 1, + cluster = NULL, ecflow_suite_dir = NULL, ecflow_server = NULL, + silent = FALSE, debug = FALSE, wait = TRUE) } \arguments{ \item{workflow}{A list of the class 'startR_workflow' returned by function @@ -113,3 +104,4 @@ arrays and additional metadata. res <- Compute(wf, chunks = list(longitude = 4, sdate = 2)) } + diff --git a/man/NcCloser.Rd b/man/NcCloser.Rd index 588f63a..65beab8 100644 --- a/man/NcCloser.Rd +++ b/man/NcCloser.Rd @@ -32,3 +32,4 @@ NcCloser(connection) \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcDimReader}} \code{\link{NcVarReader}} } + diff --git a/man/NcDataReader.Rd b/man/NcDataReader.Rd index 9014789..a6d32c7 100644 --- a/man/NcDataReader.Rd +++ b/man/NcDataReader.Rd @@ -4,13 +4,8 @@ \alias{NcDataReader} \title{NetCDF file data reader for 'startR'} \usage{ -NcDataReader( - file_path = NULL, - file_object = NULL, - file_selectors = NULL, - inner_indices = NULL, - synonims -) +NcDataReader(file_path = NULL, file_object = NULL, file_selectors = NULL, + inner_indices = NULL, synonims) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -66,3 +61,4 @@ in turn uses nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDimReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } + diff --git a/man/NcDimReader.Rd b/man/NcDimReader.Rd index 38dd870..d539ffd 100644 --- a/man/NcDimReader.Rd +++ b/man/NcDimReader.Rd @@ -4,13 +4,8 @@ \alias{NcDimReader} \title{NetCDF dimension reader for 'startR'} \usage{ -NcDimReader( - file_path = NULL, - file_object = NULL, - file_selectors = NULL, - inner_indices = NULL, - synonims -) +NcDimReader(file_path = NULL, file_object = NULL, file_selectors = NULL, + inner_indices = NULL, synonims) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -63,3 +58,4 @@ This function uses the function NcReadDims() in the package 'easyNCDF'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } + diff --git a/man/NcOpener.Rd b/man/NcOpener.Rd index 30885fc..e46384c 100644 --- a/man/NcOpener.Rd +++ b/man/NcOpener.Rd @@ -34,3 +34,4 @@ NcCloser(connection) \code{\link{NcDimReader}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } + diff --git a/man/NcVarReader.Rd b/man/NcVarReader.Rd index fb093ae..c601907 100644 --- a/man/NcVarReader.Rd +++ b/man/NcVarReader.Rd @@ -4,13 +4,8 @@ \alias{NcVarReader} \title{NetCDF variable reader for 'startR'} \usage{ -NcVarReader( - file_path = NULL, - file_object = NULL, - file_selectors = NULL, - var_name = NULL, - synonims -) +NcVarReader(file_path = NULL, file_object = NULL, file_selectors = NULL, + var_name = NULL, synonims) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -63,3 +58,4 @@ nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcDimReader}} } + diff --git a/man/SelectorChecker.Rd b/man/SelectorChecker.Rd index e1cf112..ef83575 100644 --- a/man/SelectorChecker.Rd +++ b/man/SelectorChecker.Rd @@ -4,7 +4,8 @@ \alias{SelectorChecker} \title{Translate a set of selectors into a set of numeric indices} \usage{ -SelectorChecker(selectors, var = NULL, return_indices = TRUE, tolerance = NULL) +SelectorChecker(selectors, var = NULL, return_indices = TRUE, + tolerance = NULL) } \arguments{ \item{selectors}{A vector or a list of two of numeric indices or variable @@ -49,3 +50,4 @@ sub_array_of_values <- seq(90, -90, length.out = 258)[2:257] SelectorChecker(sub_array_of_selectors, sub_array_of_values) } + diff --git a/man/Sort.Rd b/man/Sort.Rd index 25a92fe..9ab516e 100644 --- a/man/Sort.Rd +++ b/man/Sort.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/Sort.R \name{Sort} -\alias{Sort} \alias{CircularSort} +\alias{Sort} \title{Sort the coordinate variable values in a Start() call} \usage{ Sort(...) @@ -10,12 +10,12 @@ Sort(...) CircularSort(start, end, ...) } \arguments{ -\item{\dots}{Additional parameters to adjust the reorderig. See function -sort() for more details.} - \item{start}{A numeric indicating the lower bound of the circular range.} \item{end}{A numeric indicating the upper bound of the circular range.} + +\item{\dots}{Additional parameters to adjust the reorderig. See function +sort() for more details.} } \value{ A list of 2 containing: @@ -57,3 +57,4 @@ range. This is useful for circular coordinates such as the Earth longitudes. retrieve = FALSE) } + diff --git a/man/Start.Rd b/man/Start.Rd index 5598210..76510ad 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -4,677 +4,660 @@ \alias{Start} \title{Declare, discover, subset and retrieve multidimensional distributed data sets} \usage{ -Start( - ..., - return_vars = NULL, - synonims = NULL, - file_opener = NcOpener, - file_var_reader = NcVarReader, - file_dim_reader = NcDimReader, - file_data_reader = NcDataReader, - file_closer = NcCloser, - transform = NULL, - transform_params = NULL, - transform_vars = NULL, - transform_extra_cells = 2, - apply_indices_after_transform = FALSE, - pattern_dims = NULL, - metadata_dims = NULL, - selector_checker = SelectorChecker, - merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, - split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, - retrieve = FALSE, - num_procs = 1, - ObjectBigmemory = NULL, - silent = FALSE, - debug = FALSE -) +Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, + file_var_reader = NcVarReader, file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, file_closer = NcCloser, + transform = NULL, transform_params = NULL, transform_vars = NULL, + transform_extra_cells = 2, apply_indices_after_transform = FALSE, + pattern_dims = NULL, metadata_dims = NULL, + selector_checker = SelectorChecker, merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, retrieve = FALSE, num_procs = 1, + ObjectBigmemory = NULL, silent = FALSE, debug = FALSE) } \arguments{ -\item{\dots}{A selection of custemized parameters depending on the data -format. When we retrieve data from one or a collection of data sets, -the involved data can be perceived as belonging to a large multi-dimensional -array. For instance, let us consider an example case. We want to retrieve data -from a source, which contains data for the number of monthly sales of various -items, and also for their retail price each month. The data on source is -stored as follows:\cr\cr -\command{ -\cr # /data/ -\cr # |-> sales/ -\cr # | |-> electronics -\cr # | | |-> item_a.data -\cr # | | |-> item_b.data -\cr # | | |-> item_c.data -\cr # | |-> clothing -\cr # | |-> item_d.data -\cr # | |-> idem_e.data -\cr # | |-> idem_f.data -\cr # |-> prices/ -\cr # |-> electronics -\cr # | |-> item_a.data -\cr # | |-> item_b.data -\cr # | |-> item_c.data -\cr # |-> clothing -\cr # |-> item_d.data -\cr # |-> item_e.data -\cr # |-> item_f.data -}\cr\cr -Each item file contains data, stored in whichever format, for the sales or -prices over a time period, e.g. for the past 24 months, registered at 100 -different stores over the world. Whichever the format it is stored in, each -file can be perceived as a container of a data array of 2 dimensions, time and -store. Let us assume the '.data' format allows to keep a name for each of -these dimensions, and the actual names are 'time' and 'store'.\cr\cr -The different item files for sales or prices can be perceived as belonging to -an 'item' dimension of length 3, and the two groups of three items to a -'section' dimension of length 2, and the two groups of two sections (one with -the sales and the other with the prices) can be perceived as belonging also to -another dimension 'variable' of length 2. Even the source can be perceived as -belonging to a dimension 'source' of length 1.\cr\cr -All in all, in this example, the whole data could be perceived as belonging to -a multidimensional 'large array' of dimensions\cr +\item{return_vars}{A named list where the names are the names of the +variables to be fetched in the files, and the values are vectors of +character strings with the names of the file dimension which to retrieve each +variable for, or NULL if the variable has to be retrieved only once +from any (the first) of the involved files.\cr\cr +Apart from retrieving a multidimensional data array, retrieving auxiliary +variables inside the files can also be needed. The parameter +'return_vars' allows for requesting such variables, as long as a +'file_var_reader' function is also specified in the call to +Start() (see documentation on the corresponding parameter). +\cr\cr +In the case of the the item sales example (see documentation on parameter +\code{\dots)}, the store location variable is requested with the parameter\cr +\code{return_vars = list(store_location = NULL)}.\cr This will cause +Start() to fetch once the variable 'store_location' and return it in +the component\cr \code{$Variables$common$store_location},\cr and will be an +array of character strings with the location names, with the dimensions +\code{c('store' = 100)}. Although useless in this example, we could ask +Start() to fetch and return such variable for each file along the +items dimension as follows: \cr +\code{return_vars = list(store_location = c('item'))}.\cr In that case, the +variable will be fetched once from a file of each of the items, and will be +returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. +\cr\cr +If a variable is requested along a file dimension that contains path pattern +specifications ('source' in the example), the fetched variable values will be +returned in the component\cr \code{$Variables$$}.\cr +For example: +\cr \command{ -\cr # source variable section item store month -\cr # 1 2 2 3 100 24 +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item'))) +\cr # # Checking the structure of the returned variables +\cr # str(found_data$Variables) +\cr # Named list +\cr # ..$common: NULL +\cr # ..$sourceA: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # ..$sourceB: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # # Checking the dimensions of the returned variable +\cr # # for the source A +\cr # dim(found_data$Variables$sourceA) +\cr # item store +\cr # 3 3 } \cr\cr -The dimensions of this 'large array' can be classified in two types. The ones -that group actual files (the file dimensions) and the ones that group data -values inside the files (the inner dimensions). In the example, the file -dimensions are 'source', 'variable', 'section' and 'item', whereas the inner -dimensions are 'store' and 'month'. +The names of the requested variables do not necessarily have to match the +actual variable names inside the files. A list of alternative names to be +seeked can be specified via the parameter 'synonims'.} + +\item{synonims}{A named list where the names are the requested variable or +dimension names, and the values are vectors of character strings with +alternative names to seek for such dimension or variable.\cr\cr +In some requests, data from different sources may follow different naming +conventions for the dimensions or variables, or even files in the same source +could have varying names. This parameter is in order for Start() to +properly identify the dimensions or variables with different names. \cr\cr -Having the dimensions of our target sources in mind, the parameter \code{\dots} -expects to receive information on: - \itemize{ - \item{ -The names of the expected dimensions of the 'large dataset' we want to -retrieve data from - } - \item{ -The indices to take from each dimension (and other constraints) - } - \item{ -How to reorder the dimension if needed - } - \item{ -The location and organization of the files of the data sets - } - } -For each dimension, the 3 first information items can be specified with a set -of parameters to be provided through \code{\dots}. For a given dimension -'dimname', six parameters can be specified:\cr +In the example used in parameter 'return_vars', it may be the case that +the two involved data sources follow slightly different naming conventions. +For example, source A uses 'sect' as name for the sections dimension, whereas +source B uses 'section'; source A uses 'store_loc' as variable name for the +store locations, whereas source B uses 'store_location'. This can be taken +into account as follows: +\cr \command{ -\cr # dimname = , # 'all' / 'first' / 'last' / -\cr # # indices(c(1, 10, 20)) / -\cr # # indices(c(1:20)) / -\cr # # indices(list(1, 20)) / -\cr # # c(1, 10, 20) / c(1:20) / -\cr # # list(1, 20) -\cr # dimname_var = , -\cr # dimname_tolerance = , -\cr # dimname_reorder = , -\cr # dimname_depends = , -\cr # dimname_across = +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item')), +\cr # synonims = list( +\cr # section = c('sec', 'section'), +\cr # store_location = c('store_loc', +\cr # 'store_location') +\cr # )) } +\cr} + +\item{file_opener}{A function that receives as a single parameter + 'file_path' a character string with the path to a file to be opened, + and returns an object with an open connection to the file (optionally with + header information) on success, or returns NULL on failure. \cr\cr -The \bold{indices to take} can be specified in three possible formats (see -code comments above for examples). The first format consists in using -character tags, such as 'all' (take all the indices available for that -dimension), 'first' (take only the first) and 'last' (only the last). The -second format consists in using numeric indices, which have to be wrapped in a -call to the indices() helper function. For the second format, either a -vector of numeric indices can be provided, or a list with two numeric indices -can be provided to take all the indices in the range between the two specified -indices (both extremes inclusive). The third format consists in providing a -vector character strings (for file dimensions) or of values of whichever type -(for inner dimensions). For the file dimensions, the provided character -strings in the third format will be used as components to build up the final -path to the files (read further). For inner dimensions, the provided values in -the third format will be compared to the values of an associated coordinate -variable (must be specified in '_reorder', read further), and the -indices of the closest values will be retrieved. When using the third format, -a list with two values can also be provided to take all the indices of the -values within the specified range. +This parameter takes by default NcOpener() (an opener function for NetCDF +files). \cr\cr -The \bold{name of the associated coordinate variable} must be a character -string with the name of an associated coordinate variable to be found in the -data files (in all* of them). For this to work, a 'file_var_reader' -function must be specified when calling Start() (see parameter -'file_var_reader'). The coordinate variable must also be requested in the -parameter 'return_vars' (see its section for details). This feature only -works for inner dimensions. +See NcOpener() for a template to build a file opener for your own file +format.} + +\item{file_var_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, + \code{synonims} that returns an array with auxiliary data (i.e. data from a + variable) inside a file. Start() will provide automatically either a + 'file_path' or a 'file_object' to the 'file_var_reader' + function (the function has to be ready to work whichever of these two is + provided). The parameter 'file_selectors' will also be provided + automatically to the variable reader, containing a named list where the + names are the names of the file dimensions of the queried data set (see + documentation on \code{\dots}) and the values are single character strings + with the components used to build the path to the file being read (the one + provided in 'file_path' or 'file_object'). The parameter 'var_name' + will be filled in automatically by Start() also, with the name of one + of the variales to be read. The parameter 'synonims' will be filled in + with exactly the same value as provided in the parameter 'synonims' in + the call to Start(), and has to be used in the code of the variable + reader to check for alternative variable names inside the target file. The + 'file_var_reader' must return a (multi)dimensional array with named + dimensions, and optionally with the attribute 'variales' with other + additional metadata on the retrieved variable. \cr\cr -The \bold{tolerance value} is useful when indices for an inner dimension are -specified in the third format (values of whichever type). In that case, the -indices of the closest values in the coordinate variable are seeked. However -the closest value might be too distant and we would want to consider no real -match exists for such provided value. This is possible via the tolerance, -which allows to specify a threshold beyond which not to seek for matching -values and mark that index as missing value. +Usually, the 'file_var_reader' should be a degenerate case of the +'file_data_reader' (see documentation on the corresponding parameter), +so it is recommended to code the 'file_data_reder' in first place. \cr\cr -The \bold{reorder_function} is useful when indices for an inner dimension are -specified in the third fromat, and the retrieved indices need to be reordered -in function of their provided associated variable values. A function can be -provided, which receives as input a vector of values, and returns as outputs a -list with the components \code{$x} with the reordered values, and \code{$ix} -with the permutation indices. Two reordering functions are included in -startR, the Sort() and the CircularSort(). +This parameter takes by default NcVarReader() (a variable reader function +for NetCDF files). \cr\cr -The \bold{name of another dimension} to be specified in _depends, -only available for file dimensions, must be a character string with the name -of another requested \bold{file dimension} in \code{\dots}, and will make -Start() aware that the path components of a file dimension can vary in -function of the path component of another file dimension. For instance, in the -example above, specifying \code{item_depends = 'section'} will make -Start() aware that the item names vary in function of the section, i.e. -section 'electronics' has items 'a', 'b' and 'c' but section 'clothing' has -items 'd', 'e', 'f'. Otherwise Start() would expect to find the same -item names in all the sections. -\cr\cr -The \bold{name of another dimension} to be specified in '_across', -only available for inner dimensions, must be a character string with the name -of another requested \bold{inner dimension} in \code{\dots}, and will make -Start() aware that an inner dimension extends along multiple files. For -instance, let us imagine that in the example above, the records for each item -are so large that it becomes necessary to split them in multiple files each -one containing the registers for a different period of time, e.g. in 10 files -with 100 months each ('item_a_period1.data', 'item_a_period2.data', and so on). -In that case, the data can be perceived as having an extra file dimension, the -'period' dimension. The inner dimension 'month' would extend across multiple -files, and providing the parameter \code{month = indices(1, 300)} would make -Start() crash because it would perceive we have made a request out of -bounds (each file contains 100 'month' indices, but we requested 1 to 300). -This can be solved by specifying the parameter \code{month_across = period} (a -long with the full specification of the dimension 'period'). -\cr\cr -\bold{Defining the path pattern} -\cr -As mentioned above, the parameter \dots also expects to receive information -with the location of the data files. In order to do this, a special dimension -must be defined. In that special dimension, in place of specifying indices to -take, a path pattern must be provided. The path pattern is a character string -that encodes the way the files are organized in their source. It must be a -path to one of the data set files in an accessible local or remote file system, -or a URL to one of the files provided by a local or remote server. The regions -of this path that vary across files (along the file dimensions) must be -replaced by wildcards. The wildcards must match any of the defined file -dimensions in the call to Start() and must be delimited with heading -and trailing '$'. Shell globbing expressions can be used in the path pattern. -See the next code snippet for an example of a path pattern. -\cr\cr -All in all, the call to Start() to load the entire data set in the -example of store item sales, would look as follows: -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # variable = 'all', -\cr # section = 'all', -\cr # item = 'all', -\cr # item_depends = 'section', -\cr # store = 'all', -\cr # month = 'all') -} -\cr\cr -Note that in this example it would still be pending to properly define the -parameters 'file_opener', 'file_closer', 'file_dim_reader', -'file_var_reader' and 'file_data_reader' for the '.data' file format -(see the corresponding sections). +See NcVarReader() for a template to build a variale reader for your own +file format.} + +\item{file_dim_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} + that returns a named numeric vector where the names are the names of the + dimensions of the multidimensional data array in the file and the values are + the sizes of such dimensions. Start() will provide automatically + either a 'file_path' or a 'file_object' to the + 'file_dim_reader' function (the function has to be ready to work + whichever of these two is provided). The parameter 'file_selectors' + will also be provided automatically to the dimension reader, containing a + named list where the names are the names of the file dimensions of the + queried data set (see documentation on \code{\dots}) and the values are + single character strings with the components used to build the path to the + file being read (the one provided in 'file_path' or 'file_object'). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and can optionally be used in advanced configurations. \cr\cr -The call to Start() will return a multidimensional R array with the -following dimensions: -\cr -\command{ -\cr # source variable section item store month -\cr # 1 2 2 3 100 24 -} -\cr -The dimension specifications in the \code{\dots} do not have to follow any -particular order. The returned array will have the dimensions in the same order -as they have been specified in the call. For example, the following call: -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # month = 'all', -\cr # store = 'all', -\cr # item = 'all', -\cr # item_depends = 'section', -\cr # section = 'all', -\cr # variable = 'all') -} +This parameter takes by default NcDimReader() (a dimension reader +function for NetCDF files). \cr\cr -would return an array with the following dimensions: -\cr -\command{ -\cr # source month store item section variable -\cr # 1 24 100 3 2 2 -} +See NcDimReader() for (an advanced) template to build a dimension reader +for your own file format.} + +\item{file_data_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, + \code{inner_indices = NULL}, \code{synonims} that returns a subset of the + multidimensional data array inside a file (even if internally it is not an + array). Start() will provide automatically either a 'file_path' + or a 'file_object' to the 'file_data_reader' function (the + function has to be ready to work whichever of these two is provided). The + parameter 'file_selectors' will also be provided automatically to the + data reader, containing a named list where the names are the names of the + file dimensions of the queried data set (see documentation on \code{\dots}) + and the values are single character strings with the components used to + build the path to the file being read (the one provided in 'file_path' or + 'file_object'). The parameter 'inner_indices' will be filled in + automatically by Start() also, with a named list of numeric vectors, + where the names are the names of all the expected inner dimensions in a file + to be read, and the numeric vectors are the indices to be taken from the + corresponding dimension (the indices may not be consecutive nor in order). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and has to be used in the code of the data reader to check for alternative + dimension names inside the target file. The 'file_data_reader' must + return a (multi)dimensional array with named dimensions, and optionally with + the attribute 'variables' with other additional metadata on the retrieved + data. \cr\cr -Next, a more advanced example to retrieve data for only the sales records, for -the first section ('electronics'), for the 1st and 3rd items and for the -stores located in Barcelona (assuming the files contain the variable -'store_location' with the name of the city each of the 100 stores are located -at): -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} +Usually, 'file_data_reader' should use 'file_dim_reader' +(see documentation on the corresponding parameter), so it is recommended to +code 'file_dim_reder' in first place. \cr\cr -The defined names for the dimensions do not necessarily have to match the -names of the dimensions inside the file. Lists of alternative names to be -seeked can be defined in the parameter 'synonims'. +This parameter takes by default NcDataReader() (a data reader function +for NetCDF files). \cr\cr -If data from multiple sources (not necessarily following the same structure) -has to be retrieved, it can be done by providing a vector of character strings -with path pattern specifications, or, in the extended form, by providing a -list of lists with the components 'name' and 'path', and the name of the -dataset and path pattern as values, respectively. For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} -\cr} +See NcDataReader() for a template to build a data reader for your own +file format.} -\item{return_vars}{A named list where the names are the names of the -variables to be fetched in the files, and the values are vectors of -character strings with the names of the file dimension which to retrieve each -variable for, or NULL if the variable has to be retrieved only once -from any (the first) of the involved files.\cr\cr -Apart from retrieving a multidimensional data array, retrieving auxiliary -variables inside the files can also be needed. The parameter -'return_vars' allows for requesting such variables, as long as a -'file_var_reader' function is also specified in the call to -Start() (see documentation on the corresponding parameter). +\item{file_closer}{A function that receives as a single parameter + 'file_object' an open connection (as returned by 'file_opener') + to one of the files to be read, optionally with header information, and + closes the open connection. Always returns NULL. \cr\cr -In the case of the the item sales example (see documentation on parameter -\code{\dots)}, the store location variable is requested with the parameter\cr -\code{return_vars = list(store_location = NULL)}.\cr This will cause -Start() to fetch once the variable 'store_location' and return it in -the component\cr \code{$Variables$common$store_location},\cr and will be an -array of character strings with the location names, with the dimensions -\code{c('store' = 100)}. Although useless in this example, we could ask -Start() to fetch and return such variable for each file along the -items dimension as follows: \cr -\code{return_vars = list(store_location = c('item'))}.\cr In that case, the -variable will be fetched once from a file of each of the items, and will be -returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. +This parameter takes by default NcCloser() (a closer function for NetCDF +files). \cr\cr -If a variable is requested along a file dimension that contains path pattern -specifications ('source' in the example), the fetched variable values will be -returned in the component\cr \code{$Variables$$}.\cr -For example: -\cr +See NcCloser() for a template to build a file closer for your own file +format.} + +\item{transform}{A function with the header \code{dara_array}, +\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as +input, through the parameter \code{data_array}, a subset of a +multidimensional array (as returned by 'file_data_reader'), applies a +transformation to it and returns it, preserving the amount of dimensions but +potentially modifying their size. This transformation may require data from +other auxiliary variables, automatically provided to 'transform' +through the parameter 'variables', in the form of a named list where +the names are the variable names and the values are (multi)dimensional +arrays. Which variables need to be sent to 'transform' can be specified +with the parameter 'transform_vars' in Start(). The parameter +'file_selectors' will also be provided automatically to +'transform', containing a named list where the names are the names of +the file dimensions of the queried data set (see documentation on +\code{\dots}) and the values are single character strings with the +components used to build the path to the file the subset being processed +belongs to. The parameter \code{\dots} will be filled in with other +additional parameters to adjust the transformation, exactly as provided in +the call to Start() via the parameter 'transform_params'.} + +\item{transform_params}{A named list with additional parameters to be sent to +the 'transform' function (if specified). See documentation on parameter +'transform' for details.} + +\item{transform_vars}{A vector of character strings with the names of +auxiliary variables to be sent to the 'transform' function (if +specified). All the variables to be sent to 'transform' must also +have been requested as return variables in the parameter 'return_vars' +of Start().} + +\item{transform_extra_cells}{An integer of extra indices to retrieve from the +data set, beyond the requested indices in \code{\dots}, in order for +'transform' to dispose of additional information to properly apply +whichever transformation (if needed). As many as +'transform_extra_cells' will be retrieved beyond each of the limits for +each of those inner dimensions associated to a coordinate variable and sent +to 'transform' (i.e. present in 'transform_vars'). After +'transform' has finished, Start() will take again and return a +subset of the result, for the returned data to fall within the specified +bounds in \code{\dots}. The default value is 2.} + +\item{apply_indices_after_transform}{A logical value indicating when a +'transform' is specified in Start() and numeric indices are +provided for any of the inner dimensions that depend on coordinate variables, +these numeric indices can be made effective (retrieved) before applying the +transformation or after. The boolean flag allows to adjust this behaviour. +It takes FALSE by default (numeric indices are applied before sending +data to 'transform').} + +\item{pattern_dims}{A character string indicating the name of the dimension +with path pattern specifications (see \code{\dots} for details). If not +specified, Start() assumes the first provided dimension is the pattern +dimension, with a warning.} + +\item{metadata_dims}{A vector of character strings with the names of the file +dimensions which to return metadata for. As noted in 'file_data_reader', +the data reader can optionally return auxiliary data via the attribute +'variables' of the returned array. Start() by default returns the +auxiliary data read for only the first file of each source (or data set) in +the pattern dimension (see \code{\dots} for info on what the pattern +dimension is). However it can be configured to return the metadata for all +the files along any set of file dimensions. The default value is NULL, and +it will be assigned automatically as parameter 'pattern_dims'.} + +\item{selector_checker}{A function used internaly by Start() to +translate a set of selectors (values for a dimension associated to a +coordinate variable) into a set of numeric indices. It takes by default +SelectorChecker() and, in principle, it should not be required to +change it for customized file formats. The option to replace it is left open +for more versatility. See the code of SelectorChecker() for details on +the inputs, functioning and outputs of a selector checker.} + +\item{merge_across_dims}{A logical value indicating whether to merge +dimensions across which another dimension extends (according to the +'_across' parameters). Takes the value FALSE by default. For +example, if the dimension 'time' extends across the dimension 'chunk' and +\code{merge_across_dims = TRUE}, the resulting data array will only contain +only the dimension 'time' as long as all the chunks together.} + +\item{merge_across_dims_narm}{A logical value indicating whether to remove +the additional NAs from data when parameter 'merge_across_dims' is TRUE. +It is helpful when the length of the to-be-merged dimension is different +across another dimension. For example, if the dimension 'time' extends +across dimension 'chunk', and the time length along the first chunk is 2 +while along the second chunk is 10. Setting this parameter as TRUE can +remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} + +\item{split_multiselected_dims}{A logical value indicating whether to split a +dimension that has been selected with a multidimensional array of selectors +into as many dimensions as present in the selector array. The default value +is FALSE.} + +\item{path_glob_permissive}{A logical value or an integer specifying how many + folder levels in the path pattern, beginning from the end, the shell glob + expressions must be preserved and worked out for each file. The default + value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr +When specifying a path pattern for a dataset, it might contain shell glob +experissions. For each dataset, the first file matching the path pattern is +found, and the found file is used to work out fixed values for the glob +expressions that will be used for all the files of the dataset. However, in +some cases, the values of the shell glob expressions may not be constant for +all files in a dataset, and they need to be worked out for each file +involved.\cr\cr +For example, a path pattern could be as follows: \cr +\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving +\code{path_glob_permissive = FALSE} will trigger automatic seek of the + contents to replace the asterisks (e.g. the first asterisk matches with + \code{'bar'} and the second with \code{'baz'}. The found contents will be + used for all files in the dataset (in the example, the path pattern will be + fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if + any of the files in the dataset have other contents in the position of the + asterisks, Start() will not find them (in the example, a file like \cr + \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be + found). Setting \code{path_glob_permissive = 1} would preserve global + expressions in the latest level (in the example, the fixed path pattern + would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the + problematic file mentioned before would be found), but of course this would + slow down the Start() call if the dataset involves a large number of + files. Setting \code{path_glob_permissive = 2} would leave the original path + pattern with the original glob expressions in the 1st and 2nd levels (in the + example, both asterisks would be preserved, thus would allow Start() + to recognize files such as \cr + \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +Note that each glob expression can only represent one possibility (Start() +chooses the first). Because /code{*} is not the tag, which means it cannot +be a dimension of the output array. Therefore, only one possibility can be +adopted. For example, if \cr +\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +has two matches:\cr +\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +only the first found file will be used.} + +\item{retrieve}{A logical value indicating whether to retrieve the data +defined in the Start() call or to explore only its dimension lengths +and names, and the values for the file and inner dimensions. The default +value is FALSE.} + +\item{num_procs}{An integer of number of processes to be created for the +parallel execution of the retrieval/transformation/arrangement of the +multiple involved files in a call to Start(). If set to NULL, +takes the number of available cores (as detected by detectCores() in +the package 'future'). The default value is 1 (no parallel execution).} + +\item{ObjectBigmemory}{a character string to be included as part of the +bigmemory object name. This parameter is thought to be used internally by the +chunking capabilities of startR.} + +\item{silent}{A logical value of whether to display progress messages (FALSE) +or not (TRUE). The default value is FALSE.} + +\item{debug}{A logical value of whether to return detailed messages on the +progress and operations in a Start() call (TRUE) or not (FALSE). The +default value is FALSE.} + +\item{\dots}{A selection of custemized parameters depending on the data +format. When we retrieve data from one or a collection of data sets, +the involved data can be perceived as belonging to a large multi-dimensional +array. For instance, let us consider an example case. We want to retrieve data +from a source, which contains data for the number of monthly sales of various +items, and also for their retail price each month. The data on source is +stored as follows:\cr\cr \command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item'))) -\cr # # Checking the structure of the returned variables -\cr # str(found_data$Variables) -\cr # Named list -\cr # ..$common: NULL -\cr # ..$sourceA: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # ..$sourceB: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # # Checking the dimensions of the returned variable -\cr # # for the source A -\cr # dim(found_data$Variables$sourceA) -\cr # item store -\cr # 3 3 +\cr # /data/ +\cr # |-> sales/ +\cr # | |-> electronics +\cr # | | |-> item_a.data +\cr # | | |-> item_b.data +\cr # | | |-> item_c.data +\cr # | |-> clothing +\cr # | |-> item_d.data +\cr # | |-> idem_e.data +\cr # | |-> idem_f.data +\cr # |-> prices/ +\cr # |-> electronics +\cr # | |-> item_a.data +\cr # | |-> item_b.data +\cr # | |-> item_c.data +\cr # |-> clothing +\cr # |-> item_d.data +\cr # |-> item_e.data +\cr # |-> item_f.data +}\cr\cr +Each item file contains data, stored in whichever format, for the sales or +prices over a time period, e.g. for the past 24 months, registered at 100 +different stores over the world. Whichever the format it is stored in, each +file can be perceived as a container of a data array of 2 dimensions, time and +store. Let us assume the '.data' format allows to keep a name for each of +these dimensions, and the actual names are 'time' and 'store'.\cr\cr +The different item files for sales or prices can be perceived as belonging to +an 'item' dimension of length 3, and the two groups of three items to a +'section' dimension of length 2, and the two groups of two sections (one with +the sales and the other with the prices) can be perceived as belonging also to +another dimension 'variable' of length 2. Even the source can be perceived as +belonging to a dimension 'source' of length 1.\cr\cr +All in all, in this example, the whole data could be perceived as belonging to +a multidimensional 'large array' of dimensions\cr +\command{ +\cr # source variable section item store month +\cr # 1 2 2 3 100 24 } \cr\cr -The names of the requested variables do not necessarily have to match the -actual variable names inside the files. A list of alternative names to be -seeked can be specified via the parameter 'synonims'.} - -\item{synonims}{A named list where the names are the requested variable or -dimension names, and the values are vectors of character strings with -alternative names to seek for such dimension or variable.\cr\cr -In some requests, data from different sources may follow different naming -conventions for the dimensions or variables, or even files in the same source -could have varying names. This parameter is in order for Start() to -properly identify the dimensions or variables with different names. +The dimensions of this 'large array' can be classified in two types. The ones +that group actual files (the file dimensions) and the ones that group data +values inside the files (the inner dimensions). In the example, the file +dimensions are 'source', 'variable', 'section' and 'item', whereas the inner +dimensions are 'store' and 'month'. \cr\cr -In the example used in parameter 'return_vars', it may be the case that -the two involved data sources follow slightly different naming conventions. -For example, source A uses 'sect' as name for the sections dimension, whereas -source B uses 'section'; source A uses 'store_loc' as variable name for the -store locations, whereas source B uses 'store_location'. This can be taken -into account as follows: -\cr +Having the dimensions of our target sources in mind, the parameter \code{\dots} +expects to receive information on: + \itemize{ + \item{ +The names of the expected dimensions of the 'large dataset' we want to +retrieve data from + } + \item{ +The indices to take from each dimension (and other constraints) + } + \item{ +How to reorder the dimension if needed + } + \item{ +The location and organization of the files of the data sets + } + } +For each dimension, the 3 first information items can be specified with a set +of parameters to be provided through \code{\dots}. For a given dimension +'dimname', six parameters can be specified:\cr \command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item')), -\cr # synonims = list( -\cr # section = c('sec', 'section'), -\cr # store_location = c('store_loc', -\cr # 'store_location') -\cr # )) +\cr # dimname = , # 'all' / 'first' / 'last' / +\cr # # indices(c(1, 10, 20)) / +\cr # # indices(c(1:20)) / +\cr # # indices(list(1, 20)) / +\cr # # c(1, 10, 20) / c(1:20) / +\cr # # list(1, 20) +\cr # dimname_var = , +\cr # dimname_tolerance = , +\cr # dimname_reorder = , +\cr # dimname_depends = , +\cr # dimname_across = } -\cr} - -\item{file_opener}{A function that receives as a single parameter - 'file_path' a character string with the path to a file to be opened, - and returns an object with an open connection to the file (optionally with - header information) on success, or returns NULL on failure. \cr\cr -This parameter takes by default NcOpener() (an opener function for NetCDF -files). +The \bold{indices to take} can be specified in three possible formats (see +code comments above for examples). The first format consists in using +character tags, such as 'all' (take all the indices available for that +dimension), 'first' (take only the first) and 'last' (only the last). The +second format consists in using numeric indices, which have to be wrapped in a +call to the indices() helper function. For the second format, either a +vector of numeric indices can be provided, or a list with two numeric indices +can be provided to take all the indices in the range between the two specified +indices (both extremes inclusive). The third format consists in providing a +vector character strings (for file dimensions) or of values of whichever type +(for inner dimensions). For the file dimensions, the provided character +strings in the third format will be used as components to build up the final +path to the files (read further). For inner dimensions, the provided values in +the third format will be compared to the values of an associated coordinate +variable (must be specified in '_reorder', read further), and the +indices of the closest values will be retrieved. When using the third format, +a list with two values can also be provided to take all the indices of the +values within the specified range. \cr\cr -See NcOpener() for a template to build a file opener for your own file -format.} - -\item{file_var_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, - \code{synonims} that returns an array with auxiliary data (i.e. data from a - variable) inside a file. Start() will provide automatically either a - 'file_path' or a 'file_object' to the 'file_var_reader' - function (the function has to be ready to work whichever of these two is - provided). The parameter 'file_selectors' will also be provided - automatically to the variable reader, containing a named list where the - names are the names of the file dimensions of the queried data set (see - documentation on \code{\dots}) and the values are single character strings - with the components used to build the path to the file being read (the one - provided in 'file_path' or 'file_object'). The parameter 'var_name' - will be filled in automatically by Start() also, with the name of one - of the variales to be read. The parameter 'synonims' will be filled in - with exactly the same value as provided in the parameter 'synonims' in - the call to Start(), and has to be used in the code of the variable - reader to check for alternative variable names inside the target file. The - 'file_var_reader' must return a (multi)dimensional array with named - dimensions, and optionally with the attribute 'variales' with other - additional metadata on the retrieved variable. +The \bold{name of the associated coordinate variable} must be a character +string with the name of an associated coordinate variable to be found in the +data files (in all* of them). For this to work, a 'file_var_reader' +function must be specified when calling Start() (see parameter +'file_var_reader'). The coordinate variable must also be requested in the +parameter 'return_vars' (see its section for details). This feature only +works for inner dimensions. \cr\cr -Usually, the 'file_var_reader' should be a degenerate case of the -'file_data_reader' (see documentation on the corresponding parameter), -so it is recommended to code the 'file_data_reder' in first place. +The \bold{tolerance value} is useful when indices for an inner dimension are +specified in the third format (values of whichever type). In that case, the +indices of the closest values in the coordinate variable are seeked. However +the closest value might be too distant and we would want to consider no real +match exists for such provided value. This is possible via the tolerance, +which allows to specify a threshold beyond which not to seek for matching +values and mark that index as missing value. \cr\cr -This parameter takes by default NcVarReader() (a variable reader function -for NetCDF files). +The \bold{reorder_function} is useful when indices for an inner dimension are +specified in the third fromat, and the retrieved indices need to be reordered +in function of their provided associated variable values. A function can be +provided, which receives as input a vector of values, and returns as outputs a +list with the components \code{$x} with the reordered values, and \code{$ix} +with the permutation indices. Two reordering functions are included in +startR, the Sort() and the CircularSort(). \cr\cr -See NcVarReader() for a template to build a variale reader for your own -file format.} - -\item{file_dim_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} - that returns a named numeric vector where the names are the names of the - dimensions of the multidimensional data array in the file and the values are - the sizes of such dimensions. Start() will provide automatically - either a 'file_path' or a 'file_object' to the - 'file_dim_reader' function (the function has to be ready to work - whichever of these two is provided). The parameter 'file_selectors' - will also be provided automatically to the dimension reader, containing a - named list where the names are the names of the file dimensions of the - queried data set (see documentation on \code{\dots}) and the values are - single character strings with the components used to build the path to the - file being read (the one provided in 'file_path' or 'file_object'). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and can optionally be used in advanced configurations. +The \bold{name of another dimension} to be specified in _depends, +only available for file dimensions, must be a character string with the name +of another requested \bold{file dimension} in \code{\dots}, and will make +Start() aware that the path components of a file dimension can vary in +function of the path component of another file dimension. For instance, in the +example above, specifying \code{item_depends = 'section'} will make +Start() aware that the item names vary in function of the section, i.e. +section 'electronics' has items 'a', 'b' and 'c' but section 'clothing' has +items 'd', 'e', 'f'. Otherwise Start() would expect to find the same +item names in all the sections. \cr\cr -This parameter takes by default NcDimReader() (a dimension reader -function for NetCDF files). +The \bold{name of another dimension} to be specified in '_across', +only available for inner dimensions, must be a character string with the name +of another requested \bold{inner dimension} in \code{\dots}, and will make +Start() aware that an inner dimension extends along multiple files. For +instance, let us imagine that in the example above, the records for each item +are so large that it becomes necessary to split them in multiple files each +one containing the registers for a different period of time, e.g. in 10 files +with 100 months each ('item_a_period1.data', 'item_a_period2.data', and so on). +In that case, the data can be perceived as having an extra file dimension, the +'period' dimension. The inner dimension 'month' would extend across multiple +files, and providing the parameter \code{month = indices(1, 300)} would make +Start() crash because it would perceive we have made a request out of +bounds (each file contains 100 'month' indices, but we requested 1 to 300). +This can be solved by specifying the parameter \code{month_across = period} (a +long with the full specification of the dimension 'period'). \cr\cr -See NcDimReader() for (an advanced) template to build a dimension reader -for your own file format.} - -\item{file_data_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, - \code{inner_indices = NULL}, \code{synonims} that returns a subset of the - multidimensional data array inside a file (even if internally it is not an - array). Start() will provide automatically either a 'file_path' - or a 'file_object' to the 'file_data_reader' function (the - function has to be ready to work whichever of these two is provided). The - parameter 'file_selectors' will also be provided automatically to the - data reader, containing a named list where the names are the names of the - file dimensions of the queried data set (see documentation on \code{\dots}) - and the values are single character strings with the components used to - build the path to the file being read (the one provided in 'file_path' or - 'file_object'). The parameter 'inner_indices' will be filled in - automatically by Start() also, with a named list of numeric vectors, - where the names are the names of all the expected inner dimensions in a file - to be read, and the numeric vectors are the indices to be taken from the - corresponding dimension (the indices may not be consecutive nor in order). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and has to be used in the code of the data reader to check for alternative - dimension names inside the target file. The 'file_data_reader' must - return a (multi)dimensional array with named dimensions, and optionally with - the attribute 'variables' with other additional metadata on the retrieved - data. +\bold{Defining the path pattern} +\cr +As mentioned above, the parameter \dots also expects to receive information +with the location of the data files. In order to do this, a special dimension +must be defined. In that special dimension, in place of specifying indices to +take, a path pattern must be provided. The path pattern is a character string +that encodes the way the files are organized in their source. It must be a +path to one of the data set files in an accessible local or remote file system, +or a URL to one of the files provided by a local or remote server. The regions +of this path that vary across files (along the file dimensions) must be +replaced by wildcards. The wildcards must match any of the defined file +dimensions in the call to Start() and must be delimited with heading +and trailing '$'. Shell globbing expressions can be used in the path pattern. +See the next code snippet for an example of a path pattern. +\cr\cr +All in all, the call to Start() to load the entire data set in the +example of store item sales, would look as follows: +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # variable = 'all', +\cr # section = 'all', +\cr # item = 'all', +\cr # item_depends = 'section', +\cr # store = 'all', +\cr # month = 'all') +} \cr\cr -Usually, 'file_data_reader' should use 'file_dim_reader' -(see documentation on the corresponding parameter), so it is recommended to -code 'file_dim_reder' in first place. +Note that in this example it would still be pending to properly define the +parameters 'file_opener', 'file_closer', 'file_dim_reader', +'file_var_reader' and 'file_data_reader' for the '.data' file format +(see the corresponding sections). \cr\cr -This parameter takes by default NcDataReader() (a data reader function -for NetCDF files). +The call to Start() will return a multidimensional R array with the +following dimensions: +\cr +\command{ +\cr # source variable section item store month +\cr # 1 2 2 3 100 24 +} +\cr +The dimension specifications in the \code{\dots} do not have to follow any +particular order. The returned array will have the dimensions in the same order +as they have been specified in the call. For example, the following call: +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # month = 'all', +\cr # store = 'all', +\cr # item = 'all', +\cr # item_depends = 'section', +\cr # section = 'all', +\cr # variable = 'all') +} \cr\cr -See NcDataReader() for a template to build a data reader for your own -file format.} - -\item{file_closer}{A function that receives as a single parameter - 'file_object' an open connection (as returned by 'file_opener') - to one of the files to be read, optionally with header information, and - closes the open connection. Always returns NULL. +would return an array with the following dimensions: +\cr +\command{ +\cr # source month store item section variable +\cr # 1 24 100 3 2 2 +} \cr\cr -This parameter takes by default NcCloser() (a closer function for NetCDF -files). +Next, a more advanced example to retrieve data for only the sales records, for +the first section ('electronics'), for the 1st and 3rd items and for the +stores located in Barcelona (assuming the files contain the variable +'store_location' with the name of the city each of the 100 stores are located +at): +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} \cr\cr -See NcCloser() for a template to build a file closer for your own file -format.} - -\item{transform}{A function with the header \code{dara_array}, -\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as -input, through the parameter \code{data_array}, a subset of a -multidimensional array (as returned by 'file_data_reader'), applies a -transformation to it and returns it, preserving the amount of dimensions but -potentially modifying their size. This transformation may require data from -other auxiliary variables, automatically provided to 'transform' -through the parameter 'variables', in the form of a named list where -the names are the variable names and the values are (multi)dimensional -arrays. Which variables need to be sent to 'transform' can be specified -with the parameter 'transform_vars' in Start(). The parameter -'file_selectors' will also be provided automatically to -'transform', containing a named list where the names are the names of -the file dimensions of the queried data set (see documentation on -\code{\dots}) and the values are single character strings with the -components used to build the path to the file the subset being processed -belongs to. The parameter \code{\dots} will be filled in with other -additional parameters to adjust the transformation, exactly as provided in -the call to Start() via the parameter 'transform_params'.} - -\item{transform_params}{A named list with additional parameters to be sent to -the 'transform' function (if specified). See documentation on parameter -'transform' for details.} - -\item{transform_vars}{A vector of character strings with the names of -auxiliary variables to be sent to the 'transform' function (if -specified). All the variables to be sent to 'transform' must also -have been requested as return variables in the parameter 'return_vars' -of Start().} - -\item{transform_extra_cells}{An integer of extra indices to retrieve from the -data set, beyond the requested indices in \code{\dots}, in order for -'transform' to dispose of additional information to properly apply -whichever transformation (if needed). As many as -'transform_extra_cells' will be retrieved beyond each of the limits for -each of those inner dimensions associated to a coordinate variable and sent -to 'transform' (i.e. present in 'transform_vars'). After -'transform' has finished, Start() will take again and return a -subset of the result, for the returned data to fall within the specified -bounds in \code{\dots}. The default value is 2.} - -\item{apply_indices_after_transform}{A logical value indicating when a -'transform' is specified in Start() and numeric indices are -provided for any of the inner dimensions that depend on coordinate variables, -these numeric indices can be made effective (retrieved) before applying the -transformation or after. The boolean flag allows to adjust this behaviour. -It takes FALSE by default (numeric indices are applied before sending -data to 'transform').} - -\item{pattern_dims}{A character string indicating the name of the dimension -with path pattern specifications (see \code{\dots} for details). If not -specified, Start() assumes the first provided dimension is the pattern -dimension, with a warning.} - -\item{metadata_dims}{A vector of character strings with the names of the file -dimensions which to return metadata for. As noted in 'file_data_reader', -the data reader can optionally return auxiliary data via the attribute -'variables' of the returned array. Start() by default returns the -auxiliary data read for only the first file of each source (or data set) in -the pattern dimension (see \code{\dots} for info on what the pattern -dimension is). However it can be configured to return the metadata for all -the files along any set of file dimensions. The default value is NULL, and -it will be assigned automatically as parameter 'pattern_dims'.} - -\item{selector_checker}{A function used internaly by Start() to -translate a set of selectors (values for a dimension associated to a -coordinate variable) into a set of numeric indices. It takes by default -SelectorChecker() and, in principle, it should not be required to -change it for customized file formats. The option to replace it is left open -for more versatility. See the code of SelectorChecker() for details on -the inputs, functioning and outputs of a selector checker.} - -\item{merge_across_dims}{A logical value indicating whether to merge -dimensions across which another dimension extends (according to the -'_across' parameters). Takes the value FALSE by default. For -example, if the dimension 'time' extends across the dimension 'chunk' and -\code{merge_across_dims = TRUE}, the resulting data array will only contain -only the dimension 'time' as long as all the chunks together.} - -\item{merge_across_dims_narm}{A logical value indicating whether to remove -the additional NAs from data when parameter 'merge_across_dims' is TRUE. -It is helpful when the length of the to-be-merged dimension is different -across another dimension. For example, if the dimension 'time' extends -across dimension 'chunk', and the time length along the first chunk is 2 -while along the second chunk is 10. Setting this parameter as TRUE can -remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} - -\item{split_multiselected_dims}{A logical value indicating whether to split a -dimension that has been selected with a multidimensional array of selectors -into as many dimensions as present in the selector array. The default value -is FALSE.} - -\item{path_glob_permissive}{A logical value or an integer specifying how many - folder levels in the path pattern, beginning from the end, the shell glob - expressions must be preserved and worked out for each file. The default - value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr -When specifying a path pattern for a dataset, it might contain shell glob -experissions. For each dataset, the first file matching the path pattern is -found, and the found file is used to work out fixed values for the glob -expressions that will be used for all the files of the dataset. However, in -some cases, the values of the shell glob expressions may not be constant for -all files in a dataset, and they need to be worked out for each file -involved.\cr\cr -For example, a path pattern could be as follows: \cr -\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving -\code{path_glob_permissive = FALSE} will trigger automatic seek of the - contents to replace the asterisks (e.g. the first asterisk matches with - \code{'bar'} and the second with \code{'baz'}. The found contents will be - used for all files in the dataset (in the example, the path pattern will be - fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if - any of the files in the dataset have other contents in the position of the - asterisks, Start() will not find them (in the example, a file like \cr - \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be - found). Setting \code{path_glob_permissive = 1} would preserve global - expressions in the latest level (in the example, the fixed path pattern - would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the - problematic file mentioned before would be found), but of course this would - slow down the Start() call if the dataset involves a large number of - files. Setting \code{path_glob_permissive = 2} would leave the original path - pattern with the original glob expressions in the 1st and 2nd levels (in the - example, both asterisks would be preserved, thus would allow Start() - to recognize files such as \cr - \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr -Note that each glob expression can only represent one possibility (Start() -chooses the first). Because /code{*} is not the tag, which means it cannot -be a dimension of the output array. Therefore, only one possibility can be -adopted. For example, if \cr -\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr -has two matches:\cr -\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr -\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr -only the first found file will be used.} - -\item{retrieve}{A logical value indicating whether to retrieve the data -defined in the Start() call or to explore only its dimension lengths -and names, and the values for the file and inner dimensions. The default -value is FALSE.} - -\item{num_procs}{An integer of number of processes to be created for the -parallel execution of the retrieval/transformation/arrangement of the -multiple involved files in a call to Start(). If set to NULL, -takes the number of available cores (as detected by detectCores() in -the package 'future'). The default value is 1 (no parallel execution).} - -\item{ObjectBigmemory}{a character string to be included as part of the -bigmemory object name. This parameter is thought to be used internally by the -chunking capabilities of startR.} - -\item{silent}{A logical value of whether to display progress messages (FALSE) -or not (TRUE). The default value is FALSE.} - -\item{debug}{A logical value of whether to return detailed messages on the -progress and operations in a Start() call (TRUE) or not (FALSE). The -default value is FALSE.} +The defined names for the dimensions do not necessarily have to match the +names of the dimensions inside the file. Lists of alternative names to be +seeked can be defined in the parameter 'synonims'. +\cr\cr +If data from multiple sources (not necessarily following the same structure) +has to be retrieved, it can be done by providing a vector of character strings +with path pattern specifications, or, in the extended form, by providing a +list of lists with the components 'name' and 'path', and the name of the +dataset and path pattern as values, respectively. For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr} } \value{ If \code{retrieve = TRUE} the involved data is loaded into RAM memory @@ -830,3 +813,4 @@ file format. retrieve = FALSE) } + diff --git a/man/Step.Rd b/man/Step.Rd index c473ccb..65f0c72 100644 --- a/man/Step.Rd +++ b/man/Step.Rd @@ -4,13 +4,8 @@ \alias{Step} \title{Define the operation applied on declared data.} \usage{ -Step( - fun, - target_dims, - output_dims, - use_libraries = NULL, - use_attributes = NULL -) +Step(fun, target_dims, output_dims, use_libraries = NULL, + use_attributes = NULL) } \arguments{ \item{fun}{A function in R format defining the operation to be applied to the @@ -75,3 +70,4 @@ to the expected order for this function. wf <- AddStep(data, step) } + diff --git a/man/indices.Rd b/man/indices.Rd index 6233b71..a3d85ea 100644 --- a/man/indices.Rd +++ b/man/indices.Rd @@ -39,3 +39,4 @@ original data. See details in the documentation of the parameter \code{\dots} \seealso{ \code{\link{values}} } + diff --git a/man/values.Rd b/man/values.Rd index 31ce95a..3300f19 100644 --- a/man/values.Rd +++ b/man/values.Rd @@ -41,3 +41,4 @@ coordinate variable. See details in the documentation of the parameter \seealso{ \code{\link{indices}} } + -- GitLab