diff --git a/R/AddStep.R b/R/AddStep.R index 955ed847ac2db51eb568eac51b2f1b6005fa91eb..3ca5a3542399348b2509db84df549f83e2bbacf3 100644 --- a/R/AddStep.R +++ b/R/AddStep.R @@ -1,105 +1,105 @@ -AddStep <- function(inputs, step_fun, ...) { - # Check step_fun - if (!('startR_step_fun' %in% class(step_fun))) { - stop("Parameter 'step_fun' must be a startR step function as returned by Step.") - } - - # Check inputs - if (any(c('startR_cube', 'startR_workflow') %in% class(inputs))) { - inputs <- list(inputs) - names(inputs) <- 'input1' - } - else if (is.list(inputs)) { - if (any(!sapply(inputs, - function(x) any(c('startR_cube', - 'startR_workflow') %in% class(x))))) { - stop("Parameter 'inputs' must be one or a list of objects of the class ", - "'startR_cube' or 'startR_workflow'.") - } - } else { - stop("Parameter 'inputs' must be one or a list of objects of the class ", - "'startR_cube' or 'startR_workflow'.") - } - - # Consistency checks - if (!is.null(attr(step_fun, "UseAttributes"))) { - if (!all(names(inputs) == names(attr(step_fun, "UseAttributes")))) { - names(inputs) <- names(attr(step_fun, "UseAttributes")) - .warning(paste("The name of inputs is not assigned or differs from", - "name of use_attributes list in Step(). Force inputs", - "name to be consistent with use_attributes list")) - } - } - - if (length(inputs) != length(attr(step_fun, 'TargetDims'))) { - stop("The number of provided 'inputs' (", length(inputs), ") does not ", - "match the number of expected inputs by the provided 'step_fun' (", - length(attr(step_fun, 'TargetDims')), ").") - } - - # Work out the total target dims of the step - previous_target_dims <- NULL - all_input_dims <- NULL - for (input in 1:length(inputs)) { - dims_to_compare <- names(attr(inputs[[input]], 'Dimensions')) - if (!all(attr(step_fun, 'TargetDims')[[input]] %in% dims_to_compare)) { - stop("The target dimensions required by 'step_fun' for the input ", input, - " are not present in the corresponding provided object in 'inputs'.") - } - if ('startR_workflow' %in% class(inputs[[input]])) { - if (is.null(previous_target_dims)) { - previous_target_dims <- attr(inputs[[input]], 'TargetDims') - } else { - dims1 <- rep(1, length(previous_target_dims)) - names(dims1) <- previous_target_dims - dims2 <- rep(1, length(attr(inputs[[input]], 'TargetDims'))) - names(dims2) <- attr(inputs[[input]], 'TargetDims') - previous_target_dims <- names(startR:::.MergeArrayDims(dims1, dims2)[[1]]) - } - } - new_input_dims <- attr(inputs[[input]], 'Dimensions') - if (any(is.na(new_input_dims))) { - new_input_dims[which(is.na(new_input_dims))] <- rep(1, length(which(is.na(new_input_dims)))) - } - if (is.null(all_input_dims)) { - all_input_dims <- new_input_dims - } else { - all_input_dims <- startR:::.MergeArrayDims(all_input_dims, new_input_dims)[[1]] - } - } - - new_target_dims <- unique(unlist(attr(step_fun, 'TargetDims'))) - result <- list() - dims1 <- rep(1, length(previous_target_dims)) - names(dims1) <- previous_target_dims - dims2 <- rep(1, length(new_target_dims)) - names(dims2) <- new_target_dims - target_dims <- names(startR:::.MergeArrayDims(dims1, dims2)[[1]]) - for (output in 1:length(attr(step_fun, 'OutputDims'))) { - workflow <- list(inputs = inputs, - fun = step_fun, - params = list(...)) - if (!is.null(attr(step_fun, 'OutputDims')[[output]])) { - dimensions <- rep(NA, length(attr(step_fun, 'OutputDims')[[output]])) - names(dimensions) <- attr(step_fun, 'OutputDims')[[output]] - } else { - dimensions <- NULL - } - in_dims_to_remove <- which(names(all_input_dims) %in% new_target_dims) - if (length(in_dims_to_remove) > 0) { - dimensions <- c(dimensions, all_input_dims[-in_dims_to_remove]) - } else { - dimensions <- c(dimensions, all_input_dims) - } - attr(workflow, 'Dimensions') <- dimensions - attr(workflow, 'AllTargetDims') <- target_dims - class(workflow) <- 'startR_workflow' - result[[names(attr(step_fun, 'OutputDims'))[output]]] <- workflow - } - - if (length(result) == 1) { - result[[1]] - } else { - result - } -} +AddStep <- function(inputs, step_fun, ...) { + # Check step_fun + if (!('startR_step_fun' %in% class(step_fun))) { + stop("Parameter 'step_fun' must be a startR step function as returned by Step.") + } + + # Check inputs + if (any(c('startR_cube', 'startR_workflow') %in% class(inputs))) { + inputs <- list(inputs) + names(inputs) <- 'input1' + } + else if (is.list(inputs)) { + if (any(!sapply(inputs, + function(x) any(c('startR_cube', + 'startR_workflow') %in% class(x))))) { + stop("Parameter 'inputs' must be one or a list of objects of the class ", + "'startR_cube' or 'startR_workflow'.") + } + } else { + stop("Parameter 'inputs' must be one or a list of objects of the class ", + "'startR_cube' or 'startR_workflow'.") + } + + # Consistency checks + if (!is.null(attr(step_fun, "UseAttributes"))) { + if (!all(names(inputs) == names(attr(step_fun, "UseAttributes")))) { + names(inputs) <- names(attr(step_fun, "UseAttributes")) + .warning(paste("The name of inputs is not assigned or differs from", + "name of use_attributes list in Step(). Force inputs", + "name to be consistent with use_attributes list")) + } + } + + if (length(inputs) != length(attr(step_fun, 'TargetDims'))) { + stop("The number of provided 'inputs' (", length(inputs), ") does not ", + "match the number of expected inputs by the provided 'step_fun' (", + length(attr(step_fun, 'TargetDims')), ").") + } + + # Work out the total target dims of the step + previous_target_dims <- NULL + all_input_dims <- NULL + for (input in 1:length(inputs)) { + dims_to_compare <- names(attr(inputs[[input]], 'Dimensions')) + if (!all(attr(step_fun, 'TargetDims')[[input]] %in% dims_to_compare)) { + stop("The target dimensions required by 'step_fun' for the input ", input, + " are not present in the corresponding provided object in 'inputs'.") + } + if ('startR_workflow' %in% class(inputs[[input]])) { + if (is.null(previous_target_dims)) { + previous_target_dims <- attr(inputs[[input]], 'TargetDims') + } else { + dims1 <- rep(1, length(previous_target_dims)) + names(dims1) <- previous_target_dims + dims2 <- rep(1, length(attr(inputs[[input]], 'TargetDims'))) + names(dims2) <- attr(inputs[[input]], 'TargetDims') + previous_target_dims <- names(startR:::.MergeArrayDims(dims1, dims2)[[1]]) + } + } + new_input_dims <- attr(inputs[[input]], 'Dimensions') + if (any(is.na(new_input_dims))) { + new_input_dims[which(is.na(new_input_dims))] <- rep(1, length(which(is.na(new_input_dims)))) + } + if (is.null(all_input_dims)) { + all_input_dims <- new_input_dims + } else { + all_input_dims <- startR:::.MergeArrayDims(all_input_dims, new_input_dims)[[1]] + } + } + + new_target_dims <- unique(unlist(attr(step_fun, 'TargetDims'))) + result <- list() + dims1 <- rep(1, length(previous_target_dims)) + names(dims1) <- previous_target_dims + dims2 <- rep(1, length(new_target_dims)) + names(dims2) <- new_target_dims + target_dims <- names(startR:::.MergeArrayDims(dims1, dims2)[[1]]) + for (output in 1:length(attr(step_fun, 'OutputDims'))) { + workflow <- list(inputs = inputs, + fun = step_fun, + params = list(...)) + if (!is.null(attr(step_fun, 'OutputDims')[[output]])) { + dimensions <- rep(NA, length(attr(step_fun, 'OutputDims')[[output]])) + names(dimensions) <- attr(step_fun, 'OutputDims')[[output]] + } else { + dimensions <- NULL + } + in_dims_to_remove <- which(names(all_input_dims) %in% new_target_dims) + if (length(in_dims_to_remove) > 0) { + dimensions <- c(dimensions, all_input_dims[-in_dims_to_remove]) + } else { + dimensions <- c(dimensions, all_input_dims) + } + attr(workflow, 'Dimensions') <- dimensions + attr(workflow, 'AllTargetDims') <- target_dims + class(workflow) <- 'startR_workflow' + result[[names(attr(step_fun, 'OutputDims'))[output]]] <- workflow + } + + if (length(result) == 1) { + result[[1]] + } else { + result + } +} diff --git a/R/ByChunks.R b/R/ByChunks.R index 26d66627197af9dc8a05f274548004c1766a9406..22b8667fa612160a215111ff450f90d9be4e50fc 100644 --- a/R/ByChunks.R +++ b/R/ByChunks.R @@ -1,906 +1,906 @@ -ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', - threads_load = 2, threads_compute = 1, - cluster = NULL, - ecflow_suite_dir = NULL, - ecflow_server = NULL, - silent = FALSE, debug = FALSE, - wait = TRUE) { - # Build object to store profiling timings - t_begin_total <- Sys.time() - t_begin_bychunks_setup <- t_begin_total - timings <- list(nchunks = NULL, - concurrent_chunks = NULL, - cores_per_job = NULL, - threads_load = NULL, - threads_compute = NULL, - bychunks_setup = NULL, - transfer = NULL, - queue = NULL, - job_setup = NULL, - load = NULL, - compute = NULL, - transfer_back = NULL, - merge = NULL, - total = NULL) - - MergeArrays <- startR:::.MergeArrays - - # Check input headers - if ('startR_cube' %in% class(cube_headers)) { - cube_headers <- list(cube_headers) - } - if (!all(sapply(lapply(cube_headers, class), - function(x) 'startR_cube' %in% x))) { - stop("All objects passed in 'cube_headers' must be of class 'startR_cube', ", - "as returned by Start().") - } - - # Check step_fun - if (!is.function(step_fun)) { - stop("Parameter 'step_fun' must be a function.") - } - - # Check cores - if (!is.numeric(threads_load)) { - stop("Parameter 'threads_load' must be a numeric value.") - } - threads_load <- round(threads_load) - if (!is.numeric(threads_compute)) { - stop("Parameter 'threads_compute' must be a numeric value.") - } - threads_compute <- round(threads_compute) - timings[['threads_load']] <- threads_load - timings[['threads_compute']] <- threads_compute - - on_cluster <- !is.null(cluster) - - # Check ecflow_suite_dir - suite_id <- sample(10 ^ 10, 1) - ecflow_suite_dir_suite <- '' - if (on_cluster) { - if (is.null(ecflow_suite_dir)) { - stop("Parameter 'ecflow_suite_dir' must be specified when dispatching on a cluster.") - } - if (!is.character(ecflow_suite_dir)) { - stop("Parameter 'ecflow_suite_dir' must be a character string.") - } - ecflow_suite_dir_suite <- paste0(ecflow_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') - dir.create(ecflow_suite_dir_suite, recursive = TRUE) - if (!dir.exists(ecflow_suite_dir_suite)) { - stop("Could not find or create the directory in ", - "parameter 'ecflow_suite_dir'.") - } - } - - # Check cluster - default_cluster <- list(queue_host = NULL, - queue_type = 'slurm', - data_dir = NULL, - temp_dir = NULL, - lib_dir = NULL, - init_commands = list(''), - r_module = 'R', - CDO_module = NULL, - ecflow_module = 'ecFlow', - node_memory = NULL, - cores_per_job = NULL, - job_wallclock = '01:00:00', - max_jobs = 6, - extra_queue_params = list(''), - bidirectional = TRUE, - polling_period = 10, - special_setup = 'none') - if (on_cluster) { - if (!is.list(cluster)) { - stop("Parameter 'cluster' must be a named list.") - } - if (is.null(names(cluster))) { - stop("Parameter 'cluster' must be a named list.") - } - if (any(!(names(cluster) %in% c('queue_host', 'queue_type', 'data_dir', - 'temp_dir', 'lib_dir', 'init_commands', - 'r_module', 'CDO_module', - 'ecflow_module', 'node_memory', - 'cores_per_job', 'job_wallclock', 'max_jobs', - 'extra_queue_params', 'bidirectional', - 'polling_period', 'special_setup')))) { - stop("Found invalid component names in parameter 'cluster'.") - } - default_cluster[names(cluster)] <- cluster - } - localhost_name <- NULL - cluster <- default_cluster - remote_ecflow_suite_dir <- ecflow_suite_dir - is_data_dir_shared <- FALSE - is_ecflow_suite_dir_shared <- FALSE - if (on_cluster) { - #localhost_name <- Sys.info()[['nodename']] - localhost_name <- system('hostname -f', intern = TRUE) - if (Sys.which('ecflow_client') == '') { - stop("ecFlow must be installed in order to run the computation on clusters.") - } - if (is.null(cluster[['queue_host']])) { - queue_host <- localhost_name - } else if ((cluster[['queue_host']] %in% c('localhost', '127.0.0.1', localhost_name)) || - grepl(paste0('^', localhost_name), cluster[['queue_host']])) { - queue_host <- localhost_name - } - if (!(cluster[['queue_type']] %in% c('slurm', 'pbs', 'lsf', 'host'))) { - stop("The only supported 'queue_type's are 'slurm', 'pbs', 'lsf' and 'host'.") - } - if (is.null(cluster[['data_dir']])) { - is_data_dir_shared <- TRUE - } else { - if (!is.character(cluster[['data_dir']])) { - stop("The component 'data_dir' of the parameter 'cluster' must be a character string.") - } - remote_data_dir <- cluster[['data_dir']] - } - if (is.null(cluster[['temp_dir']])) { - is_ecflow_suite_dir_shared <- TRUE - } else { - if (!is.character(cluster[['temp_dir']])) { - stop("The component 'temp_dir' of the parameter 'cluster' must be a character string.") - } - remote_ecflow_suite_dir <- cluster[['temp_dir']] - } - if (!is.null(cluster[['lib_dir']])) { - if (!is.character(cluster[['lib_dir']])) { - stop("The component 'lib_dir', of the parameter 'cluster' must be NULL or ", - "a character string.") - } - } - if (!is.logical(cluster[['bidirectional']])) { - stop("The component 'bidirectional' of the parameter 'cluster' must be a logical value.") - } - if (cluster[['bidirectional']]) { - cluster[['init_commands']] <- c(cluster[['init_commands']], - list(paste('module load', cluster[['ecflow_module']]))) - } - if (!is.list(cluster[['init_commands']]) || - !all(sapply(cluster[['init_commands']], is.character))) { - stop("The component 'init_commands' of the parameter 'cluster' must be a list of ", - "character strings.") - } - if (!is.character(cluster[['r_module']])) { - stop("The component 'r_module' of the parameter 'cluster' must be a character string.") - } - if ((nchar(cluster[['r_module']]) < 1) || (grepl(' ', cluster[['r_module']]))) { - stop("The component 'r_module' of the parameter 'cluster' must have at least one character ", - "and contain no blank spaces.") - } - if (!is.null(cluster[['CDO_module']])) { - if (!is.character(cluster[['CDO_module']])) { - stop("The component 'CDO_module' of the parameter 'cluster' must be a character string.") - } - if (nchar(cluster[['CDO_module']]) < 1 || grepl(' ', cluster[['CDO_module']])) { - warning("The component 'CDO_module' of parameter 'cluster' must have ", - " than 1 and only the first element will be used.") - } - cluster[['r_module']] <- paste(cluster[['r_module']], cluster[['CDO_module']]) - } - if (!is.character(cluster[['ecflow_module']])) { - stop("The component 'ecflow_module' of the parameter 'cluster' must be a character string.") - } - if ((nchar(cluster[['ecflow_module']]) < 1) || - (grepl(' ', cluster[['ecflow_module']]))) { - stop("The component 'ecflow_module' of the parameter 'cluster' must have at least ", - "one character, and contain no blank spaces.") - } - if (is.null(cluster[['cores_per_job']])) { - cluster[['cores_per_job']] <- threads_compute - } - if (!is.numeric(cluster[['cores_per_job']])) { - stop("The component 'cores_per_job' of the parameter 'cluster' must be numeric.") - } - cluster[['cores_per_job']] <- round(cluster[['cores_per_job']]) - if (cluster[['cores_per_job']] > threads_compute) { - startR:::.message("WARNING: 'threads_compute' should be >= cluster[['cores_per_job']].") - } - if (!is.list(cluster[['extra_queue_params']]) || - !all(sapply(cluster[['extra_queue_params']], is.character))) { - stop("The component 'extra_queue_params' of the parameter 'cluster' must be a list of ", - "character strings.") - } - if (!(cluster[['special_setup']] %in% c('none', 'marenostrum4'))) { - stop("The value provided for the component 'special_setup' of the parameter ", - "'cluster' is not recognized.") - } - } - - # Check ecflow_suite_dir - remote_ecflow_suite_dir_suite <- '' - if (on_cluster) { - remote_ecflow_suite_dir_suite <- paste0(remote_ecflow_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') - } - - # Check ecflow_server - if (!is.null(ecflow_server) && !(is.character(ecflow_server))) { - stop("Parameter 'ecflow_server' must be a character string if specified.") - } - - # Check silent - if (!is.logical(silent)) { - stop("Parameter 'silent' must be logical.") - } - - # Check debug - if (!is.logical(debug)) { - stop("Parameter 'debug' must be logical.") - } - if (silent) { - debug <- FALSE - } - - # Check wait - if (!is.logical(wait)) { - stop("Parameter 'wait' must be logical.") - } - - # Work out chunked dimensions and target dimensions - all_dims <- lapply(cube_headers, attr, 'Dimensions') - all_dims_merged <- NULL - for (i in all_dims) { - if (is.null(all_dims_merged)) { - all_dims_merged <- i - } else { - all_dims_merged <- startR:::.MergeArrayDims(all_dims_merged, i)[[3]] - } - } - all_dimnames <- names(all_dims_merged) - - target_dims_indices <- which(all_dimnames %in% unlist(attr(step_fun, 'TargetDims'))) - target_dims <- NULL - if (length(target_dims_indices) > 0) { - target_dims <- all_dimnames[target_dims_indices] - } - - chunked_dims <- all_dimnames - if (length(target_dims_indices) > 0) { - chunked_dims <- chunked_dims[-target_dims_indices] - } - if (length(chunked_dims) < 1) { - stop("Not possible to process input by chunks. All input dimensions are ", - "target dimensions.") - } - - if (length(cube_headers) != length(attr(step_fun, 'TargetDims'))) { - stop("Number of inputs in parameter 'cube_headers' must be equal to the ", - "number of inputs expected by the function 'step_fun'.") - } - # Check all input headers have matching dimensions - cube_index <- 1 - for (cube_header in cube_headers) { - if (!all(attr(cube_header, 'Dimensions') == all_dims_merged[names(attr(cube_header, 'Dimensions'))])) { - stop("All provided 'cube_headers' must have matching dimension lengths ", - "with each other.") - } - if (!all(attr(step_fun, 'TargetDims')[[cube_index]] %in% names(attr(cube_header, 'Dimensions')))) { - stop("All provided 'cube_headers' must contain at least the target dimensions ", - "expected by 'step_fun'.") - } - cube_index <- cube_index + 1 - # work out expected result dimensions - } - - # Check chunks - default_chunks <- as.list(rep(1, length(chunked_dims))) - names(default_chunks) <- chunked_dims - if (length(chunks) == 1 && chunks == 'auto') { - chunks <- default_chunks - } - if (!is.list(chunks)) { - stop("Parameter 'chunks' must be a named list or 'auto'.") - } - if (is.null(names(chunks))) { - stop("Parameter 'chunks' must be a named list or 'auto'.") - } - if (any(!(names(chunks) %in% chunked_dims))) { - stop("All names in parameter 'chunks' must be one of the non-target dimensions ", - "present in the cubes in 'cube_headers'. The target dimensions are ", - paste(paste0("'", target_dims, "'"), collapse = ', '), ". The non-target ", - "dimensions (margins) are ", paste(paste0("'", chunked_dims, "'"), collapse = ', '), ".") - } - if (any(!(((unlist(chunks) %% 1) == 0) | (unlist(chunks) == 'all')))) { - stop("All values in parameter 'chunks' must take a numeric value or 'all'.") - } - if (any(unlist(chunks) < 1)) { - stop("All values in parameter 'chunks' must be >= 1.") - } - for (chunk_spec in 1:length(chunks)) { - if (chunks[[chunk_spec]] > all_dims_merged[names(chunks)[chunk_spec]]) { - stop("Too many chunks requested for the dimension ", names(chunks)[chunk_spec], - ". Maximum allowed is ", all_dims_merged[names(chunks)[chunk_spec]]) - } - } - default_chunks[names(chunks)] <- chunks - chunks <- default_chunks - timings[['nchunks']] <- prod(unlist(chunks)) - - # Check step_fun - if (!('startR_step_fun' %in% class(step_fun))) { - stop("Parameter 'step_fun' must be of the class 'startR_step_fun', as returned ", - "by the function Step.") - } - - # Replace 'all's - chunks_all <- which(unlist(chunks) == 'all') - if (length(chunks_all) > 0) { - chunks[chunks_all] <- all_dims[names(chunks)[chunks_all]] - } - # Mount the ecFlow suite - if (on_cluster) { - .message(paste0("ATTENTION: Dispatching chunks on a remote cluster", - ". Make sure passwordless ", - "access is properly set in both directions.")) - - # Copy load_process_save_chunk.R into shared folder - chunk_script <- file(system.file('chunking/load_process_save_chunk.R', - package = 'startR')) - chunk_script_lines <- readLines(chunk_script) - close(chunk_script) - chunk_script_lines <- gsub('^lib_dir <- *', paste0('lib_dir <- ', - paste(deparse(cluster[['lib_dir']]), collapse = '\n')), - chunk_script_lines) - chunk_script_lines <- gsub('^out_dir <- *', paste0('out_dir <- ', - paste(deparse(remote_ecflow_suite_dir_suite), collapse = '\n')), chunk_script_lines) - chunk_script_lines <- gsub('^debug <- *', paste0('debug <- ', paste(deparse(debug), collapse = '\n')), - chunk_script_lines) - deparsed_calls <- paste0('start_calls <- list(') - extra_path <- '' - if (cluster[['special_setup']] == 'marenostrum4') { - extra_path <- '/gpfs/archive/bsc32/' - } - for (cube_header in 1:length(cube_headers)) { - pattern_dim <- attr(cube_headers[[cube_header]], 'PatternDim') - bk_pattern_dim <- cube_headers[[cube_header]][[pattern_dim]] - bk_expected_files <- attr(cube_headers[[cube_header]], 'ExpectedFiles') - if (!is_data_dir_shared) { - cube_headers[[cube_header]][[pattern_dim]] <- paste0(remote_data_dir, '/', - extra_path, '/', cube_headers[[cube_header]][[pattern_dim]]) - for (file_n in 1:length(bk_expected_files)) { - attr(cube_headers[[cube_header]], 'ExpectedFiles')[file_n] <- paste0(remote_data_dir, '/', - extra_path, '/', attr(cube_headers[[cube_header]], 'ExpectedFiles')[file_n]) - } - } - deparsed_calls <- paste0(deparsed_calls, '\nquote(', - paste(deparse(cube_headers[[cube_header]]), collapse = '\n'), - ')') - cube_headers[[cube_header]][[pattern_dim]] <- bk_pattern_dim - attr(cube_headers[[cube_header]], 'ExpectedFiles') <- bk_expected_files - if (cube_header < length(cube_headers)) { - deparsed_calls <- paste0(deparsed_calls, ', ') - } - } - deparsed_calls <- paste0(deparsed_calls, '\n)') - chunk_script_lines <- gsub('^start_calls <- *', deparsed_calls, chunk_script_lines) - chunk_script_lines <- gsub('^start_calls_attrs <- *', paste0('start_calls_attrs <- ', paste(deparse(lapply(cube_headers, attributes)), collapse = '\n')), - chunk_script_lines) - chunk_script_lines <- gsub('^param_dimnames <- *', paste0('param_dimnames <- ', paste(deparse(chunked_dims), collapse = '\n')), - chunk_script_lines) - chunk_script_lines <- gsub('^threads_load <- *', paste0('threads_load <- ', threads_load), - chunk_script_lines) - chunk_script_lines <- gsub('^threads_compute <- *', paste0('threads_compute <- ', threads_compute), - chunk_script_lines) - chunk_script_lines <- gsub('^fun <- *', paste0('fun <- ', paste(deparse(step_fun), collapse = '\n')), - chunk_script_lines) - chunk_script_lines <- gsub('^params <- *', paste0('params <- ', paste(deparse(list(...)), collapse = '\n')), - chunk_script_lines) - writeLines(chunk_script_lines, paste0(ecflow_suite_dir_suite, '/load_process_save_chunk.R')) - - # Copy Chunk.ecf into shared folder - chunk_ecf_script <- file(system.file('chunking/Chunk.ecf', - package = 'startR')) - chunk_ecf_script_lines <- readLines(chunk_ecf_script) - close(chunk_ecf_script) - if (cluster[['queue_type']] == 'host') { - chunk_ecf_script_lines <- gsub('^include_queue_header', - '', - chunk_ecf_script_lines) - } else { - chunk_ecf_script_lines <- gsub('^include_queue_header', - paste0('%include "./', cluster[['queue_type']], '.h"'), - chunk_ecf_script_lines) - } - chunk_ecf_script_lines <- gsub('^include_init_commands', - paste0(paste0(cluster[['init_commands']], collapse = '\n'), '\n'), - chunk_ecf_script_lines) - chunk_ecf_script_lines <- gsub('^include_module_load', - paste0('module load ', cluster[['r_module']]), - chunk_ecf_script_lines) - ecf_vars <- paste0('%', as.vector(sapply(chunked_dims, - function(x) { - c(toupper(x), paste0(toupper(x), '_N')) - })), '%') -# if (!is_ecflow_suite_dir_shared && (cluster[['queue_host']] != localhost_name)) { -# #transfer_back_line <- paste0('rsync -rav %REMOTE_ECF_HOME% ', localhost_name, -# # ':%ECF_HOME%\nrm -f %ECF_HOME%/', -# # paste0('*', paste(ecf_vars[((1:(length(ecf_vars) / 2)) * 2) - 1], collapse = '*'), '*.Rds')) - result_file_id <- paste0('*', - paste(paste0('_', ecf_vars[((1:(length(ecf_vars) / 2)) * 2) - 1], '__'), - collapse = '*'), '*') -# transfer_back_line <- paste0('rsync -rav %REMOTE_ECF_HOME%/%SUITE%/ ', -# localhost_name, -# ':%ECF_HOME%/%SUITE%/\nscp %REMOTE_ECF_HOME%/', -# result_file_id, ' ', localhost_name, -# ':%ECF_HOME%\nrm -f %REMOTE_ECF_HOME%/', -# result_file_id) -# } else { -# transfer_back_line <- '' -# } - chunk_ecf_script_lines <- gsub('^Rscript load_process_save_chunk.R --args \\$task_path insert_indices', - paste0('Rscript load_process_save_chunk.R --args $task_path ', paste(ecf_vars, collapse = ' ')), - chunk_ecf_script_lines) - #chunk_ecf_script_lines <- gsub('^include_transfer_back_and_rm', transfer_back_line, chunk_ecf_script_lines) - writeLines(chunk_ecf_script_lines, paste0(ecflow_suite_dir_suite, '/Chunk.ecf')) - - # Copy merge_chunks.R into tmp folder -# merge_script <- file(system.file('chunking/merge_chunks.R', -# package = 'startR')) -# merge_script_lines <- readLines(merge_script) -# close(merge_script) -# merge_script_lines <- gsub('^shared_dir <- *', paste0('shared_dir <- ', -# paste(deparse(shared_dir_suite), collapse = '\n')), merge_script_lines) -# writeLines(merge_script_lines, paste0(shared_dir_suite, '/merge_chunks.R')) - - # Copy Merge.ecf into tmp folder - #TODO: Modify chain of parameters sent to r script when merging - #chunks progressively -# merge_ecf_script <- file(system.file('chunking/Merge.ecf', -# package = 'startR')) -# merge_ecf_script_lines <- readLines(merge_ecf_script) -# close(merge_ecf_script) -# writeLines(merge_ecf_script_lines, paste0(shared_dir_suite, '/Merge.ecf')) - - # Copy queue header into shared folder - #file.copy(system.file(paste0('chunking/', cluster[['queue_type']], '.h'), package = 'startR'), - # ecflow_suite_dir_suite) - chunk_queue_header <- file(system.file(paste0('chunking/', cluster[['queue_type']], '.h'), package = 'startR')) - chunk_queue_header_lines <- readLines(chunk_queue_header) - close(chunk_queue_header) - chunk_queue_header_lines <- gsub('^include_extra_queue_params', - paste0(paste0(cluster[['extra_queue_params']], collapse = '\n'), '\n'), - chunk_queue_header_lines) - writeLines(chunk_queue_header_lines, paste0(ecflow_suite_dir_suite, '/', cluster[['queue_type']], '.h')) - - # Copy headers - file.copy(system.file('chunking/head.h', package = 'startR'), - ecflow_suite_dir_suite) - file.copy(system.file('chunking/tail.h', package = 'startR'), - ecflow_suite_dir_suite) - } - - add_line <- function(suite, line, tabs) { - c(suite, paste0(paste(rep(' ', tabs), collapse = ''), line)) - } - suite <- NULL - tabs <- 0 - suite <- add_line(suite, paste0('suite STARTR_CHUNKING_', suite_id), tabs) - tabs <- tabs + 2 - submit_command <- '' - if (cluster[['queue_type']] == 'slurm') { - submit_command <- 'sbatch' - } else if (cluster[['queue_type']] == 'pbs') { - submit_command <- 'qsub' - } else if (cluster[['queue_type']] == 'lsf') { - submit_command <- 'bsub <' - } else if (cluster[['queue_type']] == 'host') { - submit_command <- 'bash' - } - if (on_cluster) { - suite <- add_line(suite, paste0("edit BIDIRECTIONAL '", cluster[['bidirectional']], "'"), tabs) - suite <- add_line(suite, paste0("edit QUEUE_HOST '", cluster[['queue_host']], "'"), tabs) - suite <- add_line(suite, paste0("edit ECF_HOST '", localhost_name, "'"), tabs) - suite <- add_line(suite, paste0("edit EC_HOST_FULL '", localhost_name, "'"), tabs) - suite <- add_line(suite, paste0("edit RESULT_FILE_ID '", result_file_id, "'"), tabs) - #} else { - # suite <- add_line(suite, paste0("edit ECF_JOB_CMD '", submit_command, " %ECF_JOB% > %ECF_JOBOUT% 2>&1 &'"), tabs) - } - suite <- add_line(suite, paste0("edit ECF_HOME '", ecflow_suite_dir_suite, "'"), tabs) - suite <- add_line(suite, paste0("edit REMOTE_ECF_HOME '", remote_ecflow_suite_dir_suite, "'"), tabs) - suite <- add_line(suite, paste0("edit CORES_PER_JOB ", cluster[['cores_per_job']], ""), tabs) - suite <- add_line(suite, paste0("edit JOB_WALLCLOCK '", cluster[['job_wallclock']], "'"), tabs) - suite <- add_line(suite, paste0("limit max_jobs ", cluster[['max_jobs']]), tabs) - suite <- add_line(suite, paste0("inlimit max_jobs"), tabs) - suite <- add_line(suite, "family computation", tabs) - tabs <- tabs + 2 - - if (on_cluster) { - # source $HOME/.profile ; - sync_command <- '' - if (!is_ecflow_suite_dir_shared) { - sync_command <- paste0("rsync -rav ", - "%ECF_HOME%/ ", - "%QUEUE_HOST%:%REMOTE_ECF_HOME%/ ; ") - } - suite <- add_line(suite, paste0("edit ECF_JOB_CMD '", - #"mkdir -p %REMOTE_ECF_HOME%/%SUITE%/ ; ", - sync_command, - "ssh %QUEUE_HOST% \"", - "date --rfc-3339=seconds > %REMOTE_ECF_HOME%/%ECF_NAME%.submit_time ; ", - submit_command, - " %REMOTE_ECF_HOME%/%ECF_NAME%.job%ECF_TRYNO% > ", - "%REMOTE_ECF_HOME%/%ECF_NAME%.%ECF_TRYNO% 2>&1 &\" ", - "2>&1'"), tabs) - if (is_ecflow_suite_dir_shared) { - suite <- add_line(suite, paste0("edit REPORT_BACK 'FALSE'"), tabs) - } else { - suite <- add_line(suite, paste0("edit REPORT_BACK 'TRUE'"), tabs) - } - } - - # Open nested ecFlow families - for (i in length(chunked_dims):1) { - suite <- add_line(suite, paste0('family ', chunked_dims[i], '_CHUNK_', 1), tabs) - tabs <- tabs + 2 - suite <- add_line(suite, paste0('edit ', toupper(chunked_dims[i]), ' ', 1), tabs) - suite <- add_line(suite, paste0('edit ', toupper(chunked_dims[i]), '_N ', chunks[[chunked_dims[i]]]), tabs) - } - - # Iterate through chunks - chunk_array <- array(1:prod(unlist(chunks)), dim = (unlist(chunks))) - arrays_of_results <- vector('list', length(attr(step_fun, 'OutputDims'))) - names(arrays_of_results) <- names(attr(step_fun, 'OutputDims')) - for (component in 1:length(arrays_of_results)) { - arrays_of_results[[component]] <- vector('list', prod((unlist(chunks)))) - dim(arrays_of_results[[component]]) <- (unlist(chunks)) - } - if (!on_cluster) { - t_end_bychunks_setup <- Sys.time() - timings[['bychunks_setup']] <- as.numeric(difftime(t_end_bychunks_setup, - t_begin_bychunks_setup, units = 'secs')) - timings[['transfer']] <- 0 - timings[['queue']] <- 0 - timings[['job_setup']] <- 0 - timings[['transfer_back']] <- 0 - if (!silent) { - startR:::.message(paste0("Processing chunks... ", - "remaining time estimate soon...")) - } - time_before_first_chunk <- Sys.time() - time_after_first_chunk <- NULL - } - previous_chunk_indices <- rep(1, length(chunks)) - found_first_result <- FALSE - for (i in 1:length(chunk_array)) { - chunk_indices <- which(chunk_array == i, arr.ind = TRUE)[1, ] - names(chunk_indices) <- names(dim(chunk_array)) - # ADD CHUNK SCRIPT TO SUITE - families_to_jump <- which(chunk_indices != previous_chunk_indices) - if (length(families_to_jump) > 0) { - families_to_jump <- max(families_to_jump) - # Close ecFlow families - for (j in 1:families_to_jump) { - tabs <- tabs - 2 - suite <- add_line(suite, paste0('endfamily'), tabs) - } - # Open ecFlow families - for (j in families_to_jump:1) { - suite <- add_line(suite, paste0('family ', (chunked_dims)[j], '_CHUNK_', chunk_indices[j]), tabs) - tabs <- tabs + 2 - suite <- add_line(suite, paste0('edit ', toupper((chunked_dims)[j]), ' ', chunk_indices[j]), tabs) - suite <- add_line(suite, paste0('edit ', toupper((chunked_dims)[j]), '_N ', chunks[[(chunked_dims)[j]]]), tabs) - } - } - suite <- add_line(suite, "task Chunk", tabs) - - if (!on_cluster) { - if (!silent) { - startR:::.message(paste("Loading chunk", i, - "out of", length(chunk_array), "...")) - } - data <- vector('list', length(cube_headers)) - t_begin_load <- Sys.time() - for (input in 1:length(data)) { - start_call <- cube_headers[[input]] - dims_to_alter <- which(names(attr(start_call, 'Dimensions')) %in% names(chunks)) - names_dims_to_alter <- names(attr(start_call, 'Dimensions'))[dims_to_alter] - # If any dimension comes from split dimensions - split_dims <- attr(start_call, 'SplitDims') - - if (length(split_dims) != 0){ - - for (k in 1:length(split_dims)) { - if (any(names(split_dims[[k]]) %in% names_dims_to_alter)) { - chunks_split_dims <- rep(1, length(split_dims[[k]])) - names(chunks_split_dims) <- names(split_dims[[k]]) - chunks_indices_split_dims <- chunks_split_dims - split_dims_to_alter <- which(names(split_dims[[k]]) %in% names_dims_to_alter) - chunks_split_dims[split_dims_to_alter] <- unlist(chunks[names(split_dims[[k]])[split_dims_to_alter]]) - chunks_indices_split_dims[split_dims_to_alter] <- chunk_indices[names(split_dims[[k]])[split_dims_to_alter]] - start_call[[names(split_dims)[k]]] <- chunk(chunks_indices_split_dims, chunks_split_dims, - eval(start_call[[names(split_dims)[k]]])) - dims_to_alter_to_remove <- which(names_dims_to_alter %in% names(split_dims[[k]])) - if (length(dims_to_alter_to_remove) > 0) { - dims_to_alter <- dims_to_alter[-dims_to_alter_to_remove] - names_dims_to_alter <- names_dims_to_alter[-dims_to_alter_to_remove] - } - } - } - } - - if (length(dims_to_alter) > 0) { - for (call_dim in names(attr(start_call, 'Dimensions'))[dims_to_alter]) { - start_call[[call_dim]] <- chunk(chunk_indices[call_dim], chunks[[call_dim]], - eval(start_call[[call_dim]])) - } - } - start_call[['silent']] <- !debug - if (!('num_procs' %in% names(start_call))) { - start_call[['num_procs']] <- threads_load - } - data[[input]] <- eval(start_call) - } - t_end_load <- Sys.time() - timings[['load']] <- c(timings[['load']], - as.numeric(difftime(t_end_load, t_begin_load, units = 'secs'))) - if (!silent) { - startR:::.message(paste("Processing...")) - } - #TODO: Find a better way to assign the names of data. When multiple steps for Compute is available, this way may fail. - names(data) <- names(cube_headers) - t_begin_compute <- Sys.time() - result <- multiApply::Apply(data, - target_dims = attr(step_fun, 'TargetDims'), - fun = step_fun, ..., - output_dims = attr(step_fun, 'OutputDims'), - use_attributes = attr(step_fun, 'UseAttributes'), - ncores = threads_compute) - if (!found_first_result) { - names(arrays_of_results) <- names(result) - found_first_result <- TRUE - } - for (component in 1:length(result)) { - arrays_of_results[[component]][[i]] <- result[[component]] - } - rm(data) - gc() - t_end_compute <- Sys.time() - timings[['compute']] <- c(timings[['compute']], - as.numeric(difftime(t_end_compute, - t_begin_compute, units = 'secs'))) - } - - # Time estimate - if (!on_cluster) { - if (is.null(time_after_first_chunk)) { - time_after_first_chunk <- Sys.time() - if (!silent) { - estimate <- (time_after_first_chunk - - time_before_first_chunk) * - (length(chunk_array) - 1) - units(estimate) <- 'mins' - startR:::.message( - paste0("Remaining time estimate (at ", format(time_after_first_chunk), ") ", - "(neglecting merge time): ", format(estimate)) - ) - } - } - } - previous_chunk_indices <- chunk_indices - } - - # Close nested ecFlow families - for (i in length(chunked_dims):1) { - tabs <- tabs - 2 - suite <- add_line(suite, paste0('endfamily'), tabs) - } - - # Close the ecFlow suite - tabs <- tabs - 2 - suite <- add_line(suite, paste0('endfamily'), tabs) -# suite <- add_line(suite, "family merge", tabs) -# tabs <- tabs + 2 -# suite <- add_line(suite, "trigger computation == complete", tabs) -# suite <- add_line(suite, "edit ECF_JOB_CMD 'bash %ECF_JOB% > %ECF_JOBOUT% 2>&1 &'", tabs) -# suite <- add_line(suite, "task Merge", tabs) -# tabs <- tabs - 2 -# suite <- add_line(suite, paste0('endfamily'), tabs) - - tabs <- tabs - 2 - suite <- add_line(suite, "endsuite", tabs) - - # Run ecFlow suite if needed - if (on_cluster) { - timings[['cores_per_job']] <- cluster[['cores_per_job']] - timings[['concurrent_chunks']] <- cluster[['max_jobs']] - suite_file <- paste0(ecflow_suite_dir_suite, '/startR_chunking.def') - suite_file_o <- file(suite_file) - writeLines(suite, suite_file_o) - close(suite_file_o) - - default_ecflow_server <- list(host = localhost_name, port = '5678') - if (is.null(ecflow_server)) { - .warning("Parameter 'ecflow_server' has not been specified but execution on ", - "cluster has been requested. An ecFlow server instance will ", - "be created on localhost:5678.") - } else { - if ('host' %in% names(ecflow_server)) { - stop("A host has been specified for the 'ecflow_server', but this option is not available yet.") - } - default_ecflow_server[names(ecflow_server)] <- ecflow_server - } - ecflow_server <- default_ecflow_server - system(paste0("ecflow_start.sh -p ", ecflow_server[['port']])) - system(paste0("ecflow_client --load=", suite_file, " --host=", - ecflow_server[['host']], " --port=", ecflow_server[['port']])) - if (!is_ecflow_suite_dir_shared) { - system(paste0('ssh ', cluster[['queue_host']], ' "mkdir -p ', - remote_ecflow_suite_dir_suite, '"')) - system(paste0('rsync -ra ', ecflow_suite_dir_suite, - ' ', cluster[['queue_host']], ':', - remote_ecflow_suite_dir_suite)) - } - t_end_bychunks_setup <- Sys.time() - timings[['bychunks_setup']] <- as.numeric(difftime(t_end_bychunks_setup, - t_begin_bychunks_setup, units = 'secs')) - if (!is_data_dir_shared) { - t_begin_transfer <- Sys.time() - startR:::.message("Sending involved files to the cluster file system...") - files_to_send <- NULL - #files_to_check <- NULL - for (cube_header in 1:length(cube_headers)) { - expected_files <- attr(cube_headers[[cube_header]], 'ExpectedFiles') - #files_to_check <- c(files_to_check, expected_files) - #if (cluster[['special_setup']] == 'marenostrum4') { - # expected_files <- paste0('/gpfs/archive/bsc32/', expected_files) - #} - files_to_send <- c(files_to_send, expected_files) - } - #which_files_exist <- sapply(files_to_check, file.exists) - which_files_exist <- sapply(files_to_send, file.exists) - files_to_send <- files_to_send[which_files_exist] - if (cluster[['special_setup']] == 'marenostrum4') { - file_spec <- paste(paste0("/gpfs/archive/bsc32/", - files_to_send), collapse = ' ') - system(paste0("ssh ", cluster[['queue_host']], " 'mkdir -p ", remote_data_dir, - ' ; module load transfer ; cd ', remote_ecflow_suite_dir_suite, - ' ; dtrsync -Rrav ', '\'', file_spec, '\' "', remote_data_dir, '/"', - " ; sleep 1 ; ", - "while [[ ! $(ls dtrsync_*.out 2>/dev/null | wc -l) -ge 1 ]] ; ", - "do sleep 2 ; done", - " ; sleep 1 ; ", - 'while [[ ! $(grep "total size is" dtrsync_*.out | ', - "wc -l) -ge 1 ]] ; ", - "do sleep 5 ; done", "'")) - } else { - file_spec <- paste(files_to_send, collapse = ' :') - system(paste0("ssh ", cluster[['queue_host']], ' "mkdir -p ', - remote_data_dir, '"')) - system(paste0("rsync -Rrav '", file_spec, "' '", - cluster[['queue_host']], ":", remote_data_dir, "/'")) - } - startR:::.message("Files sent successfully.") - t_end_transfer <- Sys.time() - timings[['transfer']] <- as.numeric(difftime(t_end_transfer, t_begin_transfer, units = 'secs')) - } else { - timings[['transfer']] <- 0 - } - if (!silent) { - startR:::.message(paste0("Processing chunks... ")) - } - time_begin_first_chunk <- Sys.time() -# time_after_first_chunk <- NULL - system(paste0("ecflow_client --begin=STARTR_CHUNKING_", suite_id, - " --host=", ecflow_server[['host']], " --port=", - ecflow_server[['port']])) - - timings[['total']] <- t_begin_total - startr_exec <- list(cluster = cluster, ecflow_server = ecflow_server, - suite_id = suite_id, chunks = chunks, - num_outputs = length(arrays_of_results), - ecflow_suite_dir = ecflow_suite_dir, - timings = timings) - class(startr_exec) <- 'startR_exec' - if (wait) { - if (!silent) { - startR:::.message(paste0("Remaining time estimate soon... ")) -# while (is.null(time_after_first_chunk)) { -# if (any(grepl('.*\\.Rds$', list.files(ecflow_suite_dir_suite)))) { -# time_after_first_chunk <- Sys.time() -# estimate <- (time_after_first_chunk - -# time_before_first_chunk) * -# ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / -# cluster[['max_jobs']]) -# units(estimate) <- 'mins' -# startR:::.message( -# paste0('Remaining time estimate (neglecting queue and ', -# 'merge time) (at ', format(time_after_first_chunk), -# '): ', format(estimate), ' (', -# format(time_after_first_chunk - -# time_before_first_chunk), ' per chunk)') -# ) -# } else if (!cluster[['bidirectional']]) { -# rsync_output <- tryCatch({ -# system(paste0("rsync -ra --ignore-missing-args ", -# cluster[['queue_host']], ":", -# remote_ecflow_suite_dir_suite, "/*.Rds ", -# ecflow_suite_dir_suite, "/"), intern = TRUE) -# }, error = function(e) { -# message("Warning: rsync from remote server to collect results failed. ", -# "Retrying soon.") -# failed <- TRUE -# }) -# Sys.sleep(cluster[['polling_period']]) -# } -# } - startr_exec[['t_begin_first_chunk']] <- time_begin_first_chunk - } - result <- Collect(startr_exec, wait = TRUE) - startR:::.message("Computation ended successfully.") - result - } else { - startr_exec - } - } else { - timings[['cores_per_job']] <- NA - timings[['concurrent_chunks']] <- 1 - t_begin_merge <- Sys.time() - for (component in 1:length(arrays_of_results)) { - arrays_of_results[[component]] <- startR:::.MergeArrayOfArrays(arrays_of_results[[component]]) - } - t_end_merge <- Sys.time() - timings[['merge']] <- as.numeric(difftime(t_end_merge, t_begin_merge, units = 'secs')) - t_end_total <- t_end_merge - timings[['total']] <- as.numeric(difftime(t_end_total, t_begin_total, units = 'secs')) - message(paste0("* Computation ended successfully.")) - message(paste0("* Number of chunks: ", - timings[['nchunks']])) - message(paste0("* Max. number of concurrent chunks (jobs): ", - timings[['concurrent_chunks']])) - message(paste0("* Requested cores per job: ", - timings[['cores_per_job']])) - message(paste0("* Load threads per chunk: ", - timings[['threads_load']])) - message(paste0("* Compute threads per chunk: ", - timings[['threads_compute']])) - message(paste0("* Total time (s): ", - timings[['total']])) - message(paste0("* Chunking setup: ", - timings[['bychunks_setup']])) - message(paste0("* Data upload to cluster: ", - timings[['transfer']])) - message(paste0("* All chunks: ", - timings[['total']] - - timings[['bychunks_setup']] - - timings[['transfer']] - - timings[['transfer_back']] - - timings[['merge']])) - message(paste0("* Transfer results from cluster: ", - timings[['transfer_back']])) - message(paste0("* Merge: ", - timings[['merge']])) - message(paste0("* Each chunk: ")) - message(paste0("* queue: ")) - message(paste0("* mean: ", - mean(timings[['queue']]))) - message(paste0("* min: ", - min(timings[['queue']]))) - message(paste0("* max: ", - max(timings[['queue']]))) - message(paste0("* job setup: ")) - message(paste0("* mean: ", - mean(timings[['job_setup']]))) - message(paste0("* min: ", - min(timings[['job_setup']]))) - message(paste0("* max: ", - max(timings[['job_setup']]))) - message(paste0("* load: ")) - message(paste0("* mean: ", - mean(timings[['load']]))) - message(paste0("* min: ", - min(timings[['load']]))) - message(paste0("* max: ", - max(timings[['load']]))) - message(paste0("* compute: ")) - message(paste0("* mean: ", - mean(timings[['compute']]))) - message(paste0("* min: ", - min(timings[['compute']]))) - message(paste0("* max: ", - max(timings[['compute']]))) - attr(arrays_of_results, 'startR_compute_profiling') <- timings - arrays_of_results - } - #TODO: check result dimensions match expected dimensions -} +ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', + threads_load = 2, threads_compute = 1, + cluster = NULL, + ecflow_suite_dir = NULL, + ecflow_server = NULL, + silent = FALSE, debug = FALSE, + wait = TRUE) { + # Build object to store profiling timings + t_begin_total <- Sys.time() + t_begin_bychunks_setup <- t_begin_total + timings <- list(nchunks = NULL, + concurrent_chunks = NULL, + cores_per_job = NULL, + threads_load = NULL, + threads_compute = NULL, + bychunks_setup = NULL, + transfer = NULL, + queue = NULL, + job_setup = NULL, + load = NULL, + compute = NULL, + transfer_back = NULL, + merge = NULL, + total = NULL) + + MergeArrays <- startR:::.MergeArrays + + # Check input headers + if ('startR_cube' %in% class(cube_headers)) { + cube_headers <- list(cube_headers) + } + if (!all(sapply(lapply(cube_headers, class), + function(x) 'startR_cube' %in% x))) { + stop("All objects passed in 'cube_headers' must be of class 'startR_cube', ", + "as returned by Start().") + } + + # Check step_fun + if (!is.function(step_fun)) { + stop("Parameter 'step_fun' must be a function.") + } + + # Check cores + if (!is.numeric(threads_load)) { + stop("Parameter 'threads_load' must be a numeric value.") + } + threads_load <- round(threads_load) + if (!is.numeric(threads_compute)) { + stop("Parameter 'threads_compute' must be a numeric value.") + } + threads_compute <- round(threads_compute) + timings[['threads_load']] <- threads_load + timings[['threads_compute']] <- threads_compute + + on_cluster <- !is.null(cluster) + + # Check ecflow_suite_dir + suite_id <- sample(10 ^ 10, 1) + ecflow_suite_dir_suite <- '' + if (on_cluster) { + if (is.null(ecflow_suite_dir)) { + stop("Parameter 'ecflow_suite_dir' must be specified when dispatching on a cluster.") + } + if (!is.character(ecflow_suite_dir)) { + stop("Parameter 'ecflow_suite_dir' must be a character string.") + } + ecflow_suite_dir_suite <- paste0(ecflow_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') + dir.create(ecflow_suite_dir_suite, recursive = TRUE) + if (!dir.exists(ecflow_suite_dir_suite)) { + stop("Could not find or create the directory in ", + "parameter 'ecflow_suite_dir'.") + } + } + + # Check cluster + default_cluster <- list(queue_host = NULL, + queue_type = 'slurm', + data_dir = NULL, + temp_dir = NULL, + lib_dir = NULL, + init_commands = list(''), + r_module = 'R', + CDO_module = NULL, + ecflow_module = 'ecFlow', + node_memory = NULL, + cores_per_job = NULL, + job_wallclock = '01:00:00', + max_jobs = 6, + extra_queue_params = list(''), + bidirectional = TRUE, + polling_period = 10, + special_setup = 'none') + if (on_cluster) { + if (!is.list(cluster)) { + stop("Parameter 'cluster' must be a named list.") + } + if (is.null(names(cluster))) { + stop("Parameter 'cluster' must be a named list.") + } + if (any(!(names(cluster) %in% c('queue_host', 'queue_type', 'data_dir', + 'temp_dir', 'lib_dir', 'init_commands', + 'r_module', 'CDO_module', + 'ecflow_module', 'node_memory', + 'cores_per_job', 'job_wallclock', 'max_jobs', + 'extra_queue_params', 'bidirectional', + 'polling_period', 'special_setup')))) { + stop("Found invalid component names in parameter 'cluster'.") + } + default_cluster[names(cluster)] <- cluster + } + localhost_name <- NULL + cluster <- default_cluster + remote_ecflow_suite_dir <- ecflow_suite_dir + is_data_dir_shared <- FALSE + is_ecflow_suite_dir_shared <- FALSE + if (on_cluster) { + #localhost_name <- Sys.info()[['nodename']] + localhost_name <- system('hostname -f', intern = TRUE) + if (Sys.which('ecflow_client') == '') { + stop("ecFlow must be installed in order to run the computation on clusters.") + } + if (is.null(cluster[['queue_host']])) { + queue_host <- localhost_name + } else if ((cluster[['queue_host']] %in% c('localhost', '127.0.0.1', localhost_name)) || + grepl(paste0('^', localhost_name), cluster[['queue_host']])) { + queue_host <- localhost_name + } + if (!(cluster[['queue_type']] %in% c('slurm', 'pbs', 'lsf', 'host'))) { + stop("The only supported 'queue_type's are 'slurm', 'pbs', 'lsf' and 'host'.") + } + if (is.null(cluster[['data_dir']])) { + is_data_dir_shared <- TRUE + } else { + if (!is.character(cluster[['data_dir']])) { + stop("The component 'data_dir' of the parameter 'cluster' must be a character string.") + } + remote_data_dir <- cluster[['data_dir']] + } + if (is.null(cluster[['temp_dir']])) { + is_ecflow_suite_dir_shared <- TRUE + } else { + if (!is.character(cluster[['temp_dir']])) { + stop("The component 'temp_dir' of the parameter 'cluster' must be a character string.") + } + remote_ecflow_suite_dir <- cluster[['temp_dir']] + } + if (!is.null(cluster[['lib_dir']])) { + if (!is.character(cluster[['lib_dir']])) { + stop("The component 'lib_dir', of the parameter 'cluster' must be NULL or ", + "a character string.") + } + } + if (!is.logical(cluster[['bidirectional']])) { + stop("The component 'bidirectional' of the parameter 'cluster' must be a logical value.") + } + if (cluster[['bidirectional']]) { + cluster[['init_commands']] <- c(cluster[['init_commands']], + list(paste('module load', cluster[['ecflow_module']]))) + } + if (!is.list(cluster[['init_commands']]) || + !all(sapply(cluster[['init_commands']], is.character))) { + stop("The component 'init_commands' of the parameter 'cluster' must be a list of ", + "character strings.") + } + if (!is.character(cluster[['r_module']])) { + stop("The component 'r_module' of the parameter 'cluster' must be a character string.") + } + if ((nchar(cluster[['r_module']]) < 1) || (grepl(' ', cluster[['r_module']]))) { + stop("The component 'r_module' of the parameter 'cluster' must have at least one character ", + "and contain no blank spaces.") + } + if (!is.null(cluster[['CDO_module']])) { + if (!is.character(cluster[['CDO_module']])) { + stop("The component 'CDO_module' of the parameter 'cluster' must be a character string.") + } + if (nchar(cluster[['CDO_module']]) < 1 || grepl(' ', cluster[['CDO_module']])) { + warning("The component 'CDO_module' of parameter 'cluster' must have ", + " than 1 and only the first element will be used.") + } + cluster[['r_module']] <- paste(cluster[['r_module']], cluster[['CDO_module']]) + } + if (!is.character(cluster[['ecflow_module']])) { + stop("The component 'ecflow_module' of the parameter 'cluster' must be a character string.") + } + if ((nchar(cluster[['ecflow_module']]) < 1) || + (grepl(' ', cluster[['ecflow_module']]))) { + stop("The component 'ecflow_module' of the parameter 'cluster' must have at least ", + "one character, and contain no blank spaces.") + } + if (is.null(cluster[['cores_per_job']])) { + cluster[['cores_per_job']] <- threads_compute + } + if (!is.numeric(cluster[['cores_per_job']])) { + stop("The component 'cores_per_job' of the parameter 'cluster' must be numeric.") + } + cluster[['cores_per_job']] <- round(cluster[['cores_per_job']]) + if (cluster[['cores_per_job']] > threads_compute) { + startR:::.message("WARNING: 'threads_compute' should be >= cluster[['cores_per_job']].") + } + if (!is.list(cluster[['extra_queue_params']]) || + !all(sapply(cluster[['extra_queue_params']], is.character))) { + stop("The component 'extra_queue_params' of the parameter 'cluster' must be a list of ", + "character strings.") + } + if (!(cluster[['special_setup']] %in% c('none', 'marenostrum4'))) { + stop("The value provided for the component 'special_setup' of the parameter ", + "'cluster' is not recognized.") + } + } + + # Check ecflow_suite_dir + remote_ecflow_suite_dir_suite <- '' + if (on_cluster) { + remote_ecflow_suite_dir_suite <- paste0(remote_ecflow_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') + } + + # Check ecflow_server + if (!is.null(ecflow_server) && !(is.character(ecflow_server))) { + stop("Parameter 'ecflow_server' must be a character string if specified.") + } + + # Check silent + if (!is.logical(silent)) { + stop("Parameter 'silent' must be logical.") + } + + # Check debug + if (!is.logical(debug)) { + stop("Parameter 'debug' must be logical.") + } + if (silent) { + debug <- FALSE + } + + # Check wait + if (!is.logical(wait)) { + stop("Parameter 'wait' must be logical.") + } + + # Work out chunked dimensions and target dimensions + all_dims <- lapply(cube_headers, attr, 'Dimensions') + all_dims_merged <- NULL + for (i in all_dims) { + if (is.null(all_dims_merged)) { + all_dims_merged <- i + } else { + all_dims_merged <- startR:::.MergeArrayDims(all_dims_merged, i)[[3]] + } + } + all_dimnames <- names(all_dims_merged) + + target_dims_indices <- which(all_dimnames %in% unlist(attr(step_fun, 'TargetDims'))) + target_dims <- NULL + if (length(target_dims_indices) > 0) { + target_dims <- all_dimnames[target_dims_indices] + } + + chunked_dims <- all_dimnames + if (length(target_dims_indices) > 0) { + chunked_dims <- chunked_dims[-target_dims_indices] + } + if (length(chunked_dims) < 1) { + stop("Not possible to process input by chunks. All input dimensions are ", + "target dimensions.") + } + + if (length(cube_headers) != length(attr(step_fun, 'TargetDims'))) { + stop("Number of inputs in parameter 'cube_headers' must be equal to the ", + "number of inputs expected by the function 'step_fun'.") + } + # Check all input headers have matching dimensions + cube_index <- 1 + for (cube_header in cube_headers) { + if (!all(attr(cube_header, 'Dimensions') == all_dims_merged[names(attr(cube_header, 'Dimensions'))])) { + stop("All provided 'cube_headers' must have matching dimension lengths ", + "with each other.") + } + if (!all(attr(step_fun, 'TargetDims')[[cube_index]] %in% names(attr(cube_header, 'Dimensions')))) { + stop("All provided 'cube_headers' must contain at least the target dimensions ", + "expected by 'step_fun'.") + } + cube_index <- cube_index + 1 + # work out expected result dimensions + } + + # Check chunks + default_chunks <- as.list(rep(1, length(chunked_dims))) + names(default_chunks) <- chunked_dims + if (length(chunks) == 1 && chunks == 'auto') { + chunks <- default_chunks + } + if (!is.list(chunks)) { + stop("Parameter 'chunks' must be a named list or 'auto'.") + } + if (is.null(names(chunks))) { + stop("Parameter 'chunks' must be a named list or 'auto'.") + } + if (any(!(names(chunks) %in% chunked_dims))) { + stop("All names in parameter 'chunks' must be one of the non-target dimensions ", + "present in the cubes in 'cube_headers'. The target dimensions are ", + paste(paste0("'", target_dims, "'"), collapse = ', '), ". The non-target ", + "dimensions (margins) are ", paste(paste0("'", chunked_dims, "'"), collapse = ', '), ".") + } + if (any(!(((unlist(chunks) %% 1) == 0) | (unlist(chunks) == 'all')))) { + stop("All values in parameter 'chunks' must take a numeric value or 'all'.") + } + if (any(unlist(chunks) < 1)) { + stop("All values in parameter 'chunks' must be >= 1.") + } + for (chunk_spec in 1:length(chunks)) { + if (chunks[[chunk_spec]] > all_dims_merged[names(chunks)[chunk_spec]]) { + stop("Too many chunks requested for the dimension ", names(chunks)[chunk_spec], + ". Maximum allowed is ", all_dims_merged[names(chunks)[chunk_spec]]) + } + } + default_chunks[names(chunks)] <- chunks + chunks <- default_chunks + timings[['nchunks']] <- prod(unlist(chunks)) + + # Check step_fun + if (!('startR_step_fun' %in% class(step_fun))) { + stop("Parameter 'step_fun' must be of the class 'startR_step_fun', as returned ", + "by the function Step.") + } + + # Replace 'all's + chunks_all <- which(unlist(chunks) == 'all') + if (length(chunks_all) > 0) { + chunks[chunks_all] <- all_dims[names(chunks)[chunks_all]] + } + # Mount the ecFlow suite + if (on_cluster) { + .message(paste0("ATTENTION: Dispatching chunks on a remote cluster", + ". Make sure passwordless ", + "access is properly set in both directions.")) + + # Copy load_process_save_chunk.R into shared folder + chunk_script <- file(system.file('chunking/load_process_save_chunk.R', + package = 'startR')) + chunk_script_lines <- readLines(chunk_script) + close(chunk_script) + chunk_script_lines <- gsub('^lib_dir <- *', paste0('lib_dir <- ', + paste(deparse(cluster[['lib_dir']]), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^out_dir <- *', paste0('out_dir <- ', + paste(deparse(remote_ecflow_suite_dir_suite), collapse = '\n')), chunk_script_lines) + chunk_script_lines <- gsub('^debug <- *', paste0('debug <- ', paste(deparse(debug), collapse = '\n')), + chunk_script_lines) + deparsed_calls <- paste0('start_calls <- list(') + extra_path <- '' + if (cluster[['special_setup']] == 'marenostrum4') { + extra_path <- '/gpfs/archive/bsc32/' + } + for (cube_header in 1:length(cube_headers)) { + pattern_dim <- attr(cube_headers[[cube_header]], 'PatternDim') + bk_pattern_dim <- cube_headers[[cube_header]][[pattern_dim]] + bk_expected_files <- attr(cube_headers[[cube_header]], 'ExpectedFiles') + if (!is_data_dir_shared) { + cube_headers[[cube_header]][[pattern_dim]] <- paste0(remote_data_dir, '/', + extra_path, '/', cube_headers[[cube_header]][[pattern_dim]]) + for (file_n in 1:length(bk_expected_files)) { + attr(cube_headers[[cube_header]], 'ExpectedFiles')[file_n] <- paste0(remote_data_dir, '/', + extra_path, '/', attr(cube_headers[[cube_header]], 'ExpectedFiles')[file_n]) + } + } + deparsed_calls <- paste0(deparsed_calls, '\nquote(', + paste(deparse(cube_headers[[cube_header]]), collapse = '\n'), + ')') + cube_headers[[cube_header]][[pattern_dim]] <- bk_pattern_dim + attr(cube_headers[[cube_header]], 'ExpectedFiles') <- bk_expected_files + if (cube_header < length(cube_headers)) { + deparsed_calls <- paste0(deparsed_calls, ', ') + } + } + deparsed_calls <- paste0(deparsed_calls, '\n)') + chunk_script_lines <- gsub('^start_calls <- *', deparsed_calls, chunk_script_lines) + chunk_script_lines <- gsub('^start_calls_attrs <- *', paste0('start_calls_attrs <- ', paste(deparse(lapply(cube_headers, attributes)), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^param_dimnames <- *', paste0('param_dimnames <- ', paste(deparse(chunked_dims), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^threads_load <- *', paste0('threads_load <- ', threads_load), + chunk_script_lines) + chunk_script_lines <- gsub('^threads_compute <- *', paste0('threads_compute <- ', threads_compute), + chunk_script_lines) + chunk_script_lines <- gsub('^fun <- *', paste0('fun <- ', paste(deparse(step_fun), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^params <- *', paste0('params <- ', paste(deparse(list(...)), collapse = '\n')), + chunk_script_lines) + writeLines(chunk_script_lines, paste0(ecflow_suite_dir_suite, '/load_process_save_chunk.R')) + + # Copy Chunk.ecf into shared folder + chunk_ecf_script <- file(system.file('chunking/Chunk.ecf', + package = 'startR')) + chunk_ecf_script_lines <- readLines(chunk_ecf_script) + close(chunk_ecf_script) + if (cluster[['queue_type']] == 'host') { + chunk_ecf_script_lines <- gsub('^include_queue_header', + '', + chunk_ecf_script_lines) + } else { + chunk_ecf_script_lines <- gsub('^include_queue_header', + paste0('%include "./', cluster[['queue_type']], '.h"'), + chunk_ecf_script_lines) + } + chunk_ecf_script_lines <- gsub('^include_init_commands', + paste0(paste0(cluster[['init_commands']], collapse = '\n'), '\n'), + chunk_ecf_script_lines) + chunk_ecf_script_lines <- gsub('^include_module_load', + paste0('module load ', cluster[['r_module']]), + chunk_ecf_script_lines) + ecf_vars <- paste0('%', as.vector(sapply(chunked_dims, + function(x) { + c(toupper(x), paste0(toupper(x), '_N')) + })), '%') + # if (!is_ecflow_suite_dir_shared && (cluster[['queue_host']] != localhost_name)) { + # #transfer_back_line <- paste0('rsync -rav %REMOTE_ECF_HOME% ', localhost_name, + # # ':%ECF_HOME%\nrm -f %ECF_HOME%/', + # # paste0('*', paste(ecf_vars[((1:(length(ecf_vars) / 2)) * 2) - 1], collapse = '*'), '*.Rds')) + result_file_id <- paste0('*', + paste(paste0('_', ecf_vars[((1:(length(ecf_vars) / 2)) * 2) - 1], '__'), + collapse = '*'), '*') + # transfer_back_line <- paste0('rsync -rav %REMOTE_ECF_HOME%/%SUITE%/ ', + # localhost_name, + # ':%ECF_HOME%/%SUITE%/\nscp %REMOTE_ECF_HOME%/', + # result_file_id, ' ', localhost_name, + # ':%ECF_HOME%\nrm -f %REMOTE_ECF_HOME%/', + # result_file_id) + # } else { + # transfer_back_line <- '' + # } + chunk_ecf_script_lines <- gsub('^Rscript load_process_save_chunk.R --args \\$task_path insert_indices', + paste0('Rscript load_process_save_chunk.R --args $task_path ', paste(ecf_vars, collapse = ' ')), + chunk_ecf_script_lines) + #chunk_ecf_script_lines <- gsub('^include_transfer_back_and_rm', transfer_back_line, chunk_ecf_script_lines) + writeLines(chunk_ecf_script_lines, paste0(ecflow_suite_dir_suite, '/Chunk.ecf')) + + # Copy merge_chunks.R into tmp folder + # merge_script <- file(system.file('chunking/merge_chunks.R', + # package = 'startR')) + # merge_script_lines <- readLines(merge_script) + # close(merge_script) + # merge_script_lines <- gsub('^shared_dir <- *', paste0('shared_dir <- ', + # paste(deparse(shared_dir_suite), collapse = '\n')), merge_script_lines) + # writeLines(merge_script_lines, paste0(shared_dir_suite, '/merge_chunks.R')) + + # Copy Merge.ecf into tmp folder + #TODO: Modify chain of parameters sent to r script when merging + #chunks progressively + # merge_ecf_script <- file(system.file('chunking/Merge.ecf', + # package = 'startR')) + # merge_ecf_script_lines <- readLines(merge_ecf_script) + # close(merge_ecf_script) + # writeLines(merge_ecf_script_lines, paste0(shared_dir_suite, '/Merge.ecf')) + + # Copy queue header into shared folder + #file.copy(system.file(paste0('chunking/', cluster[['queue_type']], '.h'), package = 'startR'), + # ecflow_suite_dir_suite) + chunk_queue_header <- file(system.file(paste0('chunking/', cluster[['queue_type']], '.h'), package = 'startR')) + chunk_queue_header_lines <- readLines(chunk_queue_header) + close(chunk_queue_header) + chunk_queue_header_lines <- gsub('^include_extra_queue_params', + paste0(paste0(cluster[['extra_queue_params']], collapse = '\n'), '\n'), + chunk_queue_header_lines) + writeLines(chunk_queue_header_lines, paste0(ecflow_suite_dir_suite, '/', cluster[['queue_type']], '.h')) + + # Copy headers + file.copy(system.file('chunking/head.h', package = 'startR'), + ecflow_suite_dir_suite) + file.copy(system.file('chunking/tail.h', package = 'startR'), + ecflow_suite_dir_suite) + } + + add_line <- function(suite, line, tabs) { + c(suite, paste0(paste(rep(' ', tabs), collapse = ''), line)) + } + suite <- NULL + tabs <- 0 + suite <- add_line(suite, paste0('suite STARTR_CHUNKING_', suite_id), tabs) + tabs <- tabs + 2 + submit_command <- '' + if (cluster[['queue_type']] == 'slurm') { + submit_command <- 'sbatch' + } else if (cluster[['queue_type']] == 'pbs') { + submit_command <- 'qsub' + } else if (cluster[['queue_type']] == 'lsf') { + submit_command <- 'bsub <' + } else if (cluster[['queue_type']] == 'host') { + submit_command <- 'bash' + } + if (on_cluster) { + suite <- add_line(suite, paste0("edit BIDIRECTIONAL '", cluster[['bidirectional']], "'"), tabs) + suite <- add_line(suite, paste0("edit QUEUE_HOST '", cluster[['queue_host']], "'"), tabs) + suite <- add_line(suite, paste0("edit ECF_HOST '", localhost_name, "'"), tabs) + suite <- add_line(suite, paste0("edit EC_HOST_FULL '", localhost_name, "'"), tabs) + suite <- add_line(suite, paste0("edit RESULT_FILE_ID '", result_file_id, "'"), tabs) + #} else { + # suite <- add_line(suite, paste0("edit ECF_JOB_CMD '", submit_command, " %ECF_JOB% > %ECF_JOBOUT% 2>&1 &'"), tabs) + } + suite <- add_line(suite, paste0("edit ECF_HOME '", ecflow_suite_dir_suite, "'"), tabs) + suite <- add_line(suite, paste0("edit REMOTE_ECF_HOME '", remote_ecflow_suite_dir_suite, "'"), tabs) + suite <- add_line(suite, paste0("edit CORES_PER_JOB ", cluster[['cores_per_job']], ""), tabs) + suite <- add_line(suite, paste0("edit JOB_WALLCLOCK '", cluster[['job_wallclock']], "'"), tabs) + suite <- add_line(suite, paste0("limit max_jobs ", cluster[['max_jobs']]), tabs) + suite <- add_line(suite, paste0("inlimit max_jobs"), tabs) + suite <- add_line(suite, "family computation", tabs) + tabs <- tabs + 2 + + if (on_cluster) { + # source $HOME/.profile ; + sync_command <- '' + if (!is_ecflow_suite_dir_shared) { + sync_command <- paste0("rsync -rav ", + "%ECF_HOME%/ ", + "%QUEUE_HOST%:%REMOTE_ECF_HOME%/ ; ") + } + suite <- add_line(suite, paste0("edit ECF_JOB_CMD '", + #"mkdir -p %REMOTE_ECF_HOME%/%SUITE%/ ; ", + sync_command, + "ssh %QUEUE_HOST% \"", + "date --rfc-3339=seconds > %REMOTE_ECF_HOME%/%ECF_NAME%.submit_time ; ", + submit_command, + " %REMOTE_ECF_HOME%/%ECF_NAME%.job%ECF_TRYNO% > ", + "%REMOTE_ECF_HOME%/%ECF_NAME%.%ECF_TRYNO% 2>&1 &\" ", + "2>&1'"), tabs) + if (is_ecflow_suite_dir_shared) { + suite <- add_line(suite, paste0("edit REPORT_BACK 'FALSE'"), tabs) + } else { + suite <- add_line(suite, paste0("edit REPORT_BACK 'TRUE'"), tabs) + } + } + + # Open nested ecFlow families + for (i in length(chunked_dims):1) { + suite <- add_line(suite, paste0('family ', chunked_dims[i], '_CHUNK_', 1), tabs) + tabs <- tabs + 2 + suite <- add_line(suite, paste0('edit ', toupper(chunked_dims[i]), ' ', 1), tabs) + suite <- add_line(suite, paste0('edit ', toupper(chunked_dims[i]), '_N ', chunks[[chunked_dims[i]]]), tabs) + } + + # Iterate through chunks + chunk_array <- array(1:prod(unlist(chunks)), dim = (unlist(chunks))) + arrays_of_results <- vector('list', length(attr(step_fun, 'OutputDims'))) + names(arrays_of_results) <- names(attr(step_fun, 'OutputDims')) + for (component in 1:length(arrays_of_results)) { + arrays_of_results[[component]] <- vector('list', prod((unlist(chunks)))) + dim(arrays_of_results[[component]]) <- (unlist(chunks)) + } + if (!on_cluster) { + t_end_bychunks_setup <- Sys.time() + timings[['bychunks_setup']] <- as.numeric(difftime(t_end_bychunks_setup, + t_begin_bychunks_setup, units = 'secs')) + timings[['transfer']] <- 0 + timings[['queue']] <- 0 + timings[['job_setup']] <- 0 + timings[['transfer_back']] <- 0 + if (!silent) { + startR:::.message(paste0("Processing chunks... ", + "remaining time estimate soon...")) + } + time_before_first_chunk <- Sys.time() + time_after_first_chunk <- NULL + } + previous_chunk_indices <- rep(1, length(chunks)) + found_first_result <- FALSE + for (i in 1:length(chunk_array)) { + chunk_indices <- which(chunk_array == i, arr.ind = TRUE)[1, ] + names(chunk_indices) <- names(dim(chunk_array)) + # ADD CHUNK SCRIPT TO SUITE + families_to_jump <- which(chunk_indices != previous_chunk_indices) + if (length(families_to_jump) > 0) { + families_to_jump <- max(families_to_jump) + # Close ecFlow families + for (j in 1:families_to_jump) { + tabs <- tabs - 2 + suite <- add_line(suite, paste0('endfamily'), tabs) + } + # Open ecFlow families + for (j in families_to_jump:1) { + suite <- add_line(suite, paste0('family ', (chunked_dims)[j], '_CHUNK_', chunk_indices[j]), tabs) + tabs <- tabs + 2 + suite <- add_line(suite, paste0('edit ', toupper((chunked_dims)[j]), ' ', chunk_indices[j]), tabs) + suite <- add_line(suite, paste0('edit ', toupper((chunked_dims)[j]), '_N ', chunks[[(chunked_dims)[j]]]), tabs) + } + } + suite <- add_line(suite, "task Chunk", tabs) + + if (!on_cluster) { + if (!silent) { + startR:::.message(paste("Loading chunk", i, + "out of", length(chunk_array), "...")) + } + data <- vector('list', length(cube_headers)) + t_begin_load <- Sys.time() + for (input in 1:length(data)) { + start_call <- cube_headers[[input]] + dims_to_alter <- which(names(attr(start_call, 'Dimensions')) %in% names(chunks)) + names_dims_to_alter <- names(attr(start_call, 'Dimensions'))[dims_to_alter] + # If any dimension comes from split dimensions + split_dims <- attr(start_call, 'SplitDims') + + if (length(split_dims) != 0){ + + for (k in 1:length(split_dims)) { + if (any(names(split_dims[[k]]) %in% names_dims_to_alter)) { + chunks_split_dims <- rep(1, length(split_dims[[k]])) + names(chunks_split_dims) <- names(split_dims[[k]]) + chunks_indices_split_dims <- chunks_split_dims + split_dims_to_alter <- which(names(split_dims[[k]]) %in% names_dims_to_alter) + chunks_split_dims[split_dims_to_alter] <- unlist(chunks[names(split_dims[[k]])[split_dims_to_alter]]) + chunks_indices_split_dims[split_dims_to_alter] <- chunk_indices[names(split_dims[[k]])[split_dims_to_alter]] + start_call[[names(split_dims)[k]]] <- chunk(chunks_indices_split_dims, chunks_split_dims, + eval(start_call[[names(split_dims)[k]]])) + dims_to_alter_to_remove <- which(names_dims_to_alter %in% names(split_dims[[k]])) + if (length(dims_to_alter_to_remove) > 0) { + dims_to_alter <- dims_to_alter[-dims_to_alter_to_remove] + names_dims_to_alter <- names_dims_to_alter[-dims_to_alter_to_remove] + } + } + } + } + + if (length(dims_to_alter) > 0) { + for (call_dim in names(attr(start_call, 'Dimensions'))[dims_to_alter]) { + start_call[[call_dim]] <- chunk(chunk_indices[call_dim], chunks[[call_dim]], + eval(start_call[[call_dim]])) + } + } + start_call[['silent']] <- !debug + if (!('num_procs' %in% names(start_call))) { + start_call[['num_procs']] <- threads_load + } + data[[input]] <- eval(start_call) + } + t_end_load <- Sys.time() + timings[['load']] <- c(timings[['load']], + as.numeric(difftime(t_end_load, t_begin_load, units = 'secs'))) + if (!silent) { + startR:::.message(paste("Processing...")) + } + #TODO: Find a better way to assign the names of data. When multiple steps for Compute is available, this way may fail. + names(data) <- names(cube_headers) + t_begin_compute <- Sys.time() + result <- multiApply::Apply(data, + target_dims = attr(step_fun, 'TargetDims'), + fun = step_fun, ..., + output_dims = attr(step_fun, 'OutputDims'), + use_attributes = attr(step_fun, 'UseAttributes'), + ncores = threads_compute) + if (!found_first_result) { + names(arrays_of_results) <- names(result) + found_first_result <- TRUE + } + for (component in 1:length(result)) { + arrays_of_results[[component]][[i]] <- result[[component]] + } + rm(data) + gc() + t_end_compute <- Sys.time() + timings[['compute']] <- c(timings[['compute']], + as.numeric(difftime(t_end_compute, + t_begin_compute, units = 'secs'))) + } + + # Time estimate + if (!on_cluster) { + if (is.null(time_after_first_chunk)) { + time_after_first_chunk <- Sys.time() + if (!silent) { + estimate <- (time_after_first_chunk - + time_before_first_chunk) * + (length(chunk_array) - 1) + units(estimate) <- 'mins' + startR:::.message( + paste0("Remaining time estimate (at ", format(time_after_first_chunk), ") ", + "(neglecting merge time): ", format(estimate)) + ) + } + } + } + previous_chunk_indices <- chunk_indices + } + + # Close nested ecFlow families + for (i in length(chunked_dims):1) { + tabs <- tabs - 2 + suite <- add_line(suite, paste0('endfamily'), tabs) + } + + # Close the ecFlow suite + tabs <- tabs - 2 + suite <- add_line(suite, paste0('endfamily'), tabs) + # suite <- add_line(suite, "family merge", tabs) + # tabs <- tabs + 2 + # suite <- add_line(suite, "trigger computation == complete", tabs) + # suite <- add_line(suite, "edit ECF_JOB_CMD 'bash %ECF_JOB% > %ECF_JOBOUT% 2>&1 &'", tabs) + # suite <- add_line(suite, "task Merge", tabs) + # tabs <- tabs - 2 + # suite <- add_line(suite, paste0('endfamily'), tabs) + + tabs <- tabs - 2 + suite <- add_line(suite, "endsuite", tabs) + + # Run ecFlow suite if needed + if (on_cluster) { + timings[['cores_per_job']] <- cluster[['cores_per_job']] + timings[['concurrent_chunks']] <- cluster[['max_jobs']] + suite_file <- paste0(ecflow_suite_dir_suite, '/startR_chunking.def') + suite_file_o <- file(suite_file) + writeLines(suite, suite_file_o) + close(suite_file_o) + + default_ecflow_server <- list(host = localhost_name, port = '5678') + if (is.null(ecflow_server)) { + .warning("Parameter 'ecflow_server' has not been specified but execution on ", + "cluster has been requested. An ecFlow server instance will ", + "be created on localhost:5678.") + } else { + if ('host' %in% names(ecflow_server)) { + stop("A host has been specified for the 'ecflow_server', but this option is not available yet.") + } + default_ecflow_server[names(ecflow_server)] <- ecflow_server + } + ecflow_server <- default_ecflow_server + system(paste0("ecflow_start.sh -p ", ecflow_server[['port']])) + system(paste0("ecflow_client --load=", suite_file, " --host=", + ecflow_server[['host']], " --port=", ecflow_server[['port']])) + if (!is_ecflow_suite_dir_shared) { + system(paste0('ssh ', cluster[['queue_host']], ' "mkdir -p ', + remote_ecflow_suite_dir_suite, '"')) + system(paste0('rsync -ra ', ecflow_suite_dir_suite, + ' ', cluster[['queue_host']], ':', + remote_ecflow_suite_dir_suite)) + } + t_end_bychunks_setup <- Sys.time() + timings[['bychunks_setup']] <- as.numeric(difftime(t_end_bychunks_setup, + t_begin_bychunks_setup, units = 'secs')) + if (!is_data_dir_shared) { + t_begin_transfer <- Sys.time() + startR:::.message("Sending involved files to the cluster file system...") + files_to_send <- NULL + #files_to_check <- NULL + for (cube_header in 1:length(cube_headers)) { + expected_files <- attr(cube_headers[[cube_header]], 'ExpectedFiles') + #files_to_check <- c(files_to_check, expected_files) + #if (cluster[['special_setup']] == 'marenostrum4') { + # expected_files <- paste0('/gpfs/archive/bsc32/', expected_files) + #} + files_to_send <- c(files_to_send, expected_files) + } + #which_files_exist <- sapply(files_to_check, file.exists) + which_files_exist <- sapply(files_to_send, file.exists) + files_to_send <- files_to_send[which_files_exist] + if (cluster[['special_setup']] == 'marenostrum4') { + file_spec <- paste(paste0("/gpfs/archive/bsc32/", + files_to_send), collapse = ' ') + system(paste0("ssh ", cluster[['queue_host']], " 'mkdir -p ", remote_data_dir, + ' ; module load transfer ; cd ', remote_ecflow_suite_dir_suite, + ' ; dtrsync -Rrav ', '\'', file_spec, '\' "', remote_data_dir, '/"', + " ; sleep 1 ; ", + "while [[ ! $(ls dtrsync_*.out 2>/dev/null | wc -l) -ge 1 ]] ; ", + "do sleep 2 ; done", + " ; sleep 1 ; ", + 'while [[ ! $(grep "total size is" dtrsync_*.out | ', + "wc -l) -ge 1 ]] ; ", + "do sleep 5 ; done", "'")) + } else { + file_spec <- paste(files_to_send, collapse = ' :') + system(paste0("ssh ", cluster[['queue_host']], ' "mkdir -p ', + remote_data_dir, '"')) + system(paste0("rsync -Rrav '", file_spec, "' '", + cluster[['queue_host']], ":", remote_data_dir, "/'")) + } + startR:::.message("Files sent successfully.") + t_end_transfer <- Sys.time() + timings[['transfer']] <- as.numeric(difftime(t_end_transfer, t_begin_transfer, units = 'secs')) + } else { + timings[['transfer']] <- 0 + } + if (!silent) { + startR:::.message(paste0("Processing chunks... ")) + } + time_begin_first_chunk <- Sys.time() + # time_after_first_chunk <- NULL + system(paste0("ecflow_client --begin=STARTR_CHUNKING_", suite_id, + " --host=", ecflow_server[['host']], " --port=", + ecflow_server[['port']])) + + timings[['total']] <- t_begin_total + startr_exec <- list(cluster = cluster, ecflow_server = ecflow_server, + suite_id = suite_id, chunks = chunks, + num_outputs = length(arrays_of_results), + ecflow_suite_dir = ecflow_suite_dir, + timings = timings) + class(startr_exec) <- 'startR_exec' + if (wait) { + if (!silent) { + startR:::.message(paste0("Remaining time estimate soon... ")) + # while (is.null(time_after_first_chunk)) { + # if (any(grepl('.*\\.Rds$', list.files(ecflow_suite_dir_suite)))) { + # time_after_first_chunk <- Sys.time() + # estimate <- (time_after_first_chunk - + # time_before_first_chunk) * + # ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / + # cluster[['max_jobs']]) + # units(estimate) <- 'mins' + # startR:::.message( + # paste0('Remaining time estimate (neglecting queue and ', + # 'merge time) (at ', format(time_after_first_chunk), + # '): ', format(estimate), ' (', + # format(time_after_first_chunk - + # time_before_first_chunk), ' per chunk)') + # ) + # } else if (!cluster[['bidirectional']]) { + # rsync_output <- tryCatch({ + # system(paste0("rsync -ra --ignore-missing-args ", + # cluster[['queue_host']], ":", + # remote_ecflow_suite_dir_suite, "/*.Rds ", + # ecflow_suite_dir_suite, "/"), intern = TRUE) + # }, error = function(e) { + # message("Warning: rsync from remote server to collect results failed. ", + # "Retrying soon.") + # failed <- TRUE + # }) + # Sys.sleep(cluster[['polling_period']]) + # } + # } + startr_exec[['t_begin_first_chunk']] <- time_begin_first_chunk + } + result <- Collect(startr_exec, wait = TRUE) + startR:::.message("Computation ended successfully.") + result + } else { + startr_exec + } + } else { + timings[['cores_per_job']] <- NA + timings[['concurrent_chunks']] <- 1 + t_begin_merge <- Sys.time() + for (component in 1:length(arrays_of_results)) { + arrays_of_results[[component]] <- startR:::.MergeArrayOfArrays(arrays_of_results[[component]]) + } + t_end_merge <- Sys.time() + timings[['merge']] <- as.numeric(difftime(t_end_merge, t_begin_merge, units = 'secs')) + t_end_total <- t_end_merge + timings[['total']] <- as.numeric(difftime(t_end_total, t_begin_total, units = 'secs')) + message(paste0("* Computation ended successfully.")) + message(paste0("* Number of chunks: ", + timings[['nchunks']])) + message(paste0("* Max. number of concurrent chunks (jobs): ", + timings[['concurrent_chunks']])) + message(paste0("* Requested cores per job: ", + timings[['cores_per_job']])) + message(paste0("* Load threads per chunk: ", + timings[['threads_load']])) + message(paste0("* Compute threads per chunk: ", + timings[['threads_compute']])) + message(paste0("* Total time (s): ", + timings[['total']])) + message(paste0("* Chunking setup: ", + timings[['bychunks_setup']])) + message(paste0("* Data upload to cluster: ", + timings[['transfer']])) + message(paste0("* All chunks: ", + timings[['total']] - + timings[['bychunks_setup']] - + timings[['transfer']] - + timings[['transfer_back']] - + timings[['merge']])) + message(paste0("* Transfer results from cluster: ", + timings[['transfer_back']])) + message(paste0("* Merge: ", + timings[['merge']])) + message(paste0("* Each chunk: ")) + message(paste0("* queue: ")) + message(paste0("* mean: ", + mean(timings[['queue']]))) + message(paste0("* min: ", + min(timings[['queue']]))) + message(paste0("* max: ", + max(timings[['queue']]))) + message(paste0("* job setup: ")) + message(paste0("* mean: ", + mean(timings[['job_setup']]))) + message(paste0("* min: ", + min(timings[['job_setup']]))) + message(paste0("* max: ", + max(timings[['job_setup']]))) + message(paste0("* load: ")) + message(paste0("* mean: ", + mean(timings[['load']]))) + message(paste0("* min: ", + min(timings[['load']]))) + message(paste0("* max: ", + max(timings[['load']]))) + message(paste0("* compute: ")) + message(paste0("* mean: ", + mean(timings[['compute']]))) + message(paste0("* min: ", + min(timings[['compute']]))) + message(paste0("* max: ", + max(timings[['compute']]))) + attr(arrays_of_results, 'startR_compute_profiling') <- timings + arrays_of_results + } + #TODO: check result dimensions match expected dimensions +} diff --git a/R/CDORemapper.R b/R/CDORemapper.R index ab352348fb30f30d70f80a6d9903bee60d9ef877..b109f548f3ec8b1faa4c243b09ef27169af9f417 100644 --- a/R/CDORemapper.R +++ b/R/CDORemapper.R @@ -1,46 +1,46 @@ -CDORemapper <- function(data_array, variables, file_selectors = NULL, ...) { - file_dims <- names(file_selectors) - known_lon_names <- s2dverification:::.KnownLonNames() - known_lat_names <- s2dverification:::.KnownLatNames() - if (!any(known_lon_names %in% names(variables)) || - !any(known_lat_names %in% names(variables))) { - stop("The longitude and latitude variables must be requested in ", - "'return_vars' and specified in 'transform_vars' for the ", - "CDORemapper to work.") - } - lon_name <- names(variables)[which(names(variables) %in% known_lon_names)[1]] - lons <- variables[[lon_name]] - if (!is.null(dim(lons))) { - dims_to_subset <- which(names(dim(lons)) %in% file_dims) - if (length(dims_to_subset) > 0) { - lons_to_use <- as.list(rep(TRUE, length(dim(lons)))) - names(lons_to_use) <- names(dim(lons)) - lons_to_use[dims_to_subset] <- as.list(rep(1, length(dims_to_subset))) - attr_bk <- attributes(lons) - lons <- do.call('[', c(list(x = lons), lons_to_use, list(drop = TRUE))) - attributes(lons) <- attr_bk - } - } - lat_name <- names(variables)[which(names(variables) %in% known_lat_names)[1]] - lats <- variables[[lat_name]] - if (!is.null(dim(lats))) { - dims_to_subset <- which(names(dim(lats)) %in% file_dims) - if (length(dims_to_subset) > 0) { - lats_to_use <- as.list(rep(TRUE, length(dim(lats)))) - names(lats_to_use) <- names(dim(lats)) - lats_to_use[dims_to_subset] <- as.list(rep(1, length(dims_to_subset))) - attr_bk <- attributes(lons) - lats <- do.call('[', c(list(x = lats), lats_to_use, list(drop = TRUE))) - attributes(lats) <- attr_bk - } - } - extra_params <- list(...) - if (!all(c('grid', 'method') %in% names(extra_params))) { - stop("Parameters 'grid' and 'method' must be specified for the ", - "CDORemapper, via the 'transform_params' argument.") - } - result <- s2dverification::CDORemap(data_array, lons, lats, ...) - return_variables <- list(result$lons, result$lats) - names(return_variables) <- c(lon_name, lat_name) - list(data_array = result$data_array, variables = return_variables) -} +CDORemapper <- function(data_array, variables, file_selectors = NULL, ...) { + file_dims <- names(file_selectors) + known_lon_names <- s2dverification:::.KnownLonNames() + known_lat_names <- s2dverification:::.KnownLatNames() + if (!any(known_lon_names %in% names(variables)) || + !any(known_lat_names %in% names(variables))) { + stop("The longitude and latitude variables must be requested in ", + "'return_vars' and specified in 'transform_vars' for the ", + "CDORemapper to work.") + } + lon_name <- names(variables)[which(names(variables) %in% known_lon_names)[1]] + lons <- variables[[lon_name]] + if (!is.null(dim(lons))) { + dims_to_subset <- which(names(dim(lons)) %in% file_dims) + if (length(dims_to_subset) > 0) { + lons_to_use <- as.list(rep(TRUE, length(dim(lons)))) + names(lons_to_use) <- names(dim(lons)) + lons_to_use[dims_to_subset] <- as.list(rep(1, length(dims_to_subset))) + attr_bk <- attributes(lons) + lons <- do.call('[', c(list(x = lons), lons_to_use, list(drop = TRUE))) + attributes(lons) <- attr_bk + } + } + lat_name <- names(variables)[which(names(variables) %in% known_lat_names)[1]] + lats <- variables[[lat_name]] + if (!is.null(dim(lats))) { + dims_to_subset <- which(names(dim(lats)) %in% file_dims) + if (length(dims_to_subset) > 0) { + lats_to_use <- as.list(rep(TRUE, length(dim(lats)))) + names(lats_to_use) <- names(dim(lats)) + lats_to_use[dims_to_subset] <- as.list(rep(1, length(dims_to_subset))) + attr_bk <- attributes(lons) + lats <- do.call('[', c(list(x = lats), lats_to_use, list(drop = TRUE))) + attributes(lats) <- attr_bk + } + } + extra_params <- list(...) + if (!all(c('grid', 'method') %in% names(extra_params))) { + stop("Parameters 'grid' and 'method' must be specified for the ", + "CDORemapper, via the 'transform_params' argument.") + } + result <- s2dverification::CDORemap(data_array, lons, lats, ...) + return_variables <- list(result$lons, result$lats) + names(return_variables) <- c(lon_name, lat_name) + list(data_array = result$data_array, variables = return_variables) +} diff --git a/R/Collect.R b/R/Collect.R index 2325d55fcd8834392d647a76c658c04630e17751..f714899e55e56939b27edb5f13782828c5131fed 100644 --- a/R/Collect.R +++ b/R/Collect.R @@ -1,275 +1,275 @@ -Collect <- function(startr_exec, wait = TRUE, remove = TRUE) { - if (!('startR_exec' %in% class(startr_exec))) { - stop("Parameter 'startr_exec' must be an object of the class ", - "'startR_exec', as returned by Collect(..., wait = FALSE).") - } - if (Sys.which('ecflow_client') == '') { - stop("ecFlow must be installed in order to collect results from a ", - "Compute() execution.") - } - cluster <- startr_exec[['cluster']] - ecflow_server <- startr_exec[['ecflow_server']] - suite_id <- startr_exec[['suite_id']] - chunks <- startr_exec[['chunks']] - num_outputs <- startr_exec[['num_outputs']] - ecflow_suite_dir <- startr_exec[['ecflow_suite_dir']] - timings <- startr_exec[['timings']] - ecflow_suite_dir_suite <- paste0(ecflow_suite_dir, '/STARTR_CHUNKING_', - suite_id, '/') - if (!is.null(cluster[['temp_dir']])) { - remote_ecflow_suite_dir_suite <- paste0(cluster[['temp_dir']], - '/STARTR_CHUNKING_', - suite_id, '/') - } - find_task_name <- function(received_file) { - file_name <- received_file - parts <- strsplit(file_name, '__')[[1]] - parts <- parts[c(2:(length(parts) - 1))] - chunk_indices <- rev(sapply(parts, function(x) { - as.numeric(strsplit(x, '_')[[1]][2]) - })) - task_pattern <- paste(paste0('*_', chunk_indices, '/'), - collapse = '') - task_glob <- paste0(ecflow_suite_dir_suite, '/*/*/', - task_pattern) - task_path <- Sys.glob(task_glob) - if (length(task_path) != 1) { - stop("Unexpected error while receiving results.") - } - task_name <- strsplit(task_path, 'computation')[[1]][2] - task_name <- paste0('/STARTR_CHUNKING_', suite_id, - '/computation', task_name) - task_name - } - done <- FALSE - attempt <- 1 - sum_received_chunks <- sum(grepl('output.*\\.Rds', - list.files(ecflow_suite_dir_suite))) - if (cluster[['bidirectional']]) { - t_transfer_back <- NA - } else { - t_transfer_back <- 0 - } - time_before_first_chunk <- startr_exec[['t_begin_first_chunk']] - first_chunk_received <- FALSE - rsync_petition_file_lines <- c('+ *.Rds', '+ *.timings', '+ *.crashed', - '+ *.running', '- *') - rsync_petition_file <- tempfile() - writeLines(rsync_petition_file_lines, rsync_petition_file) - Sys.sleep(2) - while (!done) { - failed <- FALSE - if (cluster[['bidirectional']]) { - status <- system(paste0("ecflow_client --get_state=STARTR_CHUNKING_", - suite_id, " --host=", - ecflow_server[['host']], " --port=", ecflow_server[['port']]), - intern = TRUE) - if (any(grepl(paste0("suite STARTR_CHUNKING_", suite_id, " #.* state:complete"), status))) { - done <- TRUE - } else if (!wait) { - stop("Computation in progress...") - } - if (!first_chunk_received) { - if (any(grepl('state:complete', status))) { - if (!is.null(time_before_first_chunk)) { - time_after_first_chunk <- Sys.time() - estimate <- (time_after_first_chunk - - time_before_first_chunk) * - ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / - cluster[['max_jobs']]) - units(estimate) <- 'mins' - startR:::.message( - paste0('Remaining time estimate (neglecting queue and ', - 'merge time) (at ', format(time_after_first_chunk), - '): ', format(estimate), ' (', - format(time_after_first_chunk - - time_before_first_chunk), ' per chunk)') - ) - } - first_chunk_received <- TRUE - } - } - Sys.sleep(min(sqrt(attempt), 5)) - } else { - #if (sum_received_chunks == 0) { - # # Accounting for the fist chunk received in ByChunks and - # # setting it to complete - # # ByChunks needs the first chunk to calculate remaining time - # received_files <- list.files(ecflow_suite_dir_suite) - # received_chunks <- received_files[grepl('Rds$', - # received_files)] - #} - t_begin_transfer_back <- Sys.time() - rsync_output <- tryCatch({ - system(paste0("rsync -rav --include-from=", rsync_petition_file, " '", - cluster[['queue_host']], ":", remote_ecflow_suite_dir_suite, "' ", - ecflow_suite_dir_suite, "/"), intern = TRUE) - }, error = function(e) { - message("Warning: rsync from remote server to collect results failed. ", - "Retrying soon.") - failed <- TRUE - }) - t_end_transfer_back <- Sys.time() - t_transfer_back <- t_transfer_back + as.numeric(difftime(t_end_transfer_back, - t_begin_transfer_back, units = 'secs')) - if (!failed) { - #if (sum_received_chunks == 0) { - # rsync_output <- c(rsync_output, received_chunks) - #} - received_running <- grepl('running$', rsync_output) - for (received_chunk_index in which(received_running)) { - file_name <- rsync_output[received_chunk_index] - task_name <- find_task_name(file_name) - system(paste0('ecflow_client --force=active recursive ', - task_name, - " --host=", ecflow_server[['host']], - " --port=", ecflow_server[['port']])) - } - received_crashed <- grepl('crashed$', rsync_output) - for (received_chunk_index in which(received_crashed)) { - file_name <- rsync_output[received_chunk_index] - task_name <- find_task_name(file_name) - system(paste0('ecflow_client --force=aborted recursive ', - task_name, - " --host=", ecflow_server[['host']], - " --port=", ecflow_server[['port']])) - } - received_chunks <- grepl('Rds$', rsync_output) - for (received_chunk_index in which(received_chunks)) { - file_name <- rsync_output[received_chunk_index] - task_name <- find_task_name(file_name) - system(paste0('ecflow_client --force=complete recursive ', - task_name, - " --host=", ecflow_server[['host']], - " --port=", ecflow_server[['port']])) - sum_received_chunks <- sum_received_chunks + 1 - if (!first_chunk_received) { - if (!is.null(time_before_first_chunk)) { - time_after_first_chunk <- Sys.time() - estimate <- (time_after_first_chunk - - time_before_first_chunk) * - ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / - cluster[['max_jobs']]) - units(estimate) <- 'mins' - startR:::.message( - paste0('Remaining time estimate (neglecting queue and ', - 'merge time) (at ', format(time_after_first_chunk), - '): ', format(estimate), ' (', - format(time_after_first_chunk - - time_before_first_chunk), ' per chunk)') - ) - } - first_chunk_received <- TRUE - } - } - if (sum_received_chunks / num_outputs == prod(unlist(chunks))) { - done <- TRUE - } else if (!wait) { - stop("Computation in progress...") - } - } - Sys.sleep(cluster[['polling_period']]) - } - attempt <- attempt + 1 - } - file.remove(rsync_petition_file) - timings[['transfer_back']] <- t_transfer_back - if (!is.null(cluster[['temp_dir']])) { - system(paste0('ssh ', cluster[['queue_host']], ' "rm -rf ', - remote_ecflow_suite_dir_suite, '"')) - } - if (remove) { - .warning("ATTENTION: The source chunks will be removed from the ", - "system. Store the result after Collect() ends if needed.") - } - t_begin_merge <- Sys.time() - result <- startR:::.MergeChunks(ecflow_suite_dir, suite_id, remove) - t_end_merge <- Sys.time() - timings[['merge']] <- as.numeric(difftime(t_end_merge, t_begin_merge, units = 'secs')) - received_files <- list.files(ecflow_suite_dir_suite, full.names = TRUE) - received_timings_files <- received_files[grepl('timings$', received_files)] - for (timings_file in received_timings_files) { - times <- readRDS(timings_file) - timings[['queue']] <- c(timings[['queue']], times['queue']) - timings[['job_setup']] <- c(timings[['job_setup']], times['job_setup']) - timings[['load']] <- c(timings[['load']], times['load']) - timings[['compute']] <- c(timings[['compute']], times['compute']) - } - if (remove) { - system(paste0("ecflow_client --delete=force yes /STARTR_CHUNKING_", - suite_id, " --host=", ecflow_server[['host']], - " --port=", ecflow_server[['port']])) - unlink(paste0(ecflow_suite_dir_suite), - recursive = TRUE) - } - if (attempt > 2) { - t_end_total <- Sys.time() - timings[['total']] <- as.numeric(difftime(t_end_total, timings[['total']], units = 'secs')) - } else { - # When attempt <= 2, it means all results were ready possibly from - # long ago, so is not straightfowrard to work out total time. - timings[['total']] <- NA - } - message(paste0("* Computation ended successfully.")) - message(paste0("* Number of chunks: ", - timings[['nchunks']])) - message(paste0("* Max. number of concurrent chunks (jobs): ", - timings[['concurrent_chunks']])) - message(paste0("* Requested cores per job: ", - timings[['cores_per_job']])) - message(paste0("* Load threads per chunk: ", - timings[['threads_load']])) - message(paste0("* Compute threads per chunk: ", - timings[['threads_compute']])) - message(paste0("* Total time (s): ", - timings[['total']])) - message(paste0("* Chunking setup: ", - timings[['bychunks_setup']])) - message(paste0("* Data upload to cluster: ", - timings[['transfer']])) - message(paste0("* All chunks: ", - timings[['total']] - - timings[['bychunks_setup']] - - timings[['transfer']] - - timings[['transfer_back']] - - timings[['merge']])) - message(paste0("* Transfer results from cluster: ", - timings[['transfer_back']])) - message(paste0("* Merge: ", - timings[['merge']])) - message(paste0("* Each chunk: ")) - message(paste0("* queue: ")) - message(paste0("* mean: ", - mean(timings[['queue']]))) - message(paste0("* min: ", - min(timings[['queue']]))) - message(paste0("* max: ", - max(timings[['queue']]))) - message(paste0("* job setup: ")) - message(paste0("* mean: ", - mean(timings[['job_setup']]))) - message(paste0("* min: ", - min(timings[['job_setup']]))) - message(paste0("* max: ", - max(timings[['job_setup']]))) - message(paste0("* load: ")) - message(paste0("* mean: ", - mean(timings[['load']]))) - message(paste0("* min: ", - min(timings[['load']]))) - message(paste0("* max: ", - max(timings[['load']]))) - message(paste0("* compute: ")) - message(paste0("* mean: ", - mean(timings[['compute']]))) - message(paste0("* min: ", - min(timings[['compute']]))) - message(paste0("* max: ", - max(timings[['compute']]))) - #system("ecflow_client --shutdown --port=5678") - #system("ecflow_stop.sh -p 5678") - #result <- readRDS(paste0(ecflow_output_dir, '/result.Rds')) - #file.remove(paste0(ecflow_output_dir, '/result.Rds')) - attr(result, 'startR_compute_profiling') <- timings - result -} +Collect <- function(startr_exec, wait = TRUE, remove = TRUE) { + if (!('startR_exec' %in% class(startr_exec))) { + stop("Parameter 'startr_exec' must be an object of the class ", + "'startR_exec', as returned by Collect(..., wait = FALSE).") + } + if (Sys.which('ecflow_client') == '') { + stop("ecFlow must be installed in order to collect results from a ", + "Compute() execution.") + } + cluster <- startr_exec[['cluster']] + ecflow_server <- startr_exec[['ecflow_server']] + suite_id <- startr_exec[['suite_id']] + chunks <- startr_exec[['chunks']] + num_outputs <- startr_exec[['num_outputs']] + ecflow_suite_dir <- startr_exec[['ecflow_suite_dir']] + timings <- startr_exec[['timings']] + ecflow_suite_dir_suite <- paste0(ecflow_suite_dir, '/STARTR_CHUNKING_', + suite_id, '/') + if (!is.null(cluster[['temp_dir']])) { + remote_ecflow_suite_dir_suite <- paste0(cluster[['temp_dir']], + '/STARTR_CHUNKING_', + suite_id, '/') + } + find_task_name <- function(received_file) { + file_name <- received_file + parts <- strsplit(file_name, '__')[[1]] + parts <- parts[c(2:(length(parts) - 1))] + chunk_indices <- rev(sapply(parts, function(x) { + as.numeric(strsplit(x, '_')[[1]][2]) + })) + task_pattern <- paste(paste0('*_', chunk_indices, '/'), + collapse = '') + task_glob <- paste0(ecflow_suite_dir_suite, '/*/*/', + task_pattern) + task_path <- Sys.glob(task_glob) + if (length(task_path) != 1) { + stop("Unexpected error while receiving results.") + } + task_name <- strsplit(task_path, 'computation')[[1]][2] + task_name <- paste0('/STARTR_CHUNKING_', suite_id, + '/computation', task_name) + task_name + } + done <- FALSE + attempt <- 1 + sum_received_chunks <- sum(grepl('output.*\\.Rds', + list.files(ecflow_suite_dir_suite))) + if (cluster[['bidirectional']]) { + t_transfer_back <- NA + } else { + t_transfer_back <- 0 + } + time_before_first_chunk <- startr_exec[['t_begin_first_chunk']] + first_chunk_received <- FALSE + rsync_petition_file_lines <- c('+ *.Rds', '+ *.timings', '+ *.crashed', + '+ *.running', '- *') + rsync_petition_file <- tempfile() + writeLines(rsync_petition_file_lines, rsync_petition_file) + Sys.sleep(2) + while (!done) { + failed <- FALSE + if (cluster[['bidirectional']]) { + status <- system(paste0("ecflow_client --get_state=STARTR_CHUNKING_", + suite_id, " --host=", + ecflow_server[['host']], " --port=", ecflow_server[['port']]), + intern = TRUE) + if (any(grepl(paste0("suite STARTR_CHUNKING_", suite_id, " #.* state:complete"), status))) { + done <- TRUE + } else if (!wait) { + stop("Computation in progress...") + } + if (!first_chunk_received) { + if (any(grepl('state:complete', status))) { + if (!is.null(time_before_first_chunk)) { + time_after_first_chunk <- Sys.time() + estimate <- (time_after_first_chunk - + time_before_first_chunk) * + ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / + cluster[['max_jobs']]) + units(estimate) <- 'mins' + startR:::.message( + paste0('Remaining time estimate (neglecting queue and ', + 'merge time) (at ', format(time_after_first_chunk), + '): ', format(estimate), ' (', + format(time_after_first_chunk - + time_before_first_chunk), ' per chunk)') + ) + } + first_chunk_received <- TRUE + } + } + Sys.sleep(min(sqrt(attempt), 5)) + } else { + #if (sum_received_chunks == 0) { + # # Accounting for the fist chunk received in ByChunks and + # # setting it to complete + # # ByChunks needs the first chunk to calculate remaining time + # received_files <- list.files(ecflow_suite_dir_suite) + # received_chunks <- received_files[grepl('Rds$', + # received_files)] + #} + t_begin_transfer_back <- Sys.time() + rsync_output <- tryCatch({ + system(paste0("rsync -rav --include-from=", rsync_petition_file, " '", + cluster[['queue_host']], ":", remote_ecflow_suite_dir_suite, "' ", + ecflow_suite_dir_suite, "/"), intern = TRUE) + }, error = function(e) { + message("Warning: rsync from remote server to collect results failed. ", + "Retrying soon.") + failed <- TRUE + }) + t_end_transfer_back <- Sys.time() + t_transfer_back <- t_transfer_back + as.numeric(difftime(t_end_transfer_back, + t_begin_transfer_back, units = 'secs')) + if (!failed) { + #if (sum_received_chunks == 0) { + # rsync_output <- c(rsync_output, received_chunks) + #} + received_running <- grepl('running$', rsync_output) + for (received_chunk_index in which(received_running)) { + file_name <- rsync_output[received_chunk_index] + task_name <- find_task_name(file_name) + system(paste0('ecflow_client --force=active recursive ', + task_name, + " --host=", ecflow_server[['host']], + " --port=", ecflow_server[['port']])) + } + received_crashed <- grepl('crashed$', rsync_output) + for (received_chunk_index in which(received_crashed)) { + file_name <- rsync_output[received_chunk_index] + task_name <- find_task_name(file_name) + system(paste0('ecflow_client --force=aborted recursive ', + task_name, + " --host=", ecflow_server[['host']], + " --port=", ecflow_server[['port']])) + } + received_chunks <- grepl('Rds$', rsync_output) + for (received_chunk_index in which(received_chunks)) { + file_name <- rsync_output[received_chunk_index] + task_name <- find_task_name(file_name) + system(paste0('ecflow_client --force=complete recursive ', + task_name, + " --host=", ecflow_server[['host']], + " --port=", ecflow_server[['port']])) + sum_received_chunks <- sum_received_chunks + 1 + if (!first_chunk_received) { + if (!is.null(time_before_first_chunk)) { + time_after_first_chunk <- Sys.time() + estimate <- (time_after_first_chunk - + time_before_first_chunk) * + ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / + cluster[['max_jobs']]) + units(estimate) <- 'mins' + startR:::.message( + paste0('Remaining time estimate (neglecting queue and ', + 'merge time) (at ', format(time_after_first_chunk), + '): ', format(estimate), ' (', + format(time_after_first_chunk - + time_before_first_chunk), ' per chunk)') + ) + } + first_chunk_received <- TRUE + } + } + if (sum_received_chunks / num_outputs == prod(unlist(chunks))) { + done <- TRUE + } else if (!wait) { + stop("Computation in progress...") + } + } + Sys.sleep(cluster[['polling_period']]) + } + attempt <- attempt + 1 + } + file.remove(rsync_petition_file) + timings[['transfer_back']] <- t_transfer_back + if (!is.null(cluster[['temp_dir']])) { + system(paste0('ssh ', cluster[['queue_host']], ' "rm -rf ', + remote_ecflow_suite_dir_suite, '"')) + } + if (remove) { + .warning("ATTENTION: The source chunks will be removed from the ", + "system. Store the result after Collect() ends if needed.") + } + t_begin_merge <- Sys.time() + result <- startR:::.MergeChunks(ecflow_suite_dir, suite_id, remove) + t_end_merge <- Sys.time() + timings[['merge']] <- as.numeric(difftime(t_end_merge, t_begin_merge, units = 'secs')) + received_files <- list.files(ecflow_suite_dir_suite, full.names = TRUE) + received_timings_files <- received_files[grepl('timings$', received_files)] + for (timings_file in received_timings_files) { + times <- readRDS(timings_file) + timings[['queue']] <- c(timings[['queue']], times['queue']) + timings[['job_setup']] <- c(timings[['job_setup']], times['job_setup']) + timings[['load']] <- c(timings[['load']], times['load']) + timings[['compute']] <- c(timings[['compute']], times['compute']) + } + if (remove) { + system(paste0("ecflow_client --delete=force yes /STARTR_CHUNKING_", + suite_id, " --host=", ecflow_server[['host']], + " --port=", ecflow_server[['port']])) + unlink(paste0(ecflow_suite_dir_suite), + recursive = TRUE) + } + if (attempt > 2) { + t_end_total <- Sys.time() + timings[['total']] <- as.numeric(difftime(t_end_total, timings[['total']], units = 'secs')) + } else { + # When attempt <= 2, it means all results were ready possibly from + # long ago, so is not straightfowrard to work out total time. + timings[['total']] <- NA + } + message(paste0("* Computation ended successfully.")) + message(paste0("* Number of chunks: ", + timings[['nchunks']])) + message(paste0("* Max. number of concurrent chunks (jobs): ", + timings[['concurrent_chunks']])) + message(paste0("* Requested cores per job: ", + timings[['cores_per_job']])) + message(paste0("* Load threads per chunk: ", + timings[['threads_load']])) + message(paste0("* Compute threads per chunk: ", + timings[['threads_compute']])) + message(paste0("* Total time (s): ", + timings[['total']])) + message(paste0("* Chunking setup: ", + timings[['bychunks_setup']])) + message(paste0("* Data upload to cluster: ", + timings[['transfer']])) + message(paste0("* All chunks: ", + timings[['total']] - + timings[['bychunks_setup']] - + timings[['transfer']] - + timings[['transfer_back']] - + timings[['merge']])) + message(paste0("* Transfer results from cluster: ", + timings[['transfer_back']])) + message(paste0("* Merge: ", + timings[['merge']])) + message(paste0("* Each chunk: ")) + message(paste0("* queue: ")) + message(paste0("* mean: ", + mean(timings[['queue']]))) + message(paste0("* min: ", + min(timings[['queue']]))) + message(paste0("* max: ", + max(timings[['queue']]))) + message(paste0("* job setup: ")) + message(paste0("* mean: ", + mean(timings[['job_setup']]))) + message(paste0("* min: ", + min(timings[['job_setup']]))) + message(paste0("* max: ", + max(timings[['job_setup']]))) + message(paste0("* load: ")) + message(paste0("* mean: ", + mean(timings[['load']]))) + message(paste0("* min: ", + min(timings[['load']]))) + message(paste0("* max: ", + max(timings[['load']]))) + message(paste0("* compute: ")) + message(paste0("* mean: ", + mean(timings[['compute']]))) + message(paste0("* min: ", + min(timings[['compute']]))) + message(paste0("* max: ", + max(timings[['compute']]))) + #system("ecflow_client --shutdown --port=5678") + #system("ecflow_stop.sh -p 5678") + #result <- readRDS(paste0(ecflow_output_dir, '/result.Rds')) + #file.remove(paste0(ecflow_output_dir, '/result.Rds')) + attr(result, 'startR_compute_profiling') <- timings + result +} diff --git a/R/Compute.R b/R/Compute.R index 12c2dcea6cfcfa923f009d37746f2c263b94e5fd..570b16d4d5f52a90ec8ca3a5d33307f6efd1ad68 100644 --- a/R/Compute.R +++ b/R/Compute.R @@ -1,75 +1,75 @@ -Compute <- function(workflow, chunks = 'auto', - threads_load = 1, threads_compute = 1, - cluster = NULL, ecflow_suite_dir = NULL, - ecflow_server = NULL, silent = FALSE, debug = FALSE, - wait = TRUE) { - # Check workflow - if (!any(c('startR_cube', 'startR_workflow') %in% class(workflow))) { - stop("Parameter 'workflow' must be an object of class 'startR_cube' as ", - "returned by Start or of class 'startR_workflow' as returned by ", - "AddStep.") - } - - if ('startR_cube' %in% class(workflow)) { - #machine_free_ram <- 1000000000 - #max_ram_ratio <- 0.5 - #data_size <- prod(c(attr(workflow, 'Dimensions'), 8)) - #if (data_size > (machine_free_ram * max_ram_ratio)) { - # stop("It is not possible to fit the requested data (", data_size, - # " bytes) into the maximum allowed free ram (", max_ram_ratio, - # " x ", machine_free_ram, ").") - #} - eval(workflow) - } else { - # TODO: - #explore tree of operations and identify set of operations that reduce dimensionality as much as possible - # while being able to fit in (cluster and to exploit number of available nodes) | (machine) - #combine set of operations into a single function - #Goal: to build manually a function following this pattern: - #operation <- function(input1, input2) { - # fun1 <- workflow$fun - # fun1(input1, input2, names(workflow$params)[1] = workflow$params[[1]]) - #} - op_text <- "function(" - op_text <- paste0(op_text, - paste(paste0('input', 1:length(workflow$inputs)), - collapse = ', ')) - op_text <- paste0(op_text, ") {") - op_text <- paste0(op_text, "\n fun1 <- ", paste(deparse(workflow$fun), collapse = '\n')) - op_text <- paste0(op_text, "\n res <- fun1(", - paste(paste0('input', 1:length(workflow$inputs)), - collapse = ", ")) - if (length(workflow$params) > 0) { - for (j in 1:length(workflow$params)) { - op_text <- paste0(op_text, ", ") - op_text <- paste0(op_text, names(workflow$params)[j], " = ", - paste(deparse(workflow$params[[j]]), collapse = '\n')) - } - } - op_text <- paste0(op_text, ")") - op_text <- paste0(op_text, "\n}") - operation <- eval(parse(text = op_text)) - operation <- Step(operation, - attr(workflow$fun, 'TargetDims'), - attr(workflow$fun, 'OutputDims'), - attr(workflow$fun, 'UseLibraries'), - attr(workflow$fun, 'UseAttributes')) - - if (!all(sapply(workflow$inputs, class) == 'startR_cube')) { - stop("Workflows with only one step supported by now.") - } - # Run ByChunks with the combined operation - res <- ByChunks(step_fun = operation, - cube_headers = workflow$inputs, - chunks = chunks, - threads_load = threads_load, - threads_compute = threads_compute, - cluster = cluster, - ecflow_suite_dir = ecflow_suite_dir, - ecflow_server = ecflow_server, - silent = silent, debug = debug, wait = wait) - # TODO: carry out remaining steps locally, using multiApply - # Return results - res - } -} +Compute <- function(workflow, chunks = 'auto', + threads_load = 1, threads_compute = 1, + cluster = NULL, ecflow_suite_dir = NULL, + ecflow_server = NULL, silent = FALSE, debug = FALSE, + wait = TRUE) { + # Check workflow + if (!any(c('startR_cube', 'startR_workflow') %in% class(workflow))) { + stop("Parameter 'workflow' must be an object of class 'startR_cube' as ", + "returned by Start or of class 'startR_workflow' as returned by ", + "AddStep.") + } + + if ('startR_cube' %in% class(workflow)) { + #machine_free_ram <- 1000000000 + #max_ram_ratio <- 0.5 + #data_size <- prod(c(attr(workflow, 'Dimensions'), 8)) + #if (data_size > (machine_free_ram * max_ram_ratio)) { + # stop("It is not possible to fit the requested data (", data_size, + # " bytes) into the maximum allowed free ram (", max_ram_ratio, + # " x ", machine_free_ram, ").") + #} + eval(workflow) + } else { + # TODO: + #explore tree of operations and identify set of operations that reduce dimensionality as much as possible + # while being able to fit in (cluster and to exploit number of available nodes) | (machine) + #combine set of operations into a single function + #Goal: to build manually a function following this pattern: + #operation <- function(input1, input2) { + # fun1 <- workflow$fun + # fun1(input1, input2, names(workflow$params)[1] = workflow$params[[1]]) + #} + op_text <- "function(" + op_text <- paste0(op_text, + paste(paste0('input', 1:length(workflow$inputs)), + collapse = ', ')) + op_text <- paste0(op_text, ") {") + op_text <- paste0(op_text, "\n fun1 <- ", paste(deparse(workflow$fun), collapse = '\n')) + op_text <- paste0(op_text, "\n res <- fun1(", + paste(paste0('input', 1:length(workflow$inputs)), + collapse = ", ")) + if (length(workflow$params) > 0) { + for (j in 1:length(workflow$params)) { + op_text <- paste0(op_text, ", ") + op_text <- paste0(op_text, names(workflow$params)[j], " = ", + paste(deparse(workflow$params[[j]]), collapse = '\n')) + } + } + op_text <- paste0(op_text, ")") + op_text <- paste0(op_text, "\n}") + operation <- eval(parse(text = op_text)) + operation <- Step(operation, + attr(workflow$fun, 'TargetDims'), + attr(workflow$fun, 'OutputDims'), + attr(workflow$fun, 'UseLibraries'), + attr(workflow$fun, 'UseAttributes')) + + if (!all(sapply(workflow$inputs, class) == 'startR_cube')) { + stop("Workflows with only one step supported by now.") + } + # Run ByChunks with the combined operation + res <- ByChunks(step_fun = operation, + cube_headers = workflow$inputs, + chunks = chunks, + threads_load = threads_load, + threads_compute = threads_compute, + cluster = cluster, + ecflow_suite_dir = ecflow_suite_dir, + ecflow_server = ecflow_server, + silent = silent, debug = debug, wait = wait) + # TODO: carry out remaining steps locally, using multiApply + # Return results + res + } +} diff --git a/R/SelectorChecker.R b/R/SelectorChecker.R index 14bae7edc40ef2bb921567a0b932517a13594d78..8765da287cba3277b7ac1874b056588d49252688 100644 --- a/R/SelectorChecker.R +++ b/R/SelectorChecker.R @@ -1,269 +1,269 @@ -SelectorChecker <- function(selectors, var = NULL, return_indices = TRUE, - tolerance = NULL) { - if (length(selectors) == 0) { - stop("No selectors provided in 'selectors'.") - } - if (return_indices) { - if (is.list(selectors)) { - if (length(selectors) != 2) { - stop("'selectors' provided in a wrong format.") - } - crescent_selectors <- TRUE - if (all(sapply(selectors, - function(x) { - any(c('numeric', "POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(x)) - }))) { - if (selectors[[2]] < selectors[[1]]) { - crescent_selectors <- FALSE - } - } - for (i in 1:length(selectors)) { - if (is.null(var)) { - if (!is.numeric(selectors[[i]])) { - stop("No selector values provided in 'var'.") - } else { - selectors[[i]] <- round(selectors[[i]]) - } - } else if (is.na(selectors[[i]])) { - if (i == 1) { - if (crescent_selectors) { - selectors[[i]] <- 1 - } else { - selectors[[i]] <- length(var) - } - } - else { - if (crescent_selectors) { - selectors[[i]] <- length(var) - } else { - selectors[[i]] <- 1 - } - } - } else if (is.character(selectors[[i]])) { - if (is.character(var)) { - candidate <- which(var == selectors[[i]]) - if (length(candidate) > 0) { - selectors[[i]] <- candidate[1] - } else { - stop("Selector value not found in 'var'.") - } - } else { - stop("Character selectors provided but possible values in 'var' are not character.") - } - } else if (is.numeric(selectors[[i]])) { - if (is.numeric(var)) { - - tol <- 0 - if (!is.null(tolerance)) { - if (!any(class(tolerance) %in% "numeric")) { - stop("Expected a numeric *_tolerance.") - } - tol <- tolerance - } - - val <- selectors[[i]] - - if (i == 1) { - if (crescent_selectors) { - val <- val - tol - if (var[1] < var[2]) { - selectors[[i]] <- which(var >= val)[1] - } else if (var[1] > var[2]) { - selectors[[i]] <- rev(which(var >= val))[1] - } - - } else { - val <- val + tol - if (var[1] < var[2]) { - selectors[[i]] <- rev(which(var <= val))[1] - } else if (var[1] > var[2]) { - selectors[[i]] <- which(var <= val)[1] - } - } - } - else if (i == 2) { - if (crescent_selectors) { - val <- val + tol - if (var[1] < var[2]) { - selectors[[i]] <- rev(which(var <= val))[1] - } else if (var[1] > var[2]) { - selectors[[i]] <- which(var <= val)[1] - } - - } else { - val <- val - tol - if (var[1] < var[2]) { - selectors[[i]] <- which(var >= val)[1] - } else if (var[1] > var[2]) { - selectors[[i]] <- rev(which(var >= val))[1] - } - } - } - - - } else { - stop("Numeric selectors provided but possible values in 'var' are not numeric.") - } - } else if (any(c("POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(selectors[[i]]))) { - # TODO: Here, change to as above (numeric part). - if (any(c("POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(var))) { - val <- selectors[[i]] - tol <- 0 - if (!is.null(tolerance)) { - if (!any(class(tolerance) %in% "difftime")) { - stop("Expected a difftime *_tolerance.") - } - tol <- tolerance - } - if (i == 1) { - if (crescent_selectors) { - val <- val - tol - selectors[[i]] <- which(var >= val)[1] - } else { - val <- val + tol - selectors[[i]] <- rev(which(var <= val))[1] - } - } - else { - if (crescent_selectors) { - val <- val + tol - selectors[[i]] <- rev(which(var <= val))[1] - } else { - val <- val - tol - selectors[[i]] <- which(var >= val)[1] - } - } - } else { - stop("Datetime selectors provided but possible values in 'var' are not datetime.") - } - } - } - - # The checker is returning a list of two indices. - ##selectors[[1]]:selectors[[2]] - selectors - } else if (is.numeric(selectors)) { - if (is.null(var)) { - ## TODO: Crash if negative indices? - round(selectors) - } else { - if (is.numeric(var)) { - if (!all(selectors %in% var)) { - .warning(paste0("Numeric selectors have been ", - "provided for a dimension defined along a ", - "numeric variable, but no exact match ", - "found for all the selectors. Taking the index of the ", - "nearest values.")) - } - if (!is.null(tolerance)) { - if (!any(class(tolerance) %in% 'numeric')) { - stop("Expected a numeric *_tolerance.") - } - } - sapply(selectors, function(x) { - dif <- abs(var - x) - res <- which.min(dif)[1] - if (!is.null(tolerance)) { - if (dif[res] > tolerance) { - stop("Could not find a value in 'var' close ", - "enough to one of the 'selectors', ", - "according to 'tolerance'.") - } - } - res - }) - } else { - stop("Numeric selectors provided but possible values in 'var' are not numeric.") - } - } - } else if (any(c('POSIXct', 'POSIXlt', 'POSIXt', 'Date') %in% class(selectors))) { - if (is.null(var)) { - stop("Numeric selectors have been provided for a dimension ", - "defined along a date variable, but no possible values ", - "provided in 'var'.") - } - if (!all(selectors %in% var)) { - .warning(paste0("Date selectors have been ", - "provided for a dimension defined along a ", - "date variable, but no exact match ", - "found for all the selectors. Taking the index of the ", - "nearest values.")) - } - if (!is.null(tolerance)) { - if (!any(class(tolerance) %in% 'difftime')) { - stop("Expected a difftime *_tolerance.") - } - } - sapply(selectors, function(x) { - dif <- abs(var - x) - res <- which.min(dif)[1] - if (!is.null(tolerance)) { - if (dif[res] > tolerance) { - res <- NA - #stop("Could not find a value in 'var' close ", - # "enough to one of the 'selectors', ", - # "according to 'tolerance'.") - } - } - res - }) - } else { - if (is.null(var)) { - stop("No selector values provided in 'var'.") - } else { - if ((length(selectors) == 1) && - (selectors %in% c('all', 'first', 'last'))) { - if (selectors == 'all') { - 1:length(var) - } else if (selectors == 'first') { - 1 - } else { - length(var) - } - } else { - if (!identical(class(var), class(selectors))) { - stop("Class of provided selectors does not match class of 'var'.") - } - candidates <- match(as.vector(selectors), as.vector(var)) - if (length(candidates) == 0 | any(is.na(candidates))) { - stop("Selectors do not match values in 'var'.") - } else if (length(candidates) != length(selectors)) { - stop("Some selectors do not match values in 'var'.") - } - candidates - } - } - } - } else { - if (!is.null(var)) { - if (is.list(selectors)) { - if (length(selectors) != 2) { - stop("'selectors' provided in a wrong format.") - } else { - var[selectors[[1]]:selectors[[2]]] - } - } else if (is.numeric(selectors)) { - if (length(selectors) > 0) { - var[selectors] - } else { - stop("No selectors provided.") - } - } else { - if ((length(selectors) == 1) && - (selectors %in% c('all', 'first', 'last'))) { - if (selectors == 'all') { - var - } else if (selectors == 'first') { - head(var, 1) - } else { - tail(var, 1) - } - } else { - selectors - } - } - } else { - selectors - } - } -} +SelectorChecker <- function(selectors, var = NULL, return_indices = TRUE, + tolerance = NULL) { + if (length(selectors) == 0) { + stop("No selectors provided in 'selectors'.") + } + if (return_indices) { + if (is.list(selectors)) { + if (length(selectors) != 2) { + stop("'selectors' provided in a wrong format.") + } + crescent_selectors <- TRUE + if (all(sapply(selectors, + function(x) { + any(c('numeric', "POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(x)) + }))) { + if (selectors[[2]] < selectors[[1]]) { + crescent_selectors <- FALSE + } + } + for (i in 1:length(selectors)) { + if (is.null(var)) { + if (!is.numeric(selectors[[i]])) { + stop("No selector values provided in 'var'.") + } else { + selectors[[i]] <- round(selectors[[i]]) + } + } else if (is.na(selectors[[i]])) { + if (i == 1) { + if (crescent_selectors) { + selectors[[i]] <- 1 + } else { + selectors[[i]] <- length(var) + } + } + else { + if (crescent_selectors) { + selectors[[i]] <- length(var) + } else { + selectors[[i]] <- 1 + } + } + } else if (is.character(selectors[[i]])) { + if (is.character(var)) { + candidate <- which(var == selectors[[i]]) + if (length(candidate) > 0) { + selectors[[i]] <- candidate[1] + } else { + stop("Selector value not found in 'var'.") + } + } else { + stop("Character selectors provided but possible values in 'var' are not character.") + } + } else if (is.numeric(selectors[[i]])) { + if (is.numeric(var)) { + + tol <- 0 + if (!is.null(tolerance)) { + if (!any(class(tolerance) %in% "numeric")) { + stop("Expected a numeric *_tolerance.") + } + tol <- tolerance + } + + val <- selectors[[i]] + + if (i == 1) { + if (crescent_selectors) { + val <- val - tol + if (var[1] < var[2]) { + selectors[[i]] <- which(var >= val)[1] + } else if (var[1] > var[2]) { + selectors[[i]] <- rev(which(var >= val))[1] + } + + } else { + val <- val + tol + if (var[1] < var[2]) { + selectors[[i]] <- rev(which(var <= val))[1] + } else if (var[1] > var[2]) { + selectors[[i]] <- which(var <= val)[1] + } + } + } + else if (i == 2) { + if (crescent_selectors) { + val <- val + tol + if (var[1] < var[2]) { + selectors[[i]] <- rev(which(var <= val))[1] + } else if (var[1] > var[2]) { + selectors[[i]] <- which(var <= val)[1] + } + + } else { + val <- val - tol + if (var[1] < var[2]) { + selectors[[i]] <- which(var >= val)[1] + } else if (var[1] > var[2]) { + selectors[[i]] <- rev(which(var >= val))[1] + } + } + } + + + } else { + stop("Numeric selectors provided but possible values in 'var' are not numeric.") + } + } else if (any(c("POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(selectors[[i]]))) { + # TODO: Here, change to as above (numeric part). + if (any(c("POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(var))) { + val <- selectors[[i]] + tol <- 0 + if (!is.null(tolerance)) { + if (!any(class(tolerance) %in% "difftime")) { + stop("Expected a difftime *_tolerance.") + } + tol <- tolerance + } + if (i == 1) { + if (crescent_selectors) { + val <- val - tol + selectors[[i]] <- which(var >= val)[1] + } else { + val <- val + tol + selectors[[i]] <- rev(which(var <= val))[1] + } + } + else { + if (crescent_selectors) { + val <- val + tol + selectors[[i]] <- rev(which(var <= val))[1] + } else { + val <- val - tol + selectors[[i]] <- which(var >= val)[1] + } + } + } else { + stop("Datetime selectors provided but possible values in 'var' are not datetime.") + } + } + } + + # The checker is returning a list of two indices. + ##selectors[[1]]:selectors[[2]] + selectors + } else if (is.numeric(selectors)) { + if (is.null(var)) { + ## TODO: Crash if negative indices? + round(selectors) + } else { + if (is.numeric(var)) { + if (!all(selectors %in% var)) { + .warning(paste0("Numeric selectors have been ", + "provided for a dimension defined along a ", + "numeric variable, but no exact match ", + "found for all the selectors. Taking the index of the ", + "nearest values.")) + } + if (!is.null(tolerance)) { + if (!any(class(tolerance) %in% 'numeric')) { + stop("Expected a numeric *_tolerance.") + } + } + sapply(selectors, function(x) { + dif <- abs(var - x) + res <- which.min(dif)[1] + if (!is.null(tolerance)) { + if (dif[res] > tolerance) { + stop("Could not find a value in 'var' close ", + "enough to one of the 'selectors', ", + "according to 'tolerance'.") + } + } + res + }) + } else { + stop("Numeric selectors provided but possible values in 'var' are not numeric.") + } + } + } else if (any(c('POSIXct', 'POSIXlt', 'POSIXt', 'Date') %in% class(selectors))) { + if (is.null(var)) { + stop("Numeric selectors have been provided for a dimension ", + "defined along a date variable, but no possible values ", + "provided in 'var'.") + } + if (!all(selectors %in% var)) { + .warning(paste0("Date selectors have been ", + "provided for a dimension defined along a ", + "date variable, but no exact match ", + "found for all the selectors. Taking the index of the ", + "nearest values.")) + } + if (!is.null(tolerance)) { + if (!any(class(tolerance) %in% 'difftime')) { + stop("Expected a difftime *_tolerance.") + } + } + sapply(selectors, function(x) { + dif <- abs(var - x) + res <- which.min(dif)[1] + if (!is.null(tolerance)) { + if (dif[res] > tolerance) { + res <- NA + #stop("Could not find a value in 'var' close ", + # "enough to one of the 'selectors', ", + # "according to 'tolerance'.") + } + } + res + }) + } else { + if (is.null(var)) { + stop("No selector values provided in 'var'.") + } else { + if ((length(selectors) == 1) && + (selectors %in% c('all', 'first', 'last'))) { + if (selectors == 'all') { + 1:length(var) + } else if (selectors == 'first') { + 1 + } else { + length(var) + } + } else { + if (!identical(class(var), class(selectors))) { + stop("Class of provided selectors does not match class of 'var'.") + } + candidates <- match(as.vector(selectors), as.vector(var)) + if (length(candidates) == 0 | any(is.na(candidates))) { + stop("Selectors do not match values in 'var'.") + } else if (length(candidates) != length(selectors)) { + stop("Some selectors do not match values in 'var'.") + } + candidates + } + } + } + } else { + if (!is.null(var)) { + if (is.list(selectors)) { + if (length(selectors) != 2) { + stop("'selectors' provided in a wrong format.") + } else { + var[selectors[[1]]:selectors[[2]]] + } + } else if (is.numeric(selectors)) { + if (length(selectors) > 0) { + var[selectors] + } else { + stop("No selectors provided.") + } + } else { + if ((length(selectors) == 1) && + (selectors %in% c('all', 'first', 'last'))) { + if (selectors == 'all') { + var + } else if (selectors == 'first') { + head(var, 1) + } else { + tail(var, 1) + } + } else { + selectors + } + } + } else { + selectors + } + } +} diff --git a/R/Sort.R b/R/Sort.R index bff2654a4dd396078e12315f4f5cfb27fca8eb82..825272bf860d8ec569fdf9559bc21cfff9417554 100644 --- a/R/Sort.R +++ b/R/Sort.R @@ -1,36 +1,36 @@ -Sort <- function(...) { - params <- list(...) - f <- "function(x) { - dim_bk <- dim(x) - x <- do.call(sort, c(list(x, index.return = TRUE), - PARAMS)) - dim(x$x) <- dim_bk - dim(x$ix) <- dim_bk - x - }" - f <- gsub("PARAMS", deparse(params), f) - r <- eval(parse(text = f)) - attr(r, 'circular') <- FALSE - r -} - -CircularSort <- function(start, end, ...) { - params <- list(...) - f <- "function (x) { - start <- START - end <- END - dim_bk <- dim(x) - x <- do.call(sort, c(list((x - start) %% (end - start) + start, - index.return = TRUE), - PARAMS)) - dim(x$x) <- dim_bk - dim(x$ix) <- dim_bk - x - }" - f <- gsub("START", deparse(start), f) - f <- gsub("END", deparse(end), f) - f <- gsub("PARAMS", deparse(params), f) - r <- eval(parse(text = f)) - attr(r, 'circular') <- TRUE - r -} +Sort <- function(...) { + params <- list(...) + f <- "function(x) { + dim_bk <- dim(x) + x <- do.call(sort, c(list(x, index.return = TRUE), + PARAMS)) + dim(x$x) <- dim_bk + dim(x$ix) <- dim_bk + x + }" + f <- gsub("PARAMS", deparse(params), f) + r <- eval(parse(text = f)) + attr(r, 'circular') <- FALSE + r +} + +CircularSort <- function(start, end, ...) { + params <- list(...) + f <- "function (x) { + start <- START + end <- END + dim_bk <- dim(x) + x <- do.call(sort, c(list((x - start) %% (end - start) + start, + index.return = TRUE), + PARAMS)) + dim(x$x) <- dim_bk + dim(x$ix) <- dim_bk + x + }" + f <- gsub("START", deparse(start), f) + f <- gsub("END", deparse(end), f) + f <- gsub("PARAMS", deparse(params), f) + r <- eval(parse(text = f)) + attr(r, 'circular') <- TRUE + r +} diff --git a/R/Start.R b/R/Start.R index 02d2b4c5edb27dec589d80296386b9143ec531dc..3c544a4cc58fb14bc65c777616476cf143b558ae 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1,3488 +1,3488 @@ -Start <- function(..., # dim = indices/selectors, - # dim_var = 'var', - # dim_reorder = Sort/CircularSort, - # dim_tolerance = number, - # dim_depends = 'file_dim', - # dim_across = 'file_dim', - return_vars = NULL, - synonims = NULL, - file_opener = NcOpener, - file_var_reader = NcVarReader, - file_dim_reader = NcDimReader, - file_data_reader = NcDataReader, - file_closer = NcCloser, - transform = NULL, - transform_params = NULL, - transform_vars = NULL, - transform_extra_cells = 2, - apply_indices_after_transform = FALSE, - pattern_dims = NULL, - metadata_dims = NULL, - selector_checker = SelectorChecker, - merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, - split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, - retrieve = FALSE, - num_procs = 1, - silent = FALSE, debug = FALSE) { - #, config_file = NULL - #dictionary_dim_names = , - #dictionary_var_names = - dim_params <- list(...) - - # Take *_var parameters apart - var_params_ind <- grep('_var$', names(dim_params)) - var_params <- dim_params[var_params_ind] - # Check all *_var are NULL or vectors of character strings, and - # that they all have a matching dimension param. - i <- 1 - for (var_param in var_params) { - if (!is.character(var_param)) { - stop("All '*_var' parameters must be character strings.") - } else if (!any(grepl(paste0('^', strsplit(names(var_params)[i], - '_var$')[[1]][1], '$'), - names(dim_params)))) { - stop(paste0("All '*_var' parameters must be associated to a dimension parameter. Found parameter '", - names(var_params)[i], "' but no parameter '", - strsplit(names(var_params)[i], '_var$')[[1]][1], "'.")) - } - i <- i + 1 - } - # Make the keys of 'var_params' to be the name of - # the corresponding dimension. - if (length(var_params) < 1) { - var_params <- NULL - } else { - names(var_params) <- gsub('_var$', '', names(var_params)) - } - - # Take *_reorder parameters apart - dim_reorder_params_ind <- grep('_reorder$', names(dim_params)) - dim_reorder_params <- dim_params[dim_reorder_params_ind] - # Make the keys of 'dim_reorder_params' to be the name of - # the corresponding dimension. - if (length(dim_reorder_params) < 1) { - dim_reorder_params <- NULL - } else { - names(dim_reorder_params) <- gsub('_reorder$', '', names(dim_reorder_params)) - } - - # Take *_tolerance parameters apart - tolerance_params_ind <- grep('_tolerance$', names(dim_params)) - tolerance_params <- dim_params[tolerance_params_ind] - - # Take *_depends parameters apart - depends_params_ind <- grep('_depends$', names(dim_params)) - depends_params <- dim_params[depends_params_ind] - # Check all *_depends are NULL or vectors of character strings, and - # that they all have a matching dimension param. - i <- 1 - for (depends_param in depends_params) { - if (!is.character(depends_param) || (length(depends_param) > 1)) { - stop("All '*_depends' parameters must be single character strings.") - } else if (!any(grepl(paste0('^', strsplit(names(depends_params)[i], - '_depends$')[[1]][1], '$'), - names(dim_params)))) { - stop(paste0("All '*_depends' parameters must be associated to a dimension parameter. Found parameter '", - names(depends_params)[i], "' but no parameter '", - strsplit(names(depends_params)[i], '_depends$')[[1]][1], "'.")) - } - i <- i + 1 - } - # Make the keys of 'depends_params' to be the name of - # the corresponding dimension. - if (length(depends_params) < 1) { - depends_params <- NULL - } else { - names(depends_params) <- gsub('_depends$', '', names(depends_params)) - } - # Change name to depending_file_dims - depending_file_dims <- depends_params - - # Take *_across parameters apart - across_params_ind <- grep('_across$', names(dim_params)) - across_params <- dim_params[across_params_ind] - # Check all *_across are NULL or vectors of character strings, and - # that they all have a matching dimension param. - i <- 1 - for (across_param in across_params) { - if (!is.character(across_param) || (length(across_param) > 1)) { - stop("All '*_across' parameters must be single character strings.") - } else if (!any(grepl(paste0('^', strsplit(names(across_params)[i], - '_across$')[[1]][1], '$'), - names(dim_params)))) { - stop(paste0("All '*_across' parameters must be associated to a dimension parameter. Found parameter '", - names(across_params)[i], "' but no parameter '", - strsplit(names(across_params)[i], '_across$')[[1]][1], "'.")) - } - i <- i + 1 - } - # Make the keys of 'across_params' to be the name of - # the corresponding dimension. - if (length(across_params) < 1) { - across_params <- NULL - } else { - names(across_params) <- gsub('_across$', '', names(across_params)) - } - # Change name to inner_dims_across_files - inner_dims_across_files <- across_params - - # Check merge_across_dims - if (!is.logical(merge_across_dims)) { - stop("Parameter 'merge_across_dims' must be TRUE or FALSE.") - } - - # Check merge_across_dims_narm - if (!is.logical(merge_across_dims_narm)) { - stop("Parameter 'merge_across_dims_narm' must be TRUE or FALSE.") - } - if (!merge_across_dims & merge_across_dims_narm) { - merge_across_dims_narm <- FALSE - warning(paste0("Parameter 'merge_across_dims_narm' can only be TRUE when ", - "'merge_across_dims' is TRUE. Set 'merge_across_dims_narm'", - " to FALSE.")) - } - - # Leave alone the dimension parameters in the variable dim_params - if (length(c(var_params_ind, dim_reorder_params_ind, tolerance_params_ind, - depends_params_ind, across_params_ind)) > 0) { - dim_params <- dim_params[-c(var_params_ind, dim_reorder_params_ind, - tolerance_params_ind, depends_params_ind, - across_params_ind)] - # Reallocating pairs of across file and inner dimensions if they have - # to be merged. They are put one next to the other to ease merge later. - if (merge_across_dims) { - for (inner_dim_across in names(inner_dims_across_files)) { - inner_dim_pos <- which(names(dim_params) == inner_dim_across) - file_dim_pos <- which(names(dim_params) == inner_dims_across_files[[inner_dim_across]]) - new_pos <- inner_dim_pos - if (file_dim_pos < inner_dim_pos) { - new_pos <- new_pos - 1 - } - dim_params_to_move <- dim_params[c(inner_dim_pos, file_dim_pos)] - dim_params <- dim_params[-c(inner_dim_pos, file_dim_pos)] - new_dim_params <- list() - if (new_pos > 1) { - new_dim_params <- c(new_dim_params, dim_params[1:(new_pos - 1)]) - } - new_dim_params <- c(new_dim_params, dim_params_to_move) - if (length(dim_params) >= new_pos) { - new_dim_params <- c(new_dim_params, dim_params[new_pos:length(dim_params)]) - } - dim_params <- new_dim_params - } - } - } - dim_names <- names(dim_params) - if (is.null(dim_names)) { - stop("At least one pattern dim must be specified.") - } - - # Look for chunked dims - chunks <- vector('list', length(dim_names)) - names(chunks) <- dim_names - for (dim_name in dim_names) { - if (!is.null(attr(dim_params[[dim_name]], 'chunk'))) { - chunks[[dim_name]] <- attr(dim_params[[dim_name]], 'chunk') - attributes(dim_params[[dim_name]]) <- attributes(dim_params[[dim_name]])[-which(names(attributes(dim_params[[dim_name]])) == 'chunk')] - } else { - chunks[[dim_name]] <- c(chunk = 1, n_chunks = 1) - } - } - # This is a helper function to compute the chunk indices to take once the total - # number of indices for a dimension has been discovered. - chunk_indices <- function(n_indices, chunk, n_chunks, dim_name) { - if (n_chunks > n_indices) { - stop("Requested to divide dimension '", dim_name, "' of length ", - n_indices, " in ", n_chunks, " chunks, which is not possible.") - } - chunk_sizes <- rep(floor(n_indices / n_chunks), n_chunks) - chunks_to_extend <- n_indices - chunk_sizes[1] * n_chunks - if (chunks_to_extend > 0) { - chunk_sizes[1:chunks_to_extend] <- chunk_sizes[1:chunks_to_extend] + 1 - } - chunk_size <- chunk_sizes[chunk] - offset <- 0 - if (chunk > 1) { - offset <- sum(chunk_sizes[1:(chunk - 1)]) - } - indices <- 1:chunk_sizes[chunk] + offset - array(indices, dim = setNames(length(indices), dim_name)) - } - - # Check pattern_dims - if (is.null(pattern_dims)) { - .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '", - dim_names[1], "' as 'pattern_dims'.")) - pattern_dims <- dim_names[1] - } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) { - pattern_dims <- unique(pattern_dims) - } else { - stop("Parameter 'pattern_dims' must be a vector of character strings.") - } - if (any(names(var_params) %in% pattern_dims)) { - stop("'*_var' parameters specified for pattern dimensions. Remove or fix them.") - } - # Find the pattern dimension with the pattern specifications - found_pattern_dim <- NULL - for (pattern_dim in pattern_dims) { - # Check all specifications in pattern_dim are valid - dat <- datasets <- dim_params[[pattern_dim]] - if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) { - stop(paste0("Parameter '", pattern_dim, - "' must be a list of lists with pattern specifications or a vector of character strings.")) - } - if (!is.null(dim_reorder_params[[pattern_dim]])) { - .warning(paste0("A reorder for the selectors of '", pattern_dim, - "' has been specified, but it is a pattern dimension and the reorder will be ignored.")) - } - if (is.list(dat) || any(sapply(dat, is.list))) { - if (is.null(found_pattern_dim)) { - found_pattern_dim <- pattern_dim - } else { - stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.") - } - } - } - if (is.null(found_pattern_dim)) { - .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications.")) - found_pattern_dim <- pattern_dims[1] - } - - # Check all *_reorder are NULL or functions, and that they all have - # a matching dimension param. - i <- 1 - for (dim_reorder_param in dim_reorder_params) { - if (!is.function(dim_reorder_param)) { - stop("All '*_reorder' parameters must be functions.") - } else if (!any(grepl(paste0('^', strsplit(names(dim_reorder_params)[i], - '_reorder$')[[1]][1], '$'), - names(dim_params)))) { - stop(paste0("All '*_reorder' parameters must be associated to a dimension parameter. Found parameter '", - names(dim_reorder_params)[i], "' but no parameter '", - strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "'.")) - #} else if (!any(grepl(paste0('^', strsplit(names(dim_reorder_params)[i], - # '_reorder$')[[1]][1], '$'), - # names(var_params)))) { - # stop(paste0("All '*_reorder' parameters must be associated to a dimension parameter associated to a ", - # "variable. Found parameter '", names(dim_reorder_params)[i], "' and dimension parameter '", - # strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "' but did not find variable ", - # "parameter '", strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "_var'.")) - } - i <- i + 1 - } - - # Check all *_tolerance are NULL or vectors of character strings, and - # that they all have a matching dimension param. - i <- 1 - for (tolerance_param in tolerance_params) { - if (!any(grepl(paste0('^', strsplit(names(tolerance_params)[i], - '_tolerance$')[[1]][1], '$'), - names(dim_params)))) { - stop(paste0("All '*_tolerance' parameters must be associated to a dimension parameter. Found parameter '", - names(tolerance_params)[i], "' but no parameter '", - strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "'.")) - #} else if (!any(grepl(paste0('^', strsplit(names(tolerance_params)[i], - # '_tolerance$')[[1]][1], '$'), - # names(var_params)))) { - # stop(paste0("All '*_tolerance' parameters must be associated to a dimension parameter associated to a ", - # "variable. Found parameter '", names(tolerance_params)[i], "' and dimension parameter '", - # strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "' but did not find variable ", - # "parameter '", strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "_var'.")) - } - i <- i + 1 - } - # Make the keys of 'tolerance_params' to be the name of - # the corresponding dimension. - if (length(tolerance_params) < 1) { - tolerance_params <- NULL - } else { - names(tolerance_params) <- gsub('_tolerance$', '', names(tolerance_params)) - } - - # Check metadata_dims - if (!is.null(metadata_dims)) { - if (is.na(metadata_dims)) { - metadata_dims <- NULL - } else if (!is.character(metadata_dims) || (length(metadata_dims) < 1)) { - stop("Parameter 'metadata' dims must be a vector of at least one character string.") - } - } else { - metadata_dims <- pattern_dims - } - - # Once the pattern dimension with dataset specifications is found, - # the variable 'dat' is mounted with the information of each - # dataset. - # Take only the datasets for the requested chunk - dats_to_take <- chunk_indices(length(dim_params[[found_pattern_dim]]), - chunks[[found_pattern_dim]]['chunk'], - chunks[[found_pattern_dim]]['n_chunks'], - found_pattern_dim) - dim_params[[found_pattern_dim]] <- dim_params[[found_pattern_dim]][dats_to_take] - dat <- datasets <- dim_params[[found_pattern_dim]] - dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames') - dat_to_fetch <- c() - dat_names <- c() - if (!is.list(dat)) { - dat <- as.list(dat) - } else { - if (!any(sapply(dat, is.list))) { - dat <- list(dat) - } - } - for (i in 1:length(dat)) { - if (is.character(dat[[i]]) && length(dat[[i]]) == 1 && nchar(dat[[i]]) > 0) { - if (grepl('^(\\./|\\.\\./|/.*/|~/)', dat[[i]])) { - dat[[i]] <- list(path = dat[[i]]) - } else { - dat[[i]] <- list(name = dat[[i]]) - } - } else if (!is.list(dat[[i]])) { - stop(paste0("Parameter '", pattern_dim, - "' is incorrect. It must be a list of lists or character strings.")) - } - #if (!(all(names(dat[[i]]) %in% dat_info_names))) { - # stop("Error: parameter 'dat' is incorrect. There are unrecognized components in the information of some of the datasets. Check 'dat' in ?Load for details.") - #} - if (!('name' %in% names(dat[[i]]))) { - dat[[i]][['name']] <- paste0('dat', i) - if (!('path' %in% names(dat[[i]]))) { - stop(paste0("Parameter '", found_pattern_dim, - "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided.")) - } - } else if (!('path' %in% names(dat[[i]]))) { - dat_to_fetch <- c(dat_to_fetch, i) - } - #if ('path' %in% names(dat[[i]])) { - # if (!('nc_var_name' %in% names(dat[[i]]))) { - # dat[[i]][['nc_var_name']] <- '$var_name$' - # } - # if (!('suffix' %in% names(dat[[i]]))) { - # dat[[i]][['suffix']] <- '' - # } - # if (!('var_min' %in% names(dat[[i]]))) { - # dat[[i]][['var_min']] <- '' - # } - # if (!('var_max' %in% names(dat[[i]]))) { - # dat[[i]][['var_max']] <- '' - # } - #} - dat_names <- c(dat_names, dat[[i]][['name']]) - } - if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) { - .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.") - } - if (length(dat_to_fetch) > 0) { - stop("Specified only the name for some data sets, but not the path ", - "pattern. This option has not been yet implemented.") - } - - # Reorder inner_dims_across_files (to make the keys be the file dimensions, - # and the values to be the inner dimensions that go across it). - if (!is.null(inner_dims_across_files)) { - # Reorder: example, convert list(ftime = 'chunk', ensemble = 'member', xx = 'chunk') - # to list(chunk = c('ftime', 'xx'), member = 'ensemble') - new_idaf <- list() - for (i in names(inner_dims_across_files)) { - if (!(inner_dims_across_files[[i]] %in% names(new_idaf))) { - new_idaf[[inner_dims_across_files[[i]]]] <- i - } else { - new_idaf[[inner_dims_across_files[[i]]]] <- c(new_idaf[[inner_dims_across_files[[i]]]], i) - } - } - inner_dims_across_files <- new_idaf - } - - # Check return_vars - if (is.null(return_vars)) { - return_vars <- list() -# if (length(var_params) > 0) { -# return_vars <- as.list(paste0(names(var_params), '_var')) -# } else { -# return_vars <- list() -# } - } - if (!is.list(return_vars)) { - stop("Parameter 'return_vars' must be a list or NULL.") - } - if (length(return_vars) > 0 && is.null(names(return_vars))) { -# names(return_vars) <- rep('', length(return_vars)) - stop("Parameter 'return_vars' must be a named list.") - } - i <- 1 - while (i <= length(return_vars)) { -# if (names(return_vars)[i] == '') { -# if (!(is.character(return_vars[[i]]) && (length(return_vars[[i]]) == 1))) { -# stop("The ", i, "th specification in 'return_vars' is malformed.") -# } -# if (!grepl('_var$', return_vars[[i]])) { -# stop("The ", i, "th specification in 'return_vars' is malformed.") -# } -# dim_name <- strsplit(return_vars[[i]], '_var$')[[1]][1] -# if (!(dim_name %in% names(var_params))) { -# stop("'", dim_name, "_var' requested in 'return_vars' but ", -# "no '", dim_name, "_var' specified in the .Load call.") -# } -# names(return_vars)[i] <- var_params[[dim_name]] -# return_vars[[i]] <- found_pattern_dim -# } else - if (length(return_vars[[i]]) > 0) { - if (!is.character(return_vars[[i]])) { - stop("The ", i, "th specification in 'return_vars' is malformed. It ", - "must be a vector of character strings of valid file dimension ", - "names.") - } - } - i <- i + 1 - } - - # Check synonims - if (!is.null(synonims)) { - error <- FALSE - if (!is.list(synonims)) { - error <- TRUE - } - for (synonim_entry in names(synonims)) { - if (!(synonim_entry %in% names(dim_params)) && - !(synonim_entry %in% names(return_vars))) { - error <- TRUE - } - if (!is.character(synonims[[synonim_entry]]) || - length(synonims[[synonim_entry]]) < 1) { - error <- TRUE - } - } - if (error) { - stop("Parameter 'synonims' must be a named list, where the names are ", - "a name of a requested dimension or variable and the values are ", - "vectors of character strings with at least one alternative name ", - " for each dimension or variable in 'synonims'.") - } - } - if (length(unique(names(synonims))) < length(names(synonims))) { - stop("There must not be repeated entries in 'synonims'.") - } - if (length(unique(unlist(synonims))) < length(unlist(synonims))) { - stop("There must not be repeated values in 'synonims'.") - } - # Make that all dims and vars have an entry in synonims, even if only dim_name = dim_name - dim_entries_to_add <- which(!(names(dim_params) %in% names(synonims))) - if (length(dim_entries_to_add) > 0) { - synonims[names(dim_params)[dim_entries_to_add]] <- as.list(names(dim_params)[dim_entries_to_add]) - } - var_entries_to_add <- which(!(names(var_params) %in% names(synonims))) - if (length(var_entries_to_add) > 0) { - synonims[names(var_params)[var_entries_to_add]] <- as.list(names(var_params)[var_entries_to_add]) - } - - # Check selector_checker - if (is.null(selector_checker) || !is.function(selector_checker)) { - stop("Parameter 'selector_checker' must be a function.") - } - - # Check file_opener - if (is.null(file_opener) || !is.function(file_opener)) { - stop("Parameter 'file_opener' must be a function.") - } - - # Check file_var_reader - if (!is.null(file_var_reader) && !is.function(file_var_reader)) { - stop("Parameter 'file_var_reader' must be a function.") - } - - # Check file_dim_reader - if (!is.null(file_dim_reader) && !is.function(file_dim_reader)) { - stop("Parameter 'file_dim_reader' must be a function.") - } - - # Check file_data_reader - if (is.null(file_data_reader) || !is.function(file_data_reader)) { - stop("Parameter 'file_data_reader' must be a function.") - } - - # Check file_closer - if (is.null(file_closer) || !is.function(file_closer)) { - stop("Parameter 'file_closer' must be a function.") - } - - # Check transform - if (!is.null(transform)) { - if (!is.function(transform)) { - stop("Parameter 'transform' must be a function.") - } - } - - # Check transform_params - if (!is.null(transform_params)) { - if (!is.list(transform_params)) { - stop("Parameter 'transform_params' must be a list.") - } - if (is.null(names(transform_params))) { - stop("Parameter 'transform_params' must be a named list.") - } - } - - # Check transform_vars - if (!is.null(transform_vars)) { - if (!is.character(transform_vars)) { - stop("Parameter 'transform_vars' must be a vector of character strings.") - } - } - if (any(!(transform_vars %in% names(return_vars)))) { - stop("All the variables specified in 'transform_vars' must also be specified in 'return_vars'.") - } - - # Check apply_indices_after_transform - if (!is.logical(apply_indices_after_transform)) { - stop("Parameter 'apply_indices_after_transform' must be either TRUE or FALSE.") - } - aiat <- apply_indices_after_transform - - # Check transform_extra_cells - if (!is.numeric(transform_extra_cells)) { - stop("Parameter 'transform_extra_cells' must be numeric.") - } - transform_extra_cells <- round(transform_extra_cells) - - # Check split_multiselected_dims - if (!is.logical(split_multiselected_dims)) { - stop("Parameter 'split_multiselected_dims' must be TRUE or FALSE.") - } - - # Check path_glob_permissive - if (!is.numeric(path_glob_permissive) && !is.logical(path_glob_permissive)) { - stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or an integer.") - } - if (length(path_glob_permissive) != 1) { - stop("Parameter 'path_glob_permissive' must be of length 1.") - } - - # Check retrieve - if (!is.logical(retrieve)) { - stop("Parameter 'retrieve' must be TRUE or FALSE.") - } - - # Check num_procs - if (!is.null(num_procs)) { - if (!is.numeric(num_procs)) { - stop("Parameter 'num_procs' must be numeric.") - } else { - num_procs <- round(num_procs) - } - } - - # Check silent - if (!is.logical(silent)) { - stop("Parameter 'silent' must be logical.") - } - - dim_params[[found_pattern_dim]] <- dat_names - - if (!silent) { - .message(paste0("Exploring files... This will take a variable amount ", - "of time depending on the issued request and the ", - "performance of the file server...")) - } - -if (!is.character(debug)) { -dims_to_check <- c('time') -} else { -dims_to_check <- debug -debug <- TRUE -} - - ############################## READING FILE DIMS ############################ - # Check that no unrecognized variables are present in the path patterns - # and also that no file dimensions are requested to THREDDs catalogs. - # And in the mean time, build all the work pieces and look for the - # first available file of each dataset. - array_of_files_to_load <- NULL - array_of_not_found_files <- NULL - indices_of_first_files_with_data <- vector('list', length(dat)) - selectors_of_first_files_with_data <- vector('list', length(dat)) - dataset_has_files <- rep(FALSE, length(dat)) - found_file_dims <- vector('list', length(dat)) - expected_inner_dims <- vector('list', length(dat)) - -#print("A") - for (i in 1:length(dat)) { -#print("B") - dat_selectors <- dim_params - dat_selectors[[found_pattern_dim]] <- dat_selectors[[found_pattern_dim]][i] - dim_vars <- paste0('$', dim_names, '$') - file_dims <- which(sapply(dim_vars, grepl, dat[[i]][['path']], fixed = TRUE)) - if (length(file_dims) > 0) { - file_dims <- dim_names[file_dims] - } - file_dims <- unique(c(pattern_dims, file_dims)) - found_file_dims[[i]] <- file_dims - expected_inner_dims[[i]] <- dim_names[which(!(dim_names %in% file_dims))] - # (Check the depending_file_dims). - if (any(c(names(depending_file_dims), unlist(depending_file_dims)) %in% - expected_inner_dims[[i]])) { - stop(paste0("The dimension dependancies specified in ", - "'depending_file_dims' can only be between file ", - "dimensions, but some inner dimensions found in ", - "dependancies for '", dat[[i]][['name']], "', which ", - "has the following file dimensions: ", - paste(paste0("'", file_dims, "'"), collapse = ', '), ".")) - } else { - a <- names(depending_file_dims) %in% file_dims - b <- unlist(depending_file_dims) %in% file_dims - ab <- a & b - if (any(!ab)) { - .warning(paste0("Detected some dependancies in 'depending_file_dims' with ", - "non-existing dimension names. These will be disregarded.")) - depending_file_dims <- depending_file_dims[-which(!ab)] - } - if (any(names(depending_file_dims) == unlist(depending_file_dims))) { - depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == unlist(depending_file_dims))] - } - } - # (Check the inner_dims_across_files). - if (any(!(names(inner_dims_across_files) %in% file_dims)) || - any(!(unlist(inner_dims_across_files) %in% expected_inner_dims[[i]]))) { - stop(paste0("All relationships specified in ", - "'_across' parameters must be between a inner ", - "dimension and a file dimension. Found wrong ", - "specification for '", dat[[i]][['name']], "', which ", - "has the following file dimensions: ", - paste(paste0("'", file_dims, "'"), collapse = ', '), - ", and the following inner dimensions: ", - paste(paste0("'", expected_inner_dims[[i]], "'"), - collapse = ', '), ".")) - } - # (Check the return_vars). - j <- 1 - while (j <= length(return_vars)) { - if (any(!(return_vars[[j]] %in% file_dims))) { - if (any(return_vars[[j]] %in% expected_inner_dims[[i]])) { - stop("Found variables in 'return_vars' requested ", - "for some inner dimensions (for dataset '", - dat[[i]][['name']], "'), but variables can only be ", - "requested for file dimensions.") - } else { - stop("Found variables in 'return_vars' requested ", - "for non-existing dimensions.") - } - } - j <- j + 1 - } - # (Check the metadata_dims). - if (!is.null(metadata_dims)) { - if (any(!(metadata_dims %in% file_dims))) { - stop("All dimensions in 'metadata_dims' must be file dimensions.") - } - } - ## Look for _var params that should be requested automatically. - for (dim_name in dim_names) { - if (!(dim_name %in% pattern_dims)) { - if (is.null(attr(dat_selectors[[dim_name]], 'values')) || - is.null(attr(dat_selectors[[dim_name]], 'indices'))) { - flag <- ((dat_selectors[[dim_name]] %in% c('all', 'first', 'last')) || - (is.numeric(unlist(dat_selectors[[dim_name]])))) - attr(dat_selectors[[dim_name]], 'values') <- !flag - attr(dat_selectors[[dim_name]], 'indices') <- flag - } - ## The following code 'rewrites' var_params for all datasets. If providing different - ## path pattern repositories with different file/inner dimensions, var_params might - ## have to be handled for each dataset separately. - if ((attr(dat_selectors[[dim_name]], 'values') || (dim_name %in% c('var', 'variable'))) && - !(dim_name %in% names(var_params)) && !(dim_name %in% file_dims)) { - if (dim_name %in% c('var', 'variable')) { - var_params <- c(var_params, setNames(list('var_names'), dim_name)) - .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", - dim_name, "_var' requested. ", '"', dim_name, "_var = '", - 'var_names', "'", '"', " has been automatically added to ", - "the Start call.")) - } else { - var_params <- c(var_params, setNames(list(dim_name), dim_name)) - .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", - dim_name, "_var' requested. ", '"', dim_name, "_var = '", - dim_name, "'", '"', " has been automatically added to ", - "the Start call.")) - } - } - } - } - ## (Check the *_var parameters). - if (any(!(unlist(var_params) %in% names(return_vars)))) { - vars_to_add <- which(!(unlist(var_params) %in% names(return_vars))) - new_return_vars <- vector('list', length(vars_to_add)) - names(new_return_vars) <- unlist(var_params)[vars_to_add] - return_vars <- c(return_vars, new_return_vars) - .warning(paste0("All '*_var' params must associate a dimension to one of the ", - "requested variables in 'return_vars'. The following variables", - " have been added to 'return_vars': ", - paste(paste0("'", unlist(var_params), "'"), collapse = ', '))) - } - - replace_values <- vector('list', length = length(file_dims)) - names(replace_values) <- file_dims - # Take the first selector for all possible file dimensions - for (file_dim in file_dims) { - if (file_dim %in% names(var_params)) { - .warning(paste0("The '", file_dim, "_var' param will be ignored since '", - file_dim, "' is a file dimension (for the dataset with pattern ", - dat[[i]][['path']], ").")) - } - if (!is.list(dat_selectors[[file_dim]]) || - (is.list(dat_selectors[[file_dim]]) && - length(dat_selectors[[file_dim]]) == 2 && - is.null(names(dat_selectors[[file_dim]])))) { - dat_selectors[[file_dim]] <- list(dat_selectors[[file_dim]]) - } - first_class <- class(dat_selectors[[file_dim]][[1]]) - first_length <- length(dat_selectors[[file_dim]][[1]]) - for (j in 1:length(dat_selectors[[file_dim]])) { - sv <- selector_vector <- dat_selectors[[file_dim]][[j]] - if (!identical(first_class, class(sv)) || - !identical(first_length, length(sv))) { - stop("All provided selectors for depending dimensions must ", - "be vectors of the same length and of the same class.") - } - if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) { - dat_selectors[[file_dim]][[j]] <- selector_checker(selectors = sv, - return_indices = FALSE) - # Take chunk if needed - dat_selectors[[file_dim]][[j]] <- dat_selectors[[file_dim]][[j]][chunk_indices(length(dat_selectors[[file_dim]][[j]]), - chunks[[file_dim]]['chunk'], - chunks[[file_dim]]['n_chunks'], - file_dim)] - } else if (!(is.numeric(sv) || - (is.character(sv) && (length(sv) == 1) && (sv %in% c('all', 'first', 'last'))) || - (is.list(sv) && (length(sv) == 2) && (all(sapply(sv, is.character)) || - all(sapply(sv, is.numeric)))))) { - stop("All explicitly provided selectors for file dimensions must be character strings.") - } - } - sv <- dat_selectors[[file_dim]][[1]] - if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) { - replace_values[[file_dim]] <- dat_selectors[[file_dim]][[1]][1] - } - } -#print("C") - # Now we know which dimensions whose selectors are provided non-explicitly. - undefined_file_dims <- file_dims[which(sapply(replace_values, is.null))] - defined_file_dims <- file_dims[which(!(file_dims %in% undefined_file_dims))] - # Quickly check if the depending dimensions are provided properly. - for (file_dim in file_dims) { - if (file_dim %in% names(depending_file_dims)) { - ## TODO: Detect multi-dependancies and forbid. - if (all(c(file_dim, depending_file_dims[[file_dim]]) %in% defined_file_dims)) { - if (length(dat_selectors[[file_dim]]) != length(dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) { - stop(paste0("If providing selectors for the depending ", - "dimension '", file_dim, "', a ", - "vector of selectors must be provided for ", - "each selector of the dimension it depends on, '", - depending_file_dims[[file_dim]], "'.")) - } else if (!all(names(dat_selectors[[file_dim]]) == dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) { - stop(paste0("If providing selectors for the depending ", - "dimension '", file_dim, "', the name of the ", - "provided vectors of selectors must match ", - "exactly the selectors of the dimension it ", - "depends on, '", depending_file_dims[[file_dim]], "'.")) - } - } - } - } - # Find the possible values for the selectors that are provided as - # indices. If the requested file is on server, impossible operation. - if (length(grep("^http", dat[[i]][['path']])) > 0) { - if (length(undefined_file_dims) > 0) { - stop(paste0("All selectors for the file dimensions must be ", - "character strings if requesting data to a remote ", - "server. Found invalid selectors for the file dimensions ", - paste(paste0("'", undefined_file_dims, "'"), collapse = ', '), ".")) - } - dataset_has_files[i] <- TRUE - } else { - dat[[i]][['path']] <- path.expand(dat[[i]][['path']]) - # Iterate over the known dimensions to find the first existing file. - # The path to the first existing file will be used to find the - # values for the non explicitly defined selectors. - first_file <- NULL - first_file_selectors <- NULL - if (length(undefined_file_dims) > 0) { - replace_values[undefined_file_dims] <- '*' - } - ## TODO: What if length of defined_file_dims is 0? code might crash (in practice it worked for an example case) - files_to_check <- sapply(dat_selectors[defined_file_dims], function(x) length(x[[1]])) - sub_array_of_files_to_check <- array(1:prod(files_to_check), dim = files_to_check) - j <- 1 -#print("D") - while (j <= prod(files_to_check) && is.null(first_file)) { - selector_indices <- which(sub_array_of_files_to_check == j, arr.ind = TRUE)[1, ] - selectors <- sapply(1:length(defined_file_dims), - function (x) { - vector_to_pick <- 1 - if (defined_file_dims[x] %in% names(depending_file_dims)) { - vector_to_pick <- selector_indices[which(defined_file_dims == depending_file_dims[[defined_file_dims[x]]])] - } - dat_selectors[defined_file_dims][[x]][[vector_to_pick]][selector_indices[x]] - }) - replace_values[defined_file_dims] <- selectors - file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) - file_path <- Sys.glob(file_path) - if (length(file_path) > 0) { - first_file <- file_path[1] - first_file_selectors <- selectors - } - j <- j + 1 - } -#print("E") - # Start looking for values for the non-explicitly defined selectors. - if (is.null(first_file)) { - .warning(paste0("No found files for the datset '", dat[[i]][['name']], - "'. Provide existing selectors for the file dimensions ", - " or check and correct its path pattern: ", dat[[i]][['path']])) - } else { - dataset_has_files[i] <- TRUE - ## TODO: Improve message here if no variable found: - if (length(undefined_file_dims) > 0) { - # Looking for the first values, parsed from first_file. - first_values <- vector('list', length = length(undefined_file_dims)) - names(first_values) <- undefined_file_dims - found_values <- 0 - stop <- FALSE - try_dim <- 1 - last_success <- 1 - while ((found_values < length(undefined_file_dims)) && !stop) { - u_file_dim <- undefined_file_dims[try_dim] - if (is.null(first_values[[u_file_dim]])) { - path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], - replace_values[-which(file_dims == u_file_dim)], - allow_undefined_key_vars = TRUE) - found_value <- .FindTagValue(path_with_globs_and_tag, - first_file, u_file_dim) - if (!is.null(found_value)) { - found_values <- found_values + 1 - last_success <- try_dim - first_values[[u_file_dim]] <- found_value - replace_values[[u_file_dim]] <- found_value - } - } - try_dim <- (try_dim %% length(undefined_file_dims)) + 1 - if (try_dim == last_success) { - stop <- TRUE - } - } - if (found_values < length(undefined_file_dims)) { - stop(paste0("Path pattern of dataset '", dat[[i]][['name']], - "' is too complex. Could not automatically ", - "detect values for all non-explicitly defined ", - "indices. Check its pattern: ", dat[[i]][['path']])) - } - ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above - ## Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical - ## cmor case, requesting all members and chunks for fixed var and sdate. Not fixing - ## sdate raised 'too complex' error. - # Replace shell globs in path pattern and keep the file_dims as tags - dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, - file_dims, dat[[i]][['name']], path_glob_permissive) - # Now time to look for the available values for the non - # explicitly defined selectors for the file dimensions. -#print("H") - # Check first the ones that do not depend on others. - ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))], - undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))]) - - for (u_file_dim in ufd) { - replace_values[undefined_file_dims] <- first_values - replace_values[[u_file_dim]] <- '*' - depended_dim <- NULL - depended_dim_values <- NA - selectors <- dat_selectors[[u_file_dim]][[1]] - if (u_file_dim %in% names(depending_file_dims)) { - depended_dim <- depending_file_dims[[u_file_dim]] - depended_dim_values <- dat_selectors[[depended_dim]][[1]] - dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values)) - names(dat_selectors[[u_file_dim]]) <- depended_dim_values - } else { - dat_selectors[[u_file_dim]] <- list() - } - if (u_file_dim %in% unlist(depending_file_dims)) { - depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))] - replace_values[depending_dims] <- rep('*', length(depending_dims)) - } - for (j in 1:length(depended_dim_values)) { - parsed_values <- c() - if (!is.null(depended_dim)) { - replace_values[[depended_dim]] <- depended_dim_values[j] - } - path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) - found_files <- Sys.glob(path_with_globs) - ## TODO: Enhance this error message, or change by warning. - ## Raises if a wrong sdate is specified, for example. - if (length(found_files) == 0) { - .warning(paste0("Could not find files for any '", u_file_dim, - "' for '", depended_dim, "' = '", - depended_dim_values[j], "'.")) - dat_selectors[[u_file_dim]][[j]] <- NA - } else { - for (found_file in found_files) { - path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], - replace_values[-which(file_dims == u_file_dim)], - allow_undefined_key_vars = TRUE) - parsed_values <- c(parsed_values, - .FindTagValue(path_with_globs_and_tag, found_file, - u_file_dim)) - } - dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors, - var = unique(parsed_values), - return_indices = FALSE) - # Take chunk if needed - dat_selectors[[u_file_dim]][[j]] <- dat_selectors[[u_file_dim]][[j]][chunk_indices(length(dat_selectors[[u_file_dim]][[j]]), - chunks[[u_file_dim]]['chunk'], - chunks[[u_file_dim]]['n_chunks'], - u_file_dim)] - } - } - } -#print("I") - } else { - dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, - defined_file_dims, dat[[i]][['name']], path_glob_permissive) - } - } - } - # Now fetch for the first available file - if (dataset_has_files[i]) { - known_dims <- file_dims - } else { - known_dims <- defined_file_dims - } - replace_values <- vector('list', length = length(known_dims)) - names(replace_values) <- known_dims - files_to_load <- sapply(dat_selectors[known_dims], function(x) length(x[[1]])) - files_to_load[found_pattern_dim] <- 1 - sub_array_of_files_to_load <- array(1:prod(files_to_load), - dim = files_to_load) - names(dim(sub_array_of_files_to_load)) <- known_dims - sub_array_of_not_found_files <- array(!dataset_has_files[i], - dim = files_to_load) - names(dim(sub_array_of_not_found_files)) <- known_dims - j <- 1 - selector_indices_save <- vector('list', prod(files_to_load)) - while (j <= prod(files_to_load)) { - selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] - names(selector_indices) <- known_dims - selector_indices_save[[j]] <- selector_indices - selectors <- sapply(1:length(known_dims), - function (x) { - vector_to_pick <- 1 - if (known_dims[x] %in% names(depending_file_dims)) { - vector_to_pick <- selector_indices[which(known_dims == depending_file_dims[[known_dims[x]]])] - } - dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]] - }) - names(selectors) <- known_dims - replace_values[known_dims] <- selectors - if (!dataset_has_files[i]) { - if (any(is.na(selectors))) { - replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))] - } - file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE) - sub_array_of_files_to_load[j] <- file_path - #sub_array_of_not_found_files[j] <- TRUE??? - } else { - if (any(is.na(selectors))) { - replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))] - file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE) - sub_array_of_files_to_load[j] <- file_path - sub_array_of_not_found_files[j] <- TRUE - } else { - file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) - if (!(length(grep("^http", file_path)) > 0)) { - if (grepl(file_path, '*', fixed = TRUE)) { - file_path_full <- Sys.glob(file_path)[1] - if (nchar(file_path_full) > 0) { - file_path <- file_path_full - } - } - } - sub_array_of_files_to_load[j] <- file_path - if (is.null(indices_of_first_files_with_data[[i]])) { - if (!(length(grep("^http", file_path)) > 0)) { - if (!file.exists(file_path)) { - file_path <- NULL - } - } - if (!is.null(file_path)) { - test_file <- NULL - ## TODO: suppress error messages - test_file <- file_opener(file_path) - if (!is.null(test_file)) { - selector_indices[which(known_dims == found_pattern_dim)] <- i - indices_of_first_files_with_data[[i]] <- selector_indices - selectors_of_first_files_with_data[[i]] <- selectors - file_closer(test_file) - } - } - } - } - } - j <- j + 1 - } - # Extend array as needed progressively - if (is.null(array_of_files_to_load)) { - array_of_files_to_load <- sub_array_of_files_to_load - array_of_not_found_files <- sub_array_of_not_found_files - } else { - array_of_files_to_load <- .MergeArrays(array_of_files_to_load, sub_array_of_files_to_load, - along = found_pattern_dim) - ## TODO: file_dims, and variables like that.. are still ok now? I don't think so - array_of_not_found_files <- .MergeArrays(array_of_not_found_files, sub_array_of_not_found_files, - along = found_pattern_dim) - } - dat[[i]][['selectors']] <- dat_selectors - } - if (all(sapply(indices_of_first_files_with_data, is.null))) { - stop("No data files found for any of the specified datasets.") - } - - ########################### READING INNER DIMS. ############################# -#print("J") - ## TODO: To be run in parallel (local multi-core) - # Now time to work out the inner file dimensions. - # First pick the requested variables. - dims_to_iterate <- NULL - for (return_var in names(return_vars)) { - dims_to_iterate <- unique(c(dims_to_iterate, return_vars[[return_var]])) - } - if (found_pattern_dim %in% dims_to_iterate) { - dims_to_iterate <- dims_to_iterate[-which(dims_to_iterate == found_pattern_dim)] - } - common_return_vars <- NULL - common_first_found_file <- NULL - common_return_vars_pos <- NULL - if (length(return_vars) > 0) { - common_return_vars_pos <- which(sapply(return_vars, function(x) !(found_pattern_dim %in% x))) - } - if (length(common_return_vars_pos) > 0) { - common_return_vars <- return_vars[common_return_vars_pos] - return_vars <- return_vars[-common_return_vars_pos] - common_first_found_file <- rep(FALSE, length(which(sapply(common_return_vars, length) == 0))) - names(common_first_found_file) <- names(common_return_vars[which(sapply(common_return_vars, length) == 0)]) - } - return_vars <- lapply(return_vars, - function(x) { - if (found_pattern_dim %in% x) { - x[-which(x == found_pattern_dim)] - } else { - x - } - }) - if (length(common_return_vars) > 0) { - picked_common_vars <- vector('list', length = length(common_return_vars)) - names(picked_common_vars) <- names(common_return_vars) - } else { - picked_common_vars <- NULL - } - picked_common_vars_ordered <- picked_common_vars - picked_common_vars_unorder_indices <- picked_common_vars - picked_vars <- vector('list', length = length(dat)) - names(picked_vars) <- dat_names - picked_vars_ordered <- picked_vars - picked_vars_unorder_indices <- picked_vars - for (i in 1:length(dat)) { - if (dataset_has_files[i]) { - # Put all selectors in a list of a single list/vector of selectors. - # The dimensions that go across files will later be extended to have - # lists of lists/vectors of selectors. - for (inner_dim in expected_inner_dims[[i]]) { - if (!is.list(dat[[i]][['selectors']][[inner_dim]]) || - (is.list(dat[[i]][['selectors']][[inner_dim]]) && - length(dat[[i]][['selectors']][[inner_dim]]) == 2 && - is.null(names(dat[[i]][['selectors']][[inner_dim]])))) { - dat[[i]][['selectors']][[inner_dim]] <- list(dat[[i]][['selectors']][[inner_dim]]) - } - } - if (length(return_vars) > 0) { - picked_vars[[i]] <- vector('list', length = length(return_vars)) - names(picked_vars[[i]]) <- names(return_vars) - picked_vars_ordered[[i]] <- picked_vars[[i]] - picked_vars_unorder_indices[[i]] <- picked_vars[[i]] - } - indices_of_first_file <- as.list(indices_of_first_files_with_data[[i]]) - array_file_dims <- sapply(dat[[i]][['selectors']][found_file_dims[[i]]], function(x) length(x[[1]])) - names(array_file_dims) <- found_file_dims[[i]] - if (length(dims_to_iterate) > 0) { - indices_of_first_file[dims_to_iterate] <- lapply(array_file_dims[dims_to_iterate], function(x) 1:x) - } - array_of_var_files <- do.call('[', c(list(x = array_of_files_to_load), indices_of_first_file, list(drop = FALSE))) - array_of_var_indices <- array(1:length(array_of_var_files), dim = dim(array_of_var_files)) - array_of_not_found_var_files <- do.call('[', c(list(x = array_of_not_found_files), indices_of_first_file, list(drop = FALSE))) - previous_indices <- rep(-1, length(indices_of_first_file)) - names(previous_indices) <- names(indices_of_first_file) - first_found_file <- NULL - if (length(return_vars) > 0) { - first_found_file <- rep(FALSE, length(which(sapply(return_vars, length) == 0))) - names(first_found_file) <- names(return_vars[which(sapply(return_vars, length) == 0)]) - } - for (j in 1:length(array_of_var_files)) { - current_indices <- which(array_of_var_indices == j, arr.ind = TRUE)[1, ] - names(current_indices) <- names(indices_of_first_file) - if (!is.na(array_of_var_files[j]) && !array_of_not_found_var_files[j]) { - changed_dims <- which(current_indices != previous_indices) - vars_to_read <- NULL - if (length(return_vars) > 0) { - vars_to_read <- names(return_vars)[sapply(return_vars, function(x) any(names(changed_dims) %in% x))] - } - if (!is.null(first_found_file)) { - if (any(!first_found_file)) { - vars_to_read <- c(vars_to_read, names(first_found_file[which(!first_found_file)])) - } - } - if ((i == 1) && (length(common_return_vars) > 0)) { - vars_to_read <- c(vars_to_read, names(common_return_vars)[sapply(common_return_vars, function(x) any(names(changed_dims) %in% x))]) - } - if (!is.null(common_first_found_file)) { - if (any(!common_first_found_file)) { - vars_to_read <- c(vars_to_read, names(common_first_found_file[which(!common_first_found_file)])) - } - } - file_object <- file_opener(array_of_var_files[j]) - if (!is.null(file_object)) { - for (var_to_read in vars_to_read) { - if (var_to_read %in% unlist(var_params)) { - associated_dim_name <- names(var_params)[which(unlist(var_params) == var_to_read)] - } - var_name_to_reader <- var_to_read - names(var_name_to_reader) <- 'var' - var_dims <- file_dim_reader(NULL, file_object, var_name_to_reader, NULL, - synonims) - # file_dim_reader returns dimension names as found in the file. - # Need to translate accoridng to synonims: - names(var_dims) <- sapply(names(var_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) - if (!is.null(var_dims)) { - var_file_dims <- NULL - if (var_to_read %in% names(common_return_vars)) { - var_to_check <- common_return_vars[[var_to_read]] - } else { - var_to_check <- return_vars[[var_to_read]] - } - if (any(names(dim(array_of_files_to_load)) %in% var_to_check)) { - var_file_dims <- dim(array_of_files_to_load)[which(names(dim(array_of_files_to_load)) %in% - var_to_check)] - } - if (((var_to_read %in% names(common_return_vars)) && - is.null(picked_common_vars[[var_to_read]])) || - ((var_to_read %in% names(return_vars)) && - is.null(picked_vars[[i]][[var_to_read]]))) { - if (any(names(var_file_dims) %in% names(var_dims))) { - stop("Found a requested var in 'return_var' requested for a ", - "file dimension which also appears in the dimensions of ", - "the variable inside the file.\n", array_of_var_files[j]) - } - special_types <- list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, - 'Date' = as.Date) - first_sample <- file_var_reader(NULL, file_object, NULL, - var_to_read, synonims) - if (any(class(first_sample) %in% names(special_types))) { - array_size <- prod(c(var_file_dims, var_dims)) - new_array <- rep(special_types[[class(first_sample)[1]]](NA), array_size) - dim(new_array) <- c(var_file_dims, var_dims) - } else { - new_array <- array(dim = c(var_file_dims, var_dims)) - } - attr(new_array, 'variables') <- attr(first_sample, 'variables') - if (var_to_read %in% names(common_return_vars)) { - picked_common_vars[[var_to_read]] <- new_array - pick_ordered <- FALSE - if (var_to_read %in% unlist(var_params)) { - if (associated_dim_name %in% names(dim_reorder_param) && !aiat) { - picked_common_vars_ordered[[var_to_read]] <- new_array - pick_ordered <- TRUE - } - } - if (!pick_ordered) { - picked_common_vars_ordered[[var_to_read]] <- NULL - } - } else { - picked_vars[[i]][[var_to_read]] <- new_array - pick_ordered <- FALSE - if (var_to_read %in% unlist(var_params)) { - if (associated_dim_name %in% names(dim_reorder_params) && !aiat) { - picked_vars_ordered[[i]][[var_to_read]] <- new_array - pick_ordered <- TRUE - } - } - if (!pick_ordered) { - picked_vars_ordered[[i]][[var_to_read]] <- NULL - } - } - } else { - if (var_to_read %in% names(common_return_vars)) { - array_var_dims <- dim(picked_common_vars[[var_to_read]]) - } else { - array_var_dims <- dim(picked_vars[[i]][[var_to_read]]) - } - full_array_var_dims <- array_var_dims - if (any(names(array_var_dims) %in% names(var_file_dims))) { - array_var_dims <- array_var_dims[-which(names(array_var_dims) %in% names(var_file_dims))] - } - if (names(array_var_dims) != names(var_dims)) { - stop("Error while reading the variable '", var_to_read, "' from ", - "the file. Dimensions do not match.\nExpected ", - paste(paste0("'", names(array_var_dims), "'"), - collapse = ', '), " but found ", - paste(paste0("'", names(var_dims), "'"), - collapse = ', '), ".\n", array_of_var_files[j]) - } - if (any(var_dims > array_var_dims)) { - longer_dims <- which(var_dims > array_var_dims) - if (length(longer_dims) == 1) { - longer_dims_in_full_array <- longer_dims - if (any(names(full_array_var_dims) %in% names(var_file_dims))) { - candidates <- (1:length(full_array_var_dims))[-which(names(full_array_var_dims) %in% names(var_file_dims))] - longer_dims_in_full_array <- candidates[longer_dims] - } - padding_dims <- full_array_var_dims - padding_dims[longer_dims_in_full_array] <- var_dims[longer_dims] - - array_var_dims[longer_dims] - special_types <- list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, - 'Date' = as.Date) - if (var_to_read %in% names(common_return_vars)) { - var_class <- class(picked_common_vars[[var_to_read]]) - } else { - var_class <- class(picked_vars[[i]][[var_to_read]]) - } - if (any(var_class %in% names(special_types))) { - padding_size <- prod(padding_dims) - padding <- rep(special_types[[var_class[1]]](NA), padding_size) - dim(padding) <- padding_dims - } else { - padding <- array(dim = padding_dims) - } - if (var_to_read %in% names(common_return_vars)) { - picked_common_vars[[var_to_read]] <- .abind2( - picked_common_vars[[var_to_read]], - padding, - names(full_array_var_dims)[longer_dims_in_full_array] - ) - } else { - picked_vars[[i]][[var_to_read]] <- .abind2( - picked_vars[[i]][[var_to_read]], - padding, - names(full_array_var_dims)[longer_dims_in_full_array] - ) - } - } else { - stop("Error while reading the variable '", var_to_read, "' from ", - "the file. Found size (", paste(var_dims, collapse = ' x '), - ") is greater than expected maximum size (", - array_var_dims, ").") - } - } - } - var_store_indices <- c(as.list(current_indices[names(var_file_dims)]), lapply(var_dims, function(x) 1:x)) - var_values <- file_var_reader(NULL, file_object, NULL, var_to_read, synonims) - if (var_to_read %in% unlist(var_params)) { - if ((associated_dim_name %in% names(dim_reorder_params)) && !aiat) { - ## Is this check really needed? - if (length(dim(var_values)) > 1) { - stop("Requested a '", associated_dim_name, "_reorder' for a dimension ", - "whose coordinate variable that has more than 1 dimension. This is ", - "not supported.") - } - ordered_var_values <- dim_reorder_params[[associated_dim_name]](var_values) - attr(ordered_var_values$x, 'variables') <- attr(var_values, 'variables') - if (!all(c('x', 'ix') %in% names(ordered_var_values))) { - stop("All the dimension reorder functions must return a list with the components 'x' and 'ix'.") - } - # Save the indices to reorder back the ordered variable values. - # This will be used to define the first round indices. - unorder <- sort(ordered_var_values$ix, index.return = TRUE)$ix - if (var_to_read %in% names(common_return_vars)) { - picked_common_vars_ordered[[var_to_read]] <- do.call('[<-', - c(list(x = picked_common_vars_ordered[[var_to_read]]), - var_store_indices, - list(value = ordered_var_values$x))) - picked_common_vars_unorder_indices[[var_to_read]] <- do.call('[<-', - c(list(x = picked_common_vars_unorder_indices[[var_to_read]]), - var_store_indices, - list(value = unorder))) - } else { - picked_vars_ordered[[i]][[var_to_read]] <- do.call('[<-', - c(list(x = picked_vars_ordered[[i]][[var_to_read]]), - var_store_indices, - list(value = ordered_var_values$x))) - picked_vars_unorder_indices[[i]][[var_to_read]] <- do.call('[<-', - c(list(x = picked_vars_unorder_indices[[i]][[var_to_read]]), - var_store_indices, - list(value = unorder))) - } - } - } - if (var_to_read %in% names(common_return_vars)) { - picked_common_vars[[var_to_read]] <- do.call('[<-', - c(list(x = picked_common_vars[[var_to_read]]), - var_store_indices, - list(value = var_values))) - } else { - picked_vars[[i]][[var_to_read]] <- do.call('[<-', - c(list(x = picked_vars[[i]][[var_to_read]]), - var_store_indices, - list(value = var_values))) - } - if (var_to_read %in% names(first_found_file)) { - first_found_file[var_to_read] <- TRUE - } - if (var_to_read %in% names(common_first_found_file)) { - common_first_found_file[var_to_read] <- TRUE - } - } else { - stop("Could not find variable '", var_to_read, - "' in the file ", array_of_var_files[j]) - } - } - file_closer(file_object) - } - } - previous_indices <- current_indices - } - } - } - # Once we have the variable values, we can work out the indices - # for the implicitly defined selectors. - # - # Trnasforms a vector of indices v expressed in a world of - # length N from 1 to N, into a world of length M, from - # 1 to M. Repeated adjacent indices are collapsed. - transform_indices <- function(v, n, m) { - #unique2 turns e.g. 1 1 2 2 2 3 3 1 1 1 into 1 2 3 1 - unique2 <- function(v) { - if (length(v) < 2) { - v - } else { - v[c(1, v[2:length(v)] - v[1:(length(v) - 1)]) != 0] - } - } - unique2(round(((v - 1) / (n - 1)) * (m - 1))) + 1 # this rounding may generate 0s. what then? - } - beta <- transform_extra_cells - dims_to_crop <- vector('list') - transformed_vars <- vector('list', length = length(dat)) - names(transformed_vars) <- dat_names - transformed_vars_ordered <- transformed_vars - transformed_vars_unorder_indices <- transformed_vars - transformed_common_vars <- NULL - transformed_common_vars_ordered <- NULL - transformed_common_vars_unorder_indices <- NULL - - for (i in 1:length(dat)) { - if (dataset_has_files[i]) { - indices <- indices_of_first_files_with_data[[i]] - if (!is.null(indices)) { - file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) - # The following 5 lines should go several lines below, but were moved - # here for better performance. - # If any of the dimensions comes without defining variable, then we read - # the data dimensions. - data_dims <- NULL - if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { - file_to_open <- file_path - data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], - lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), - synonims) - # file_dim_reader returns dimension names as found in the file. - # Need to translate accoridng to synonims: - names(data_dims) <- sapply(names(data_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) - } - # Transform the variables if needed and keep them apart. - if (!is.null(transform) && (length(transform_vars) > 0)) { - if (!all(transform_vars %in% c(names(picked_vars[[i]]), names(picked_common_vars)))) { - stop("Could not find all the required variables in 'transform_vars' ", - "for the dataset '", dat[[i]][['name']], "'.") - } - vars_to_transform <- NULL - picked_vars_to_transform <- which(names(picked_vars[[i]]) %in% transform_vars) - if (length(picked_vars_to_transform) > 0) { - picked_vars_to_transform <- names(picked_vars[[i]])[picked_vars_to_transform] - new_vars_to_transform <- picked_vars[[i]][picked_vars_to_transform] - which_are_ordered <- which(!sapply(picked_vars_ordered[[i]][picked_vars_to_transform], is.null)) - -##NOTE: The following 'if' replaces the original with reordering vector - if (length(which_are_ordered) > 0) { - tmp <- which(!is.na(match(names(picked_vars_ordered[[i]]), names(which_are_ordered)))) - new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[[i]][tmp] - - } - vars_to_transform <- c(vars_to_transform, new_vars_to_transform) - } - -##NOTE: Above is non-common vars, here is common vars (ie, return_vars = NULL). - picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars) - if (length(picked_common_vars_to_transform) > 0) { - picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform] - - new_vars_to_transform <- picked_common_vars[picked_common_vars_to_transform] - which_are_ordered <- which(!sapply(picked_common_vars_ordered[picked_common_vars_to_transform], is.null)) - - if (length(which_are_ordered) > 0) { - - tmp <- which(!is.na(match(names(picked_common_vars_ordered), names(which_are_ordered)))) - new_vars_to_transform[which_are_ordered] <- picked_common_vars_ordered[tmp] - } - vars_to_transform <- c(vars_to_transform, new_vars_to_transform) - } - - # Transform the variables - transformed_data <- do.call(transform, c(list(data_array = NULL, - variables = vars_to_transform, - file_selectors = selectors_of_first_files_with_data[[i]]), - transform_params)) - # Discard the common transformed variables if already transformed before - if (!is.null(transformed_common_vars)) { - common_ones <- which(names(picked_common_vars) %in% names(transformed_data$variables)) - if (length(common_ones) > 0) { - transformed_data$variables <- transformed_data$variables[-common_ones] - } - } - transformed_vars[[i]] <- list() - transformed_vars_ordered[[i]] <- list() - transformed_vars_unorder_indices[[i]] <- list() - # Order the transformed variables if needed - # 'var_to_read' should be 'transformed_var', but is kept to reuse the same code as above. - for (var_to_read in names(transformed_data$variables)) { - if (var_to_read %in% unlist(var_params)) { - associated_dim_name <- names(var_params)[which(unlist(var_params) == var_to_read)] - if ((associated_dim_name %in% names(dim_reorder_params)) && aiat) { - ## Is this check really needed? - if (length(dim(transformed_data$variables[[associated_dim_name]])) > 1) { - stop("Requested a '", associated_dim_name, "_reorder' for a dimension ", - "whose coordinate variable that has more than 1 dimension (after ", - "transform). This is not supported.") - } - ordered_var_values <- dim_reorder_params[[associated_dim_name]](transformed_data$variables[[associated_dim_name]]) - attr(ordered_var_values, 'variables') <- attr(transformed_data$variables[[associated_dim_name]], 'variables') - if (!all(c('x', 'ix') %in% names(ordered_var_values))) { - stop("All the dimension reorder functions must return a list with the components 'x' and 'ix'.") - } - # Save the indices to reorder back the ordered variable values. - # This will be used to define the first round indices. - unorder <- sort(ordered_var_values$ix, index.return = TRUE)$ix - if (var_to_read %in% names(picked_common_vars)) { - transformed_common_vars_ordered[[var_to_read]] <- ordered_var_values$x - transformed_common_vars_unorder_indices[[var_to_read]] <- unorder - } else { - transformed_vars_ordered[[i]][[var_to_read]] <- ordered_var_values$x - transformed_vars_unorder_indices[[i]][[var_to_read]] <- unorder - } - } - } - } - transformed_picked_vars <- which(names(picked_vars[[i]]) %in% names(transformed_data$variables)) - if (length(transformed_picked_vars) > 0) { - transformed_picked_vars <- names(picked_vars[[i]])[transformed_picked_vars] - transformed_vars[[i]][transformed_picked_vars] <- transformed_data$variables[transformed_picked_vars] - } - if (is.null(transformed_common_vars)) { - transformed_picked_common_vars <- which(names(picked_common_vars) %in% names(transformed_data$variables)) - if (length(transformed_picked_common_vars) > 0) { - transformed_picked_common_vars <- names(picked_common_vars)[transformed_picked_common_vars] - transformed_common_vars <- transformed_data$variables[transformed_picked_common_vars] - } - } - } - # Once the variables are transformed, we compute the indices to be - # taken for each inner dimension. - # In all cases, indices will have to be computed to know which data - # values to take from the original data for each dimension (if a - # variable is specified for that dimension, it will be used to - # convert the provided selectors into indices). These indices are - # referred to as 'first round of indices'. - # The taken data will then be transformed if needed, together with - # the dimension variable if specified, and, in that case, indices - # will have to be computed again to know which values to take from the - # transformed data. These are the 'second round of indices'. In the - # case there is no transformation, the second round of indices will - # be all the available indices, i.e. from 1 to the number of taken - # values with the first round of indices. - for (inner_dim in expected_inner_dims[[i]]) { -if (debug) { -print("-> DEFINING INDICES FOR INNER DIMENSION:") -print(inner_dim) -} - file_dim <- NULL - if (inner_dim %in% unlist(inner_dims_across_files)) { - file_dim <- names(inner_dims_across_files)[which(sapply(inner_dims_across_files, function(x) inner_dim %in% x))[1]] - chunk_amount <- length(dat[[i]][['selectors']][[file_dim]][[1]]) - names(chunk_amount) <- file_dim - } else { - chunk_amount <- 1 - } - # In the special case that the selectors for a dimension are 'all', 'first', ... - # and chunking (dividing in more than 1 chunk) is requested, the selectors are - # replaced for equivalent indices. - if ((dat[[i]][['selectors']][[inner_dim]][[1]] %in% c('all', 'first', 'last')) && - (chunks[[inner_dim]]['n_chunks'] != 1)) { - selectors <- dat[[i]][['selectors']][[inner_dim]][[1]] - if (selectors == 'all') { - selectors <- indices(1:(data_dims[[inner_dim]] * chunk_amount)) - } else if (selectors == 'first') { - selectors <- indices(1) - } else { - selectors <- indices(data_dims[[inner_dim]] * chunk_amount) - } - dat[[i]][['selectors']][[inner_dim]][[1]] <- selectors - } - # The selectors for the inner dimension are taken. - selector_array <- dat[[i]][['selectors']][[inner_dim]][[1]] -if (debug) { -if (inner_dim %in% dims_to_check) { -print(paste0("-> DEBUG MESSAGES FOR THE DATASET", i, " AND INNER DIMENSION '", inner_dim, "':")) -print("-> STRUCTURE OF SELECTOR ARRAY:") -print(str(selector_array)) -print("-> PICKED VARS:") -print(picked_vars) -print("-> TRANSFORMED VARS:") -print(transformed_vars) -} -} - if (is.null(dim(selector_array))) { - dim(selector_array) <- length(selector_array) - } - if (is.null(names(dim(selector_array)))) { - if (length(dim(selector_array)) == 1) { - names(dim(selector_array)) <- inner_dim - } else { - stop("Provided selector arrays must be provided with dimension ", - "names. Found an array of selectors without dimension names ", - "for the dimension '", inner_dim, "'.") - } - } - selectors_are_indices <- FALSE - if (!is.null(attr(selector_array, 'indices'))) { - if (!is.logical(attr(selector_array, 'indices'))) { - stop("The atribute 'indices' for the selectors for the dimension '", - inner_dim, "' must be TRUE or FALSE.") - } - selectors_are_indices <- attr(selector_array, 'indices') - } - taken_chunks <- rep(FALSE, chunk_amount) - selector_file_dims <- 1 - if (any(found_file_dims[[i]] %in% names(dim(selector_array)))) { - selector_file_dims <- dim(selector_array)[which(names(dim(selector_array)) %in% found_file_dims[[i]])] - } - selector_inner_dims <- dim(selector_array)[which(!(names(dim(selector_array)) %in% found_file_dims[[i]]))] - var_with_selectors <- NULL - var_with_selectors_name <- var_params[[inner_dim]] - var_ordered <- NULL - var_unorder_indices <- NULL - with_transform <- FALSE - # If the selectors come with an associated variable - if (!is.null(var_with_selectors_name)) { - if ((var_with_selectors_name %in% transform_vars) && (!is.null(transform))) { - with_transform <- TRUE - if (!is.null(file_dim)) { - stop("Requested a transformation over the dimension '", - inner_dim, "', wich goes across files. This feature ", - "is not supported. Either do the request without the ", - "transformation or request it over dimensions that do ", - "not go across files.") - } - } -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> NAME OF THE VARIABLE WITH THE SELECTOR VALUES FOR THE CURRENT INNER DIMENSION:") -print(var_with_selectors_name) -print("-> NAMES OF THE VARIABLES TO BE TRANSFORMED:") -print(transform_vars) -print("-> STRUCTURE OF THE TRANSFORMATION FUNCTION:") -print(str(transform)) -} -} - if (var_with_selectors_name %in% names(picked_vars[[i]])) { - var_with_selectors <- picked_vars[[i]][[var_with_selectors_name]] - var_ordered <- picked_vars_ordered[[i]][[var_with_selectors_name]] - var_unorder_indices <- picked_vars_unorder_indices[[i]][[var_with_selectors_name]] - } else if (var_with_selectors_name %in% names(picked_common_vars)) { - var_with_selectors <- picked_common_vars[[var_with_selectors_name]] - var_ordered <- picked_common_vars_ordered[[var_with_selectors_name]] - var_unorder_indices <- picked_common_vars_unorder_indices[[var_with_selectors_name]] - } - n <- prod(dim(var_with_selectors)) - if (is.null(var_unorder_indices)) { - var_unorder_indices <- 1:n - } - if (with_transform) { - if (var_with_selectors_name %in% names(transformed_vars[[i]])) { - m <- prod(dim(transformed_vars[[i]][[var_with_selectors_name]])) - if (aiat) { - var_with_selectors <- transformed_vars[[i]][[var_with_selectors_name]] - var_ordered <- transformed_vars_ordered[[i]][[var_with_selectors_name]] - var_unorder_indices <- transformed_vars_unorder_indices[[i]][[var_with_selectors_name]] - } - } else if (var_with_selectors_name %in% names(transformed_common_vars)) { - m <- prod(dim(transformed_common_vars[[var_with_selectors_name]])) - if (aiat) { - var_with_selectors <- transformed_common_vars[[var_with_selectors_name]] - var_ordered <- transformed_common_vars_ordered[[var_with_selectors_name]] - var_unorder_indices <- transformed_common_vars_unorder_indices[[var_with_selectors_name]] - } - } - if (is.null(var_unorder_indices)) { - var_unorder_indices <- 1:m - } - } -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> SIZE OF ORIGINAL VARIABLE:") -print(n) -print("-> SIZE OF TRANSFORMED VARIABLE:") -if (with_transform) print(m) -print("-> STRUCTURE OF ORDERED VAR:") -print(str(var_ordered)) -print("-> UNORDER INDICES:") -print(var_unorder_indices) -} -} - var_dims <- dim(var_with_selectors) - var_file_dims <- 1 - if (any(names(var_dims) %in% found_file_dims[[i]])) { - if (with_transform) { - stop("Requested transformation for inner dimension '", - inner_dim, "' but provided selectors for such dimension ", - "over one or more file dimensions. This is not ", - "supported. Either request no transformation for the ", - "dimension '", inner_dim, "' or specify the ", - "selectors for this dimension without the file dimensions.") - } - var_file_dims <- var_dims[which(names(var_dims) %in% found_file_dims[[i]])] - var_dims <- var_dims[-which(names(var_dims) %in% found_file_dims[[i]])] - } -## # Keep the selectors if they correspond to a variable that will be transformed. -## if (with_transform) { -## if (var_with_selectors_name %in% names(picked_vars[[i]])) { -## transformed_var_with_selectors <- transformed_vars[[i]][[var_with_selectors_name]] -## } else if (var_with_selectors_name %in% names(picked_common_vars)) { -## transformed_var_with_selectors <- transformed_common_vars[[var_with_selectors_name]] -## } -## transformed_var_dims <- dim(transformed_var_with_selectors) -## transformed_var_file_dims <- 1 -## if (any(names(transformed_var_dims) %in% found_file_dims[[i]])) { -## transformed_var_file_dims <- transformed_var_dims[which(names(transformed_var_dims) %in% found_file_dims[[i]])] -## transformed_var_dims <- tranasformed_var_dims[-which(names(transformed_var_dims) %in% found_file_dims[[i]])] -## } -##if (inner_dim %in% dims_to_check) { -##print("111m") -##print(str(transformed_var_dims)) -##} -## -## m <- prod(transformed_var_dims) -## } - # Work out var file dims and inner dims. - if (inner_dim %in% unlist(inner_dims_across_files)) { - #TODO: if (chunk_amount != number of chunks in selector_file_dims), crash - if (length(var_dims) > 1) { - stop("Specified a '", inner_dim, "_var' for the dimension '", - inner_dim, "', which goes across files (across '", file_dim, - "'). The specified variable, '", var_with_selectors_name, "', has more ", - "than one dimension and can not be used as selector variable. ", - "Select another variable or fix it in the files.") - } - } - ## TODO HERE:: - #- indices_of_first_files_with_data may change, because array is now extended - var_full_dims <- dim(var_with_selectors) - if (!(inner_dim %in% names(var_full_dims))) { - stop("Could not find the dimension '", inner_dim, "' in ", - "the file. Either change the dimension name in ", - "your request, adjust the parameter ", - "'dim_names_in_files' or fix the dimension name in ", - "the file.\n", file_path) - } - } else if (((is.numeric(selector_array) || is.list(selector_array)) && selectors_are_indices) || - (is.character(selector_array) && (length(selector_array) == 1) && - (selector_array %in% c('all', 'first', 'last')) && - !is.null(file_dim_reader))) { - #### TODO HERE:: - ###- indices_of_first_files_with_data may change, because array is now extended - # Lines moved above for better performance. - ##data_dims <- file_dim_reader(file_path, NULL, selectors_of_first_files_with_data[[i]], - ## lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1)) - if (!(inner_dim %in% names(data_dims))) { - stop("Could not find the dimension '", inner_dim, "' in ", - "the file. Either change the dimension name in ", - "your request, adjust the parameter ", - "'dim_names_in_files' or fix the dimension name in ", - "the file.\n", file_path) - } - } else { - stop(paste0("Can not translate the provided selectors for '", inner_dim, - "' to numeric indices. Provide numeric indices and a ", - "'file_dim_reader' function, or a '", inner_dim, - "_var' in order to calculate the indices.")) - } - # At this point, if no selector variable was provided, the variable - # data_dims has been populated. If a selector variable was provided, - # the variables var_dims, var_file_dims and var_full_dims have been - # populated instead. - fri <- first_round_indices <- NULL - sri <- second_round_indices <- NULL - # This variable will keep the indices needed to crop the transformed - # variable (the one that has been transformed without being subset - # with the first round indices). - tvi <- tranaformed_variable_indices <- NULL - ordered_fri <- NULL - ordered_sri <- NULL - if ((length(selector_array) == 1) && is.character(selector_array) && - (selector_array %in% c('all', 'first', 'last')) && - (chunks[[inner_dim]]['n_chunks'] == 1)) { - if (is.null(var_with_selectors_name)) { - fri <- vector('list', length = chunk_amount) - dim(fri) <- c(chunk_amount) - sri <- vector('list', length = chunk_amount) - dim(sri) <- c(chunk_amount) - if (selector_array == 'all') { - fri[] <- replicate(chunk_amount, list(1:(data_dims[inner_dim]))) - taken_chunks <- rep(TRUE, chunk_amount) - #sri <- NULL - } else if (selector_array == 'first') { - fri[[1]] <- 1 - taken_chunks[1] <- TRUE - #sri <- NULL - } else if (selector_array == 'last') { - fri[[chunk_amount]] <- data_dims[inner_dim] - taken_chunks[length(taken_chunks)] <- TRUE - #sri <- NULL - } - } else { - if ((!is.null(file_dim)) && !(file_dim %in% names(var_file_dims))) { - stop("The variable '", var_with_selectors_name, "' must also be ", - "requested for the file dimension '", file_dim, "' in ", - "this configuration.") - } - fri <- vector('list', length = prod(var_file_dims)) - dim(fri) <- var_file_dims - ordered_fri <- fri - sri <- vector('list', length = prod(var_file_dims)) - dim(sri) <- var_file_dims - ordered_sri <- sri - if (selector_array == 'all') { -# TODO: Populate ordered_fri - ordered_fri[] <- replicate(prod(var_file_dims), list(1:n)) - fri[] <- replicate(prod(var_file_dims), list(var_unorder_indices[1:n])) - taken_chunks <- rep(TRUE, chunk_amount) - if (!with_transform) { - #fri[] <- replicate(prod(var_file_dims), list(1:n)) - #taken_chunks <- rep(TRUE, chunk_amount) - #sri <- NULL - } else { - ordered_sri[] <- replicate(prod(var_file_dims), list(1:m)) - sri[] <- replicate(prod(var_file_dims), list(1:m)) - ## var_file_dims instead?? - #if (!aiat) { - #fri[] <- replicate(prod(var_file_dims), list(1:n)) - #taken_chunks <- rep(TRUE, chunk_amount) - #sri[] <- replicate(prod(transformed_var_file_dims), list(1:m)) - #} else { - #fri[] <- replicate(prod(var_file_dims), list(1:n)) - #taken_chunks <- rep(TRUE, chunk_amount) - #sri[] <- replicate(prod(transformed_var_file_dims), list(1:m)) - #} - tvi <- 1:m - } - } else if (selector_array == 'first') { - taken_chunks[1] <- TRUE - if (!with_transform) { - ordered_fri[[1]] <- 1 - fri[[1]] <- var_unorder_indices[1] - #taken_chunks[1] <- TRUE - #sri <- NULL - } else { - if (!aiat) { - ordered_fri[[1]] <- 1 - fri[[1]] <- var_unorder_indices[1] -# TODO: TO BE IMPROVED - #taken_chunks[1] <- TRUE - ordered_sri[[1]] <- 1:ceiling(m / n) - sri[[1]] <- 1:ceiling(m / n) - tvi <- 1:ceiling(m / n) - } else { - ordered_fri[[1]] <- 1:ceiling(m / n) - fri[[1]] <- var_unorder_indices[1:ceiling(m / n)] - #taken_chunks[1] <- TRUE - ordered_sri[[1]] <- 1 - sri[[1]] <- 1 - tvi <- 1 - } - } - } else if (selector_array == 'last') { - taken_chunks[length(taken_chunks)] <- TRUE - if (!with_transform) { - ordered_fri[[prod(var_file_dims)]] <- n - fri[[prod(var_file_dims)]] <- var_unorder_indices[n] - #taken_chunks[length(taken_chunks)] <- TRUE - #sri <- NULL - } else { - if (!aiat) { - ordered_fri[[prod(var_file_dims)]] <- prod(var_dims) - fri[[prod(var_file_dims)]] <- var_unorder_indices[prod(var_dims)] - #taken_chunks[length(taken_chunks)] <- TRUE - ordered_sri[[prod(var_file_dims)]] <- 1:ceiling(m / n) - sri[[prod(var_file_dims)]] <- 1:ceiling(m / n) -# TODO: TO BE IMPROVED. THE TVI MAY BE WRONG IF THERE'S BEEN A REORDERING. - tvi <- 1:ceiling(m / n) - } else { - ordered_fri[[prod(var_file_dims)]] <- (n - ceiling(m / n) + 1):n - fri[[prod(var_file_dims)]] <- var_unorder_indices[(n - ceiling(m / n) + 1):n] - #taken_chunks[length(taken_chunks)] <- TRUE - ordered_sri[[prod(var_file_dims)]] <- 1 - sri[[prod(var_file_dims)]] <- 1 - tvi <- 1 - } - } - } - } - # If the selectors are not 'all', 'first', 'last', ... - } else { - if (!is.null(var_with_selectors_name)) { - unmatching_file_dims <- which(!(names(var_file_dims) %in% names(selector_file_dims))) - if ((length(unmatching_file_dims) > 0)) { - raise_error <- FALSE - if (is.null(file_dim)) { - raise_error <- TRUE - } else { - if (!((length(unmatching_file_dims) == 1) && - (names(var_file_dims)[unmatching_file_dims] == file_dim) && - (inner_dim %in% names(selector_inner_dims)))) { - raise_error <- TRUE - } - } - if (raise_error) { - stop("Provided selectors for the dimension '", inner_dim, "' must have as many ", - "file dimensions as the variable the dimension is defined along, '", - var_with_selectors_name, "', with the exceptions of the file pattern dimension ('", - found_pattern_dim, "') and any depended file dimension (if specified as ", - "depended dimension in parameter 'inner_dims_across_files' and the ", - "depending file dimension is present in the provided selector array).") - } - } - if (any(names(selector_file_dims) %in% names(dim(var_with_selectors)))) { - if (any(dim(var_with_selectors)[names(selector_file_dims)] != selector_file_dims)) { - stop("Size of selector file dimensions must mach size of requested ", - "variable dimensions.") - } - } - } - ## TODO: If var dimensions are not in the same order as selector dimensions, reorder - if (is.null(names(selector_file_dims))) { - if (is.null(file_dim)) { - fri_dims <- 1 - } else { - fri_dims <- chunk_amount - names(fri_dims) <- file_dim - } - } else { - fri_dim_names <- names(selector_file_dims) - if (!is.null(file_dim)) { - fri_dim_names <- c(fri_dim_names, file_dim) - } - fri_dim_names <- found_file_dims[[i]][which(found_file_dims[[i]] %in% fri_dim_names)] - fri_dims <- rep(NA, length(fri_dim_names)) - names(fri_dims) <- fri_dim_names - fri_dims[names(selector_file_dims)] <- selector_file_dims - if (!is.null(file_dim)) { - fri_dims[file_dim] <- chunk_amount - } - } - fri <- vector('list', length = prod(fri_dims)) - dim(fri) <- fri_dims - sri <- vector('list', length = prod(fri_dims)) - dim(sri) <- fri_dims - selector_file_dim_array <- array(1:prod(selector_file_dims), dim = selector_file_dims) - selector_store_position <- fri_dims - for (j in 1:prod(dim(selector_file_dim_array))) { - selector_indices_to_take <- which(selector_file_dim_array == j, arr.ind = TRUE)[1, ] - names(selector_indices_to_take) <- names(selector_file_dims) - selector_store_position[names(selector_indices_to_take)] <- selector_indices_to_take - sub_array_of_selectors <- Subset(selector_array, names(selector_indices_to_take), - as.list(selector_indices_to_take), drop = 'selected') -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> ITERATING OVER FILE DIMENSIONS OF THE SELECTORS.") -print("-> STRUCTURE OF A SUB ARRAY:") -print(str(sub_array_of_selectors)) -print("-> STRUCTURE OF THE VARIABLE WITH SELECTORS:") -print(str(var_with_selectors)) -print(dim(var_with_selectors)) -} -} - if (selectors_are_indices) { - sub_array_of_values <- NULL - #} else if (!is.null(var_ordered)) { - # sub_array_of_values <- var_ordered - } else { - if (length(var_file_dims) > 0) { - var_indices_to_take <- selector_indices_to_take[which(names(selector_indices_to_take) %in% names(var_file_dims))] - sub_array_of_values <- Subset(var_with_selectors, names(var_indices_to_take), - as.list(var_indices_to_take), drop = 'selected') - } else { - sub_array_of_values <- var_with_selectors - } - } -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> STRUCTURE OF THE SUB ARRAY FROM THE VARIABLE CORRESPONDING TO THE SUB ARRAY OF SELECTORS") -print(str(sub_array_of_values)) -print(dim(sub_array_of_values)) -print("-> NAME OF THE FILE DIMENSION THE CURRENT INNER DIMENSION EXTENDS ALONG:") -print(file_dim) -} -} - if ((!is.null(file_dim) && (file_dim %in% names(selector_file_dims))) || is.null(file_dim)) { - if (length(sub_array_of_selectors) > 0) { -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> THE INNER DIMENSION DOES NOT GO ACROSS ANY FILE DIMENSION OR IT DOES BUT IS IN THE PROVIDED SELECTOR ARRAY.") -} -} - if (selectors_are_indices) { - if (!is.null(var_with_selectors_name)) { - max_allowed <- ifelse(aiat, m, n) - } else { - max_allowed <- data_dims[inner_dim] - } - if (any(na.omit(unlist(sub_array_of_selectors)) > max_allowed) || - any(na.omit(unlist(sub_array_of_selectors)) < 1)) { - stop("Provided indices out of range for dimension '", inner_dim, "' ", - "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", - max_allowed, ").") - } - } - - # The selector_checker will return either a vector of indices or a list - # with the first and last desired indices. - goes_across_prime_meridian <- FALSE - if (!is.null(var_ordered) && !selectors_are_indices) { - if (!is.null(dim_reorder_params[[inner_dim]])) { - if (is.list(sub_array_of_selectors)) { - -## NOTE: The check of 'goes_across_prime_meridian' is moved forward to here. - is_circular_dim <- attr(dim_reorder_params[[inner_dim]], "circular") - if (!is.null(is_circular_dim)) { - if (is_circular_dim) { - -# NOTE: Use CircularSort() to put the values in the assigned range, and get the order. -# For example, [-10, 20] in CircularSort(0, 360) is [350, 20]. The $ix list is [2, 1]. -# 'goes_across_prime_meridian' means the selector range across the border. For example, -# CircularSort(-180, 180) with selector [170, 190] -> goes_across_prime_meridian = TRUE. - tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix - goes_across_prime_meridian <- tmp[1] > tmp[2] - } - } - - # HERE change to the same code as below (under 'else'). Not sure why originally - #it uses additional lines, which make reorder not work. - sub_array_of_selectors <- as.list(dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$x) - #sub_array_reordered <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors)) - #sub_array_unorder <- sort(sub_array_reordered$ix, index.return = TRUE)$ix - #sub_array_of_selectors <- as.list(sub_array_reordered$x[sub_array_unorder]) - -# Add warning if the boundary is out of range - if (sub_array_of_selectors[1] < range(var_ordered)[1] | sub_array_of_selectors[1] > range(var_ordered)[2]) { - .warning(paste0("The lower boundary of selector of ", - inner_dim, - " is out of range [", - min(var_ordered), ", ", max(var_ordered), "]. ", - "Check if the desired range is all included.")) - } - if (sub_array_of_selectors[2] < range(var_ordered)[1] | sub_array_of_selectors[2] > range(var_ordered)[2]) { - .warning(paste0("The upper boundary of selector of ", - inner_dim, - " is out of range [", - min(var_ordered), ", ", max(var_ordered), "]. ", - "Check if the desired range is all included.")) - } - - - } else { - sub_array_of_selectors <- dim_reorder_params[[inner_dim]](sub_array_of_selectors)$x - } - } - -# NOTE: The ideal solution for selecting indices in goes_across_prime_meridian case -# is modified SelectorCheckor.R. But now SelectorCheckor doesn't know the info of -#goes_across_prime_meridian, so I do the adjustion after calling SelectorCheckor(). - sub_array_of_indices <- selector_checker(sub_array_of_selectors, var_ordered, - tolerance = if (aiat) { - NULL - } else { - tolerance_params[[inner_dim]] - }) - - if (goes_across_prime_meridian & sub_array_of_indices[[1]] < sub_array_of_indices[[2]]) { - if (!(sub_array_of_selectors[[1]] %in% var_ordered)){ - sub_array_of_indices[[1]] <- sub_array_of_indices[[1]] - 1 - } - - if (!(sub_array_of_selectors[[2]] %in% var_ordered)){ - sub_array_of_indices[[2]] <- sub_array_of_indices[[2]] + 1 - } - } - -#NOTE: the possible case? - if (goes_across_prime_meridian & sub_array_of_indices[[1]] > sub_array_of_indices[[2]]) { - .stop("The case is goes_across_prime_meridian but no adjustion for the indices!") - } - - if (any(is.na(sub_array_of_indices))) { - - stop(paste0("The selectors of ", inner_dim, - " are out of range [", min(var_ordered), - ", ", max(var_ordered), "].")) - } - - } else { - -# Add warning if the boundary is out of range - if (is.list(sub_array_of_selectors)) { - if (sub_array_of_selectors[1] < - min(sub_array_of_values) | sub_array_of_selectors[1] > - max(sub_array_of_values)) { - .warning(paste0("The lower boundary of selector of ", - inner_dim, " is out of range [", - min(sub_array_of_values), ", ", - max(sub_array_of_values), "]. ", - "Check if the desired range is all included.")) - } - if (sub_array_of_selectors[2] < - min(sub_array_of_values) | sub_array_of_selectors[2] > - max(sub_array_of_values)) { - .warning(paste0("The upper boundary of selector of ", - inner_dim, " is out of range [", - min(sub_array_of_values), ", ", - max(sub_array_of_values), "]. ", - "Check if the desired range is all included.")) - } - } - - sub_array_of_indices <- selector_checker(sub_array_of_selectors, sub_array_of_values, - tolerance = if (aiat) { - NULL - } else { - tolerance_params[[inner_dim]] - }) - - if (any(is.na(sub_array_of_indices))) { - - stop(paste0("The selectors of ", inner_dim, - " are out of range [", min(sub_array_of_values), - ", ", max(sub_array_of_values), "].")) - } - - } - ## This 'if' runs in both Start() and Compute(). In Start(), it doesn't have any effect (no chunk). - ## In Compute(), it creates the indices for each chunk. For example, if 'sub_array_of_indices' - ## is c(5:10) and chunked into 2, 'sub_array_of_indices' becomes c(5:7) for chunk = 1, c(8:10) - ## for chunk = 2. If 'sub_array_of_indices' is list(55, 62) and chunked into 2, it becomes - ## list(55, 58) for chunk = 1 and list(59, 62) for chunk = 2. - ## TODO: The list can be turned into vector here? So afterward no need to judge if it is list - ## or vector. - if (!is.list(sub_array_of_indices)) { - sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices), - chunks[[inner_dim]]["chunk"], - chunks[[inner_dim]]["n_chunks"], - inner_dim)] - } else { - tmp <- chunk_indices(length(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]), - chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], - inner_dim) - vect <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] - sub_array_of_indices[[1]] <- vect[tmp[1]] - sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] - } - # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. - -# Check if all the files have the selectors assigned (e.g., region = 'Grnland') _20191015 -if (is.character(sub_array_of_selectors)) { - array_of_var_files_check <- vector('list', length(selector_indices)) - for (k in 1:length(selector_indices)) { - asdasd <- selector_indices[[k]] - array_of_var_files_check <- do.call('[', c(list(x = array_of_files_to_load), asdasd, list(drop = FALSE)))[j] - file_object <- file_opener(array_of_var_files_check) - var_values_check <- file_var_reader(NULL, file_object, NULL, var_to_read, synonims) - if (any(as.vector(var_values_check)[sub_array_of_indices] != sub_array_of_selectors)) { - .warning('Not all the files has correponding selectors. Check the selector attributes') - } - } -} - -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> TRANSFORMATION REQUESTED?") -print(with_transform) -print("-> BETA:") -print(beta) -} -} - if (with_transform) { - # If there is a transformation and selector values are provided, these - # selectors will be processed in the same way either if aiat = TRUE or - # aiat = FALSE. -## TODO: If sub_array_of_selectors was integer and aiat then... do what's commented 50 lines below. -## otherwise, do what's coded. -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> SELECTORS REQUESTED BEFORE TRANSFORM.") -} -} - -###NOTE: Here, the transform, is different from the below part of non-transform. -# search 'if (goes_across_prime_meridian' to find the lines below. - if (goes_across_prime_meridian) { -# NOTE: before changing, the return is already correct. - -#NOTE: The fix below has the same explanation as no with_transform part below. -# Search the next next 'if (goes_across_prime_meridian) {'. - if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) { - # global longitude - sub_array_of_fri <- 1:n - # Warning if transform_extra_cell != 0 - if (beta != 0) { - .warning(paste0("Adding parameter transform_extra_cells = ", - transform_extra_cells, " to the transformed index excesses ", - "the border. The border index is used for transformation.")) - } - - } else { - # normal case, i.e., not global - first_index <- min(unlist(sub_array_of_indices)) - last_index <- max(unlist(sub_array_of_indices)) - gap_width <- last_index - first_index - 1 - sub_array_of_fri <- c(1:(min(unlist(sub_array_of_indices)) + min(gap_width, beta)), - (max(unlist(sub_array_of_indices)) - min(gap_width, beta)):n) - - if (min(gap_width, beta) != beta) { - .warning(paste0("Adding parameter transform_extra_cells = ", - transform_extra_cells, " to the transformed index excesses ", - "the border. The border index is used for transformation.")) - } - } - - } else { - #NOTE: This if seems redundant. - if (is.list(sub_array_of_indices)) { - sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] - } - first_index <- min(unlist(sub_array_of_indices)) - last_index <- max(unlist(sub_array_of_indices)) - - start_padding <- min(beta, first_index - 1) - end_padding <- min(beta, n - last_index) - - if (exists("is_circular_dim")) { - if (!is_circular_dim) { #latitude - sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) - if (start_padding != beta | end_padding != beta) { - .warning(paste0("Adding parameter transform_extra_cells = ", - transform_extra_cells, " to the transformed index excesses ", - "the border. The border index is used for transformation.")) - } - } else { #longitude - if ((last_index - first_index + 1 + beta * 2) >= n) { - sub_array_of_fri <- 1:n - } else if (start_padding < beta) { # left side too close to border, need to go to right side - sub_array_of_fri <- c((first_index - start_padding):(last_index + end_padding), (n - (beta - start_padding - 1)):n) - } else if (end_padding < beta) { # right side too close to border, need to go to left side - sub_array_of_fri <- c(1: (beta - end_padding), (first_index - start_padding):(last_index + end_padding)) - } else { #normal - sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) - } - } - } else { # when _reorder is not used - sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) - if (start_padding != beta | end_padding != beta) { - .warning(paste0("Adding parameter transform_extra_cells = ", - transform_extra_cells, " to the transformed index excesses ", - "the border. The border index is used for transformation.")) - } - } - - } - subset_vars_to_transform <- vars_to_transform - if (!is.null(var_ordered)) { - -##NOTE: if var_ordered is common_vars, it doesn't have attributes and it is a vector. -## Turn it into array and add dimension name. - if (!is.array(var_ordered)) { - var_ordered <- as.array(var_ordered) - names(dim(var_ordered)) <- inner_dim - } - - subset_vars_to_transform[[var_with_selectors_name]] <- Subset(var_ordered, inner_dim, sub_array_of_fri) - } else { -##NOTE: It should be redundant because without reordering the var should remain array -## But just stay same with above... - if (!is.array(sub_array_of_values)) { - sub_array_of_values <- as.array(sub_array_of_values) - names(dim(sub_array_of_values)) <- inner_dim - } - - subset_vars_to_transform[[var_with_selectors_name]] <- Subset(sub_array_of_values, inner_dim, sub_array_of_fri) - } - -# Change the order of longitude crop if no reorder + from big to small. -# cdo -sellonlatbox, the lon is west, east (while lat can be north -# to south or opposite) - -# Before changing crop, first we need to find the name of longitude. -# NOTE: The potential bug here (also the bug for CDORemapper): the lon name -# is limited (only the ones listed in .KnownLonNames() are available. - known_lon_names <- s2dverification:::.KnownLonNames() - lon_name <- names(subset_vars_to_transform)[which(names(subset_vars_to_transform) %in% known_lon_names)[1]] - -# NOTE: The cases not considered: (1) if lon reorder(decreasing = T) -# It doesn't make sense, but if someone uses it, here should -# occur error. (2) crop = TRUE/FALSE - if ('crop' %in% names(transform_params) & var_with_selectors_name == lon_name & is.null(dim_reorder_params[[inner_dim]])) { - if (is.numeric(class(transform_params$crop))) { - if (transform_params$crop[1] > transform_params$crop[2]) { - tmp <- transform_params$crop[1] - transform_params$crop[1] <- transform_params$crop[2] - transform_params$crop[2] <- tmp - } - } - } - - transformed_subset_var <- do.call(transform, c(list(data_array = NULL, - variables = subset_vars_to_transform, - file_selectors = selectors_of_first_files_with_data[[i]]), - transform_params))$variables[[var_with_selectors_name]] - # Sorting the transformed variable and working out the indices again after transform. - if (!is.null(dim_reorder_params[[inner_dim]])) { - transformed_subset_var_reorder <- dim_reorder_params[[inner_dim]](transformed_subset_var) - transformed_subset_var <- transformed_subset_var_reorder$x -#NOTE: The fix here solves the mis-ordered lon when across_meridian. - transformed_subset_var_unorder <- transformed_subset_var_reorder$ix -# transformed_subset_var_unorder <- sort(transformed_subset_var_reorder$ix, index.return = TRUE)$ix - } else { - transformed_subset_var_unorder <- 1:length(transformed_subset_var) - } - sub_array_of_sri <- selector_checker(sub_array_of_selectors, transformed_subset_var, - tolerance = if (aiat) { - tolerance_params[[inner_dim]] - } else { - NULL - }) - -# Check if selectors fall out of the range of the transform grid -# It may happen when original lon is [-180, 180] while want to regrid to -# [0, 360], and lon selector = [-20, -10]. - if (any(is.na(sub_array_of_sri))) { - stop(paste0("The selectors of ", - inner_dim, " are out of range of transform grid '", - transform_params$grid, "'. Use parameter '", - inner_dim, "_reorder' or change ", inner_dim, - " selectors.")) - } - - if (goes_across_prime_meridian) { - - if (sub_array_of_sri[[1]] == sub_array_of_sri[[2]]) { - # global longitude - sub_array_of_sri <- c(1:length(transformed_subset_var)) - } else { - # the common case, i.e., non-global - # NOTE: Because sub_array_of_sri order is exchanged due to - # previous development, here [[1]] and [[2]] should exchange - sub_array_of_sri <- c(1:sub_array_of_sri[[1]], - sub_array_of_sri[[2]]:length(transformed_subset_var)) - } - - } else if (is.list(sub_array_of_sri)) { - sub_array_of_sri <- sub_array_of_sri[[1]]:sub_array_of_sri[[2]] - } - ordered_sri <- sub_array_of_sri - sub_array_of_sri <- transformed_subset_var_unorder[sub_array_of_sri] - # In this case, the tvi are not defined and the 'transformed_subset_var' - # will be taken instead of the var transformed before in the code. -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> FIRST INDEX:") -print(first_index) -print("-> LAST INDEX:") -print(last_index) -print("-> STRUCTURE OF FIRST ROUND INDICES:") -print(str(sub_array_of_fri)) -print("-> STRUCTURE OF SECOND ROUND INDICES:") -print(str(sub_array_of_sri)) -print("-> STRUCTURE OF TRANSFORMED VARIABLE INDICES:") -print(str(tvi)) -} -} -### # If the selectors are expressed after transformation -### } else { -###if (debug) { -###if (inner_dim %in% dims_to_check) { -###print("-> SELECTORS REQUESTED AFTER TRANSFORM.") -###} -###} -### if (goes_across_prime_meridian) { -### sub_array_of_indices <- c(sub_array_of_indices[[1]]:m, -### 1:sub_array_of_indices[[2]]) -### } -### first_index <- min(unlist(sub_array_of_indices)) -### last_index <- max(unlist(sub_array_of_indices)) -### first_index_before_transform <- max(transform_indices(first_index, m, n) - beta, 1) -### last_index_before_transform <- min(transform_indices(last_index, m, n) + beta, n) -### sub_array_of_fri <- first_index_before_transform:last_index_before_transform -### n_of_extra_cells <- round(beta / n * m) -### if (is.list(sub_array_of_indices) && (length(sub_array_of_indices) > 1)) { -### sub_array_of_sri <- 1:(last_index - first_index + 1) -### if (is.null(tvi)) { -### tvi <- sub_array_of_sri + first_index - 1 -### } -### } else { -### sub_array_of_sri <- sub_array_of_indices - first_index + 1 -### if (is.null(tvi)) { -### tvi <- sub_array_of_indices -### } -### } -### sub_array_of_sri <- sub_array_of_sri + n_of_extra_cells - sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position), - list(value = sub_array_of_sri))) - } else { - if (goes_across_prime_meridian) { -#NOTE: The potential problem here is, if it is global longitude, -# and the indices overlap (e.g., lon = [0, 359.723] and -# CircularSort(-180, 180), then sub_array_of_indices = list(649, 649)). -# Therefore, sub_array_of_fri will be c(1:649, 649:1296). We'll -# get two 649. -# The fix below may not be the best solution, but it works for -# the example above. - - if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) { - # global longitude - sub_array_of_fri <- c(1:n) - } else { - # the common case, i.e., non-global - sub_array_of_fri <- c(1:min(unlist(sub_array_of_indices)), - max(unlist(sub_array_of_indices)):n) - } - - } else if (is.list(sub_array_of_indices)) { - sub_array_of_fri <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] - } else { - sub_array_of_fri <- sub_array_of_indices - } - } - if (!is.null(var_unorder_indices)) { - if (is.null(ordered_fri)) { - ordered_fri <- sub_array_of_fri - } - sub_array_of_fri <- var_unorder_indices[sub_array_of_fri] - } - fri <- do.call('[[<-', c(list(x = fri), as.list(selector_store_position), - list(value = sub_array_of_fri))) - if (!is.null(file_dim)) { - taken_chunks[selector_store_position[[file_dim]]] <- TRUE - } else { - taken_chunks <- TRUE - } - } - } else { -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.") -} -} - if (inner_dim %in% names(dim(sub_array_of_selectors))) { - if (is.null(var_with_selectors_name)) { - if (any(na.omit(unlist(sub_array_of_selectors)) < 1) || - any(na.omit(unlist(sub_array_of_selectors)) > data_dims[inner_dim] * chunk_amount)) { - stop("Provided indices out of range for dimension '", inner_dim, "' ", - "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", - data_dims[inner_dim] * chunk_amount, ").") - } - } else { - if (inner_dim %in% names(dim(sub_array_of_values))) { -# NOTE: Put across-inner-dim at the 1st position. -# POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_selectors below. - inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_values)) == inner_dim) - if (inner_dim_pos_in_sub_array != 1) { - new_sub_array_order <- (1:length(dim(sub_array_of_values)))[-inner_dim_pos_in_sub_array] - new_sub_array_order <- c(inner_dim_pos_in_sub_array, new_sub_array_order) - sub_array_of_values <- .aperm2(sub_array_of_values, new_sub_array_order) - } - } - } - -# NOTE: Put across-inner-dim at the 1st position. -# POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_values above. - inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_selectors)) == inner_dim) - if (inner_dim_pos_in_sub_array != 1) { - new_sub_array_order <- (1:length(dim(sub_array_of_selectors)))[-inner_dim_pos_in_sub_array] - new_sub_array_order <- c(inner_dim_pos_in_sub_array, new_sub_array_order) - sub_array_of_selectors <- .aperm2(sub_array_of_selectors, new_sub_array_order) - } - sub_array_of_indices <- selector_checker(sub_array_of_selectors, sub_array_of_values, - tolerance = tolerance_params[[inner_dim]]) - # It is needed to expand the indices here, otherwise for - # values(list(date1, date2)) only 2 values are picked. - if (is.list(sub_array_of_indices)) { - sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] - } - sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices), - chunks[[inner_dim]]['chunk'], - chunks[[inner_dim]]['n_chunks'], - inner_dim)] - sub_array_is_list <- FALSE - if (is.list(sub_array_of_indices)) { - sub_array_is_list <- TRUE - sub_array_of_indices <- unlist(sub_array_of_indices) - } - if (is.null(var_with_selectors_name)) { - indices_chunk <- floor((sub_array_of_indices - 1) / data_dims[inner_dim]) + 1 - transformed_indices <- ((sub_array_of_indices - 1) %% data_dims[inner_dim]) + 1 - } else { - indices_chunk <- floor((sub_array_of_indices - 1) / var_full_dims[inner_dim]) + 1 - transformed_indices <- ((sub_array_of_indices - 1) %% var_full_dims[inner_dim]) + 1 - } - if (sub_array_is_list) { - sub_array_of_indices <- as.list(sub_array_of_indices) - } -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> GOING TO ITERATE ALONG CHUNKS.") -} -} - for (chunk in 1:chunk_amount) { - if (!is.null(names(selector_store_position))) { - selector_store_position[file_dim] <- chunk - } else { - selector_store_position <- chunk - } - chunk_selectors <- transformed_indices[which(indices_chunk == chunk)] - sub_array_of_indices <- chunk_selectors - if (with_transform) { - # If the provided selectors are expressed in the world - # before transformation - if (!aiat) { - first_index <- min(unlist(sub_array_of_indices)) - last_index <- max(unlist(sub_array_of_indices)) - sub_array_of_fri <- max(c(first_index - beta, 1)):min(c(last_index + beta, n)) - sub_array_of_sri <- transform_indices(unlist(sub_array_of_indices) - first_index + 1, n, m) - if (is.list(sub_array_of_indices)) { - if (length(sub_array_of_sri) > 1) { - sub_array_of_sri <- sub_array_of_sri[[1]]:sub_array_of_sri[[2]] - } - } -##TODO: TRANSFORM SUBSET VARIABLE AS ABOVE, TO COMPUTE SRI - # If the selectors are expressed after transformation - } else { - first_index <- min(unlist(sub_array_of_indices)) - last_index <- max(unlist(sub_array_of_indices)) - first_index_before_transform <- max(transform_indices(first_index, m, n) - beta, 1) - last_index_before_transform <- min(transform_indices(last_index, m, n) + beta, n) - sub_array_of_fri <- first_index_before_transform:last_index_before_transform - if (is.list(sub_array_of_indices) && (length(sub_array_of_indices) > 1)) { - sub_array_of_sri <- 1:(last_index - first_index + 1) + - round(beta / n * m) - } else { - sub_array_of_sri <- sub_array_of_indices - first_index + 1 + - round(beta / n * m) - } -##TODO: FILL IN TVI - } - sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position), - list(value = sub_array_of_sri))) - if (length(sub_array_of_sri) > 0) { - taken_chunks[chunk] <- TRUE - } - } else { - sub_array_of_fri <- sub_array_of_indices - if (length(sub_array_of_fri) > 0) { - taken_chunks[chunk] <- TRUE - } - } - if (!is.null(var_unorder_indices)) { - ordered_fri <- sub_array_of_fri - sub_array_of_fri <- var_unorder_indices[sub_array_of_fri] - } - fri <- do.call('[[<-', c(list(x = fri), as.list(selector_store_position), - list(value = sub_array_of_fri))) - } -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> FINISHED ITERATING ALONG CHUNKS") -} -} - } else { - stop("Provided array of indices for dimension '", inner_dim, "', ", - "which goes across the file dimension '", file_dim, "', but ", - "the provided array does not have the dimension '", inner_dim, - "', which is mandatory.") - } - } - } - } -if (debug) { -if (inner_dim %in% dims_to_check) { -print("-> PROCEEDING TO CROP VARIABLES") -} -} - #if ((length(selector_array) == 1) && (selector_array %in% c('all', 'first', 'last'))) { - #if (!is.null(var_with_selectors_name) || (is.null(var_with_selectors_name) && is.character(selector_array) && - # (length(selector_array) == 1) && (selector_array %in% c('all', 'first', 'last')))) { - empty_chunks <- which(!taken_chunks) - if (length(empty_chunks) >= length(taken_chunks)) { - stop("Selectors do not match any of the possible values for the dimension '", inner_dim, "'.") - } - if (length(empty_chunks) > 0) { -# # Get the first group of chunks to remove, and remove them. -# # E.g., from c(1, 2, 4, 5, 6, 8, 9) remove only 1 and 2 -# dist <- abs(rev(empty_chunks) - c(rev(empty_chunks)[1] - 1, head(rev(empty_chunks), length(rev(empty_chunks)) - 1))) -# if (all(dist == 1)) { -# start_chunks_to_remove <- NULL -# } else { -# first_chunk_to_remove <- tail(which(dist > 1), 1) -# start_chunks_to_remove <- rev(rev(empty_chunks)[first_chunk_to_remove:length(empty_chunks)]) -# } -# # Get the last group of chunks to remove, and remove them. -# # E.g., from c(1, 2, 4, 5, 6, 8, 9) remove only 8 and 9 -# dist <- abs(empty_chunks - c(empty_chunks[1] - 1, head(empty_chunks, length(empty_chunks) - 1))) -# if (all(dist == 1)) { -# first_chunk_to_remove <- 1 -# } else { -# first_chunk_to_remove <- tail(which(dist > 1), 1) -# } -# end_chunks_to_remove <- empty_chunks[first_chunk_to_remove:length(empty_chunks)] -# chunks_to_keep <- which(!((1:length(taken_chunks)) %in% c(start_chunks_to_remove, end_chunks_to_remove))) - chunks_to_keep <- which(taken_chunks) - dims_to_crop[[file_dim]] <- c(dims_to_crop[[file_dim]], list(chunks_to_keep)) -# found_indices <- Subset(found_indices, file_dim, chunks_to_keep) -# # Crop dataset variables file dims. -# for (picked_var in names(picked_vars[[i]])) { -# if (file_dim %in% names(dim(picked_vars[[i]][[picked_var]]))) { -# picked_vars[[i]][[picked_var]] <- Subset(picked_vars[[i]][[picked_var]], file_dim, chunks_to_keep) -# } -# } - } - #} - dat[[i]][['selectors']][[inner_dim]] <- list(fri = fri, sri = sri) - # Crop dataset variables inner dims. - # Crop common variables inner dims. - types_of_var_to_crop <- 'picked' - if (with_transform) { - types_of_var_to_crop <- c(types_of_var_to_crop, 'transformed') - } - if (!is.null(dim_reorder_params[[inner_dim]])) { - types_of_var_to_crop <- c(types_of_var_to_crop, 'reordered') - } - for (type_of_var_to_crop in types_of_var_to_crop) { - if (type_of_var_to_crop == 'transformed') { - if (is.null(tvi)) { - if (!is.null(dim_reorder_params[[inner_dim]])) { - crop_indices <- unique(unlist(ordered_sri)) - } else { - crop_indices <- unique(unlist(sri)) - } - } else { - crop_indices <- unique(unlist(tvi)) - } - vars_to_crop <- transformed_vars[[i]] - common_vars_to_crop <- transformed_common_vars - } else if (type_of_var_to_crop == 'reordered') { - crop_indices <- unique(unlist(ordered_fri)) - vars_to_crop <- picked_vars_ordered[[i]] - common_vars_to_crop <- picked_common_vars_ordered - } else { - crop_indices <- unique(unlist(fri)) - vars_to_crop <- picked_vars[[i]] - common_vars_to_crop <- picked_common_vars - } - for (var_to_crop in names(vars_to_crop)) { - if (inner_dim %in% names(dim(vars_to_crop[[var_to_crop]]))) { - if (!is.null(crop_indices)) { - if (type_of_var_to_crop == 'transformed') { - if (!aiat) { - vars_to_crop[[var_to_crop]] <- Subset(transformed_subset_var, inner_dim, crop_indices) - } else { - vars_to_crop[[var_to_crop]] <- Subset(vars_to_crop[[var_to_crop]], inner_dim, crop_indices) - } - } else { - vars_to_crop[[var_to_crop]] <- Subset(vars_to_crop[[var_to_crop]], inner_dim, crop_indices) - } - } - } - } - if (i == length(dat)) { - for (common_var_to_crop in names(common_vars_to_crop)) { - if (inner_dim %in% names(dim(common_vars_to_crop[[common_var_to_crop]]))) { - common_vars_to_crop[[common_var_to_crop]] <- Subset(common_vars_to_crop[[common_var_to_crop]], inner_dim, crop_indices) - } - } - } - if (type_of_var_to_crop == 'transformed') { - if (!is.null(vars_to_crop)) { - transformed_vars[[i]] <- vars_to_crop - } - if (i == length(dat)) { - transformed_common_vars <- common_vars_to_crop - } - } else if (type_of_var_to_crop == 'reordered') { - if (!is.null(vars_to_crop)) { - picked_vars_ordered[[i]] <- vars_to_crop - } - if (i == length(dat)) { - picked_common_vars_ordered <- common_vars_to_crop - } - } else { - if (!is.null(vars_to_crop)) { - picked_vars[[i]] <- vars_to_crop - } - if (i == length(dat)) { - picked_common_vars <- common_vars_to_crop - } - } - } - #} - } - # After the selectors have been picked (using the original variables), - # the variables are transformed. At that point, the original selectors - # for the transformed variables are also kept in the variable original_selectors. -#print("L") - } - } - } -# if (!is.null(transformed_common_vars)) { -# picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars -# } - # Remove the trailing chunks, if any. - for (file_dim in names(dims_to_crop)) { -# indices_to_keep <- min(sapply(dims_to_crop[[file_dim]], min)):max(sapply(dims_to_crop[[file_dim]], max)) - ## TODO: Merge indices in dims_to_crop with some advanced mechanism? - indices_to_keep <- unique(unlist(dims_to_crop[[file_dim]])) - array_of_files_to_load <- Subset(array_of_files_to_load, file_dim, indices_to_keep) - array_of_not_found_files <- Subset(array_of_not_found_files, file_dim, indices_to_keep) - for (i in 1:length(dat)) { - # Crop selectors - for (selector_dim in names(dat[[i]][['selectors']])) { - if (selector_dim == file_dim) { - for (j in 1:length(dat[[i]][['selectors']][[selector_dim]][['fri']])) { - dat[[i]][['selectors']][[selector_dim]][['fri']][[j]] <- dat[[i]][['selectors']][[selector_dim]][['fri']][[j]][indices_to_keep] - } - for (j in 1:length(dat[[i]][['selectors']][[selector_dim]][['sri']])) { - dat[[i]][['selectors']][[selector_dim]][['sri']][[j]] <- dat[[i]][['selectors']][[selector_dim]][['sri']][[j]][indices_to_keep] - } - } - if (file_dim %in% names(dim(dat[[i]][['selectors']][[selector_dim]][['fri']]))) { - dat[[i]][['selectors']][[selector_dim]][['fri']] <- Subset(dat[[i]][['selectors']][[selector_dim]][['fri']], file_dim, indices_to_keep) - dat[[i]][['selectors']][[selector_dim]][['sri']] <- Subset(dat[[i]][['selectors']][[selector_dim]][['sri']], file_dim, indices_to_keep) - } - } - # Crop dataset variables file dims. - for (picked_var in names(picked_vars[[i]])) { - if (file_dim %in% names(dim(picked_vars[[i]][[picked_var]]))) { - picked_vars[[i]][[picked_var]] <- Subset(picked_vars[[i]][[picked_var]], file_dim, indices_to_keep) - } - } - for (transformed_var in names(transformed_vars[[i]])) { - if (file_dim %in% names(dim(transformed_vars[[i]][[transformed_var]]))) { - transformed_vars[[i]][[transformed_var]] <- Subset(transformed_vars[[i]][[transformed_var]], file_dim, indices_to_keep) - } - } - } - # Crop common variables file dims. - for (picked_common_var in names(picked_common_vars)) { - if (file_dim %in% names(dim(picked_common_vars[[picked_common_var]]))) { - picked_common_vars[[picked_common_var]] <- Subset(picked_common_vars[[picked_common_var]], file_dim, indices_to_keep) - } - } - for (transformed_common_var in names(transformed_common_vars)) { - if (file_dim %in% names(dim(transformed_common_vars[[transformed_common_var]]))) { - transformed_common_vars[[transformed_common_var]] <- Subset(transformed_common_vars[[transformed_common_var]], file_dim, indices_to_keep) - } - } - } - # Calculate the size of the final array. - total_inner_dims <- NULL - for (i in 1:length(dat)) { - if (dataset_has_files[i]) { - inner_dims <- expected_inner_dims[[i]] - inner_dims <- sapply(inner_dims, - function(x) { - if (!all(sapply(dat[[i]][['selectors']][[x]][['sri']], is.null))) { - max(sapply(dat[[i]][['selectors']][[x]][['sri']], length)) - } else { - if (length(var_params[[x]]) > 0) { - if (var_params[[x]] %in% names(transformed_vars[[i]])) { - length(transformed_vars[[i]][[var_params[[x]]]]) - } else if (var_params[[x]] %in% names(transformed_common_vars)) { - length(transformed_common_vars[[var_params[[x]]]]) - } else { - max(sapply(dat[[i]][['selectors']][[x]][['fri']], length)) - } - } else { - max(sapply(dat[[i]][['selectors']][[x]][['fri']], length)) - } - } - }) - names(inner_dims) <- expected_inner_dims[[i]] - if (is.null(total_inner_dims)) { - total_inner_dims <- inner_dims - } else { - new_dims <- .MergeArrayDims(total_inner_dims, inner_dims) - total_inner_dims <- new_dims[[3]] - } - } - } - new_dims <- .MergeArrayDims(dim(array_of_files_to_load), total_inner_dims) - final_dims <- new_dims[[3]][dim_names] - # final_dims_fake is the vector of final dimensions after having merged the - # 'across' file dimensions with the respective 'across' inner dimensions, and - # after having broken into multiple dimensions those dimensions for which - # multidimensional selectors have been provided. - # final_dims will be used for collocation of data, whereas final_dims_fake - # will be used for shaping the final array to be returned to the user. - final_dims_fake <- final_dims - if (merge_across_dims) { - if (!is.null(inner_dims_across_files)) { - for (file_dim_across in names(inner_dims_across_files)) { - inner_dim_pos <- which(names(final_dims_fake) == inner_dims_across_files[[file_dim_across]]) - new_dims <- c() - if (inner_dim_pos > 1) { - new_dims <- c(new_dims, final_dims_fake[1:(inner_dim_pos - 1)]) - } - new_dims <- c(new_dims, setNames(prod(final_dims_fake[c(inner_dim_pos, inner_dim_pos + 1)]), - inner_dims_across_files[[file_dim_across]])) - if (inner_dim_pos + 1 < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(inner_dim_pos + 2):length(final_dims_fake)]) - } - final_dims_fake <- new_dims - } - } - } - all_split_dims <- NULL - if (split_multiselected_dims) { - for (dim_param in 1:length(dim_params)) { - if (!is.null(dim(dim_params[[dim_param]]))) { - if (length(dim(dim_params[[dim_param]])) > 1) { - split_dims <- dim(dim_params[[dim_param]]) - all_split_dims <- c(all_split_dims, setNames(list(split_dims), - names(dim_params)[dim_param])) - if (is.null(names(split_dims))) { - names(split_dims) <- paste0(names(dim_params)[dim_param], - 1:length(split_dims)) - } - old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) - -# NOTE: Three steps to create new dims. -# 1st: Put in the dims before split_dim. -# 2nd: Replace the old_dim with split_dims. -# 3rd: Put in the dims after split_dim. - new_dims <- c() - if (old_dim_pos > 1) { - new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)]) - } - new_dims <- c(new_dims, split_dims) - if (old_dim_pos < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(old_dim_pos + 1):length(final_dims_fake)]) - } - final_dims_fake <- new_dims - } - } - } - } - if (merge_across_dims_narm) { - # only merge_across_dims -> the 'time' dim length needs to be adjusted - across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? - across_file_dim <- names(inner_dims_across_files) #TODO: more than one? - # Get the length of each inner_dim ('time') along each file_dim ('file_date') - length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) - - if (!split_multiselected_dims) { - final_dims_fake_name <- names(final_dims_fake) - pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) - new_length_inner_dim <- sum(unlist(length_inner_across_dim)) - if (pos_across_inner_dim != length(final_dims_fake)) { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim, - final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) - } else { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim) - } - names(final_dims_fake) <- final_dims_fake_name - } - } - - if (!silent) { - .message("Detected dimension sizes:") - longest_dim_len <- max(sapply(names(final_dims_fake), nchar)) - longest_size_len <- max(sapply(paste0(final_dims_fake, ''), nchar)) - sapply(names(final_dims_fake), - function(x) { - message(paste0("* ", paste(rep(' ', longest_dim_len - nchar(x)), collapse = ''), - x, ": ", paste(rep(' ', longest_size_len - nchar(paste0(final_dims_fake[x], ''))), collapse = ''), - final_dims_fake[x])) - }) - bytes <- prod(c(final_dims_fake, 8)) - dim_sizes <- paste(final_dims_fake, collapse = ' x ') - if (retrieve) { - .message(paste("Total size of requested data:")) - } else { - .message(paste("Total size of involved data:")) - } - .message(paste(dim_sizes, " x 8 bytes =", - format(structure(bytes, class = "object_size"), units = "auto")), - indent = 2) - } - -# NOTE: If split_multiselected_dims + merge_across_dims, the dim order may need to be changed. -# The inner_dim needs to be the first dim among split dims. -# Cannot control the rest dims are in the same order or not... -# Suppose users put the same order of across inner and file dims. - if (split_multiselected_dims & merge_across_dims) { - # TODO: More than one split? - inner_dim_pos_in_split_dims <- which(names(all_split_dims[[1]]) == inner_dims_across_files) - # if inner_dim is not the first, change! - if (inner_dim_pos_in_split_dims != 1) { - split_dims <- c(split_dims[inner_dim_pos_in_split_dims], - split_dims[1:length(split_dims)][-inner_dim_pos_in_split_dims]) - split_dims_pos <- which(!is.na(match(names(final_dims_fake), names(split_dims)))) - # Save the current final_dims_fake for later reorder back - final_dims_fake_output <- final_dims_fake - new_dims <- c() - if (split_dims_pos[1] != 1) { - new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) - } - new_dims <- c(new_dims, split_dims) - if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) - } - final_dims_fake <- new_dims - } - } - - # The following several lines will only be run if retrieve = TRUE - if (retrieve) { - - ########## CREATING THE SHARED MATRIX AND DISPATCHING WORK PIECES ########### - # TODO: try performance of storing all in cols instead of rows - # Create the shared memory array, and a pointer to it, to be sent - # to the work pieces. - data_array <- big.matrix(nrow = prod(final_dims), ncol = 1) - shared_matrix_pointer <- describe(data_array) - if (is.null(num_procs)) { - num_procs <- availableCores() - } - # Creating a shared tmp folder to store metadata from each chunk - array_of_metadata_flags <- array(FALSE, dim = dim(array_of_files_to_load)) - if (!is.null(metadata_dims)) { - metadata_indices_to_load <- as.list(rep(1, length(dim(array_of_files_to_load)))) - names(metadata_indices_to_load) <- names(dim(array_of_files_to_load)) - metadata_indices_to_load[metadata_dims] <- as.list(rep(TRUE, length(metadata_dims))) - array_of_metadata_flags <- do.call('[<-', c(list(array_of_metadata_flags), metadata_indices_to_load, - list(value = rep(TRUE, prod(dim(array_of_files_to_load)[metadata_dims]))))) - } - metadata_file_counter <- 0 - metadata_folder <- tempfile('metadata') - dir.create(metadata_folder) - # Build the work pieces, each with: - # - file path - # - total size (dims) of store array - # - start position in store array - # - file selectors (to provide extra info. useful e.g. to select variable) - # - indices to take from file - work_pieces <- list() - for (i in 1:length(dat)) { - if (dataset_has_files[i]) { - selectors <- dat[[i]][['selectors']] - file_dims <- found_file_dims[[i]] - inner_dims <- expected_inner_dims[[i]] - sub_array_dims <- final_dims[file_dims] - sub_array_dims[found_pattern_dim] <- 1 - sub_array_of_files_to_load <- array(1:prod(sub_array_dims), - dim = sub_array_dims) - names(dim(sub_array_of_files_to_load)) <- names(sub_array_dims) - # Detect which of the dimensions of the dataset go across files. - file_dim_across_files <- lapply(inner_dims, - function(x) { - dim_across <- sapply(inner_dims_across_files, function(y) x %in% y) - if (any(dim_across)) { - names(inner_dims_across_files)[which(dim_across)[1]] - } else { - NULL - } - }) - names(file_dim_across_files) <- inner_dims - j <- 1 - while (j <= prod(sub_array_dims)) { - # Work out file path. - file_to_load_sub_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] - names(file_to_load_sub_indices) <- names(sub_array_dims) - file_to_load_sub_indices[found_pattern_dim] <- i - big_dims <- rep(1, length(dim(array_of_files_to_load))) - names(big_dims) <- names(dim(array_of_files_to_load)) - file_to_load_indices <- .MergeArrayDims(file_to_load_sub_indices, big_dims)[[1]] - file_to_load <- do.call('[[', c(list(array_of_files_to_load), - as.list(file_to_load_indices))) - not_found_file <- do.call('[[', c(list(array_of_not_found_files), - as.list(file_to_load_indices))) - load_file_metadata <- do.call('[', c(list(array_of_metadata_flags), - as.list(file_to_load_indices))) - if (load_file_metadata) { - metadata_file_counter <- metadata_file_counter + 1 - } - if (!is.na(file_to_load) && !not_found_file) { - # Work out indices to take - first_round_indices <- lapply(inner_dims, - function (x) { - if (is.null(file_dim_across_files[[x]])) { - selectors[[x]][['fri']][[1]] - } else { - which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] - selectors[[x]][['fri']][[which_chunk]] - } - }) - names(first_round_indices) <- inner_dims - second_round_indices <- lapply(inner_dims, - function (x) { - if (is.null(file_dim_across_files[[x]])) { - selectors[[x]][['sri']][[1]] - } else { - which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] - selectors[[x]][['sri']][[which_chunk]] - } - }) -if (debug) { -print("-> BUILDING A WORK PIECE") -#print(str(selectors)) -} - names(second_round_indices) <- inner_dims - if (!any(sapply(first_round_indices, length) == 0)) { - work_piece <- list() - work_piece[['first_round_indices']] <- first_round_indices - work_piece[['second_round_indices']] <- second_round_indices - work_piece[['file_indices_in_array_of_files']] <- file_to_load_indices - work_piece[['file_path']] <- file_to_load - work_piece[['store_dims']] <- final_dims - # Work out store position - store_position <- final_dims - store_position[names(file_to_load_indices)] <- file_to_load_indices - store_position[inner_dims] <- rep(1, length(inner_dims)) - work_piece[['store_position']] <- store_position - # Work out file selectors - file_selectors <- sapply(file_dims, - function (x) { - vector_to_pick <- 1 - if (x %in% names(depending_file_dims)) { - vector_to_pick <- file_to_load_indices[depending_file_dims[[x]]] - } - selectors[file_dims][[x]][[vector_to_pick]][file_to_load_indices[x]] - }) - names(file_selectors) <- file_dims - work_piece[['file_selectors']] <- file_selectors - # Send variables for transformation - if (!is.null(transform) && (length(transform_vars) > 0)) { - vars_to_transform <- NULL - picked_vars_to_transform <- which(names(picked_vars[[i]]) %in% transform_vars) - if (length(picked_vars_to_transform) > 0) { - picked_vars_to_transform <- names(picked_vars[[i]])[picked_vars_to_transform] - vars_to_transform <- c(vars_to_transform, picked_vars[[i]][picked_vars_to_transform]) - if (any(picked_vars_to_transform %in% names(picked_vars_ordered[[i]]))) { - picked_vars_ordered_to_transform <- picked_vars_to_transform[which(picked_vars_to_transform %in% names(picked_vars_ordered[[i]]))] - vars_to_transform[picked_vars_ordered_to_transform] <- picked_vars_ordered[[i]][picked_vars_ordered_to_transform] - } - } - picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars) - if (length(picked_common_vars_to_transform) > 0) { - picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform] - vars_to_transform <- c(vars_to_transform, picked_common_vars[picked_common_vars_to_transform]) - if (any(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))) { - picked_common_vars_ordered_to_transform <- picked_common_vars_to_transform[which(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))] - vars_to_transform[picked_common_vars_ordered_to_transform] <- picked_common_vars_ordered[picked_common_vars_ordered_to_transform] - } - } - work_piece[['vars_to_transform']] <- vars_to_transform - } - # Send flag to load metadata - if (load_file_metadata) { - work_piece[['save_metadata_in']] <- paste0(metadata_folder, '/', metadata_file_counter) - } - work_pieces <- c(work_pieces, list(work_piece)) - } - } - j <- j + 1 - } - } - } -#print("N") -if (debug) { -print("-> WORK PIECES BUILT") -} - - # Calculate the progress %s that will be displayed and assign them to - # the appropriate work pieces. - if (length(work_pieces) / num_procs >= 2 && !silent) { - if (length(work_pieces) / num_procs < 10) { - amount <- 100 / ceiling(length(work_pieces) / num_procs) - reps <- ceiling(length(work_pieces) / num_procs) - } else { - amount <- 10 - reps <- 10 - } - progress_steps <- rep(amount, reps) - if (length(work_pieces) < (reps + 1)) { - selected_pieces <- length(work_pieces) - progress_steps <- c(sum(head(progress_steps, reps)), - tail(progress_steps, reps)) - } else { - selected_pieces <- round(seq(1, length(work_pieces), - length.out = reps + 1))[-1] - } - progress_steps <- paste0(' + ', round(progress_steps, 2), '%') - progress_message <- 'Progress: 0%' - } else { - progress_message <- '' - selected_pieces <- NULL - } - piece_counter <- 1 - step_counter <- 1 - work_pieces <- lapply(work_pieces, - function (x) { - if (piece_counter %in% selected_pieces) { - wp <- c(x, list(progress_amount = progress_steps[step_counter])) - step_counter <<- step_counter + 1 - } else { - wp <- x - } - piece_counter <<- piece_counter + 1 - wp - }) - if (!silent) { - .message("If the size of the requested data is close to or above the free shared RAM memory, R may crash.") - .message("If the size of the requested data is close to or above the half of the free RAM memory, R may crash.") - .message(paste0("Will now proceed to read and process ", length(work_pieces), " data files:")) - if (length(work_pieces) < 30) { - lapply(work_pieces, function (x) .message(x[['file_path']], indent = 2)) - } else { - .message("The list of files is long. You can check it after Start() finishes in the output '$Files'.", indent = 2, exdent = 5) - } - } - - # Build the cluster of processes that will do the work and dispatch work pieces. - # The function .LoadDataFile is applied to each work piece. This function will - # open the data file, regrid if needed, subset, apply the mask, - # compute and apply the weights if needed, - # disable extreme values and store in the shared memory matrix. -#print("O") - if (!silent) { - .message("Loading... This may take several minutes...") - if (progress_message != '') { - .message(progress_message, appendLF = FALSE) - } - } - if (num_procs == 1) { - found_files <- lapply(work_pieces, .LoadDataFile, - shared_matrix_pointer = shared_matrix_pointer, - file_data_reader = file_data_reader, - synonims = synonims, - transform = transform, - transform_params = transform_params, - silent = silent, debug = debug) - } else { - cluster <- makeCluster(num_procs, outfile = "") - # Send the heavy work to the workers - work_errors <- try({ - found_files <- clusterApplyLB(cluster, work_pieces, .LoadDataFile, - shared_matrix_pointer = shared_matrix_pointer, - file_data_reader = file_data_reader, - synonims = synonims, - transform = transform, - transform_params = transform_params, - silent = silent, debug = debug) - }) - stopCluster(cluster) - } - - if (!silent) { - if (progress_message != '') { - .message("\n", tag = '') - } - } -#print("P") - -# NOTE: If merge_across_dims = TRUE, there might be additional NAs due to -# unequal inner_dim ('time') length across file_dim ('file_date'). -# If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. -# TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. - if (merge_across_dims_narm) { - - # Get the length of these two dimensions in final_dims - length_inner_across_store_dims <- final_dims[across_inner_dim] - length_file_across_store_dims <- final_dims[across_file_dim] - - # Create a logical array for merge_across_dims - logi_array <- array(rep(FALSE, - length_file_across_store_dims * length_inner_across_store_dims), - dim = c(length_inner_across_store_dims, length_file_across_store_dims)) - for (i in 1:length_file_across_store_dims) { #1:4 - logi_array[1:length_inner_across_dim[[i]], i] <- TRUE - } - - # First, get the data array with final_dims dimension - data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) - - # Change the NA derived from additional spaces to -9999, then remove these -9999 - func_remove_blank <- function(data_array, logi_array) { - # dim(data_array) = [time, file_date] - # dim(logi_array) = [time, file_date] - # Change the blank spaces from NA to -9999 - data_array[which(!logi_array)] <- -9999 - return(data_array) - } - data_array_final_dims <- multiApply::Apply(data_array_final_dims, - target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') - output_dims = c(across_inner_dim, across_file_dim), - fun = func_remove_blank, - logi_array = logi_array)$output1 - ## reorder back to the correct dim - tmp <- match(names(final_dims), names(dim(data_array_final_dims))) - data_array_final_dims <- .aperm2(data_array_final_dims, tmp) - data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector - - data_array <- array(data_array_tmp, dim = final_dims_fake) - - } else { # merge_across_dims_narm = F (old version) - data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) - } - -# NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above. -# To get the user-required dim order, we need to reorder the array again. - if (split_multiselected_dims & merge_across_dims) { - if (inner_dim_pos_in_split_dims != 1) { - correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) - data_array <- .aperm2(data_array, correct_order) - } - } - - gc() - - # Load metadata and remove the metadata folder - if (!is.null(metadata_dims)) { - loaded_metadata_files <- list.files(metadata_folder) - loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS) - unlink(metadata_folder, recursive = TRUE) - return_metadata <- vector('list', length = prod(dim(array_of_metadata_flags)[metadata_dims])) - return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata - dim(return_metadata) <- dim(array_of_metadata_flags[metadata_dims]) - attr(data_array, 'Variables') <- return_metadata - # TODO: Try to infer data type from loaded_metadata - # as.integer(data_array) - } - - failed_pieces <- work_pieces[which(unlist(found_files))] - for (failed_piece in failed_pieces) { - array_of_not_found_files <- do.call('[<-', - c(list(array_of_not_found_files), - as.list(failed_piece[['file_indices_in_array_of_files']]), - list(value = TRUE))) - } - if (any(array_of_not_found_files)) { - for (i in 1:prod(dim(array_of_files_to_load))) { - if (is.na(array_of_not_found_files[i])) { - array_of_files_to_load[i] <- NA - } else { - if (array_of_not_found_files[i]) { - array_of_not_found_files[i] <- array_of_files_to_load[i] - array_of_files_to_load[i] <- NA - } else { - array_of_not_found_files[i] <- NA - } - } - } - } else { - array_of_not_found_files <- NULL - } - - } # End if (retrieve) - - # Change final_dims_fake back because retrieve = FALSE will use it for attributes later - if (exists("final_dims_fake_output")) { - final_dims_fake <- final_dims_fake_output - } - # Replace the vars and common vars by the transformed vars and common vars - for (i in 1:length(dat)) { - if (length(names(transformed_vars[[i]])) > 0) { - picked_vars[[i]][names(transformed_vars[[i]])] <- transformed_vars[[i]] - } else if (length(names(picked_vars_ordered[[i]])) > 0) { - picked_vars[[i]][names(picked_vars_ordered[[i]])] <- picked_vars_ordered[[i]] - } - } - if (length(names(transformed_common_vars)) > 0) { - picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars - } else if (length(names(picked_common_vars_ordered)) > 0) { - picked_common_vars[names(picked_common_vars_ordered)] <- picked_common_vars_ordered - } -if (debug) { -print("-> THE TRANSFORMED VARS:") -print(str(transformed_vars)) -print("-> THE PICKED VARS:") -print(str(picked_vars)) -} - - file_selectors <- NULL - for (i in 1:length(dat)) { - file_selectors[[dat[[i]][['name']]]] <- dat[[i]][['selectors']][which(names(dat[[i]][['selectors']]) %in% found_file_dims[[i]])] - } - if (retrieve) { - if (!silent) { - .message("Successfully retrieved data.") - } - var_backup <- attr(data_array, 'Variables')[[1]] - attr(data_array, 'Variables') <- NULL - attributes(data_array) <- c(attributes(data_array), - list(Variables = c(list(common = c(picked_common_vars, var_backup)), - picked_vars), - Files = array_of_files_to_load, - NotFoundFiles = array_of_not_found_files, - FileSelectors = file_selectors, - PatternDim = found_pattern_dim) - ) - attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class')) - data_array - } else { - if (!silent) { - .message("Successfully discovered data dimensions.") - } - start_call <- match.call() - for (i in 2:length(start_call)) { - if (class(start_call[[i]]) %in% c('name', 'call')) { - start_call[[i]] <- eval.parent(start_call[[i]]) - } - } - start_call[['retrieve']] <- TRUE - attributes(start_call) <- c(attributes(start_call), - list(Dimensions = final_dims_fake, - Variables = c(list(common = picked_common_vars), picked_vars), - ExpectedFiles = array_of_files_to_load, - FileSelectors = file_selectors, - PatternDim = found_pattern_dim, - MergedDims = if (merge_across_dims) { - inner_dims_across_files - } else { - NULL - }, - SplitDims = if (split_multiselected_dims) { - all_split_dims - } else { - NULL - }) - ) - attr(start_call, 'class') <- c('startR_cube', attr(start_call, 'class')) - start_call - } -} - -# This function is the responsible for loading the data of each work -# piece. -.LoadDataFile <- function(work_piece, shared_matrix_pointer, - file_data_reader, synonims, - transform, transform_params, - silent = FALSE, debug = FALSE) { -# suppressPackageStartupMessages({library(bigmemory)}) -### TODO: Specify dependencies as parameter -# suppressPackageStartupMessages({library(ncdf4)}) - -#print("1") - store_indices <- as.list(work_piece[['store_position']]) - first_round_indices <- work_piece[['first_round_indices']] - second_round_indices <- work_piece[['second_round_indices']] -#print("2") - file_to_open <- work_piece[['file_path']] - sub_array <- file_data_reader(file_to_open, NULL, - work_piece[['file_selectors']], - first_round_indices, synonims) -if (debug) { -if (all(unlist(store_indices[1:6]) == 1)) { -print("-> LOADING A WORK PIECE") -print("-> STRUCTURE OF READ UNTRANSFORMED DATA:") -print(str(sub_array)) -print("-> STRUCTURE OF VARIABLES TO TRANSFORM:") -print(str(work_piece[['vars_to_transform']])) -print("-> COMMON ARRAY DIMENSIONS:") -print(str(work_piece[['store_dims']])) -} -} - if (!is.null(sub_array)) { - # Apply data transformation once we have the data arrays. - if (!is.null(transform)) { -if (debug) { -if (all(unlist(store_indices[1:6]) == 1)) { -print("-> PROCEEDING TO TRANSFORM ARRAY") -print("-> DIMENSIONS OF ARRAY RIGHT BEFORE TRANSFORMING:") -print(dim(sub_array)) -} -} - sub_array <- do.call(transform, c(list(data_array = sub_array, - variables = work_piece[['vars_to_transform']], - file_selectors = work_piece[['file_selectors']]), - transform_params)) -if (debug) { -if (all(unlist(store_indices[1:6]) == 1)) { -print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER TRANSFORMING:") -print(str(sub_array)) -print("-> DIMENSIONS OF ARRAY RIGHT AFTER TRANSFORMING:") -print(dim(sub_array$data_array)) -} -} - sub_array <- sub_array$data_array - # Subset with second round of indices - dims_to_crop <- which(!sapply(second_round_indices, is.null)) - if (length(dims_to_crop) > 0) { - dimnames_to_crop <- names(second_round_indices)[dims_to_crop] - sub_array <- Subset(sub_array, dimnames_to_crop, - second_round_indices[dimnames_to_crop]) - } -if (debug) { -if (all(unlist(store_indices[1:6]) == 1)) { -print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER SUBSETTING WITH 2nd ROUND INDICES:") -print(str(sub_array)) -} -} - } - - metadata <- attr(sub_array, 'variables') - - names_bk <- names(store_indices) - store_indices <- lapply(names(store_indices), - function (x) { - if (!(x %in% names(first_round_indices))) { - store_indices[[x]] - } else if (is.null(second_round_indices[[x]])) { - 1:dim(sub_array)[x] - } else { - if (is.numeric(second_round_indices[[x]])) { - ## TODO: Review carefully this line. Inner indices are all - ## aligned to the left-most positions. If dataset A has longitudes - ## 1, 2, 3, 4 but dataset B has only longitudes 3 and 4, then - ## they will be stored as follows: - ## 1, 2, 3, 4 - ## 3, 4, NA, NA - ##x - min(x) + 1 - 1:length(second_round_indices[[x]]) - } else { - 1:length(second_round_indices[[x]]) - } - } - }) - names(store_indices) <- names_bk -if (debug) { -if (all(unlist(store_indices) == 1)) { -print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:") -print(str(first_round_indices)) -print("-> STRUCTURE OF SECOND ROUND INDICES FOR THIS WORK PIECE:") -print(str(second_round_indices)) -print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:") -print(str(store_indices)) -} -} - - store_indices <- lapply(store_indices, as.integer) - store_dims <- work_piece[['store_dims']] - - # split the storage work of the loaded subset in parts - largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))] - max_parts <- length(store_indices[[largest_dim_name]]) - - # Indexing a data file of N MB with expand.grid takes 30*N MB - # The peak ram of Start is, minimum, 2 * total data to load from all files - # due to inefficiencies in other regions of the code - # The more parts we split the indexing done below in, the lower - # the memory footprint of the indexing and the fast. - # But more than 10 indexing iterations (parts) for each MB processed - # makes the iteration slower (tested empirically on BSC workstations). - subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024 - best_n_parts <- ceiling(subset_size_in_mb * 10) - # We want to set n_parts to a greater value than the one that would - # result in a memory footprint (of the subset indexing code below) equal - # to 2 * total data to load from all files. - # s = subset size in MB - # p = number of parts to break it in - # T = total size of data to load - # then, s / p * 30 = 2 * T - # then, p = s * 15 / T - min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims)) - # Make sure we pick n_parts much greater than the minimum calculated - n_parts <- min_n_parts * 10 - if (n_parts > best_n_parts) { - n_parts <- best_n_parts - } - # Boundary checks - if (n_parts < 1) { - n_parts <- 1 - } - if (n_parts > max_parts) { - n_parts <- max_parts - } - - if (n_parts > 1) { - make_parts <- function(length, n) { - clusters <- cut(1:length, n, labels = FALSE) - lapply(1:n, function(y) which(clusters == y)) - } - part_indices <- make_parts(max_parts, n_parts) - parts <- lapply(part_indices, - function(x) { - store_indices[[largest_dim_name]][x] - }) - } else { - part_indices <- list(1:max_parts) - parts <- store_indices[largest_dim_name] - } - - # do the storage work - weights <- sapply(1:length(store_dims), - function(i) prod(c(1, store_dims)[1:i])) - part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array)))) - names(part_indices_in_sub_array) <- names(dim(sub_array)) - data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) - for (i in 1:n_parts) { - store_indices[[largest_dim_name]] <- parts[[i]] - # Converting array indices to vector indices - matrix_indices <- do.call("expand.grid", store_indices) - # Given a matrix where each row is a set of array indices of an element - # the vector indices are computed - matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights) - part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]] - data_array[matrix_indices] <- as.vector(do.call('[', - c(list(x = sub_array), - part_indices_in_sub_array))) - } - rm(data_array) - gc() - - if (!is.null(work_piece[['save_metadata_in']])) { - saveRDS(metadata, file = work_piece[['save_metadata_in']]) - } - } - if (!is.null(work_piece[['progress_amount']]) && !silent) { - message(work_piece[['progress_amount']], appendLF = FALSE) - } - is.null(sub_array) -} - +Start <- function(..., # dim = indices/selectors, + # dim_var = 'var', + # dim_reorder = Sort/CircularSort, + # dim_tolerance = number, + # dim_depends = 'file_dim', + # dim_across = 'file_dim', + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + retrieve = FALSE, + num_procs = 1, + silent = FALSE, debug = FALSE) { + #, config_file = NULL + #dictionary_dim_names = , + #dictionary_var_names = + dim_params <- list(...) + + # Take *_var parameters apart + var_params_ind <- grep('_var$', names(dim_params)) + var_params <- dim_params[var_params_ind] + # Check all *_var are NULL or vectors of character strings, and + # that they all have a matching dimension param. + i <- 1 + for (var_param in var_params) { + if (!is.character(var_param)) { + stop("All '*_var' parameters must be character strings.") + } else if (!any(grepl(paste0('^', strsplit(names(var_params)[i], + '_var$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_var' parameters must be associated to a dimension parameter. Found parameter '", + names(var_params)[i], "' but no parameter '", + strsplit(names(var_params)[i], '_var$')[[1]][1], "'.")) + } + i <- i + 1 + } + # Make the keys of 'var_params' to be the name of + # the corresponding dimension. + if (length(var_params) < 1) { + var_params <- NULL + } else { + names(var_params) <- gsub('_var$', '', names(var_params)) + } + + # Take *_reorder parameters apart + dim_reorder_params_ind <- grep('_reorder$', names(dim_params)) + dim_reorder_params <- dim_params[dim_reorder_params_ind] + # Make the keys of 'dim_reorder_params' to be the name of + # the corresponding dimension. + if (length(dim_reorder_params) < 1) { + dim_reorder_params <- NULL + } else { + names(dim_reorder_params) <- gsub('_reorder$', '', names(dim_reorder_params)) + } + + # Take *_tolerance parameters apart + tolerance_params_ind <- grep('_tolerance$', names(dim_params)) + tolerance_params <- dim_params[tolerance_params_ind] + + # Take *_depends parameters apart + depends_params_ind <- grep('_depends$', names(dim_params)) + depends_params <- dim_params[depends_params_ind] + # Check all *_depends are NULL or vectors of character strings, and + # that they all have a matching dimension param. + i <- 1 + for (depends_param in depends_params) { + if (!is.character(depends_param) || (length(depends_param) > 1)) { + stop("All '*_depends' parameters must be single character strings.") + } else if (!any(grepl(paste0('^', strsplit(names(depends_params)[i], + '_depends$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_depends' parameters must be associated to a dimension parameter. Found parameter '", + names(depends_params)[i], "' but no parameter '", + strsplit(names(depends_params)[i], '_depends$')[[1]][1], "'.")) + } + i <- i + 1 + } + # Make the keys of 'depends_params' to be the name of + # the corresponding dimension. + if (length(depends_params) < 1) { + depends_params <- NULL + } else { + names(depends_params) <- gsub('_depends$', '', names(depends_params)) + } + # Change name to depending_file_dims + depending_file_dims <- depends_params + + # Take *_across parameters apart + across_params_ind <- grep('_across$', names(dim_params)) + across_params <- dim_params[across_params_ind] + # Check all *_across are NULL or vectors of character strings, and + # that they all have a matching dimension param. + i <- 1 + for (across_param in across_params) { + if (!is.character(across_param) || (length(across_param) > 1)) { + stop("All '*_across' parameters must be single character strings.") + } else if (!any(grepl(paste0('^', strsplit(names(across_params)[i], + '_across$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_across' parameters must be associated to a dimension parameter. Found parameter '", + names(across_params)[i], "' but no parameter '", + strsplit(names(across_params)[i], '_across$')[[1]][1], "'.")) + } + i <- i + 1 + } + # Make the keys of 'across_params' to be the name of + # the corresponding dimension. + if (length(across_params) < 1) { + across_params <- NULL + } else { + names(across_params) <- gsub('_across$', '', names(across_params)) + } + # Change name to inner_dims_across_files + inner_dims_across_files <- across_params + + # Check merge_across_dims + if (!is.logical(merge_across_dims)) { + stop("Parameter 'merge_across_dims' must be TRUE or FALSE.") + } + + # Check merge_across_dims_narm + if (!is.logical(merge_across_dims_narm)) { + stop("Parameter 'merge_across_dims_narm' must be TRUE or FALSE.") + } + if (!merge_across_dims & merge_across_dims_narm) { + merge_across_dims_narm <- FALSE + warning(paste0("Parameter 'merge_across_dims_narm' can only be TRUE when ", + "'merge_across_dims' is TRUE. Set 'merge_across_dims_narm'", + " to FALSE.")) + } + + # Leave alone the dimension parameters in the variable dim_params + if (length(c(var_params_ind, dim_reorder_params_ind, tolerance_params_ind, + depends_params_ind, across_params_ind)) > 0) { + dim_params <- dim_params[-c(var_params_ind, dim_reorder_params_ind, + tolerance_params_ind, depends_params_ind, + across_params_ind)] + # Reallocating pairs of across file and inner dimensions if they have + # to be merged. They are put one next to the other to ease merge later. + if (merge_across_dims) { + for (inner_dim_across in names(inner_dims_across_files)) { + inner_dim_pos <- which(names(dim_params) == inner_dim_across) + file_dim_pos <- which(names(dim_params) == inner_dims_across_files[[inner_dim_across]]) + new_pos <- inner_dim_pos + if (file_dim_pos < inner_dim_pos) { + new_pos <- new_pos - 1 + } + dim_params_to_move <- dim_params[c(inner_dim_pos, file_dim_pos)] + dim_params <- dim_params[-c(inner_dim_pos, file_dim_pos)] + new_dim_params <- list() + if (new_pos > 1) { + new_dim_params <- c(new_dim_params, dim_params[1:(new_pos - 1)]) + } + new_dim_params <- c(new_dim_params, dim_params_to_move) + if (length(dim_params) >= new_pos) { + new_dim_params <- c(new_dim_params, dim_params[new_pos:length(dim_params)]) + } + dim_params <- new_dim_params + } + } + } + dim_names <- names(dim_params) + if (is.null(dim_names)) { + stop("At least one pattern dim must be specified.") + } + + # Look for chunked dims + chunks <- vector('list', length(dim_names)) + names(chunks) <- dim_names + for (dim_name in dim_names) { + if (!is.null(attr(dim_params[[dim_name]], 'chunk'))) { + chunks[[dim_name]] <- attr(dim_params[[dim_name]], 'chunk') + attributes(dim_params[[dim_name]]) <- attributes(dim_params[[dim_name]])[-which(names(attributes(dim_params[[dim_name]])) == 'chunk')] + } else { + chunks[[dim_name]] <- c(chunk = 1, n_chunks = 1) + } + } + # This is a helper function to compute the chunk indices to take once the total + # number of indices for a dimension has been discovered. + chunk_indices <- function(n_indices, chunk, n_chunks, dim_name) { + if (n_chunks > n_indices) { + stop("Requested to divide dimension '", dim_name, "' of length ", + n_indices, " in ", n_chunks, " chunks, which is not possible.") + } + chunk_sizes <- rep(floor(n_indices / n_chunks), n_chunks) + chunks_to_extend <- n_indices - chunk_sizes[1] * n_chunks + if (chunks_to_extend > 0) { + chunk_sizes[1:chunks_to_extend] <- chunk_sizes[1:chunks_to_extend] + 1 + } + chunk_size <- chunk_sizes[chunk] + offset <- 0 + if (chunk > 1) { + offset <- sum(chunk_sizes[1:(chunk - 1)]) + } + indices <- 1:chunk_sizes[chunk] + offset + array(indices, dim = setNames(length(indices), dim_name)) + } + + # Check pattern_dims + if (is.null(pattern_dims)) { + .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '", + dim_names[1], "' as 'pattern_dims'.")) + pattern_dims <- dim_names[1] + } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) { + pattern_dims <- unique(pattern_dims) + } else { + stop("Parameter 'pattern_dims' must be a vector of character strings.") + } + if (any(names(var_params) %in% pattern_dims)) { + stop("'*_var' parameters specified for pattern dimensions. Remove or fix them.") + } + # Find the pattern dimension with the pattern specifications + found_pattern_dim <- NULL + for (pattern_dim in pattern_dims) { + # Check all specifications in pattern_dim are valid + dat <- datasets <- dim_params[[pattern_dim]] + if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) { + stop(paste0("Parameter '", pattern_dim, + "' must be a list of lists with pattern specifications or a vector of character strings.")) + } + if (!is.null(dim_reorder_params[[pattern_dim]])) { + .warning(paste0("A reorder for the selectors of '", pattern_dim, + "' has been specified, but it is a pattern dimension and the reorder will be ignored.")) + } + if (is.list(dat) || any(sapply(dat, is.list))) { + if (is.null(found_pattern_dim)) { + found_pattern_dim <- pattern_dim + } else { + stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.") + } + } + } + if (is.null(found_pattern_dim)) { + .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications.")) + found_pattern_dim <- pattern_dims[1] + } + + # Check all *_reorder are NULL or functions, and that they all have + # a matching dimension param. + i <- 1 + for (dim_reorder_param in dim_reorder_params) { + if (!is.function(dim_reorder_param)) { + stop("All '*_reorder' parameters must be functions.") + } else if (!any(grepl(paste0('^', strsplit(names(dim_reorder_params)[i], + '_reorder$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_reorder' parameters must be associated to a dimension parameter. Found parameter '", + names(dim_reorder_params)[i], "' but no parameter '", + strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "'.")) + #} else if (!any(grepl(paste0('^', strsplit(names(dim_reorder_params)[i], + # '_reorder$')[[1]][1], '$'), + # names(var_params)))) { + # stop(paste0("All '*_reorder' parameters must be associated to a dimension parameter associated to a ", + # "variable. Found parameter '", names(dim_reorder_params)[i], "' and dimension parameter '", + # strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "' but did not find variable ", + # "parameter '", strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "_var'.")) + } + i <- i + 1 + } + + # Check all *_tolerance are NULL or vectors of character strings, and + # that they all have a matching dimension param. + i <- 1 + for (tolerance_param in tolerance_params) { + if (!any(grepl(paste0('^', strsplit(names(tolerance_params)[i], + '_tolerance$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_tolerance' parameters must be associated to a dimension parameter. Found parameter '", + names(tolerance_params)[i], "' but no parameter '", + strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "'.")) + #} else if (!any(grepl(paste0('^', strsplit(names(tolerance_params)[i], + # '_tolerance$')[[1]][1], '$'), + # names(var_params)))) { + # stop(paste0("All '*_tolerance' parameters must be associated to a dimension parameter associated to a ", + # "variable. Found parameter '", names(tolerance_params)[i], "' and dimension parameter '", + # strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "' but did not find variable ", + # "parameter '", strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "_var'.")) + } + i <- i + 1 + } + # Make the keys of 'tolerance_params' to be the name of + # the corresponding dimension. + if (length(tolerance_params) < 1) { + tolerance_params <- NULL + } else { + names(tolerance_params) <- gsub('_tolerance$', '', names(tolerance_params)) + } + + # Check metadata_dims + if (!is.null(metadata_dims)) { + if (is.na(metadata_dims)) { + metadata_dims <- NULL + } else if (!is.character(metadata_dims) || (length(metadata_dims) < 1)) { + stop("Parameter 'metadata' dims must be a vector of at least one character string.") + } + } else { + metadata_dims <- pattern_dims + } + + # Once the pattern dimension with dataset specifications is found, + # the variable 'dat' is mounted with the information of each + # dataset. + # Take only the datasets for the requested chunk + dats_to_take <- chunk_indices(length(dim_params[[found_pattern_dim]]), + chunks[[found_pattern_dim]]['chunk'], + chunks[[found_pattern_dim]]['n_chunks'], + found_pattern_dim) + dim_params[[found_pattern_dim]] <- dim_params[[found_pattern_dim]][dats_to_take] + dat <- datasets <- dim_params[[found_pattern_dim]] + dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames') + dat_to_fetch <- c() + dat_names <- c() + if (!is.list(dat)) { + dat <- as.list(dat) + } else { + if (!any(sapply(dat, is.list))) { + dat <- list(dat) + } + } + for (i in 1:length(dat)) { + if (is.character(dat[[i]]) && length(dat[[i]]) == 1 && nchar(dat[[i]]) > 0) { + if (grepl('^(\\./|\\.\\./|/.*/|~/)', dat[[i]])) { + dat[[i]] <- list(path = dat[[i]]) + } else { + dat[[i]] <- list(name = dat[[i]]) + } + } else if (!is.list(dat[[i]])) { + stop(paste0("Parameter '", pattern_dim, + "' is incorrect. It must be a list of lists or character strings.")) + } + #if (!(all(names(dat[[i]]) %in% dat_info_names))) { + # stop("Error: parameter 'dat' is incorrect. There are unrecognized components in the information of some of the datasets. Check 'dat' in ?Load for details.") + #} + if (!('name' %in% names(dat[[i]]))) { + dat[[i]][['name']] <- paste0('dat', i) + if (!('path' %in% names(dat[[i]]))) { + stop(paste0("Parameter '", found_pattern_dim, + "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided.")) + } + } else if (!('path' %in% names(dat[[i]]))) { + dat_to_fetch <- c(dat_to_fetch, i) + } + #if ('path' %in% names(dat[[i]])) { + # if (!('nc_var_name' %in% names(dat[[i]]))) { + # dat[[i]][['nc_var_name']] <- '$var_name$' + # } + # if (!('suffix' %in% names(dat[[i]]))) { + # dat[[i]][['suffix']] <- '' + # } + # if (!('var_min' %in% names(dat[[i]]))) { + # dat[[i]][['var_min']] <- '' + # } + # if (!('var_max' %in% names(dat[[i]]))) { + # dat[[i]][['var_max']] <- '' + # } + #} + dat_names <- c(dat_names, dat[[i]][['name']]) + } + if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) { + .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.") + } + if (length(dat_to_fetch) > 0) { + stop("Specified only the name for some data sets, but not the path ", + "pattern. This option has not been yet implemented.") + } + + # Reorder inner_dims_across_files (to make the keys be the file dimensions, + # and the values to be the inner dimensions that go across it). + if (!is.null(inner_dims_across_files)) { + # Reorder: example, convert list(ftime = 'chunk', ensemble = 'member', xx = 'chunk') + # to list(chunk = c('ftime', 'xx'), member = 'ensemble') + new_idaf <- list() + for (i in names(inner_dims_across_files)) { + if (!(inner_dims_across_files[[i]] %in% names(new_idaf))) { + new_idaf[[inner_dims_across_files[[i]]]] <- i + } else { + new_idaf[[inner_dims_across_files[[i]]]] <- c(new_idaf[[inner_dims_across_files[[i]]]], i) + } + } + inner_dims_across_files <- new_idaf + } + + # Check return_vars + if (is.null(return_vars)) { + return_vars <- list() + # if (length(var_params) > 0) { + # return_vars <- as.list(paste0(names(var_params), '_var')) + # } else { + # return_vars <- list() + # } + } + if (!is.list(return_vars)) { + stop("Parameter 'return_vars' must be a list or NULL.") + } + if (length(return_vars) > 0 && is.null(names(return_vars))) { + # names(return_vars) <- rep('', length(return_vars)) + stop("Parameter 'return_vars' must be a named list.") + } + i <- 1 + while (i <= length(return_vars)) { + # if (names(return_vars)[i] == '') { + # if (!(is.character(return_vars[[i]]) && (length(return_vars[[i]]) == 1))) { + # stop("The ", i, "th specification in 'return_vars' is malformed.") + # } + # if (!grepl('_var$', return_vars[[i]])) { + # stop("The ", i, "th specification in 'return_vars' is malformed.") + # } + # dim_name <- strsplit(return_vars[[i]], '_var$')[[1]][1] + # if (!(dim_name %in% names(var_params))) { + # stop("'", dim_name, "_var' requested in 'return_vars' but ", + # "no '", dim_name, "_var' specified in the .Load call.") + # } + # names(return_vars)[i] <- var_params[[dim_name]] + # return_vars[[i]] <- found_pattern_dim + # } else + if (length(return_vars[[i]]) > 0) { + if (!is.character(return_vars[[i]])) { + stop("The ", i, "th specification in 'return_vars' is malformed. It ", + "must be a vector of character strings of valid file dimension ", + "names.") + } + } + i <- i + 1 + } + + # Check synonims + if (!is.null(synonims)) { + error <- FALSE + if (!is.list(synonims)) { + error <- TRUE + } + for (synonim_entry in names(synonims)) { + if (!(synonim_entry %in% names(dim_params)) && + !(synonim_entry %in% names(return_vars))) { + error <- TRUE + } + if (!is.character(synonims[[synonim_entry]]) || + length(synonims[[synonim_entry]]) < 1) { + error <- TRUE + } + } + if (error) { + stop("Parameter 'synonims' must be a named list, where the names are ", + "a name of a requested dimension or variable and the values are ", + "vectors of character strings with at least one alternative name ", + " for each dimension or variable in 'synonims'.") + } + } + if (length(unique(names(synonims))) < length(names(synonims))) { + stop("There must not be repeated entries in 'synonims'.") + } + if (length(unique(unlist(synonims))) < length(unlist(synonims))) { + stop("There must not be repeated values in 'synonims'.") + } + # Make that all dims and vars have an entry in synonims, even if only dim_name = dim_name + dim_entries_to_add <- which(!(names(dim_params) %in% names(synonims))) + if (length(dim_entries_to_add) > 0) { + synonims[names(dim_params)[dim_entries_to_add]] <- as.list(names(dim_params)[dim_entries_to_add]) + } + var_entries_to_add <- which(!(names(var_params) %in% names(synonims))) + if (length(var_entries_to_add) > 0) { + synonims[names(var_params)[var_entries_to_add]] <- as.list(names(var_params)[var_entries_to_add]) + } + + # Check selector_checker + if (is.null(selector_checker) || !is.function(selector_checker)) { + stop("Parameter 'selector_checker' must be a function.") + } + + # Check file_opener + if (is.null(file_opener) || !is.function(file_opener)) { + stop("Parameter 'file_opener' must be a function.") + } + + # Check file_var_reader + if (!is.null(file_var_reader) && !is.function(file_var_reader)) { + stop("Parameter 'file_var_reader' must be a function.") + } + + # Check file_dim_reader + if (!is.null(file_dim_reader) && !is.function(file_dim_reader)) { + stop("Parameter 'file_dim_reader' must be a function.") + } + + # Check file_data_reader + if (is.null(file_data_reader) || !is.function(file_data_reader)) { + stop("Parameter 'file_data_reader' must be a function.") + } + + # Check file_closer + if (is.null(file_closer) || !is.function(file_closer)) { + stop("Parameter 'file_closer' must be a function.") + } + + # Check transform + if (!is.null(transform)) { + if (!is.function(transform)) { + stop("Parameter 'transform' must be a function.") + } + } + + # Check transform_params + if (!is.null(transform_params)) { + if (!is.list(transform_params)) { + stop("Parameter 'transform_params' must be a list.") + } + if (is.null(names(transform_params))) { + stop("Parameter 'transform_params' must be a named list.") + } + } + + # Check transform_vars + if (!is.null(transform_vars)) { + if (!is.character(transform_vars)) { + stop("Parameter 'transform_vars' must be a vector of character strings.") + } + } + if (any(!(transform_vars %in% names(return_vars)))) { + stop("All the variables specified in 'transform_vars' must also be specified in 'return_vars'.") + } + + # Check apply_indices_after_transform + if (!is.logical(apply_indices_after_transform)) { + stop("Parameter 'apply_indices_after_transform' must be either TRUE or FALSE.") + } + aiat <- apply_indices_after_transform + + # Check transform_extra_cells + if (!is.numeric(transform_extra_cells)) { + stop("Parameter 'transform_extra_cells' must be numeric.") + } + transform_extra_cells <- round(transform_extra_cells) + + # Check split_multiselected_dims + if (!is.logical(split_multiselected_dims)) { + stop("Parameter 'split_multiselected_dims' must be TRUE or FALSE.") + } + + # Check path_glob_permissive + if (!is.numeric(path_glob_permissive) && !is.logical(path_glob_permissive)) { + stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or an integer.") + } + if (length(path_glob_permissive) != 1) { + stop("Parameter 'path_glob_permissive' must be of length 1.") + } + + # Check retrieve + if (!is.logical(retrieve)) { + stop("Parameter 'retrieve' must be TRUE or FALSE.") + } + + # Check num_procs + if (!is.null(num_procs)) { + if (!is.numeric(num_procs)) { + stop("Parameter 'num_procs' must be numeric.") + } else { + num_procs <- round(num_procs) + } + } + + # Check silent + if (!is.logical(silent)) { + stop("Parameter 'silent' must be logical.") + } + + dim_params[[found_pattern_dim]] <- dat_names + + if (!silent) { + .message(paste0("Exploring files... This will take a variable amount ", + "of time depending on the issued request and the ", + "performance of the file server...")) + } + + if (!is.character(debug)) { + dims_to_check <- c('time') + } else { + dims_to_check <- debug + debug <- TRUE + } + + ############################## READING FILE DIMS ############################ + # Check that no unrecognized variables are present in the path patterns + # and also that no file dimensions are requested to THREDDs catalogs. + # And in the mean time, build all the work pieces and look for the + # first available file of each dataset. + array_of_files_to_load <- NULL + array_of_not_found_files <- NULL + indices_of_first_files_with_data <- vector('list', length(dat)) + selectors_of_first_files_with_data <- vector('list', length(dat)) + dataset_has_files <- rep(FALSE, length(dat)) + found_file_dims <- vector('list', length(dat)) + expected_inner_dims <- vector('list', length(dat)) + + #print("A") + for (i in 1:length(dat)) { + #print("B") + dat_selectors <- dim_params + dat_selectors[[found_pattern_dim]] <- dat_selectors[[found_pattern_dim]][i] + dim_vars <- paste0('$', dim_names, '$') + file_dims <- which(sapply(dim_vars, grepl, dat[[i]][['path']], fixed = TRUE)) + if (length(file_dims) > 0) { + file_dims <- dim_names[file_dims] + } + file_dims <- unique(c(pattern_dims, file_dims)) + found_file_dims[[i]] <- file_dims + expected_inner_dims[[i]] <- dim_names[which(!(dim_names %in% file_dims))] + # (Check the depending_file_dims). + if (any(c(names(depending_file_dims), unlist(depending_file_dims)) %in% + expected_inner_dims[[i]])) { + stop(paste0("The dimension dependancies specified in ", + "'depending_file_dims' can only be between file ", + "dimensions, but some inner dimensions found in ", + "dependancies for '", dat[[i]][['name']], "', which ", + "has the following file dimensions: ", + paste(paste0("'", file_dims, "'"), collapse = ', '), ".")) + } else { + a <- names(depending_file_dims) %in% file_dims + b <- unlist(depending_file_dims) %in% file_dims + ab <- a & b + if (any(!ab)) { + .warning(paste0("Detected some dependancies in 'depending_file_dims' with ", + "non-existing dimension names. These will be disregarded.")) + depending_file_dims <- depending_file_dims[-which(!ab)] + } + if (any(names(depending_file_dims) == unlist(depending_file_dims))) { + depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == unlist(depending_file_dims))] + } + } + # (Check the inner_dims_across_files). + if (any(!(names(inner_dims_across_files) %in% file_dims)) || + any(!(unlist(inner_dims_across_files) %in% expected_inner_dims[[i]]))) { + stop(paste0("All relationships specified in ", + "'_across' parameters must be between a inner ", + "dimension and a file dimension. Found wrong ", + "specification for '", dat[[i]][['name']], "', which ", + "has the following file dimensions: ", + paste(paste0("'", file_dims, "'"), collapse = ', '), + ", and the following inner dimensions: ", + paste(paste0("'", expected_inner_dims[[i]], "'"), + collapse = ', '), ".")) + } + # (Check the return_vars). + j <- 1 + while (j <= length(return_vars)) { + if (any(!(return_vars[[j]] %in% file_dims))) { + if (any(return_vars[[j]] %in% expected_inner_dims[[i]])) { + stop("Found variables in 'return_vars' requested ", + "for some inner dimensions (for dataset '", + dat[[i]][['name']], "'), but variables can only be ", + "requested for file dimensions.") + } else { + stop("Found variables in 'return_vars' requested ", + "for non-existing dimensions.") + } + } + j <- j + 1 + } + # (Check the metadata_dims). + if (!is.null(metadata_dims)) { + if (any(!(metadata_dims %in% file_dims))) { + stop("All dimensions in 'metadata_dims' must be file dimensions.") + } + } + ## Look for _var params that should be requested automatically. + for (dim_name in dim_names) { + if (!(dim_name %in% pattern_dims)) { + if (is.null(attr(dat_selectors[[dim_name]], 'values')) || + is.null(attr(dat_selectors[[dim_name]], 'indices'))) { + flag <- ((dat_selectors[[dim_name]] %in% c('all', 'first', 'last')) || + (is.numeric(unlist(dat_selectors[[dim_name]])))) + attr(dat_selectors[[dim_name]], 'values') <- !flag + attr(dat_selectors[[dim_name]], 'indices') <- flag + } + ## The following code 'rewrites' var_params for all datasets. If providing different + ## path pattern repositories with different file/inner dimensions, var_params might + ## have to be handled for each dataset separately. + if ((attr(dat_selectors[[dim_name]], 'values') || (dim_name %in% c('var', 'variable'))) && + !(dim_name %in% names(var_params)) && !(dim_name %in% file_dims)) { + if (dim_name %in% c('var', 'variable')) { + var_params <- c(var_params, setNames(list('var_names'), dim_name)) + .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", + dim_name, "_var' requested. ", '"', dim_name, "_var = '", + 'var_names', "'", '"', " has been automatically added to ", + "the Start call.")) + } else { + var_params <- c(var_params, setNames(list(dim_name), dim_name)) + .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", + dim_name, "_var' requested. ", '"', dim_name, "_var = '", + dim_name, "'", '"', " has been automatically added to ", + "the Start call.")) + } + } + } + } + ## (Check the *_var parameters). + if (any(!(unlist(var_params) %in% names(return_vars)))) { + vars_to_add <- which(!(unlist(var_params) %in% names(return_vars))) + new_return_vars <- vector('list', length(vars_to_add)) + names(new_return_vars) <- unlist(var_params)[vars_to_add] + return_vars <- c(return_vars, new_return_vars) + .warning(paste0("All '*_var' params must associate a dimension to one of the ", + "requested variables in 'return_vars'. The following variables", + " have been added to 'return_vars': ", + paste(paste0("'", unlist(var_params), "'"), collapse = ', '))) + } + + replace_values <- vector('list', length = length(file_dims)) + names(replace_values) <- file_dims + # Take the first selector for all possible file dimensions + for (file_dim in file_dims) { + if (file_dim %in% names(var_params)) { + .warning(paste0("The '", file_dim, "_var' param will be ignored since '", + file_dim, "' is a file dimension (for the dataset with pattern ", + dat[[i]][['path']], ").")) + } + if (!is.list(dat_selectors[[file_dim]]) || + (is.list(dat_selectors[[file_dim]]) && + length(dat_selectors[[file_dim]]) == 2 && + is.null(names(dat_selectors[[file_dim]])))) { + dat_selectors[[file_dim]] <- list(dat_selectors[[file_dim]]) + } + first_class <- class(dat_selectors[[file_dim]][[1]]) + first_length <- length(dat_selectors[[file_dim]][[1]]) + for (j in 1:length(dat_selectors[[file_dim]])) { + sv <- selector_vector <- dat_selectors[[file_dim]][[j]] + if (!identical(first_class, class(sv)) || + !identical(first_length, length(sv))) { + stop("All provided selectors for depending dimensions must ", + "be vectors of the same length and of the same class.") + } + if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) { + dat_selectors[[file_dim]][[j]] <- selector_checker(selectors = sv, + return_indices = FALSE) + # Take chunk if needed + dat_selectors[[file_dim]][[j]] <- dat_selectors[[file_dim]][[j]][chunk_indices(length(dat_selectors[[file_dim]][[j]]), + chunks[[file_dim]]['chunk'], + chunks[[file_dim]]['n_chunks'], + file_dim)] + } else if (!(is.numeric(sv) || + (is.character(sv) && (length(sv) == 1) && (sv %in% c('all', 'first', 'last'))) || + (is.list(sv) && (length(sv) == 2) && (all(sapply(sv, is.character)) || + all(sapply(sv, is.numeric)))))) { + stop("All explicitly provided selectors for file dimensions must be character strings.") + } + } + sv <- dat_selectors[[file_dim]][[1]] + if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) { + replace_values[[file_dim]] <- dat_selectors[[file_dim]][[1]][1] + } + } + #print("C") + # Now we know which dimensions whose selectors are provided non-explicitly. + undefined_file_dims <- file_dims[which(sapply(replace_values, is.null))] + defined_file_dims <- file_dims[which(!(file_dims %in% undefined_file_dims))] + # Quickly check if the depending dimensions are provided properly. + for (file_dim in file_dims) { + if (file_dim %in% names(depending_file_dims)) { + ## TODO: Detect multi-dependancies and forbid. + if (all(c(file_dim, depending_file_dims[[file_dim]]) %in% defined_file_dims)) { + if (length(dat_selectors[[file_dim]]) != length(dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) { + stop(paste0("If providing selectors for the depending ", + "dimension '", file_dim, "', a ", + "vector of selectors must be provided for ", + "each selector of the dimension it depends on, '", + depending_file_dims[[file_dim]], "'.")) + } else if (!all(names(dat_selectors[[file_dim]]) == dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) { + stop(paste0("If providing selectors for the depending ", + "dimension '", file_dim, "', the name of the ", + "provided vectors of selectors must match ", + "exactly the selectors of the dimension it ", + "depends on, '", depending_file_dims[[file_dim]], "'.")) + } + } + } + } + # Find the possible values for the selectors that are provided as + # indices. If the requested file is on server, impossible operation. + if (length(grep("^http", dat[[i]][['path']])) > 0) { + if (length(undefined_file_dims) > 0) { + stop(paste0("All selectors for the file dimensions must be ", + "character strings if requesting data to a remote ", + "server. Found invalid selectors for the file dimensions ", + paste(paste0("'", undefined_file_dims, "'"), collapse = ', '), ".")) + } + dataset_has_files[i] <- TRUE + } else { + dat[[i]][['path']] <- path.expand(dat[[i]][['path']]) + # Iterate over the known dimensions to find the first existing file. + # The path to the first existing file will be used to find the + # values for the non explicitly defined selectors. + first_file <- NULL + first_file_selectors <- NULL + if (length(undefined_file_dims) > 0) { + replace_values[undefined_file_dims] <- '*' + } + ## TODO: What if length of defined_file_dims is 0? code might crash (in practice it worked for an example case) + files_to_check <- sapply(dat_selectors[defined_file_dims], function(x) length(x[[1]])) + sub_array_of_files_to_check <- array(1:prod(files_to_check), dim = files_to_check) + j <- 1 + #print("D") + while (j <= prod(files_to_check) && is.null(first_file)) { + selector_indices <- which(sub_array_of_files_to_check == j, arr.ind = TRUE)[1, ] + selectors <- sapply(1:length(defined_file_dims), + function (x) { + vector_to_pick <- 1 + if (defined_file_dims[x] %in% names(depending_file_dims)) { + vector_to_pick <- selector_indices[which(defined_file_dims == depending_file_dims[[defined_file_dims[x]]])] + } + dat_selectors[defined_file_dims][[x]][[vector_to_pick]][selector_indices[x]] + }) + replace_values[defined_file_dims] <- selectors + file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + file_path <- Sys.glob(file_path) + if (length(file_path) > 0) { + first_file <- file_path[1] + first_file_selectors <- selectors + } + j <- j + 1 + } + #print("E") + # Start looking for values for the non-explicitly defined selectors. + if (is.null(first_file)) { + .warning(paste0("No found files for the datset '", dat[[i]][['name']], + "'. Provide existing selectors for the file dimensions ", + " or check and correct its path pattern: ", dat[[i]][['path']])) + } else { + dataset_has_files[i] <- TRUE + ## TODO: Improve message here if no variable found: + if (length(undefined_file_dims) > 0) { + # Looking for the first values, parsed from first_file. + first_values <- vector('list', length = length(undefined_file_dims)) + names(first_values) <- undefined_file_dims + found_values <- 0 + stop <- FALSE + try_dim <- 1 + last_success <- 1 + while ((found_values < length(undefined_file_dims)) && !stop) { + u_file_dim <- undefined_file_dims[try_dim] + if (is.null(first_values[[u_file_dim]])) { + path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], + replace_values[-which(file_dims == u_file_dim)], + allow_undefined_key_vars = TRUE) + found_value <- .FindTagValue(path_with_globs_and_tag, + first_file, u_file_dim) + if (!is.null(found_value)) { + found_values <- found_values + 1 + last_success <- try_dim + first_values[[u_file_dim]] <- found_value + replace_values[[u_file_dim]] <- found_value + } + } + try_dim <- (try_dim %% length(undefined_file_dims)) + 1 + if (try_dim == last_success) { + stop <- TRUE + } + } + if (found_values < length(undefined_file_dims)) { + stop(paste0("Path pattern of dataset '", dat[[i]][['name']], + "' is too complex. Could not automatically ", + "detect values for all non-explicitly defined ", + "indices. Check its pattern: ", dat[[i]][['path']])) + } + ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above + ## Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical + ## cmor case, requesting all members and chunks for fixed var and sdate. Not fixing + ## sdate raised 'too complex' error. + # Replace shell globs in path pattern and keep the file_dims as tags + dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, + file_dims, dat[[i]][['name']], path_glob_permissive) + # Now time to look for the available values for the non + # explicitly defined selectors for the file dimensions. + #print("H") + # Check first the ones that do not depend on others. + ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))], + undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))]) + + for (u_file_dim in ufd) { + replace_values[undefined_file_dims] <- first_values + replace_values[[u_file_dim]] <- '*' + depended_dim <- NULL + depended_dim_values <- NA + selectors <- dat_selectors[[u_file_dim]][[1]] + if (u_file_dim %in% names(depending_file_dims)) { + depended_dim <- depending_file_dims[[u_file_dim]] + depended_dim_values <- dat_selectors[[depended_dim]][[1]] + dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values)) + names(dat_selectors[[u_file_dim]]) <- depended_dim_values + } else { + dat_selectors[[u_file_dim]] <- list() + } + if (u_file_dim %in% unlist(depending_file_dims)) { + depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))] + replace_values[depending_dims] <- rep('*', length(depending_dims)) + } + for (j in 1:length(depended_dim_values)) { + parsed_values <- c() + if (!is.null(depended_dim)) { + replace_values[[depended_dim]] <- depended_dim_values[j] + } + path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + found_files <- Sys.glob(path_with_globs) + ## TODO: Enhance this error message, or change by warning. + ## Raises if a wrong sdate is specified, for example. + if (length(found_files) == 0) { + .warning(paste0("Could not find files for any '", u_file_dim, + "' for '", depended_dim, "' = '", + depended_dim_values[j], "'.")) + dat_selectors[[u_file_dim]][[j]] <- NA + } else { + for (found_file in found_files) { + path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], + replace_values[-which(file_dims == u_file_dim)], + allow_undefined_key_vars = TRUE) + parsed_values <- c(parsed_values, + .FindTagValue(path_with_globs_and_tag, found_file, + u_file_dim)) + } + dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors, + var = unique(parsed_values), + return_indices = FALSE) + # Take chunk if needed + dat_selectors[[u_file_dim]][[j]] <- dat_selectors[[u_file_dim]][[j]][chunk_indices(length(dat_selectors[[u_file_dim]][[j]]), + chunks[[u_file_dim]]['chunk'], + chunks[[u_file_dim]]['n_chunks'], + u_file_dim)] + } + } + } + #print("I") + } else { + dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, + defined_file_dims, dat[[i]][['name']], path_glob_permissive) + } + } + } + # Now fetch for the first available file + if (dataset_has_files[i]) { + known_dims <- file_dims + } else { + known_dims <- defined_file_dims + } + replace_values <- vector('list', length = length(known_dims)) + names(replace_values) <- known_dims + files_to_load <- sapply(dat_selectors[known_dims], function(x) length(x[[1]])) + files_to_load[found_pattern_dim] <- 1 + sub_array_of_files_to_load <- array(1:prod(files_to_load), + dim = files_to_load) + names(dim(sub_array_of_files_to_load)) <- known_dims + sub_array_of_not_found_files <- array(!dataset_has_files[i], + dim = files_to_load) + names(dim(sub_array_of_not_found_files)) <- known_dims + j <- 1 + selector_indices_save <- vector('list', prod(files_to_load)) + while (j <= prod(files_to_load)) { + selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] + names(selector_indices) <- known_dims + selector_indices_save[[j]] <- selector_indices + selectors <- sapply(1:length(known_dims), + function (x) { + vector_to_pick <- 1 + if (known_dims[x] %in% names(depending_file_dims)) { + vector_to_pick <- selector_indices[which(known_dims == depending_file_dims[[known_dims[x]]])] + } + dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]] + }) + names(selectors) <- known_dims + replace_values[known_dims] <- selectors + if (!dataset_has_files[i]) { + if (any(is.na(selectors))) { + replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))] + } + file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE) + sub_array_of_files_to_load[j] <- file_path + #sub_array_of_not_found_files[j] <- TRUE??? + } else { + if (any(is.na(selectors))) { + replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))] + file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE) + sub_array_of_files_to_load[j] <- file_path + sub_array_of_not_found_files[j] <- TRUE + } else { + file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + if (!(length(grep("^http", file_path)) > 0)) { + if (grepl(file_path, '*', fixed = TRUE)) { + file_path_full <- Sys.glob(file_path)[1] + if (nchar(file_path_full) > 0) { + file_path <- file_path_full + } + } + } + sub_array_of_files_to_load[j] <- file_path + if (is.null(indices_of_first_files_with_data[[i]])) { + if (!(length(grep("^http", file_path)) > 0)) { + if (!file.exists(file_path)) { + file_path <- NULL + } + } + if (!is.null(file_path)) { + test_file <- NULL + ## TODO: suppress error messages + test_file <- file_opener(file_path) + if (!is.null(test_file)) { + selector_indices[which(known_dims == found_pattern_dim)] <- i + indices_of_first_files_with_data[[i]] <- selector_indices + selectors_of_first_files_with_data[[i]] <- selectors + file_closer(test_file) + } + } + } + } + } + j <- j + 1 + } + # Extend array as needed progressively + if (is.null(array_of_files_to_load)) { + array_of_files_to_load <- sub_array_of_files_to_load + array_of_not_found_files <- sub_array_of_not_found_files + } else { + array_of_files_to_load <- .MergeArrays(array_of_files_to_load, sub_array_of_files_to_load, + along = found_pattern_dim) + ## TODO: file_dims, and variables like that.. are still ok now? I don't think so + array_of_not_found_files <- .MergeArrays(array_of_not_found_files, sub_array_of_not_found_files, + along = found_pattern_dim) + } + dat[[i]][['selectors']] <- dat_selectors + } + if (all(sapply(indices_of_first_files_with_data, is.null))) { + stop("No data files found for any of the specified datasets.") + } + + ########################### READING INNER DIMS. ############################# + #print("J") + ## TODO: To be run in parallel (local multi-core) + # Now time to work out the inner file dimensions. + # First pick the requested variables. + dims_to_iterate <- NULL + for (return_var in names(return_vars)) { + dims_to_iterate <- unique(c(dims_to_iterate, return_vars[[return_var]])) + } + if (found_pattern_dim %in% dims_to_iterate) { + dims_to_iterate <- dims_to_iterate[-which(dims_to_iterate == found_pattern_dim)] + } + common_return_vars <- NULL + common_first_found_file <- NULL + common_return_vars_pos <- NULL + if (length(return_vars) > 0) { + common_return_vars_pos <- which(sapply(return_vars, function(x) !(found_pattern_dim %in% x))) + } + if (length(common_return_vars_pos) > 0) { + common_return_vars <- return_vars[common_return_vars_pos] + return_vars <- return_vars[-common_return_vars_pos] + common_first_found_file <- rep(FALSE, length(which(sapply(common_return_vars, length) == 0))) + names(common_first_found_file) <- names(common_return_vars[which(sapply(common_return_vars, length) == 0)]) + } + return_vars <- lapply(return_vars, + function(x) { + if (found_pattern_dim %in% x) { + x[-which(x == found_pattern_dim)] + } else { + x + } + }) + if (length(common_return_vars) > 0) { + picked_common_vars <- vector('list', length = length(common_return_vars)) + names(picked_common_vars) <- names(common_return_vars) + } else { + picked_common_vars <- NULL + } + picked_common_vars_ordered <- picked_common_vars + picked_common_vars_unorder_indices <- picked_common_vars + picked_vars <- vector('list', length = length(dat)) + names(picked_vars) <- dat_names + picked_vars_ordered <- picked_vars + picked_vars_unorder_indices <- picked_vars + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + # Put all selectors in a list of a single list/vector of selectors. + # The dimensions that go across files will later be extended to have + # lists of lists/vectors of selectors. + for (inner_dim in expected_inner_dims[[i]]) { + if (!is.list(dat[[i]][['selectors']][[inner_dim]]) || + (is.list(dat[[i]][['selectors']][[inner_dim]]) && + length(dat[[i]][['selectors']][[inner_dim]]) == 2 && + is.null(names(dat[[i]][['selectors']][[inner_dim]])))) { + dat[[i]][['selectors']][[inner_dim]] <- list(dat[[i]][['selectors']][[inner_dim]]) + } + } + if (length(return_vars) > 0) { + picked_vars[[i]] <- vector('list', length = length(return_vars)) + names(picked_vars[[i]]) <- names(return_vars) + picked_vars_ordered[[i]] <- picked_vars[[i]] + picked_vars_unorder_indices[[i]] <- picked_vars[[i]] + } + indices_of_first_file <- as.list(indices_of_first_files_with_data[[i]]) + array_file_dims <- sapply(dat[[i]][['selectors']][found_file_dims[[i]]], function(x) length(x[[1]])) + names(array_file_dims) <- found_file_dims[[i]] + if (length(dims_to_iterate) > 0) { + indices_of_first_file[dims_to_iterate] <- lapply(array_file_dims[dims_to_iterate], function(x) 1:x) + } + array_of_var_files <- do.call('[', c(list(x = array_of_files_to_load), indices_of_first_file, list(drop = FALSE))) + array_of_var_indices <- array(1:length(array_of_var_files), dim = dim(array_of_var_files)) + array_of_not_found_var_files <- do.call('[', c(list(x = array_of_not_found_files), indices_of_first_file, list(drop = FALSE))) + previous_indices <- rep(-1, length(indices_of_first_file)) + names(previous_indices) <- names(indices_of_first_file) + first_found_file <- NULL + if (length(return_vars) > 0) { + first_found_file <- rep(FALSE, length(which(sapply(return_vars, length) == 0))) + names(first_found_file) <- names(return_vars[which(sapply(return_vars, length) == 0)]) + } + for (j in 1:length(array_of_var_files)) { + current_indices <- which(array_of_var_indices == j, arr.ind = TRUE)[1, ] + names(current_indices) <- names(indices_of_first_file) + if (!is.na(array_of_var_files[j]) && !array_of_not_found_var_files[j]) { + changed_dims <- which(current_indices != previous_indices) + vars_to_read <- NULL + if (length(return_vars) > 0) { + vars_to_read <- names(return_vars)[sapply(return_vars, function(x) any(names(changed_dims) %in% x))] + } + if (!is.null(first_found_file)) { + if (any(!first_found_file)) { + vars_to_read <- c(vars_to_read, names(first_found_file[which(!first_found_file)])) + } + } + if ((i == 1) && (length(common_return_vars) > 0)) { + vars_to_read <- c(vars_to_read, names(common_return_vars)[sapply(common_return_vars, function(x) any(names(changed_dims) %in% x))]) + } + if (!is.null(common_first_found_file)) { + if (any(!common_first_found_file)) { + vars_to_read <- c(vars_to_read, names(common_first_found_file[which(!common_first_found_file)])) + } + } + file_object <- file_opener(array_of_var_files[j]) + if (!is.null(file_object)) { + for (var_to_read in vars_to_read) { + if (var_to_read %in% unlist(var_params)) { + associated_dim_name <- names(var_params)[which(unlist(var_params) == var_to_read)] + } + var_name_to_reader <- var_to_read + names(var_name_to_reader) <- 'var' + var_dims <- file_dim_reader(NULL, file_object, var_name_to_reader, NULL, + synonims) + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(var_dims) <- sapply(names(var_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + if (!is.null(var_dims)) { + var_file_dims <- NULL + if (var_to_read %in% names(common_return_vars)) { + var_to_check <- common_return_vars[[var_to_read]] + } else { + var_to_check <- return_vars[[var_to_read]] + } + if (any(names(dim(array_of_files_to_load)) %in% var_to_check)) { + var_file_dims <- dim(array_of_files_to_load)[which(names(dim(array_of_files_to_load)) %in% + var_to_check)] + } + if (((var_to_read %in% names(common_return_vars)) && + is.null(picked_common_vars[[var_to_read]])) || + ((var_to_read %in% names(return_vars)) && + is.null(picked_vars[[i]][[var_to_read]]))) { + if (any(names(var_file_dims) %in% names(var_dims))) { + stop("Found a requested var in 'return_var' requested for a ", + "file dimension which also appears in the dimensions of ", + "the variable inside the file.\n", array_of_var_files[j]) + } + special_types <- list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, + 'Date' = as.Date) + first_sample <- file_var_reader(NULL, file_object, NULL, + var_to_read, synonims) + if (any(class(first_sample) %in% names(special_types))) { + array_size <- prod(c(var_file_dims, var_dims)) + new_array <- rep(special_types[[class(first_sample)[1]]](NA), array_size) + dim(new_array) <- c(var_file_dims, var_dims) + } else { + new_array <- array(dim = c(var_file_dims, var_dims)) + } + attr(new_array, 'variables') <- attr(first_sample, 'variables') + if (var_to_read %in% names(common_return_vars)) { + picked_common_vars[[var_to_read]] <- new_array + pick_ordered <- FALSE + if (var_to_read %in% unlist(var_params)) { + if (associated_dim_name %in% names(dim_reorder_param) && !aiat) { + picked_common_vars_ordered[[var_to_read]] <- new_array + pick_ordered <- TRUE + } + } + if (!pick_ordered) { + picked_common_vars_ordered[[var_to_read]] <- NULL + } + } else { + picked_vars[[i]][[var_to_read]] <- new_array + pick_ordered <- FALSE + if (var_to_read %in% unlist(var_params)) { + if (associated_dim_name %in% names(dim_reorder_params) && !aiat) { + picked_vars_ordered[[i]][[var_to_read]] <- new_array + pick_ordered <- TRUE + } + } + if (!pick_ordered) { + picked_vars_ordered[[i]][[var_to_read]] <- NULL + } + } + } else { + if (var_to_read %in% names(common_return_vars)) { + array_var_dims <- dim(picked_common_vars[[var_to_read]]) + } else { + array_var_dims <- dim(picked_vars[[i]][[var_to_read]]) + } + full_array_var_dims <- array_var_dims + if (any(names(array_var_dims) %in% names(var_file_dims))) { + array_var_dims <- array_var_dims[-which(names(array_var_dims) %in% names(var_file_dims))] + } + if (names(array_var_dims) != names(var_dims)) { + stop("Error while reading the variable '", var_to_read, "' from ", + "the file. Dimensions do not match.\nExpected ", + paste(paste0("'", names(array_var_dims), "'"), + collapse = ', '), " but found ", + paste(paste0("'", names(var_dims), "'"), + collapse = ', '), ".\n", array_of_var_files[j]) + } + if (any(var_dims > array_var_dims)) { + longer_dims <- which(var_dims > array_var_dims) + if (length(longer_dims) == 1) { + longer_dims_in_full_array <- longer_dims + if (any(names(full_array_var_dims) %in% names(var_file_dims))) { + candidates <- (1:length(full_array_var_dims))[-which(names(full_array_var_dims) %in% names(var_file_dims))] + longer_dims_in_full_array <- candidates[longer_dims] + } + padding_dims <- full_array_var_dims + padding_dims[longer_dims_in_full_array] <- var_dims[longer_dims] - + array_var_dims[longer_dims] + special_types <- list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, + 'Date' = as.Date) + if (var_to_read %in% names(common_return_vars)) { + var_class <- class(picked_common_vars[[var_to_read]]) + } else { + var_class <- class(picked_vars[[i]][[var_to_read]]) + } + if (any(var_class %in% names(special_types))) { + padding_size <- prod(padding_dims) + padding <- rep(special_types[[var_class[1]]](NA), padding_size) + dim(padding) <- padding_dims + } else { + padding <- array(dim = padding_dims) + } + if (var_to_read %in% names(common_return_vars)) { + picked_common_vars[[var_to_read]] <- .abind2( + picked_common_vars[[var_to_read]], + padding, + names(full_array_var_dims)[longer_dims_in_full_array] + ) + } else { + picked_vars[[i]][[var_to_read]] <- .abind2( + picked_vars[[i]][[var_to_read]], + padding, + names(full_array_var_dims)[longer_dims_in_full_array] + ) + } + } else { + stop("Error while reading the variable '", var_to_read, "' from ", + "the file. Found size (", paste(var_dims, collapse = ' x '), + ") is greater than expected maximum size (", + array_var_dims, ").") + } + } + } + var_store_indices <- c(as.list(current_indices[names(var_file_dims)]), lapply(var_dims, function(x) 1:x)) + var_values <- file_var_reader(NULL, file_object, NULL, var_to_read, synonims) + if (var_to_read %in% unlist(var_params)) { + if ((associated_dim_name %in% names(dim_reorder_params)) && !aiat) { + ## Is this check really needed? + if (length(dim(var_values)) > 1) { + stop("Requested a '", associated_dim_name, "_reorder' for a dimension ", + "whose coordinate variable that has more than 1 dimension. This is ", + "not supported.") + } + ordered_var_values <- dim_reorder_params[[associated_dim_name]](var_values) + attr(ordered_var_values$x, 'variables') <- attr(var_values, 'variables') + if (!all(c('x', 'ix') %in% names(ordered_var_values))) { + stop("All the dimension reorder functions must return a list with the components 'x' and 'ix'.") + } + # Save the indices to reorder back the ordered variable values. + # This will be used to define the first round indices. + unorder <- sort(ordered_var_values$ix, index.return = TRUE)$ix + if (var_to_read %in% names(common_return_vars)) { + picked_common_vars_ordered[[var_to_read]] <- do.call('[<-', + c(list(x = picked_common_vars_ordered[[var_to_read]]), + var_store_indices, + list(value = ordered_var_values$x))) + picked_common_vars_unorder_indices[[var_to_read]] <- do.call('[<-', + c(list(x = picked_common_vars_unorder_indices[[var_to_read]]), + var_store_indices, + list(value = unorder))) + } else { + picked_vars_ordered[[i]][[var_to_read]] <- do.call('[<-', + c(list(x = picked_vars_ordered[[i]][[var_to_read]]), + var_store_indices, + list(value = ordered_var_values$x))) + picked_vars_unorder_indices[[i]][[var_to_read]] <- do.call('[<-', + c(list(x = picked_vars_unorder_indices[[i]][[var_to_read]]), + var_store_indices, + list(value = unorder))) + } + } + } + if (var_to_read %in% names(common_return_vars)) { + picked_common_vars[[var_to_read]] <- do.call('[<-', + c(list(x = picked_common_vars[[var_to_read]]), + var_store_indices, + list(value = var_values))) + } else { + picked_vars[[i]][[var_to_read]] <- do.call('[<-', + c(list(x = picked_vars[[i]][[var_to_read]]), + var_store_indices, + list(value = var_values))) + } + if (var_to_read %in% names(first_found_file)) { + first_found_file[var_to_read] <- TRUE + } + if (var_to_read %in% names(common_first_found_file)) { + common_first_found_file[var_to_read] <- TRUE + } + } else { + stop("Could not find variable '", var_to_read, + "' in the file ", array_of_var_files[j]) + } + } + file_closer(file_object) + } + } + previous_indices <- current_indices + } + } + } + # Once we have the variable values, we can work out the indices + # for the implicitly defined selectors. + # + # Trnasforms a vector of indices v expressed in a world of + # length N from 1 to N, into a world of length M, from + # 1 to M. Repeated adjacent indices are collapsed. + transform_indices <- function(v, n, m) { + #unique2 turns e.g. 1 1 2 2 2 3 3 1 1 1 into 1 2 3 1 + unique2 <- function(v) { + if (length(v) < 2) { + v + } else { + v[c(1, v[2:length(v)] - v[1:(length(v) - 1)]) != 0] + } + } + unique2(round(((v - 1) / (n - 1)) * (m - 1))) + 1 # this rounding may generate 0s. what then? + } + beta <- transform_extra_cells + dims_to_crop <- vector('list') + transformed_vars <- vector('list', length = length(dat)) + names(transformed_vars) <- dat_names + transformed_vars_ordered <- transformed_vars + transformed_vars_unorder_indices <- transformed_vars + transformed_common_vars <- NULL + transformed_common_vars_ordered <- NULL + transformed_common_vars_unorder_indices <- NULL + + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + indices <- indices_of_first_files_with_data[[i]] + if (!is.null(indices)) { + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) + # The following 5 lines should go several lines below, but were moved + # here for better performance. + # If any of the dimensions comes without defining variable, then we read + # the data dimensions. + data_dims <- NULL + if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { + file_to_open <- file_path + data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims) + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(data_dims) <- sapply(names(data_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + } + # Transform the variables if needed and keep them apart. + if (!is.null(transform) && (length(transform_vars) > 0)) { + if (!all(transform_vars %in% c(names(picked_vars[[i]]), names(picked_common_vars)))) { + stop("Could not find all the required variables in 'transform_vars' ", + "for the dataset '", dat[[i]][['name']], "'.") + } + vars_to_transform <- NULL + picked_vars_to_transform <- which(names(picked_vars[[i]]) %in% transform_vars) + if (length(picked_vars_to_transform) > 0) { + picked_vars_to_transform <- names(picked_vars[[i]])[picked_vars_to_transform] + new_vars_to_transform <- picked_vars[[i]][picked_vars_to_transform] + which_are_ordered <- which(!sapply(picked_vars_ordered[[i]][picked_vars_to_transform], is.null)) + + ##NOTE: The following 'if' replaces the original with reordering vector + if (length(which_are_ordered) > 0) { + tmp <- which(!is.na(match(names(picked_vars_ordered[[i]]), names(which_are_ordered)))) + new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[[i]][tmp] + + } + vars_to_transform <- c(vars_to_transform, new_vars_to_transform) + } + + ##NOTE: Above is non-common vars, here is common vars (ie, return_vars = NULL). + picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars) + if (length(picked_common_vars_to_transform) > 0) { + picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform] + + new_vars_to_transform <- picked_common_vars[picked_common_vars_to_transform] + which_are_ordered <- which(!sapply(picked_common_vars_ordered[picked_common_vars_to_transform], is.null)) + + if (length(which_are_ordered) > 0) { + + tmp <- which(!is.na(match(names(picked_common_vars_ordered), names(which_are_ordered)))) + new_vars_to_transform[which_are_ordered] <- picked_common_vars_ordered[tmp] + } + vars_to_transform <- c(vars_to_transform, new_vars_to_transform) + } + + # Transform the variables + transformed_data <- do.call(transform, c(list(data_array = NULL, + variables = vars_to_transform, + file_selectors = selectors_of_first_files_with_data[[i]]), + transform_params)) + # Discard the common transformed variables if already transformed before + if (!is.null(transformed_common_vars)) { + common_ones <- which(names(picked_common_vars) %in% names(transformed_data$variables)) + if (length(common_ones) > 0) { + transformed_data$variables <- transformed_data$variables[-common_ones] + } + } + transformed_vars[[i]] <- list() + transformed_vars_ordered[[i]] <- list() + transformed_vars_unorder_indices[[i]] <- list() + # Order the transformed variables if needed + # 'var_to_read' should be 'transformed_var', but is kept to reuse the same code as above. + for (var_to_read in names(transformed_data$variables)) { + if (var_to_read %in% unlist(var_params)) { + associated_dim_name <- names(var_params)[which(unlist(var_params) == var_to_read)] + if ((associated_dim_name %in% names(dim_reorder_params)) && aiat) { + ## Is this check really needed? + if (length(dim(transformed_data$variables[[associated_dim_name]])) > 1) { + stop("Requested a '", associated_dim_name, "_reorder' for a dimension ", + "whose coordinate variable that has more than 1 dimension (after ", + "transform). This is not supported.") + } + ordered_var_values <- dim_reorder_params[[associated_dim_name]](transformed_data$variables[[associated_dim_name]]) + attr(ordered_var_values, 'variables') <- attr(transformed_data$variables[[associated_dim_name]], 'variables') + if (!all(c('x', 'ix') %in% names(ordered_var_values))) { + stop("All the dimension reorder functions must return a list with the components 'x' and 'ix'.") + } + # Save the indices to reorder back the ordered variable values. + # This will be used to define the first round indices. + unorder <- sort(ordered_var_values$ix, index.return = TRUE)$ix + if (var_to_read %in% names(picked_common_vars)) { + transformed_common_vars_ordered[[var_to_read]] <- ordered_var_values$x + transformed_common_vars_unorder_indices[[var_to_read]] <- unorder + } else { + transformed_vars_ordered[[i]][[var_to_read]] <- ordered_var_values$x + transformed_vars_unorder_indices[[i]][[var_to_read]] <- unorder + } + } + } + } + transformed_picked_vars <- which(names(picked_vars[[i]]) %in% names(transformed_data$variables)) + if (length(transformed_picked_vars) > 0) { + transformed_picked_vars <- names(picked_vars[[i]])[transformed_picked_vars] + transformed_vars[[i]][transformed_picked_vars] <- transformed_data$variables[transformed_picked_vars] + } + if (is.null(transformed_common_vars)) { + transformed_picked_common_vars <- which(names(picked_common_vars) %in% names(transformed_data$variables)) + if (length(transformed_picked_common_vars) > 0) { + transformed_picked_common_vars <- names(picked_common_vars)[transformed_picked_common_vars] + transformed_common_vars <- transformed_data$variables[transformed_picked_common_vars] + } + } + } + # Once the variables are transformed, we compute the indices to be + # taken for each inner dimension. + # In all cases, indices will have to be computed to know which data + # values to take from the original data for each dimension (if a + # variable is specified for that dimension, it will be used to + # convert the provided selectors into indices). These indices are + # referred to as 'first round of indices'. + # The taken data will then be transformed if needed, together with + # the dimension variable if specified, and, in that case, indices + # will have to be computed again to know which values to take from the + # transformed data. These are the 'second round of indices'. In the + # case there is no transformation, the second round of indices will + # be all the available indices, i.e. from 1 to the number of taken + # values with the first round of indices. + for (inner_dim in expected_inner_dims[[i]]) { + if (debug) { + print("-> DEFINING INDICES FOR INNER DIMENSION:") + print(inner_dim) + } + file_dim <- NULL + if (inner_dim %in% unlist(inner_dims_across_files)) { + file_dim <- names(inner_dims_across_files)[which(sapply(inner_dims_across_files, function(x) inner_dim %in% x))[1]] + chunk_amount <- length(dat[[i]][['selectors']][[file_dim]][[1]]) + names(chunk_amount) <- file_dim + } else { + chunk_amount <- 1 + } + # In the special case that the selectors for a dimension are 'all', 'first', ... + # and chunking (dividing in more than 1 chunk) is requested, the selectors are + # replaced for equivalent indices. + if ((dat[[i]][['selectors']][[inner_dim]][[1]] %in% c('all', 'first', 'last')) && + (chunks[[inner_dim]]['n_chunks'] != 1)) { + selectors <- dat[[i]][['selectors']][[inner_dim]][[1]] + if (selectors == 'all') { + selectors <- indices(1:(data_dims[[inner_dim]] * chunk_amount)) + } else if (selectors == 'first') { + selectors <- indices(1) + } else { + selectors <- indices(data_dims[[inner_dim]] * chunk_amount) + } + dat[[i]][['selectors']][[inner_dim]][[1]] <- selectors + } + # The selectors for the inner dimension are taken. + selector_array <- dat[[i]][['selectors']][[inner_dim]][[1]] + if (debug) { + if (inner_dim %in% dims_to_check) { + print(paste0("-> DEBUG MESSAGES FOR THE DATASET", i, " AND INNER DIMENSION '", inner_dim, "':")) + print("-> STRUCTURE OF SELECTOR ARRAY:") + print(str(selector_array)) + print("-> PICKED VARS:") + print(picked_vars) + print("-> TRANSFORMED VARS:") + print(transformed_vars) + } + } + if (is.null(dim(selector_array))) { + dim(selector_array) <- length(selector_array) + } + if (is.null(names(dim(selector_array)))) { + if (length(dim(selector_array)) == 1) { + names(dim(selector_array)) <- inner_dim + } else { + stop("Provided selector arrays must be provided with dimension ", + "names. Found an array of selectors without dimension names ", + "for the dimension '", inner_dim, "'.") + } + } + selectors_are_indices <- FALSE + if (!is.null(attr(selector_array, 'indices'))) { + if (!is.logical(attr(selector_array, 'indices'))) { + stop("The atribute 'indices' for the selectors for the dimension '", + inner_dim, "' must be TRUE or FALSE.") + } + selectors_are_indices <- attr(selector_array, 'indices') + } + taken_chunks <- rep(FALSE, chunk_amount) + selector_file_dims <- 1 + if (any(found_file_dims[[i]] %in% names(dim(selector_array)))) { + selector_file_dims <- dim(selector_array)[which(names(dim(selector_array)) %in% found_file_dims[[i]])] + } + selector_inner_dims <- dim(selector_array)[which(!(names(dim(selector_array)) %in% found_file_dims[[i]]))] + var_with_selectors <- NULL + var_with_selectors_name <- var_params[[inner_dim]] + var_ordered <- NULL + var_unorder_indices <- NULL + with_transform <- FALSE + # If the selectors come with an associated variable + if (!is.null(var_with_selectors_name)) { + if ((var_with_selectors_name %in% transform_vars) && (!is.null(transform))) { + with_transform <- TRUE + if (!is.null(file_dim)) { + stop("Requested a transformation over the dimension '", + inner_dim, "', wich goes across files. This feature ", + "is not supported. Either do the request without the ", + "transformation or request it over dimensions that do ", + "not go across files.") + } + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> NAME OF THE VARIABLE WITH THE SELECTOR VALUES FOR THE CURRENT INNER DIMENSION:") + print(var_with_selectors_name) + print("-> NAMES OF THE VARIABLES TO BE TRANSFORMED:") + print(transform_vars) + print("-> STRUCTURE OF THE TRANSFORMATION FUNCTION:") + print(str(transform)) + } + } + if (var_with_selectors_name %in% names(picked_vars[[i]])) { + var_with_selectors <- picked_vars[[i]][[var_with_selectors_name]] + var_ordered <- picked_vars_ordered[[i]][[var_with_selectors_name]] + var_unorder_indices <- picked_vars_unorder_indices[[i]][[var_with_selectors_name]] + } else if (var_with_selectors_name %in% names(picked_common_vars)) { + var_with_selectors <- picked_common_vars[[var_with_selectors_name]] + var_ordered <- picked_common_vars_ordered[[var_with_selectors_name]] + var_unorder_indices <- picked_common_vars_unorder_indices[[var_with_selectors_name]] + } + n <- prod(dim(var_with_selectors)) + if (is.null(var_unorder_indices)) { + var_unorder_indices <- 1:n + } + if (with_transform) { + if (var_with_selectors_name %in% names(transformed_vars[[i]])) { + m <- prod(dim(transformed_vars[[i]][[var_with_selectors_name]])) + if (aiat) { + var_with_selectors <- transformed_vars[[i]][[var_with_selectors_name]] + var_ordered <- transformed_vars_ordered[[i]][[var_with_selectors_name]] + var_unorder_indices <- transformed_vars_unorder_indices[[i]][[var_with_selectors_name]] + } + } else if (var_with_selectors_name %in% names(transformed_common_vars)) { + m <- prod(dim(transformed_common_vars[[var_with_selectors_name]])) + if (aiat) { + var_with_selectors <- transformed_common_vars[[var_with_selectors_name]] + var_ordered <- transformed_common_vars_ordered[[var_with_selectors_name]] + var_unorder_indices <- transformed_common_vars_unorder_indices[[var_with_selectors_name]] + } + } + if (is.null(var_unorder_indices)) { + var_unorder_indices <- 1:m + } + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> SIZE OF ORIGINAL VARIABLE:") + print(n) + print("-> SIZE OF TRANSFORMED VARIABLE:") + if (with_transform) print(m) + print("-> STRUCTURE OF ORDERED VAR:") + print(str(var_ordered)) + print("-> UNORDER INDICES:") + print(var_unorder_indices) + } + } + var_dims <- dim(var_with_selectors) + var_file_dims <- 1 + if (any(names(var_dims) %in% found_file_dims[[i]])) { + if (with_transform) { + stop("Requested transformation for inner dimension '", + inner_dim, "' but provided selectors for such dimension ", + "over one or more file dimensions. This is not ", + "supported. Either request no transformation for the ", + "dimension '", inner_dim, "' or specify the ", + "selectors for this dimension without the file dimensions.") + } + var_file_dims <- var_dims[which(names(var_dims) %in% found_file_dims[[i]])] + var_dims <- var_dims[-which(names(var_dims) %in% found_file_dims[[i]])] + } + ## # Keep the selectors if they correspond to a variable that will be transformed. + ## if (with_transform) { + ## if (var_with_selectors_name %in% names(picked_vars[[i]])) { + ## transformed_var_with_selectors <- transformed_vars[[i]][[var_with_selectors_name]] + ## } else if (var_with_selectors_name %in% names(picked_common_vars)) { + ## transformed_var_with_selectors <- transformed_common_vars[[var_with_selectors_name]] + ## } + ## transformed_var_dims <- dim(transformed_var_with_selectors) + ## transformed_var_file_dims <- 1 + ## if (any(names(transformed_var_dims) %in% found_file_dims[[i]])) { + ## transformed_var_file_dims <- transformed_var_dims[which(names(transformed_var_dims) %in% found_file_dims[[i]])] + ## transformed_var_dims <- tranasformed_var_dims[-which(names(transformed_var_dims) %in% found_file_dims[[i]])] + ## } + ##if (inner_dim %in% dims_to_check) { + ##print("111m") + ##print(str(transformed_var_dims)) + ##} + ## + ## m <- prod(transformed_var_dims) + ## } + # Work out var file dims and inner dims. + if (inner_dim %in% unlist(inner_dims_across_files)) { + #TODO: if (chunk_amount != number of chunks in selector_file_dims), crash + if (length(var_dims) > 1) { + stop("Specified a '", inner_dim, "_var' for the dimension '", + inner_dim, "', which goes across files (across '", file_dim, + "'). The specified variable, '", var_with_selectors_name, "', has more ", + "than one dimension and can not be used as selector variable. ", + "Select another variable or fix it in the files.") + } + } + ## TODO HERE:: + #- indices_of_first_files_with_data may change, because array is now extended + var_full_dims <- dim(var_with_selectors) + if (!(inner_dim %in% names(var_full_dims))) { + stop("Could not find the dimension '", inner_dim, "' in ", + "the file. Either change the dimension name in ", + "your request, adjust the parameter ", + "'dim_names_in_files' or fix the dimension name in ", + "the file.\n", file_path) + } + } else if (((is.numeric(selector_array) || is.list(selector_array)) && selectors_are_indices) || + (is.character(selector_array) && (length(selector_array) == 1) && + (selector_array %in% c('all', 'first', 'last')) && + !is.null(file_dim_reader))) { + #### TODO HERE:: + ###- indices_of_first_files_with_data may change, because array is now extended + # Lines moved above for better performance. + ##data_dims <- file_dim_reader(file_path, NULL, selectors_of_first_files_with_data[[i]], + ## lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1)) + if (!(inner_dim %in% names(data_dims))) { + stop("Could not find the dimension '", inner_dim, "' in ", + "the file. Either change the dimension name in ", + "your request, adjust the parameter ", + "'dim_names_in_files' or fix the dimension name in ", + "the file.\n", file_path) + } + } else { + stop(paste0("Can not translate the provided selectors for '", inner_dim, + "' to numeric indices. Provide numeric indices and a ", + "'file_dim_reader' function, or a '", inner_dim, + "_var' in order to calculate the indices.")) + } + # At this point, if no selector variable was provided, the variable + # data_dims has been populated. If a selector variable was provided, + # the variables var_dims, var_file_dims and var_full_dims have been + # populated instead. + fri <- first_round_indices <- NULL + sri <- second_round_indices <- NULL + # This variable will keep the indices needed to crop the transformed + # variable (the one that has been transformed without being subset + # with the first round indices). + tvi <- tranaformed_variable_indices <- NULL + ordered_fri <- NULL + ordered_sri <- NULL + if ((length(selector_array) == 1) && is.character(selector_array) && + (selector_array %in% c('all', 'first', 'last')) && + (chunks[[inner_dim]]['n_chunks'] == 1)) { + if (is.null(var_with_selectors_name)) { + fri <- vector('list', length = chunk_amount) + dim(fri) <- c(chunk_amount) + sri <- vector('list', length = chunk_amount) + dim(sri) <- c(chunk_amount) + if (selector_array == 'all') { + fri[] <- replicate(chunk_amount, list(1:(data_dims[inner_dim]))) + taken_chunks <- rep(TRUE, chunk_amount) + #sri <- NULL + } else if (selector_array == 'first') { + fri[[1]] <- 1 + taken_chunks[1] <- TRUE + #sri <- NULL + } else if (selector_array == 'last') { + fri[[chunk_amount]] <- data_dims[inner_dim] + taken_chunks[length(taken_chunks)] <- TRUE + #sri <- NULL + } + } else { + if ((!is.null(file_dim)) && !(file_dim %in% names(var_file_dims))) { + stop("The variable '", var_with_selectors_name, "' must also be ", + "requested for the file dimension '", file_dim, "' in ", + "this configuration.") + } + fri <- vector('list', length = prod(var_file_dims)) + dim(fri) <- var_file_dims + ordered_fri <- fri + sri <- vector('list', length = prod(var_file_dims)) + dim(sri) <- var_file_dims + ordered_sri <- sri + if (selector_array == 'all') { + # TODO: Populate ordered_fri + ordered_fri[] <- replicate(prod(var_file_dims), list(1:n)) + fri[] <- replicate(prod(var_file_dims), list(var_unorder_indices[1:n])) + taken_chunks <- rep(TRUE, chunk_amount) + if (!with_transform) { + #fri[] <- replicate(prod(var_file_dims), list(1:n)) + #taken_chunks <- rep(TRUE, chunk_amount) + #sri <- NULL + } else { + ordered_sri[] <- replicate(prod(var_file_dims), list(1:m)) + sri[] <- replicate(prod(var_file_dims), list(1:m)) + ## var_file_dims instead?? + #if (!aiat) { + #fri[] <- replicate(prod(var_file_dims), list(1:n)) + #taken_chunks <- rep(TRUE, chunk_amount) + #sri[] <- replicate(prod(transformed_var_file_dims), list(1:m)) + #} else { + #fri[] <- replicate(prod(var_file_dims), list(1:n)) + #taken_chunks <- rep(TRUE, chunk_amount) + #sri[] <- replicate(prod(transformed_var_file_dims), list(1:m)) + #} + tvi <- 1:m + } + } else if (selector_array == 'first') { + taken_chunks[1] <- TRUE + if (!with_transform) { + ordered_fri[[1]] <- 1 + fri[[1]] <- var_unorder_indices[1] + #taken_chunks[1] <- TRUE + #sri <- NULL + } else { + if (!aiat) { + ordered_fri[[1]] <- 1 + fri[[1]] <- var_unorder_indices[1] + # TODO: TO BE IMPROVED + #taken_chunks[1] <- TRUE + ordered_sri[[1]] <- 1:ceiling(m / n) + sri[[1]] <- 1:ceiling(m / n) + tvi <- 1:ceiling(m / n) + } else { + ordered_fri[[1]] <- 1:ceiling(m / n) + fri[[1]] <- var_unorder_indices[1:ceiling(m / n)] + #taken_chunks[1] <- TRUE + ordered_sri[[1]] <- 1 + sri[[1]] <- 1 + tvi <- 1 + } + } + } else if (selector_array == 'last') { + taken_chunks[length(taken_chunks)] <- TRUE + if (!with_transform) { + ordered_fri[[prod(var_file_dims)]] <- n + fri[[prod(var_file_dims)]] <- var_unorder_indices[n] + #taken_chunks[length(taken_chunks)] <- TRUE + #sri <- NULL + } else { + if (!aiat) { + ordered_fri[[prod(var_file_dims)]] <- prod(var_dims) + fri[[prod(var_file_dims)]] <- var_unorder_indices[prod(var_dims)] + #taken_chunks[length(taken_chunks)] <- TRUE + ordered_sri[[prod(var_file_dims)]] <- 1:ceiling(m / n) + sri[[prod(var_file_dims)]] <- 1:ceiling(m / n) + # TODO: TO BE IMPROVED. THE TVI MAY BE WRONG IF THERE'S BEEN A REORDERING. + tvi <- 1:ceiling(m / n) + } else { + ordered_fri[[prod(var_file_dims)]] <- (n - ceiling(m / n) + 1):n + fri[[prod(var_file_dims)]] <- var_unorder_indices[(n - ceiling(m / n) + 1):n] + #taken_chunks[length(taken_chunks)] <- TRUE + ordered_sri[[prod(var_file_dims)]] <- 1 + sri[[prod(var_file_dims)]] <- 1 + tvi <- 1 + } + } + } + } + # If the selectors are not 'all', 'first', 'last', ... + } else { + if (!is.null(var_with_selectors_name)) { + unmatching_file_dims <- which(!(names(var_file_dims) %in% names(selector_file_dims))) + if ((length(unmatching_file_dims) > 0)) { + raise_error <- FALSE + if (is.null(file_dim)) { + raise_error <- TRUE + } else { + if (!((length(unmatching_file_dims) == 1) && + (names(var_file_dims)[unmatching_file_dims] == file_dim) && + (inner_dim %in% names(selector_inner_dims)))) { + raise_error <- TRUE + } + } + if (raise_error) { + stop("Provided selectors for the dimension '", inner_dim, "' must have as many ", + "file dimensions as the variable the dimension is defined along, '", + var_with_selectors_name, "', with the exceptions of the file pattern dimension ('", + found_pattern_dim, "') and any depended file dimension (if specified as ", + "depended dimension in parameter 'inner_dims_across_files' and the ", + "depending file dimension is present in the provided selector array).") + } + } + if (any(names(selector_file_dims) %in% names(dim(var_with_selectors)))) { + if (any(dim(var_with_selectors)[names(selector_file_dims)] != selector_file_dims)) { + stop("Size of selector file dimensions must mach size of requested ", + "variable dimensions.") + } + } + } + ## TODO: If var dimensions are not in the same order as selector dimensions, reorder + if (is.null(names(selector_file_dims))) { + if (is.null(file_dim)) { + fri_dims <- 1 + } else { + fri_dims <- chunk_amount + names(fri_dims) <- file_dim + } + } else { + fri_dim_names <- names(selector_file_dims) + if (!is.null(file_dim)) { + fri_dim_names <- c(fri_dim_names, file_dim) + } + fri_dim_names <- found_file_dims[[i]][which(found_file_dims[[i]] %in% fri_dim_names)] + fri_dims <- rep(NA, length(fri_dim_names)) + names(fri_dims) <- fri_dim_names + fri_dims[names(selector_file_dims)] <- selector_file_dims + if (!is.null(file_dim)) { + fri_dims[file_dim] <- chunk_amount + } + } + fri <- vector('list', length = prod(fri_dims)) + dim(fri) <- fri_dims + sri <- vector('list', length = prod(fri_dims)) + dim(sri) <- fri_dims + selector_file_dim_array <- array(1:prod(selector_file_dims), dim = selector_file_dims) + selector_store_position <- fri_dims + for (j in 1:prod(dim(selector_file_dim_array))) { + selector_indices_to_take <- which(selector_file_dim_array == j, arr.ind = TRUE)[1, ] + names(selector_indices_to_take) <- names(selector_file_dims) + selector_store_position[names(selector_indices_to_take)] <- selector_indices_to_take + sub_array_of_selectors <- Subset(selector_array, names(selector_indices_to_take), + as.list(selector_indices_to_take), drop = 'selected') + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> ITERATING OVER FILE DIMENSIONS OF THE SELECTORS.") + print("-> STRUCTURE OF A SUB ARRAY:") + print(str(sub_array_of_selectors)) + print("-> STRUCTURE OF THE VARIABLE WITH SELECTORS:") + print(str(var_with_selectors)) + print(dim(var_with_selectors)) + } + } + if (selectors_are_indices) { + sub_array_of_values <- NULL + #} else if (!is.null(var_ordered)) { + # sub_array_of_values <- var_ordered + } else { + if (length(var_file_dims) > 0) { + var_indices_to_take <- selector_indices_to_take[which(names(selector_indices_to_take) %in% names(var_file_dims))] + sub_array_of_values <- Subset(var_with_selectors, names(var_indices_to_take), + as.list(var_indices_to_take), drop = 'selected') + } else { + sub_array_of_values <- var_with_selectors + } + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> STRUCTURE OF THE SUB ARRAY FROM THE VARIABLE CORRESPONDING TO THE SUB ARRAY OF SELECTORS") + print(str(sub_array_of_values)) + print(dim(sub_array_of_values)) + print("-> NAME OF THE FILE DIMENSION THE CURRENT INNER DIMENSION EXTENDS ALONG:") + print(file_dim) + } + } + if ((!is.null(file_dim) && (file_dim %in% names(selector_file_dims))) || is.null(file_dim)) { + if (length(sub_array_of_selectors) > 0) { + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> THE INNER DIMENSION DOES NOT GO ACROSS ANY FILE DIMENSION OR IT DOES BUT IS IN THE PROVIDED SELECTOR ARRAY.") + } + } + if (selectors_are_indices) { + if (!is.null(var_with_selectors_name)) { + max_allowed <- ifelse(aiat, m, n) + } else { + max_allowed <- data_dims[inner_dim] + } + if (any(na.omit(unlist(sub_array_of_selectors)) > max_allowed) || + any(na.omit(unlist(sub_array_of_selectors)) < 1)) { + stop("Provided indices out of range for dimension '", inner_dim, "' ", + "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", + max_allowed, ").") + } + } + + # The selector_checker will return either a vector of indices or a list + # with the first and last desired indices. + goes_across_prime_meridian <- FALSE + if (!is.null(var_ordered) && !selectors_are_indices) { + if (!is.null(dim_reorder_params[[inner_dim]])) { + if (is.list(sub_array_of_selectors)) { + + ## NOTE: The check of 'goes_across_prime_meridian' is moved forward to here. + is_circular_dim <- attr(dim_reorder_params[[inner_dim]], "circular") + if (!is.null(is_circular_dim)) { + if (is_circular_dim) { + + # NOTE: Use CircularSort() to put the values in the assigned range, and get the order. + # For example, [-10, 20] in CircularSort(0, 360) is [350, 20]. The $ix list is [2, 1]. + # 'goes_across_prime_meridian' means the selector range across the border. For example, + # CircularSort(-180, 180) with selector [170, 190] -> goes_across_prime_meridian = TRUE. + tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix + goes_across_prime_meridian <- tmp[1] > tmp[2] + } + } + + # HERE change to the same code as below (under 'else'). Not sure why originally + #it uses additional lines, which make reorder not work. + sub_array_of_selectors <- as.list(dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$x) + #sub_array_reordered <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors)) + #sub_array_unorder <- sort(sub_array_reordered$ix, index.return = TRUE)$ix + #sub_array_of_selectors <- as.list(sub_array_reordered$x[sub_array_unorder]) + + # Add warning if the boundary is out of range + if (sub_array_of_selectors[1] < range(var_ordered)[1] | sub_array_of_selectors[1] > range(var_ordered)[2]) { + .warning(paste0("The lower boundary of selector of ", + inner_dim, + " is out of range [", + min(var_ordered), ", ", max(var_ordered), "]. ", + "Check if the desired range is all included.")) + } + if (sub_array_of_selectors[2] < range(var_ordered)[1] | sub_array_of_selectors[2] > range(var_ordered)[2]) { + .warning(paste0("The upper boundary of selector of ", + inner_dim, + " is out of range [", + min(var_ordered), ", ", max(var_ordered), "]. ", + "Check if the desired range is all included.")) + } + + + } else { + sub_array_of_selectors <- dim_reorder_params[[inner_dim]](sub_array_of_selectors)$x + } + } + + # NOTE: The ideal solution for selecting indices in goes_across_prime_meridian case + # is modified SelectorCheckor.R. But now SelectorCheckor doesn't know the info of + #goes_across_prime_meridian, so I do the adjustion after calling SelectorCheckor(). + sub_array_of_indices <- selector_checker(sub_array_of_selectors, var_ordered, + tolerance = if (aiat) { + NULL + } else { + tolerance_params[[inner_dim]] + }) + + if (goes_across_prime_meridian & sub_array_of_indices[[1]] < sub_array_of_indices[[2]]) { + if (!(sub_array_of_selectors[[1]] %in% var_ordered)){ + sub_array_of_indices[[1]] <- sub_array_of_indices[[1]] - 1 + } + + if (!(sub_array_of_selectors[[2]] %in% var_ordered)){ + sub_array_of_indices[[2]] <- sub_array_of_indices[[2]] + 1 + } + } + + #NOTE: the possible case? + if (goes_across_prime_meridian & sub_array_of_indices[[1]] > sub_array_of_indices[[2]]) { + .stop("The case is goes_across_prime_meridian but no adjustion for the indices!") + } + + if (any(is.na(sub_array_of_indices))) { + + stop(paste0("The selectors of ", inner_dim, + " are out of range [", min(var_ordered), + ", ", max(var_ordered), "].")) + } + + } else { + + # Add warning if the boundary is out of range + if (is.list(sub_array_of_selectors)) { + if (sub_array_of_selectors[1] < + min(sub_array_of_values) | sub_array_of_selectors[1] > + max(sub_array_of_values)) { + .warning(paste0("The lower boundary of selector of ", + inner_dim, " is out of range [", + min(sub_array_of_values), ", ", + max(sub_array_of_values), "]. ", + "Check if the desired range is all included.")) + } + if (sub_array_of_selectors[2] < + min(sub_array_of_values) | sub_array_of_selectors[2] > + max(sub_array_of_values)) { + .warning(paste0("The upper boundary of selector of ", + inner_dim, " is out of range [", + min(sub_array_of_values), ", ", + max(sub_array_of_values), "]. ", + "Check if the desired range is all included.")) + } + } + + sub_array_of_indices <- selector_checker(sub_array_of_selectors, sub_array_of_values, + tolerance = if (aiat) { + NULL + } else { + tolerance_params[[inner_dim]] + }) + + if (any(is.na(sub_array_of_indices))) { + + stop(paste0("The selectors of ", inner_dim, + " are out of range [", min(sub_array_of_values), + ", ", max(sub_array_of_values), "].")) + } + + } + ## This 'if' runs in both Start() and Compute(). In Start(), it doesn't have any effect (no chunk). + ## In Compute(), it creates the indices for each chunk. For example, if 'sub_array_of_indices' + ## is c(5:10) and chunked into 2, 'sub_array_of_indices' becomes c(5:7) for chunk = 1, c(8:10) + ## for chunk = 2. If 'sub_array_of_indices' is list(55, 62) and chunked into 2, it becomes + ## list(55, 58) for chunk = 1 and list(59, 62) for chunk = 2. + ## TODO: The list can be turned into vector here? So afterward no need to judge if it is list + ## or vector. + if (!is.list(sub_array_of_indices)) { + sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices), + chunks[[inner_dim]]["chunk"], + chunks[[inner_dim]]["n_chunks"], + inner_dim)] + } else { + tmp <- chunk_indices(length(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]), + chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], + inner_dim) + vect <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + sub_array_of_indices[[1]] <- vect[tmp[1]] + sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] + } + # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. + + # Check if all the files have the selectors assigned (e.g., region = 'Grnland') _20191015 + if (is.character(sub_array_of_selectors)) { + array_of_var_files_check <- vector('list', length(selector_indices)) + for (k in 1:length(selector_indices)) { + asdasd <- selector_indices[[k]] + array_of_var_files_check <- do.call('[', c(list(x = array_of_files_to_load), asdasd, list(drop = FALSE)))[j] + file_object <- file_opener(array_of_var_files_check) + var_values_check <- file_var_reader(NULL, file_object, NULL, var_to_read, synonims) + if (any(as.vector(var_values_check)[sub_array_of_indices] != sub_array_of_selectors)) { + .warning('Not all the files has correponding selectors. Check the selector attributes') + } + } + } + + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> TRANSFORMATION REQUESTED?") + print(with_transform) + print("-> BETA:") + print(beta) + } + } + if (with_transform) { + # If there is a transformation and selector values are provided, these + # selectors will be processed in the same way either if aiat = TRUE or + # aiat = FALSE. + ## TODO: If sub_array_of_selectors was integer and aiat then... do what's commented 50 lines below. + ## otherwise, do what's coded. + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> SELECTORS REQUESTED BEFORE TRANSFORM.") + } + } + + ###NOTE: Here, the transform, is different from the below part of non-transform. + # search 'if (goes_across_prime_meridian' to find the lines below. + if (goes_across_prime_meridian) { + # NOTE: before changing, the return is already correct. + + #NOTE: The fix below has the same explanation as no with_transform part below. + # Search the next next 'if (goes_across_prime_meridian) {'. + if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) { + # global longitude + sub_array_of_fri <- 1:n + # Warning if transform_extra_cell != 0 + if (beta != 0) { + .warning(paste0("Adding parameter transform_extra_cells = ", + transform_extra_cells, " to the transformed index excesses ", + "the border. The border index is used for transformation.")) + } + + } else { + # normal case, i.e., not global + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + gap_width <- last_index - first_index - 1 + sub_array_of_fri <- c(1:(min(unlist(sub_array_of_indices)) + min(gap_width, beta)), + (max(unlist(sub_array_of_indices)) - min(gap_width, beta)):n) + + if (min(gap_width, beta) != beta) { + .warning(paste0("Adding parameter transform_extra_cells = ", + transform_extra_cells, " to the transformed index excesses ", + "the border. The border index is used for transformation.")) + } + } + + } else { + #NOTE: This if seems redundant. + if (is.list(sub_array_of_indices)) { + sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + } + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + + start_padding <- min(beta, first_index - 1) + end_padding <- min(beta, n - last_index) + + if (exists("is_circular_dim")) { + if (!is_circular_dim) { #latitude + sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) + if (start_padding != beta | end_padding != beta) { + .warning(paste0("Adding parameter transform_extra_cells = ", + transform_extra_cells, " to the transformed index excesses ", + "the border. The border index is used for transformation.")) + } + } else { #longitude + if ((last_index - first_index + 1 + beta * 2) >= n) { + sub_array_of_fri <- 1:n + } else if (start_padding < beta) { # left side too close to border, need to go to right side + sub_array_of_fri <- c((first_index - start_padding):(last_index + end_padding), (n - (beta - start_padding - 1)):n) + } else if (end_padding < beta) { # right side too close to border, need to go to left side + sub_array_of_fri <- c(1: (beta - end_padding), (first_index - start_padding):(last_index + end_padding)) + } else { #normal + sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) + } + } + } else { # when _reorder is not used + sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) + if (start_padding != beta | end_padding != beta) { + .warning(paste0("Adding parameter transform_extra_cells = ", + transform_extra_cells, " to the transformed index excesses ", + "the border. The border index is used for transformation.")) + } + } + + } + subset_vars_to_transform <- vars_to_transform + if (!is.null(var_ordered)) { + + ##NOTE: if var_ordered is common_vars, it doesn't have attributes and it is a vector. + ## Turn it into array and add dimension name. + if (!is.array(var_ordered)) { + var_ordered <- as.array(var_ordered) + names(dim(var_ordered)) <- inner_dim + } + + subset_vars_to_transform[[var_with_selectors_name]] <- Subset(var_ordered, inner_dim, sub_array_of_fri) + } else { + ##NOTE: It should be redundant because without reordering the var should remain array + ## But just stay same with above... + if (!is.array(sub_array_of_values)) { + sub_array_of_values <- as.array(sub_array_of_values) + names(dim(sub_array_of_values)) <- inner_dim + } + + subset_vars_to_transform[[var_with_selectors_name]] <- Subset(sub_array_of_values, inner_dim, sub_array_of_fri) + } + + # Change the order of longitude crop if no reorder + from big to small. + # cdo -sellonlatbox, the lon is west, east (while lat can be north + # to south or opposite) + + # Before changing crop, first we need to find the name of longitude. + # NOTE: The potential bug here (also the bug for CDORemapper): the lon name + # is limited (only the ones listed in .KnownLonNames() are available. + known_lon_names <- s2dverification:::.KnownLonNames() + lon_name <- names(subset_vars_to_transform)[which(names(subset_vars_to_transform) %in% known_lon_names)[1]] + + # NOTE: The cases not considered: (1) if lon reorder(decreasing = T) + # It doesn't make sense, but if someone uses it, here should + # occur error. (2) crop = TRUE/FALSE + if ('crop' %in% names(transform_params) & var_with_selectors_name == lon_name & is.null(dim_reorder_params[[inner_dim]])) { + if (is.numeric(class(transform_params$crop))) { + if (transform_params$crop[1] > transform_params$crop[2]) { + tmp <- transform_params$crop[1] + transform_params$crop[1] <- transform_params$crop[2] + transform_params$crop[2] <- tmp + } + } + } + + transformed_subset_var <- do.call(transform, c(list(data_array = NULL, + variables = subset_vars_to_transform, + file_selectors = selectors_of_first_files_with_data[[i]]), + transform_params))$variables[[var_with_selectors_name]] + # Sorting the transformed variable and working out the indices again after transform. + if (!is.null(dim_reorder_params[[inner_dim]])) { + transformed_subset_var_reorder <- dim_reorder_params[[inner_dim]](transformed_subset_var) + transformed_subset_var <- transformed_subset_var_reorder$x + #NOTE: The fix here solves the mis-ordered lon when across_meridian. + transformed_subset_var_unorder <- transformed_subset_var_reorder$ix + # transformed_subset_var_unorder <- sort(transformed_subset_var_reorder$ix, index.return = TRUE)$ix + } else { + transformed_subset_var_unorder <- 1:length(transformed_subset_var) + } + sub_array_of_sri <- selector_checker(sub_array_of_selectors, transformed_subset_var, + tolerance = if (aiat) { + tolerance_params[[inner_dim]] + } else { + NULL + }) + + # Check if selectors fall out of the range of the transform grid + # It may happen when original lon is [-180, 180] while want to regrid to + # [0, 360], and lon selector = [-20, -10]. + if (any(is.na(sub_array_of_sri))) { + stop(paste0("The selectors of ", + inner_dim, " are out of range of transform grid '", + transform_params$grid, "'. Use parameter '", + inner_dim, "_reorder' or change ", inner_dim, + " selectors.")) + } + + if (goes_across_prime_meridian) { + + if (sub_array_of_sri[[1]] == sub_array_of_sri[[2]]) { + # global longitude + sub_array_of_sri <- c(1:length(transformed_subset_var)) + } else { + # the common case, i.e., non-global + # NOTE: Because sub_array_of_sri order is exchanged due to + # previous development, here [[1]] and [[2]] should exchange + sub_array_of_sri <- c(1:sub_array_of_sri[[1]], + sub_array_of_sri[[2]]:length(transformed_subset_var)) + } + + } else if (is.list(sub_array_of_sri)) { + sub_array_of_sri <- sub_array_of_sri[[1]]:sub_array_of_sri[[2]] + } + ordered_sri <- sub_array_of_sri + sub_array_of_sri <- transformed_subset_var_unorder[sub_array_of_sri] + # In this case, the tvi are not defined and the 'transformed_subset_var' + # will be taken instead of the var transformed before in the code. + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> FIRST INDEX:") + print(first_index) + print("-> LAST INDEX:") + print(last_index) + print("-> STRUCTURE OF FIRST ROUND INDICES:") + print(str(sub_array_of_fri)) + print("-> STRUCTURE OF SECOND ROUND INDICES:") + print(str(sub_array_of_sri)) + print("-> STRUCTURE OF TRANSFORMED VARIABLE INDICES:") + print(str(tvi)) + } + } + ### # If the selectors are expressed after transformation + ### } else { + ###if (debug) { + ###if (inner_dim %in% dims_to_check) { + ###print("-> SELECTORS REQUESTED AFTER TRANSFORM.") + ###} + ###} + ### if (goes_across_prime_meridian) { + ### sub_array_of_indices <- c(sub_array_of_indices[[1]]:m, + ### 1:sub_array_of_indices[[2]]) + ### } + ### first_index <- min(unlist(sub_array_of_indices)) + ### last_index <- max(unlist(sub_array_of_indices)) + ### first_index_before_transform <- max(transform_indices(first_index, m, n) - beta, 1) + ### last_index_before_transform <- min(transform_indices(last_index, m, n) + beta, n) + ### sub_array_of_fri <- first_index_before_transform:last_index_before_transform + ### n_of_extra_cells <- round(beta / n * m) + ### if (is.list(sub_array_of_indices) && (length(sub_array_of_indices) > 1)) { + ### sub_array_of_sri <- 1:(last_index - first_index + 1) + ### if (is.null(tvi)) { + ### tvi <- sub_array_of_sri + first_index - 1 + ### } + ### } else { + ### sub_array_of_sri <- sub_array_of_indices - first_index + 1 + ### if (is.null(tvi)) { + ### tvi <- sub_array_of_indices + ### } + ### } + ### sub_array_of_sri <- sub_array_of_sri + n_of_extra_cells + sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position), + list(value = sub_array_of_sri))) + } else { + if (goes_across_prime_meridian) { + #NOTE: The potential problem here is, if it is global longitude, + # and the indices overlap (e.g., lon = [0, 359.723] and + # CircularSort(-180, 180), then sub_array_of_indices = list(649, 649)). + # Therefore, sub_array_of_fri will be c(1:649, 649:1296). We'll + # get two 649. + # The fix below may not be the best solution, but it works for + # the example above. + + if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) { + # global longitude + sub_array_of_fri <- c(1:n) + } else { + # the common case, i.e., non-global + sub_array_of_fri <- c(1:min(unlist(sub_array_of_indices)), + max(unlist(sub_array_of_indices)):n) + } + + } else if (is.list(sub_array_of_indices)) { + sub_array_of_fri <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + } else { + sub_array_of_fri <- sub_array_of_indices + } + } + if (!is.null(var_unorder_indices)) { + if (is.null(ordered_fri)) { + ordered_fri <- sub_array_of_fri + } + sub_array_of_fri <- var_unorder_indices[sub_array_of_fri] + } + fri <- do.call('[[<-', c(list(x = fri), as.list(selector_store_position), + list(value = sub_array_of_fri))) + if (!is.null(file_dim)) { + taken_chunks[selector_store_position[[file_dim]]] <- TRUE + } else { + taken_chunks <- TRUE + } + } + } else { + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.") + } + } + if (inner_dim %in% names(dim(sub_array_of_selectors))) { + if (is.null(var_with_selectors_name)) { + if (any(na.omit(unlist(sub_array_of_selectors)) < 1) || + any(na.omit(unlist(sub_array_of_selectors)) > data_dims[inner_dim] * chunk_amount)) { + stop("Provided indices out of range for dimension '", inner_dim, "' ", + "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", + data_dims[inner_dim] * chunk_amount, ").") + } + } else { + if (inner_dim %in% names(dim(sub_array_of_values))) { + # NOTE: Put across-inner-dim at the 1st position. + # POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_selectors below. + inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_values)) == inner_dim) + if (inner_dim_pos_in_sub_array != 1) { + new_sub_array_order <- (1:length(dim(sub_array_of_values)))[-inner_dim_pos_in_sub_array] + new_sub_array_order <- c(inner_dim_pos_in_sub_array, new_sub_array_order) + sub_array_of_values <- .aperm2(sub_array_of_values, new_sub_array_order) + } + } + } + + # NOTE: Put across-inner-dim at the 1st position. + # POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_values above. + inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_selectors)) == inner_dim) + if (inner_dim_pos_in_sub_array != 1) { + new_sub_array_order <- (1:length(dim(sub_array_of_selectors)))[-inner_dim_pos_in_sub_array] + new_sub_array_order <- c(inner_dim_pos_in_sub_array, new_sub_array_order) + sub_array_of_selectors <- .aperm2(sub_array_of_selectors, new_sub_array_order) + } + sub_array_of_indices <- selector_checker(sub_array_of_selectors, sub_array_of_values, + tolerance = tolerance_params[[inner_dim]]) + # It is needed to expand the indices here, otherwise for + # values(list(date1, date2)) only 2 values are picked. + if (is.list(sub_array_of_indices)) { + sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + } + sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices), + chunks[[inner_dim]]['chunk'], + chunks[[inner_dim]]['n_chunks'], + inner_dim)] + sub_array_is_list <- FALSE + if (is.list(sub_array_of_indices)) { + sub_array_is_list <- TRUE + sub_array_of_indices <- unlist(sub_array_of_indices) + } + if (is.null(var_with_selectors_name)) { + indices_chunk <- floor((sub_array_of_indices - 1) / data_dims[inner_dim]) + 1 + transformed_indices <- ((sub_array_of_indices - 1) %% data_dims[inner_dim]) + 1 + } else { + indices_chunk <- floor((sub_array_of_indices - 1) / var_full_dims[inner_dim]) + 1 + transformed_indices <- ((sub_array_of_indices - 1) %% var_full_dims[inner_dim]) + 1 + } + if (sub_array_is_list) { + sub_array_of_indices <- as.list(sub_array_of_indices) + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> GOING TO ITERATE ALONG CHUNKS.") + } + } + for (chunk in 1:chunk_amount) { + if (!is.null(names(selector_store_position))) { + selector_store_position[file_dim] <- chunk + } else { + selector_store_position <- chunk + } + chunk_selectors <- transformed_indices[which(indices_chunk == chunk)] + sub_array_of_indices <- chunk_selectors + if (with_transform) { + # If the provided selectors are expressed in the world + # before transformation + if (!aiat) { + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + sub_array_of_fri <- max(c(first_index - beta, 1)):min(c(last_index + beta, n)) + sub_array_of_sri <- transform_indices(unlist(sub_array_of_indices) - first_index + 1, n, m) + if (is.list(sub_array_of_indices)) { + if (length(sub_array_of_sri) > 1) { + sub_array_of_sri <- sub_array_of_sri[[1]]:sub_array_of_sri[[2]] + } + } + ##TODO: TRANSFORM SUBSET VARIABLE AS ABOVE, TO COMPUTE SRI + # If the selectors are expressed after transformation + } else { + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + first_index_before_transform <- max(transform_indices(first_index, m, n) - beta, 1) + last_index_before_transform <- min(transform_indices(last_index, m, n) + beta, n) + sub_array_of_fri <- first_index_before_transform:last_index_before_transform + if (is.list(sub_array_of_indices) && (length(sub_array_of_indices) > 1)) { + sub_array_of_sri <- 1:(last_index - first_index + 1) + + round(beta / n * m) + } else { + sub_array_of_sri <- sub_array_of_indices - first_index + 1 + + round(beta / n * m) + } + ##TODO: FILL IN TVI + } + sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position), + list(value = sub_array_of_sri))) + if (length(sub_array_of_sri) > 0) { + taken_chunks[chunk] <- TRUE + } + } else { + sub_array_of_fri <- sub_array_of_indices + if (length(sub_array_of_fri) > 0) { + taken_chunks[chunk] <- TRUE + } + } + if (!is.null(var_unorder_indices)) { + ordered_fri <- sub_array_of_fri + sub_array_of_fri <- var_unorder_indices[sub_array_of_fri] + } + fri <- do.call('[[<-', c(list(x = fri), as.list(selector_store_position), + list(value = sub_array_of_fri))) + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> FINISHED ITERATING ALONG CHUNKS") + } + } + } else { + stop("Provided array of indices for dimension '", inner_dim, "', ", + "which goes across the file dimension '", file_dim, "', but ", + "the provided array does not have the dimension '", inner_dim, + "', which is mandatory.") + } + } + } + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> PROCEEDING TO CROP VARIABLES") + } + } + #if ((length(selector_array) == 1) && (selector_array %in% c('all', 'first', 'last'))) { + #if (!is.null(var_with_selectors_name) || (is.null(var_with_selectors_name) && is.character(selector_array) && + # (length(selector_array) == 1) && (selector_array %in% c('all', 'first', 'last')))) { + empty_chunks <- which(!taken_chunks) + if (length(empty_chunks) >= length(taken_chunks)) { + stop("Selectors do not match any of the possible values for the dimension '", inner_dim, "'.") + } + if (length(empty_chunks) > 0) { + # # Get the first group of chunks to remove, and remove them. + # # E.g., from c(1, 2, 4, 5, 6, 8, 9) remove only 1 and 2 + # dist <- abs(rev(empty_chunks) - c(rev(empty_chunks)[1] - 1, head(rev(empty_chunks), length(rev(empty_chunks)) - 1))) + # if (all(dist == 1)) { + # start_chunks_to_remove <- NULL + # } else { + # first_chunk_to_remove <- tail(which(dist > 1), 1) + # start_chunks_to_remove <- rev(rev(empty_chunks)[first_chunk_to_remove:length(empty_chunks)]) + # } + # # Get the last group of chunks to remove, and remove them. + # # E.g., from c(1, 2, 4, 5, 6, 8, 9) remove only 8 and 9 + # dist <- abs(empty_chunks - c(empty_chunks[1] - 1, head(empty_chunks, length(empty_chunks) - 1))) + # if (all(dist == 1)) { + # first_chunk_to_remove <- 1 + # } else { + # first_chunk_to_remove <- tail(which(dist > 1), 1) + # } + # end_chunks_to_remove <- empty_chunks[first_chunk_to_remove:length(empty_chunks)] + # chunks_to_keep <- which(!((1:length(taken_chunks)) %in% c(start_chunks_to_remove, end_chunks_to_remove))) + chunks_to_keep <- which(taken_chunks) + dims_to_crop[[file_dim]] <- c(dims_to_crop[[file_dim]], list(chunks_to_keep)) + # found_indices <- Subset(found_indices, file_dim, chunks_to_keep) + # # Crop dataset variables file dims. + # for (picked_var in names(picked_vars[[i]])) { + # if (file_dim %in% names(dim(picked_vars[[i]][[picked_var]]))) { + # picked_vars[[i]][[picked_var]] <- Subset(picked_vars[[i]][[picked_var]], file_dim, chunks_to_keep) + # } + # } + } + #} + dat[[i]][['selectors']][[inner_dim]] <- list(fri = fri, sri = sri) + # Crop dataset variables inner dims. + # Crop common variables inner dims. + types_of_var_to_crop <- 'picked' + if (with_transform) { + types_of_var_to_crop <- c(types_of_var_to_crop, 'transformed') + } + if (!is.null(dim_reorder_params[[inner_dim]])) { + types_of_var_to_crop <- c(types_of_var_to_crop, 'reordered') + } + for (type_of_var_to_crop in types_of_var_to_crop) { + if (type_of_var_to_crop == 'transformed') { + if (is.null(tvi)) { + if (!is.null(dim_reorder_params[[inner_dim]])) { + crop_indices <- unique(unlist(ordered_sri)) + } else { + crop_indices <- unique(unlist(sri)) + } + } else { + crop_indices <- unique(unlist(tvi)) + } + vars_to_crop <- transformed_vars[[i]] + common_vars_to_crop <- transformed_common_vars + } else if (type_of_var_to_crop == 'reordered') { + crop_indices <- unique(unlist(ordered_fri)) + vars_to_crop <- picked_vars_ordered[[i]] + common_vars_to_crop <- picked_common_vars_ordered + } else { + crop_indices <- unique(unlist(fri)) + vars_to_crop <- picked_vars[[i]] + common_vars_to_crop <- picked_common_vars + } + for (var_to_crop in names(vars_to_crop)) { + if (inner_dim %in% names(dim(vars_to_crop[[var_to_crop]]))) { + if (!is.null(crop_indices)) { + if (type_of_var_to_crop == 'transformed') { + if (!aiat) { + vars_to_crop[[var_to_crop]] <- Subset(transformed_subset_var, inner_dim, crop_indices) + } else { + vars_to_crop[[var_to_crop]] <- Subset(vars_to_crop[[var_to_crop]], inner_dim, crop_indices) + } + } else { + vars_to_crop[[var_to_crop]] <- Subset(vars_to_crop[[var_to_crop]], inner_dim, crop_indices) + } + } + } + } + if (i == length(dat)) { + for (common_var_to_crop in names(common_vars_to_crop)) { + if (inner_dim %in% names(dim(common_vars_to_crop[[common_var_to_crop]]))) { + common_vars_to_crop[[common_var_to_crop]] <- Subset(common_vars_to_crop[[common_var_to_crop]], inner_dim, crop_indices) + } + } + } + if (type_of_var_to_crop == 'transformed') { + if (!is.null(vars_to_crop)) { + transformed_vars[[i]] <- vars_to_crop + } + if (i == length(dat)) { + transformed_common_vars <- common_vars_to_crop + } + } else if (type_of_var_to_crop == 'reordered') { + if (!is.null(vars_to_crop)) { + picked_vars_ordered[[i]] <- vars_to_crop + } + if (i == length(dat)) { + picked_common_vars_ordered <- common_vars_to_crop + } + } else { + if (!is.null(vars_to_crop)) { + picked_vars[[i]] <- vars_to_crop + } + if (i == length(dat)) { + picked_common_vars <- common_vars_to_crop + } + } + } + #} + } + # After the selectors have been picked (using the original variables), + # the variables are transformed. At that point, the original selectors + # for the transformed variables are also kept in the variable original_selectors. + #print("L") + } + } + } + # if (!is.null(transformed_common_vars)) { + # picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars + # } + # Remove the trailing chunks, if any. + for (file_dim in names(dims_to_crop)) { + # indices_to_keep <- min(sapply(dims_to_crop[[file_dim]], min)):max(sapply(dims_to_crop[[file_dim]], max)) + ## TODO: Merge indices in dims_to_crop with some advanced mechanism? + indices_to_keep <- unique(unlist(dims_to_crop[[file_dim]])) + array_of_files_to_load <- Subset(array_of_files_to_load, file_dim, indices_to_keep) + array_of_not_found_files <- Subset(array_of_not_found_files, file_dim, indices_to_keep) + for (i in 1:length(dat)) { + # Crop selectors + for (selector_dim in names(dat[[i]][['selectors']])) { + if (selector_dim == file_dim) { + for (j in 1:length(dat[[i]][['selectors']][[selector_dim]][['fri']])) { + dat[[i]][['selectors']][[selector_dim]][['fri']][[j]] <- dat[[i]][['selectors']][[selector_dim]][['fri']][[j]][indices_to_keep] + } + for (j in 1:length(dat[[i]][['selectors']][[selector_dim]][['sri']])) { + dat[[i]][['selectors']][[selector_dim]][['sri']][[j]] <- dat[[i]][['selectors']][[selector_dim]][['sri']][[j]][indices_to_keep] + } + } + if (file_dim %in% names(dim(dat[[i]][['selectors']][[selector_dim]][['fri']]))) { + dat[[i]][['selectors']][[selector_dim]][['fri']] <- Subset(dat[[i]][['selectors']][[selector_dim]][['fri']], file_dim, indices_to_keep) + dat[[i]][['selectors']][[selector_dim]][['sri']] <- Subset(dat[[i]][['selectors']][[selector_dim]][['sri']], file_dim, indices_to_keep) + } + } + # Crop dataset variables file dims. + for (picked_var in names(picked_vars[[i]])) { + if (file_dim %in% names(dim(picked_vars[[i]][[picked_var]]))) { + picked_vars[[i]][[picked_var]] <- Subset(picked_vars[[i]][[picked_var]], file_dim, indices_to_keep) + } + } + for (transformed_var in names(transformed_vars[[i]])) { + if (file_dim %in% names(dim(transformed_vars[[i]][[transformed_var]]))) { + transformed_vars[[i]][[transformed_var]] <- Subset(transformed_vars[[i]][[transformed_var]], file_dim, indices_to_keep) + } + } + } + # Crop common variables file dims. + for (picked_common_var in names(picked_common_vars)) { + if (file_dim %in% names(dim(picked_common_vars[[picked_common_var]]))) { + picked_common_vars[[picked_common_var]] <- Subset(picked_common_vars[[picked_common_var]], file_dim, indices_to_keep) + } + } + for (transformed_common_var in names(transformed_common_vars)) { + if (file_dim %in% names(dim(transformed_common_vars[[transformed_common_var]]))) { + transformed_common_vars[[transformed_common_var]] <- Subset(transformed_common_vars[[transformed_common_var]], file_dim, indices_to_keep) + } + } + } + # Calculate the size of the final array. + total_inner_dims <- NULL + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + inner_dims <- expected_inner_dims[[i]] + inner_dims <- sapply(inner_dims, + function(x) { + if (!all(sapply(dat[[i]][['selectors']][[x]][['sri']], is.null))) { + max(sapply(dat[[i]][['selectors']][[x]][['sri']], length)) + } else { + if (length(var_params[[x]]) > 0) { + if (var_params[[x]] %in% names(transformed_vars[[i]])) { + length(transformed_vars[[i]][[var_params[[x]]]]) + } else if (var_params[[x]] %in% names(transformed_common_vars)) { + length(transformed_common_vars[[var_params[[x]]]]) + } else { + max(sapply(dat[[i]][['selectors']][[x]][['fri']], length)) + } + } else { + max(sapply(dat[[i]][['selectors']][[x]][['fri']], length)) + } + } + }) + names(inner_dims) <- expected_inner_dims[[i]] + if (is.null(total_inner_dims)) { + total_inner_dims <- inner_dims + } else { + new_dims <- .MergeArrayDims(total_inner_dims, inner_dims) + total_inner_dims <- new_dims[[3]] + } + } + } + new_dims <- .MergeArrayDims(dim(array_of_files_to_load), total_inner_dims) + final_dims <- new_dims[[3]][dim_names] + # final_dims_fake is the vector of final dimensions after having merged the + # 'across' file dimensions with the respective 'across' inner dimensions, and + # after having broken into multiple dimensions those dimensions for which + # multidimensional selectors have been provided. + # final_dims will be used for collocation of data, whereas final_dims_fake + # will be used for shaping the final array to be returned to the user. + final_dims_fake <- final_dims + if (merge_across_dims) { + if (!is.null(inner_dims_across_files)) { + for (file_dim_across in names(inner_dims_across_files)) { + inner_dim_pos <- which(names(final_dims_fake) == inner_dims_across_files[[file_dim_across]]) + new_dims <- c() + if (inner_dim_pos > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(inner_dim_pos - 1)]) + } + new_dims <- c(new_dims, setNames(prod(final_dims_fake[c(inner_dim_pos, inner_dim_pos + 1)]), + inner_dims_across_files[[file_dim_across]])) + if (inner_dim_pos + 1 < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(inner_dim_pos + 2):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + } + } + all_split_dims <- NULL + if (split_multiselected_dims) { + for (dim_param in 1:length(dim_params)) { + if (!is.null(dim(dim_params[[dim_param]]))) { + if (length(dim(dim_params[[dim_param]])) > 1) { + split_dims <- dim(dim_params[[dim_param]]) + all_split_dims <- c(all_split_dims, setNames(list(split_dims), + names(dim_params)[dim_param])) + if (is.null(names(split_dims))) { + names(split_dims) <- paste0(names(dim_params)[dim_param], + 1:length(split_dims)) + } + old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) + + # NOTE: Three steps to create new dims. + # 1st: Put in the dims before split_dim. + # 2nd: Replace the old_dim with split_dims. + # 3rd: Put in the dims after split_dim. + new_dims <- c() + if (old_dim_pos > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)]) + } + new_dims <- c(new_dims, split_dims) + if (old_dim_pos < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(old_dim_pos + 1):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + } + } + } + if (merge_across_dims_narm) { + # only merge_across_dims -> the 'time' dim length needs to be adjusted + across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + # Get the length of each inner_dim ('time') along each file_dim ('file_date') + length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) + + if (!split_multiselected_dims) { + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name + } + } + + if (!silent) { + .message("Detected dimension sizes:") + longest_dim_len <- max(sapply(names(final_dims_fake), nchar)) + longest_size_len <- max(sapply(paste0(final_dims_fake, ''), nchar)) + sapply(names(final_dims_fake), + function(x) { + message(paste0("* ", paste(rep(' ', longest_dim_len - nchar(x)), collapse = ''), + x, ": ", paste(rep(' ', longest_size_len - nchar(paste0(final_dims_fake[x], ''))), collapse = ''), + final_dims_fake[x])) + }) + bytes <- prod(c(final_dims_fake, 8)) + dim_sizes <- paste(final_dims_fake, collapse = ' x ') + if (retrieve) { + .message(paste("Total size of requested data:")) + } else { + .message(paste("Total size of involved data:")) + } + .message(paste(dim_sizes, " x 8 bytes =", + format(structure(bytes, class = "object_size"), units = "auto")), + indent = 2) + } + + # NOTE: If split_multiselected_dims + merge_across_dims, the dim order may need to be changed. + # The inner_dim needs to be the first dim among split dims. + # Cannot control the rest dims are in the same order or not... + # Suppose users put the same order of across inner and file dims. + if (split_multiselected_dims & merge_across_dims) { + # TODO: More than one split? + inner_dim_pos_in_split_dims <- which(names(all_split_dims[[1]]) == inner_dims_across_files) + # if inner_dim is not the first, change! + if (inner_dim_pos_in_split_dims != 1) { + split_dims <- c(split_dims[inner_dim_pos_in_split_dims], + split_dims[1:length(split_dims)][-inner_dim_pos_in_split_dims]) + split_dims_pos <- which(!is.na(match(names(final_dims_fake), names(split_dims)))) + # Save the current final_dims_fake for later reorder back + final_dims_fake_output <- final_dims_fake + new_dims <- c() + if (split_dims_pos[1] != 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, split_dims) + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + } + + # The following several lines will only be run if retrieve = TRUE + if (retrieve) { + + ########## CREATING THE SHARED MATRIX AND DISPATCHING WORK PIECES ########### + # TODO: try performance of storing all in cols instead of rows + # Create the shared memory array, and a pointer to it, to be sent + # to the work pieces. + data_array <- big.matrix(nrow = prod(final_dims), ncol = 1) + shared_matrix_pointer <- describe(data_array) + if (is.null(num_procs)) { + num_procs <- availableCores() + } + # Creating a shared tmp folder to store metadata from each chunk + array_of_metadata_flags <- array(FALSE, dim = dim(array_of_files_to_load)) + if (!is.null(metadata_dims)) { + metadata_indices_to_load <- as.list(rep(1, length(dim(array_of_files_to_load)))) + names(metadata_indices_to_load) <- names(dim(array_of_files_to_load)) + metadata_indices_to_load[metadata_dims] <- as.list(rep(TRUE, length(metadata_dims))) + array_of_metadata_flags <- do.call('[<-', c(list(array_of_metadata_flags), metadata_indices_to_load, + list(value = rep(TRUE, prod(dim(array_of_files_to_load)[metadata_dims]))))) + } + metadata_file_counter <- 0 + metadata_folder <- tempfile('metadata') + dir.create(metadata_folder) + # Build the work pieces, each with: + # - file path + # - total size (dims) of store array + # - start position in store array + # - file selectors (to provide extra info. useful e.g. to select variable) + # - indices to take from file + work_pieces <- list() + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + selectors <- dat[[i]][['selectors']] + file_dims <- found_file_dims[[i]] + inner_dims <- expected_inner_dims[[i]] + sub_array_dims <- final_dims[file_dims] + sub_array_dims[found_pattern_dim] <- 1 + sub_array_of_files_to_load <- array(1:prod(sub_array_dims), + dim = sub_array_dims) + names(dim(sub_array_of_files_to_load)) <- names(sub_array_dims) + # Detect which of the dimensions of the dataset go across files. + file_dim_across_files <- lapply(inner_dims, + function(x) { + dim_across <- sapply(inner_dims_across_files, function(y) x %in% y) + if (any(dim_across)) { + names(inner_dims_across_files)[which(dim_across)[1]] + } else { + NULL + } + }) + names(file_dim_across_files) <- inner_dims + j <- 1 + while (j <= prod(sub_array_dims)) { + # Work out file path. + file_to_load_sub_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] + names(file_to_load_sub_indices) <- names(sub_array_dims) + file_to_load_sub_indices[found_pattern_dim] <- i + big_dims <- rep(1, length(dim(array_of_files_to_load))) + names(big_dims) <- names(dim(array_of_files_to_load)) + file_to_load_indices <- .MergeArrayDims(file_to_load_sub_indices, big_dims)[[1]] + file_to_load <- do.call('[[', c(list(array_of_files_to_load), + as.list(file_to_load_indices))) + not_found_file <- do.call('[[', c(list(array_of_not_found_files), + as.list(file_to_load_indices))) + load_file_metadata <- do.call('[', c(list(array_of_metadata_flags), + as.list(file_to_load_indices))) + if (load_file_metadata) { + metadata_file_counter <- metadata_file_counter + 1 + } + if (!is.na(file_to_load) && !not_found_file) { + # Work out indices to take + first_round_indices <- lapply(inner_dims, + function (x) { + if (is.null(file_dim_across_files[[x]])) { + selectors[[x]][['fri']][[1]] + } else { + which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] + selectors[[x]][['fri']][[which_chunk]] + } + }) + names(first_round_indices) <- inner_dims + second_round_indices <- lapply(inner_dims, + function (x) { + if (is.null(file_dim_across_files[[x]])) { + selectors[[x]][['sri']][[1]] + } else { + which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] + selectors[[x]][['sri']][[which_chunk]] + } + }) + if (debug) { + print("-> BUILDING A WORK PIECE") + #print(str(selectors)) + } + names(second_round_indices) <- inner_dims + if (!any(sapply(first_round_indices, length) == 0)) { + work_piece <- list() + work_piece[['first_round_indices']] <- first_round_indices + work_piece[['second_round_indices']] <- second_round_indices + work_piece[['file_indices_in_array_of_files']] <- file_to_load_indices + work_piece[['file_path']] <- file_to_load + work_piece[['store_dims']] <- final_dims + # Work out store position + store_position <- final_dims + store_position[names(file_to_load_indices)] <- file_to_load_indices + store_position[inner_dims] <- rep(1, length(inner_dims)) + work_piece[['store_position']] <- store_position + # Work out file selectors + file_selectors <- sapply(file_dims, + function (x) { + vector_to_pick <- 1 + if (x %in% names(depending_file_dims)) { + vector_to_pick <- file_to_load_indices[depending_file_dims[[x]]] + } + selectors[file_dims][[x]][[vector_to_pick]][file_to_load_indices[x]] + }) + names(file_selectors) <- file_dims + work_piece[['file_selectors']] <- file_selectors + # Send variables for transformation + if (!is.null(transform) && (length(transform_vars) > 0)) { + vars_to_transform <- NULL + picked_vars_to_transform <- which(names(picked_vars[[i]]) %in% transform_vars) + if (length(picked_vars_to_transform) > 0) { + picked_vars_to_transform <- names(picked_vars[[i]])[picked_vars_to_transform] + vars_to_transform <- c(vars_to_transform, picked_vars[[i]][picked_vars_to_transform]) + if (any(picked_vars_to_transform %in% names(picked_vars_ordered[[i]]))) { + picked_vars_ordered_to_transform <- picked_vars_to_transform[which(picked_vars_to_transform %in% names(picked_vars_ordered[[i]]))] + vars_to_transform[picked_vars_ordered_to_transform] <- picked_vars_ordered[[i]][picked_vars_ordered_to_transform] + } + } + picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars) + if (length(picked_common_vars_to_transform) > 0) { + picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform] + vars_to_transform <- c(vars_to_transform, picked_common_vars[picked_common_vars_to_transform]) + if (any(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))) { + picked_common_vars_ordered_to_transform <- picked_common_vars_to_transform[which(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))] + vars_to_transform[picked_common_vars_ordered_to_transform] <- picked_common_vars_ordered[picked_common_vars_ordered_to_transform] + } + } + work_piece[['vars_to_transform']] <- vars_to_transform + } + # Send flag to load metadata + if (load_file_metadata) { + work_piece[['save_metadata_in']] <- paste0(metadata_folder, '/', metadata_file_counter) + } + work_pieces <- c(work_pieces, list(work_piece)) + } + } + j <- j + 1 + } + } + } + #print("N") + if (debug) { + print("-> WORK PIECES BUILT") + } + + # Calculate the progress %s that will be displayed and assign them to + # the appropriate work pieces. + if (length(work_pieces) / num_procs >= 2 && !silent) { + if (length(work_pieces) / num_procs < 10) { + amount <- 100 / ceiling(length(work_pieces) / num_procs) + reps <- ceiling(length(work_pieces) / num_procs) + } else { + amount <- 10 + reps <- 10 + } + progress_steps <- rep(amount, reps) + if (length(work_pieces) < (reps + 1)) { + selected_pieces <- length(work_pieces) + progress_steps <- c(sum(head(progress_steps, reps)), + tail(progress_steps, reps)) + } else { + selected_pieces <- round(seq(1, length(work_pieces), + length.out = reps + 1))[-1] + } + progress_steps <- paste0(' + ', round(progress_steps, 2), '%') + progress_message <- 'Progress: 0%' + } else { + progress_message <- '' + selected_pieces <- NULL + } + piece_counter <- 1 + step_counter <- 1 + work_pieces <- lapply(work_pieces, + function (x) { + if (piece_counter %in% selected_pieces) { + wp <- c(x, list(progress_amount = progress_steps[step_counter])) + step_counter <<- step_counter + 1 + } else { + wp <- x + } + piece_counter <<- piece_counter + 1 + wp + }) + if (!silent) { + .message("If the size of the requested data is close to or above the free shared RAM memory, R may crash.") + .message("If the size of the requested data is close to or above the half of the free RAM memory, R may crash.") + .message(paste0("Will now proceed to read and process ", length(work_pieces), " data files:")) + if (length(work_pieces) < 30) { + lapply(work_pieces, function (x) .message(x[['file_path']], indent = 2)) + } else { + .message("The list of files is long. You can check it after Start() finishes in the output '$Files'.", indent = 2, exdent = 5) + } + } + + # Build the cluster of processes that will do the work and dispatch work pieces. + # The function .LoadDataFile is applied to each work piece. This function will + # open the data file, regrid if needed, subset, apply the mask, + # compute and apply the weights if needed, + # disable extreme values and store in the shared memory matrix. + #print("O") + if (!silent) { + .message("Loading... This may take several minutes...") + if (progress_message != '') { + .message(progress_message, appendLF = FALSE) + } + } + if (num_procs == 1) { + found_files <- lapply(work_pieces, .LoadDataFile, + shared_matrix_pointer = shared_matrix_pointer, + file_data_reader = file_data_reader, + synonims = synonims, + transform = transform, + transform_params = transform_params, + silent = silent, debug = debug) + } else { + cluster <- makeCluster(num_procs, outfile = "") + # Send the heavy work to the workers + work_errors <- try({ + found_files <- clusterApplyLB(cluster, work_pieces, .LoadDataFile, + shared_matrix_pointer = shared_matrix_pointer, + file_data_reader = file_data_reader, + synonims = synonims, + transform = transform, + transform_params = transform_params, + silent = silent, debug = debug) + }) + stopCluster(cluster) + } + + if (!silent) { + if (progress_message != '') { + .message("\n", tag = '') + } + } + #print("P") + + # NOTE: If merge_across_dims = TRUE, there might be additional NAs due to + # unequal inner_dim ('time') length across file_dim ('file_date'). + # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. + # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. + if (merge_across_dims_narm) { + + # Get the length of these two dimensions in final_dims + length_inner_across_store_dims <- final_dims[across_inner_dim] + length_file_across_store_dims <- final_dims[across_file_dim] + + # Create a logical array for merge_across_dims + logi_array <- array(rep(FALSE, + length_file_across_store_dims * length_inner_across_store_dims), + dim = c(length_inner_across_store_dims, length_file_across_store_dims)) + for (i in 1:length_file_across_store_dims) { #1:4 + logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + } + + # First, get the data array with final_dims dimension + data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) + + # Change the NA derived from additional spaces to -9999, then remove these -9999 + func_remove_blank <- function(data_array, logi_array) { + # dim(data_array) = [time, file_date] + # dim(logi_array) = [time, file_date] + # Change the blank spaces from NA to -9999 + data_array[which(!logi_array)] <- -9999 + return(data_array) + } + data_array_final_dims <- multiApply::Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + + data_array <- array(data_array_tmp, dim = final_dims_fake) + + } else { # merge_across_dims_narm = F (old version) + data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) + } + + # NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above. + # To get the user-required dim order, we need to reorder the array again. + if (split_multiselected_dims & merge_across_dims) { + if (inner_dim_pos_in_split_dims != 1) { + correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) + data_array <- .aperm2(data_array, correct_order) + } + } + + gc() + + # Load metadata and remove the metadata folder + if (!is.null(metadata_dims)) { + loaded_metadata_files <- list.files(metadata_folder) + loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS) + unlink(metadata_folder, recursive = TRUE) + return_metadata <- vector('list', length = prod(dim(array_of_metadata_flags)[metadata_dims])) + return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata + dim(return_metadata) <- dim(array_of_metadata_flags[metadata_dims]) + attr(data_array, 'Variables') <- return_metadata + # TODO: Try to infer data type from loaded_metadata + # as.integer(data_array) + } + + failed_pieces <- work_pieces[which(unlist(found_files))] + for (failed_piece in failed_pieces) { + array_of_not_found_files <- do.call('[<-', + c(list(array_of_not_found_files), + as.list(failed_piece[['file_indices_in_array_of_files']]), + list(value = TRUE))) + } + if (any(array_of_not_found_files)) { + for (i in 1:prod(dim(array_of_files_to_load))) { + if (is.na(array_of_not_found_files[i])) { + array_of_files_to_load[i] <- NA + } else { + if (array_of_not_found_files[i]) { + array_of_not_found_files[i] <- array_of_files_to_load[i] + array_of_files_to_load[i] <- NA + } else { + array_of_not_found_files[i] <- NA + } + } + } + } else { + array_of_not_found_files <- NULL + } + + } # End if (retrieve) + + # Change final_dims_fake back because retrieve = FALSE will use it for attributes later + if (exists("final_dims_fake_output")) { + final_dims_fake <- final_dims_fake_output + } + # Replace the vars and common vars by the transformed vars and common vars + for (i in 1:length(dat)) { + if (length(names(transformed_vars[[i]])) > 0) { + picked_vars[[i]][names(transformed_vars[[i]])] <- transformed_vars[[i]] + } else if (length(names(picked_vars_ordered[[i]])) > 0) { + picked_vars[[i]][names(picked_vars_ordered[[i]])] <- picked_vars_ordered[[i]] + } + } + if (length(names(transformed_common_vars)) > 0) { + picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars + } else if (length(names(picked_common_vars_ordered)) > 0) { + picked_common_vars[names(picked_common_vars_ordered)] <- picked_common_vars_ordered + } + if (debug) { + print("-> THE TRANSFORMED VARS:") + print(str(transformed_vars)) + print("-> THE PICKED VARS:") + print(str(picked_vars)) + } + + file_selectors <- NULL + for (i in 1:length(dat)) { + file_selectors[[dat[[i]][['name']]]] <- dat[[i]][['selectors']][which(names(dat[[i]][['selectors']]) %in% found_file_dims[[i]])] + } + if (retrieve) { + if (!silent) { + .message("Successfully retrieved data.") + } + var_backup <- attr(data_array, 'Variables')[[1]] + attr(data_array, 'Variables') <- NULL + attributes(data_array) <- c(attributes(data_array), + list(Variables = c(list(common = c(picked_common_vars, var_backup)), + picked_vars), + Files = array_of_files_to_load, + NotFoundFiles = array_of_not_found_files, + FileSelectors = file_selectors, + PatternDim = found_pattern_dim) + ) + attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class')) + data_array + } else { + if (!silent) { + .message("Successfully discovered data dimensions.") + } + start_call <- match.call() + for (i in 2:length(start_call)) { + if (class(start_call[[i]]) %in% c('name', 'call')) { + start_call[[i]] <- eval.parent(start_call[[i]]) + } + } + start_call[['retrieve']] <- TRUE + attributes(start_call) <- c(attributes(start_call), + list(Dimensions = final_dims_fake, + Variables = c(list(common = picked_common_vars), picked_vars), + ExpectedFiles = array_of_files_to_load, + FileSelectors = file_selectors, + PatternDim = found_pattern_dim, + MergedDims = if (merge_across_dims) { + inner_dims_across_files + } else { + NULL + }, + SplitDims = if (split_multiselected_dims) { + all_split_dims + } else { + NULL + }) + ) + attr(start_call, 'class') <- c('startR_cube', attr(start_call, 'class')) + start_call + } +} + +# This function is the responsible for loading the data of each work +# piece. +.LoadDataFile <- function(work_piece, shared_matrix_pointer, + file_data_reader, synonims, + transform, transform_params, + silent = FALSE, debug = FALSE) { + # suppressPackageStartupMessages({library(bigmemory)}) + ### TODO: Specify dependencies as parameter + # suppressPackageStartupMessages({library(ncdf4)}) + + #print("1") + store_indices <- as.list(work_piece[['store_position']]) + first_round_indices <- work_piece[['first_round_indices']] + second_round_indices <- work_piece[['second_round_indices']] + #print("2") + file_to_open <- work_piece[['file_path']] + sub_array <- file_data_reader(file_to_open, NULL, + work_piece[['file_selectors']], + first_round_indices, synonims) + if (debug) { + if (all(unlist(store_indices[1:6]) == 1)) { + print("-> LOADING A WORK PIECE") + print("-> STRUCTURE OF READ UNTRANSFORMED DATA:") + print(str(sub_array)) + print("-> STRUCTURE OF VARIABLES TO TRANSFORM:") + print(str(work_piece[['vars_to_transform']])) + print("-> COMMON ARRAY DIMENSIONS:") + print(str(work_piece[['store_dims']])) + } + } + if (!is.null(sub_array)) { + # Apply data transformation once we have the data arrays. + if (!is.null(transform)) { + if (debug) { + if (all(unlist(store_indices[1:6]) == 1)) { + print("-> PROCEEDING TO TRANSFORM ARRAY") + print("-> DIMENSIONS OF ARRAY RIGHT BEFORE TRANSFORMING:") + print(dim(sub_array)) + } + } + sub_array <- do.call(transform, c(list(data_array = sub_array, + variables = work_piece[['vars_to_transform']], + file_selectors = work_piece[['file_selectors']]), + transform_params)) + if (debug) { + if (all(unlist(store_indices[1:6]) == 1)) { + print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER TRANSFORMING:") + print(str(sub_array)) + print("-> DIMENSIONS OF ARRAY RIGHT AFTER TRANSFORMING:") + print(dim(sub_array$data_array)) + } + } + sub_array <- sub_array$data_array + # Subset with second round of indices + dims_to_crop <- which(!sapply(second_round_indices, is.null)) + if (length(dims_to_crop) > 0) { + dimnames_to_crop <- names(second_round_indices)[dims_to_crop] + sub_array <- Subset(sub_array, dimnames_to_crop, + second_round_indices[dimnames_to_crop]) + } + if (debug) { + if (all(unlist(store_indices[1:6]) == 1)) { + print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER SUBSETTING WITH 2nd ROUND INDICES:") + print(str(sub_array)) + } + } + } + + metadata <- attr(sub_array, 'variables') + + names_bk <- names(store_indices) + store_indices <- lapply(names(store_indices), + function (x) { + if (!(x %in% names(first_round_indices))) { + store_indices[[x]] + } else if (is.null(second_round_indices[[x]])) { + 1:dim(sub_array)[x] + } else { + if (is.numeric(second_round_indices[[x]])) { + ## TODO: Review carefully this line. Inner indices are all + ## aligned to the left-most positions. If dataset A has longitudes + ## 1, 2, 3, 4 but dataset B has only longitudes 3 and 4, then + ## they will be stored as follows: + ## 1, 2, 3, 4 + ## 3, 4, NA, NA + ##x - min(x) + 1 + 1:length(second_round_indices[[x]]) + } else { + 1:length(second_round_indices[[x]]) + } + } + }) + names(store_indices) <- names_bk + if (debug) { + if (all(unlist(store_indices) == 1)) { + print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:") + print(str(first_round_indices)) + print("-> STRUCTURE OF SECOND ROUND INDICES FOR THIS WORK PIECE:") + print(str(second_round_indices)) + print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:") + print(str(store_indices)) + } + } + + store_indices <- lapply(store_indices, as.integer) + store_dims <- work_piece[['store_dims']] + + # split the storage work of the loaded subset in parts + largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))] + max_parts <- length(store_indices[[largest_dim_name]]) + + # Indexing a data file of N MB with expand.grid takes 30*N MB + # The peak ram of Start is, minimum, 2 * total data to load from all files + # due to inefficiencies in other regions of the code + # The more parts we split the indexing done below in, the lower + # the memory footprint of the indexing and the fast. + # But more than 10 indexing iterations (parts) for each MB processed + # makes the iteration slower (tested empirically on BSC workstations). + subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024 + best_n_parts <- ceiling(subset_size_in_mb * 10) + # We want to set n_parts to a greater value than the one that would + # result in a memory footprint (of the subset indexing code below) equal + # to 2 * total data to load from all files. + # s = subset size in MB + # p = number of parts to break it in + # T = total size of data to load + # then, s / p * 30 = 2 * T + # then, p = s * 15 / T + min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims)) + # Make sure we pick n_parts much greater than the minimum calculated + n_parts <- min_n_parts * 10 + if (n_parts > best_n_parts) { + n_parts <- best_n_parts + } + # Boundary checks + if (n_parts < 1) { + n_parts <- 1 + } + if (n_parts > max_parts) { + n_parts <- max_parts + } + + if (n_parts > 1) { + make_parts <- function(length, n) { + clusters <- cut(1:length, n, labels = FALSE) + lapply(1:n, function(y) which(clusters == y)) + } + part_indices <- make_parts(max_parts, n_parts) + parts <- lapply(part_indices, + function(x) { + store_indices[[largest_dim_name]][x] + }) + } else { + part_indices <- list(1:max_parts) + parts <- store_indices[largest_dim_name] + } + + # do the storage work + weights <- sapply(1:length(store_dims), + function(i) prod(c(1, store_dims)[1:i])) + part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array)))) + names(part_indices_in_sub_array) <- names(dim(sub_array)) + data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) + for (i in 1:n_parts) { + store_indices[[largest_dim_name]] <- parts[[i]] + # Converting array indices to vector indices + matrix_indices <- do.call("expand.grid", store_indices) + # Given a matrix where each row is a set of array indices of an element + # the vector indices are computed + matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights) + part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]] + data_array[matrix_indices] <- as.vector(do.call('[', + c(list(x = sub_array), + part_indices_in_sub_array))) + } + rm(data_array) + gc() + + if (!is.null(work_piece[['save_metadata_in']])) { + saveRDS(metadata, file = work_piece[['save_metadata_in']]) + } + } + if (!is.null(work_piece[['progress_amount']]) && !silent) { + message(work_piece[['progress_amount']], appendLF = FALSE) + } + is.null(sub_array) +} + diff --git a/R/Step.R b/R/Step.R index 1f54d0838a217f646bd5201ead72636e4c1349df..32c92738010a0ea564071ba3d679962e79f7e1df 100644 --- a/R/Step.R +++ b/R/Step.R @@ -1,83 +1,83 @@ -Step <- function(fun, target_dims, output_dims, - use_libraries = NULL, use_attributes = NULL) { - # Check fun - if (!is.function(fun)) { - stop("Parameter 'fun' must be a function.") - } - - # Check target_dims - if (is.character(target_dims)) { - target_dims <- list(target_dims) - names(target_dims) <- 'input1' - } - if (is.list(target_dims)) { - sapply(target_dims, - function(x) { - if (!(is.character(x) && (length(x) > 0))) { - stop("Parameter 'target_dims' must be one or a list of vectors ", - "of target dimension names for each data array input in ", - "the function 'fun'.") - } - }) - if (is.null(names(target_dims))) { - names(target_dims) <- paste0('input', 1:length(target_dims)) - } - } - - # Check output_dims - if (is.character(output_dims) || is.null(output_dims)) { - output_dims <- list(output_dims) - names(output_dims) <- 'output1' - } - if (is.list(output_dims)) { - sapply(output_dims, - function(x) { - if (!(is.character(x) || is.null(x))) { - stop("Parameter 'output_dims' must be one or a list of vectors ", - "of target dimension names for each data array input in ", - "the function 'fun'.") - } - }) - if (is.null(names(output_dims))) { - names(output_dims) <- paste0('output', 1:length(output_dims)) - } - } - - # Check use_libraries - if (!is.null(use_libraries)) { - if (!is.character(use_libraries)) { - stop("Parameter 'use_libraries' must be a vector of character ", - "strings.") - } - } - - # Check use_attributes - if (!is.null(use_attributes)) { - raise_error <- FALSE - if (!is.list(use_attributes)) { - raise_error <- TRUE - } - if (!all(sapply(use_attributes, - function(x) { - is.character(x) || - (is.list(x) && all(sapply(x, is.character))) - }))) { - raise_error <- TRUE - } - if (raise_error) { - stop("Parameter 'use_attributes' must be a list of vectors of ", - "character strings or of lists of vectors of character ", - "strings.") - } - } - - attr(fun, 'TargetDims') <- target_dims - attr(fun, 'OutputDims') <- output_dims - attr(fun, 'UseLibraries') <- use_libraries - attr(fun, 'UseAttributes') <- use_attributes - - # TODO: Add provenance info - class(fun) <- 'startR_step_fun' - - fun -} +Step <- function(fun, target_dims, output_dims, + use_libraries = NULL, use_attributes = NULL) { + # Check fun + if (!is.function(fun)) { + stop("Parameter 'fun' must be a function.") + } + + # Check target_dims + if (is.character(target_dims)) { + target_dims <- list(target_dims) + names(target_dims) <- 'input1' + } + if (is.list(target_dims)) { + sapply(target_dims, + function(x) { + if (!(is.character(x) && (length(x) > 0))) { + stop("Parameter 'target_dims' must be one or a list of vectors ", + "of target dimension names for each data array input in ", + "the function 'fun'.") + } + }) + if (is.null(names(target_dims))) { + names(target_dims) <- paste0('input', 1:length(target_dims)) + } + } + + # Check output_dims + if (is.character(output_dims) || is.null(output_dims)) { + output_dims <- list(output_dims) + names(output_dims) <- 'output1' + } + if (is.list(output_dims)) { + sapply(output_dims, + function(x) { + if (!(is.character(x) || is.null(x))) { + stop("Parameter 'output_dims' must be one or a list of vectors ", + "of target dimension names for each data array input in ", + "the function 'fun'.") + } + }) + if (is.null(names(output_dims))) { + names(output_dims) <- paste0('output', 1:length(output_dims)) + } + } + + # Check use_libraries + if (!is.null(use_libraries)) { + if (!is.character(use_libraries)) { + stop("Parameter 'use_libraries' must be a vector of character ", + "strings.") + } + } + + # Check use_attributes + if (!is.null(use_attributes)) { + raise_error <- FALSE + if (!is.list(use_attributes)) { + raise_error <- TRUE + } + if (!all(sapply(use_attributes, + function(x) { + is.character(x) || + (is.list(x) && all(sapply(x, is.character))) + }))) { + raise_error <- TRUE + } + if (raise_error) { + stop("Parameter 'use_attributes' must be a list of vectors of ", + "character strings or of lists of vectors of character ", + "strings.") + } + } + + attr(fun, 'TargetDims') <- target_dims + attr(fun, 'OutputDims') <- output_dims + attr(fun, 'UseLibraries') <- use_libraries + attr(fun, 'UseAttributes') <- use_attributes + + # TODO: Add provenance info + class(fun) <- 'startR_step_fun' + + fun +} diff --git a/R/Subset.R b/R/Subset.R index 0e6b52a5c73717014853fb90ccfd17b9c4113745..da35b5b8a0f35066c50ae3f3e7cc0996b0ecadba 100644 --- a/R/Subset.R +++ b/R/Subset.R @@ -1,97 +1,97 @@ -Subset <- function(x, along, indices, drop = FALSE) { - # Check x - if (!is.array(x)) { - stop("Input array 'x' must be a numeric array.") - } - - # Take the input array dimension names - dim_names <- attr(x, 'dimensions') - if (!is.character(dim_names)) { - dim_names <- names(dim(x)) - } - if (!is.character(dim_names)) { - if (any(sapply(along, is.character))) { - stop("The input array 'x' doesn't have labels for the dimensions but the parameter 'along' contains dimension names.") - } - } - - # Check along - if (any(sapply(along, function(x) !is.numeric(x) && !is.character(x)))) { - stop("All provided dimension indices in 'along' must be integers or character strings.") - } - if (any(sapply(along, is.character))) { - req_dimnames <- along[which(sapply(along, is.character))] - if (length(unique(req_dimnames)) < length(req_dimnames)) { - stop("The parameter 'along' must not contain repeated dimension names.") - } - along[which(sapply(along, is.character))] <- match(req_dimnames, dim_names) - if (any(is.na(along))) { - stop("Could not match all dimension names in 'indices' with dimension names in input array 'x'.") - } - along <- as.numeric(along) - } - - # Check indices - if (!is.list(indices)) { - indices <- list(indices) - } - - # Check parameter drop - dims_to_drop <- c() - if (is.character(drop)) { - if (drop == 'all') { - drop <- TRUE - } else if (any(drop %in% c('selected', 'non-selected', 'none'))) { - if (drop == 'selected') { - dims_to_drop <- along[which(sapply(indices, length) == 1)] - } else if (drop == 'non-selected') { - dims_to_drop <- dim(x) == 1 - dims_to_drop[along] <- FALSE - dims_to_drop <- which(dims_to_drop) - } - drop <- FALSE - } else { - stop("Parameter 'drop' must be one of TRUE, FALSE, 'all', 'selected', 'non-selected', 'none'.") - } - } else if (!is.logical(drop)) { - stop("Parameter 'drop' must be one of TRUE, FALSE, 'all', 'selected', 'non-selected', 'none'.") - } - - # Take the subset - nd <- length(dim(x)) - index <- as.list(rep(TRUE, nd)) - index[along] <- indices - subset <- eval(as.call(c(as.name("["), as.name("x"), index, drop = drop))) - # If dropped all dimensions, need to drop dimnames too - if (is.character(dim_names) && drop == TRUE) { - dim_names_to_remove <- unique(c(along[which(sapply(indices, length) == 1)], - which(dim(x) == 1))) - if (length(dim_names_to_remove) > 0) { - dim_names <- dim_names[-dim_names_to_remove] - } - } - - # Amend the final dimensions and put dimnames and attributes - metadata <- attributes(x) - metadata[['dim']] <- dim(subset) - if (length(dims_to_drop) > 0) { - metadata[['dim']] <- metadata[['dim']][-dims_to_drop] - if (is.character(dim_names)) { - names(metadata[['dim']]) <- dim_names[-dims_to_drop] - if ('dimensions' %in% names(attributes(x))) { - metadata[['dimensions']] <- dim_names[-dims_to_drop] - } - } - if (length(metadata[['dim']]) == 0) { - metadata['dim'] <- list(NULL) - metadata['dimensions'] <- list(NULL) - } - } else if (is.character(dim_names)) { - names(metadata[['dim']]) <- dim_names - if ('dimensions' %in% names(attributes(x))) { - metadata[['dimensions']] <- dim_names - } - } - attributes(subset) <- metadata - subset -} +Subset <- function(x, along, indices, drop = FALSE) { + # Check x + if (!is.array(x)) { + stop("Input array 'x' must be a numeric array.") + } + + # Take the input array dimension names + dim_names <- attr(x, 'dimensions') + if (!is.character(dim_names)) { + dim_names <- names(dim(x)) + } + if (!is.character(dim_names)) { + if (any(sapply(along, is.character))) { + stop("The input array 'x' doesn't have labels for the dimensions but the parameter 'along' contains dimension names.") + } + } + + # Check along + if (any(sapply(along, function(x) !is.numeric(x) && !is.character(x)))) { + stop("All provided dimension indices in 'along' must be integers or character strings.") + } + if (any(sapply(along, is.character))) { + req_dimnames <- along[which(sapply(along, is.character))] + if (length(unique(req_dimnames)) < length(req_dimnames)) { + stop("The parameter 'along' must not contain repeated dimension names.") + } + along[which(sapply(along, is.character))] <- match(req_dimnames, dim_names) + if (any(is.na(along))) { + stop("Could not match all dimension names in 'indices' with dimension names in input array 'x'.") + } + along <- as.numeric(along) + } + + # Check indices + if (!is.list(indices)) { + indices <- list(indices) + } + + # Check parameter drop + dims_to_drop <- c() + if (is.character(drop)) { + if (drop == 'all') { + drop <- TRUE + } else if (any(drop %in% c('selected', 'non-selected', 'none'))) { + if (drop == 'selected') { + dims_to_drop <- along[which(sapply(indices, length) == 1)] + } else if (drop == 'non-selected') { + dims_to_drop <- dim(x) == 1 + dims_to_drop[along] <- FALSE + dims_to_drop <- which(dims_to_drop) + } + drop <- FALSE + } else { + stop("Parameter 'drop' must be one of TRUE, FALSE, 'all', 'selected', 'non-selected', 'none'.") + } + } else if (!is.logical(drop)) { + stop("Parameter 'drop' must be one of TRUE, FALSE, 'all', 'selected', 'non-selected', 'none'.") + } + + # Take the subset + nd <- length(dim(x)) + index <- as.list(rep(TRUE, nd)) + index[along] <- indices + subset <- eval(as.call(c(as.name("["), as.name("x"), index, drop = drop))) + # If dropped all dimensions, need to drop dimnames too + if (is.character(dim_names) && drop == TRUE) { + dim_names_to_remove <- unique(c(along[which(sapply(indices, length) == 1)], + which(dim(x) == 1))) + if (length(dim_names_to_remove) > 0) { + dim_names <- dim_names[-dim_names_to_remove] + } + } + + # Amend the final dimensions and put dimnames and attributes + metadata <- attributes(x) + metadata[['dim']] <- dim(subset) + if (length(dims_to_drop) > 0) { + metadata[['dim']] <- metadata[['dim']][-dims_to_drop] + if (is.character(dim_names)) { + names(metadata[['dim']]) <- dim_names[-dims_to_drop] + if ('dimensions' %in% names(attributes(x))) { + metadata[['dimensions']] <- dim_names[-dims_to_drop] + } + } + if (length(metadata[['dim']]) == 0) { + metadata['dim'] <- list(NULL) + metadata['dimensions'] <- list(NULL) + } + } else if (is.character(dim_names)) { + names(metadata[['dim']]) <- dim_names + if ('dimensions' %in% names(attributes(x))) { + metadata[['dimensions']] <- dim_names + } + } + attributes(subset) <- metadata + subset +} diff --git a/R/Utils.R b/R/Utils.R index bdf8fda10a1d150113636b867a1de66d4627413c..8d06343b3e689a96d3d81081e676409d6b949de4 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -1,843 +1,843 @@ -indices <- function(x) { - attr(x, 'indices') <- TRUE - attr(x, 'values') <- FALSE - attr(x, 'chunk') <- c(chunk = 1, n_chunks = 1) - x -} - -values <- function(x) { - attr(x, 'indices') <- FALSE - attr(x, 'values') <- TRUE - attr(x, 'chunk') <- c(chunk = 1, n_chunks = 1) - x -} - -chunk <- function(chunk, n_chunks, selectors) { - if (any(chunk > n_chunks)) { - stop("Requested chunk index out of bounds.") - } - if (length(chunk) == 1 && length(n_chunks) == 1) { - if (!is.null(attr(selectors, 'chunk'))) { - attr(selectors, 'chunk') <- c((attr(selectors, 'chunk')['chunk'] - 1) * n_chunks + - chunk, - attr(selectors, 'chunk')['n_chunks'] * n_chunks) - } else { - attr(selectors, 'chunk') <- c(chunk = unname(chunk), n_chunks = unname(n_chunks)) - } - } else { - # Chunking arrays of multidimensional selectors. - # This should be done in Start.R but implies modifications. - if (length(chunk) != length(n_chunks)) { - stop("Wrong chunk specification.") - } -#NOTE: 1. It should be for above? not nultidimensional selector -# 2. it was !is.null before, but it should be is.null (?) -# if (!is.null(attr(selectors, 'values'))) { -# stop("Multidimensional chunking only available when selector ", -# "values provided.") -# } - if (is.null(dim(selectors))) { - stop("Multidimensional chunking only available when multidimensional ", - "selector values provided.") - } - if (length(dim(selectors)) != length(chunk)) { - stop("As many chunk indices and chunk lengths as dimensions in the ", - "multidimensional selector array must be specified.") - } - old_indices <- attr(selectors, 'indices') - old_values <- attr(selectors, 'values') - selectors <- Subset(selectors, 1:length(chunk), - lapply(1:length(chunk), - function(x) { - n_indices <- dim(selectors)[x] - chunk_sizes <- rep(floor(n_indices / n_chunks[x]), n_chunks[x]) - chunks_to_extend <- n_indices - chunk_sizes[1] * n_chunks[x] - if (chunks_to_extend > 0) { - chunk_sizes[1:chunks_to_extend] <- chunk_sizes[1:chunks_to_extend] + 1 - } - chunk_size <- chunk_sizes[chunk[x]] - offset <- 0 - if (chunk[x] > 1) { - offset <- sum(chunk_sizes[1:(chunk[x] - 1)]) - } - 1:chunk_sizes[chunk[x]] + offset - })) - attr(selectors, 'indices') <- old_indices - attr(selectors, 'values') <- old_values - } - selectors -} - -.ReplaceVariablesInString <- function(string, replace_values, allow_undefined_key_vars = FALSE) { - # This function replaces all the occurrences of a variable in a string by - # their corresponding string stored in the replace_values. - if (length(strsplit(string, "\\$")[[1]]) > 1) { - parts <- strsplit(string, "\\$")[[1]] - output <- "" - i <- 0 - for (part in parts) { - if (i %% 2 == 0) { - output <- paste(output, part, sep = "") - } else { - if (part %in% names(replace_values)) { - output <- paste(output, .ReplaceVariablesInString(replace_values[[part]], replace_values, allow_undefined_key_vars), sep = "") - } else if (allow_undefined_key_vars) { - output <- paste0(output, "$", part, "$") - } else { - stop(paste('Error: The variable $', part, '$ was not defined in the configuration file.', sep = '')) - } - } - i <- i + 1 - } - output - } else { - string - } -} - -.ReplaceGlobExpressions <- function(path_with_globs, actual_path, - replace_values, tags_to_keep, - dataset_name, permissive) { - # The goal of this function is to replace the shell globbing expressions in - # a path pattern (that may contain shell globbing expressions and Load() - # tags) by the corresponding part of the real existing path. - # What is done actually is to replace all the values of the tags in the - # actual path by the corresponding $TAG$ - # - # It takes mainly two inputs. The path with expressions and tags, e.g.: - # /data/experiments/*/$EXP_NAME$/$VAR_NAME$/$VAR_NAME$_*$START_DATE$*.nc - # and a complete known path to one of the matching files, e.g.: - # /data/experiments/ecearth/i00k/tos/tos_fc0-1_19901101_199011-199110.nc - # and it returns the path pattern but without shell globbing expressions: - # /data/experiments/ecearth/$EXP_NAME$/$VAR_NAME$/$VAR_NAME$_fc0-1_$START_DATE$_199011-199110.nc - # - # To do that, it needs also as inputs the list of replace values (the - # association of each tag to their value). - # - # All the tags not present in the parameter tags_to_keep will be repalced. - # - # Not all cases can be resolved with the implemented algorithm. In an - # unsolvable case a warning is given and one possible guess is returned. - # - # In some cases it is interesting to replace only the expressions in the - # path to the file, but not the ones in the file name itself. To keep the - # expressions in the file name, the parameter permissive can be set to - # TRUE. To replace all the expressions it can be set to FALSE. - - # Tests - #a <- "/esarchive/exp/ecearth/a13c/3hourly/$var$_*/$var$_*-LR_historical_r1i1p1f1_gr_$chunk$.nc" - #b <- "/esarchive/exp/ecearth/a13c/3hourly/psl_f6h/psl_E3hrPt_EC-Earth3-LR_historical_r1i1p1f1_gr_195001010000-195001312100.nc" - #c <- list(dat = 'dat1', var = 'psl', chunk = '195001010000-195001312100') - #d <- c('dat', 'var', 'chunk') - #e <- 'dat1' - #f <- FALSE #TRUE/0/1/2/3 - #r <- .ReplaceGlobExpressions(a, b, c, d, e, f) - - clean <- function(x) { - if (nchar(x) > 0) { - x <- gsub('\\\\', '', x) - x <- gsub('\\^', '', x) - x <- gsub('\\$', '', x) - x <- unname(sapply(strsplit(x, '[',fixed = TRUE)[[1]], function(y) gsub('.*]', '.', y))) - do.call(paste0, as.list(x)) - } else { - x - } - } - - strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "") - - if (permissive == 0) { - permissive <- FALSE - } else { - if (permissive == TRUE) { - permissive_levels <- 1 - } else { - permissive_levels <- round(permissive[1]) - permissive <- TRUE - } - } - - if (permissive) { - actual_path_chunks <- strsplit(actual_path, '/')[[1]] - if (permissive_levels >= length(actual_path_chunks)) { - stop("Error: Provided levels out of scope in parameter 'permissive'.") - } - permissive_levels <- 1:permissive_levels - permissive_levels <- length(actual_path_chunks) - (rev(permissive_levels) - 1) - actual_path <- paste(actual_path_chunks[-permissive_levels], collapse = '/') - file_name <- paste(actual_path_chunks[permissive_levels], collapse = '/') - if (length(actual_path_chunks) > 1) { - file_name <- paste0('/', file_name) - } - path_with_globs_chunks <- strsplit(path_with_globs, '/')[[1]] - path_with_globs <- paste(path_with_globs_chunks[-permissive_levels], - collapse = '/') - path_with_globs_no_tags <- .ReplaceVariablesInString(path_with_globs, replace_values) - file_name_with_globs <- paste(path_with_globs_chunks[permissive_levels], collapse = '/') - if (length(path_with_globs_chunks) > 1) { - file_name_with_globs <- paste0('/', file_name_with_globs) - } - right_known <- head(strsplit(file_name_with_globs, '*', fixed = TRUE)[[1]], 1) - right_known_no_tags <- .ReplaceVariablesInString(right_known, replace_values) - path_with_globs_no_tags_rx <- utils::glob2rx(paste0(path_with_globs_no_tags, right_known_no_tags)) - match <- regexpr(gsub('$', '', path_with_globs_no_tags_rx, fixed = TRUE), paste0(actual_path, file_name)) - if (match != 1) { - stop("Incorrect parameters to replace glob expressions. The path with expressions does not match the actual path.") - } - #if (attr(match, 'match.length') - nchar(right_known_no_tags) < nchar(actual_path)) { - # path_with_globs_no_tags <- paste0(path_with_globs_no_tags, right_known_no_tags, '*') - # file_name_with_globs <- sub(right_known, '/*', file_name_with_globs) - #} - } - path_with_globs_rx <- utils::glob2rx(path_with_globs) - values_to_replace <- c() - tags_to_replace_starts <- c() - tags_to_replace_ends <- c() - give_warning <- FALSE - for (tag in tags_to_keep) { - matches <- gregexpr(paste0('$', tag, '$'), path_with_globs_rx, fixed = TRUE)[[1]] - lengths <- attr(matches, 'match.length') - if (!(length(matches) == 1 && matches[1] == -1)) { - for (i in 1:length(matches)) { - left <- NULL - if (matches[i] > 1) { - left <- .ReplaceVariablesInString(substr(path_with_globs_rx, 1, matches[i] - 1), replace_values) - left_known <- strReverse(head(strsplit(strReverse(left), strReverse('.*'), fixed = TRUE)[[1]], 1)) - } - right <- NULL - if ((matches[i] + lengths[i] - 1) < nchar(path_with_globs_rx)) { - right <- .ReplaceVariablesInString(substr(path_with_globs_rx, matches[i] + lengths[i], nchar(path_with_globs_rx)), replace_values) - right_known <- head(strsplit(right, '.*', fixed = TRUE)[[1]], 1) - } - final_match <- NULL - match_limits <- NULL - if (!is.null(left)) { - left_match <- regexpr(paste0(left, replace_values[[tag]], right_known), actual_path) - match_len <- attr(left_match, 'match.length') - left_match_limits <- c(left_match + match_len - 1 - nchar(clean(right_known)) - nchar(replace_values[[tag]]) + 1, - left_match + match_len - 1 - nchar(clean(right_known))) - if (!(left_match < 1)) { - match_limits <- left_match_limits - } - } - right_match <- NULL - if (!is.null(right)) { - right_match <- regexpr(paste0(left_known, replace_values[[tag]], right), actual_path) - match_len <- attr(right_match, 'match.length') - right_match_limits <- c(right_match + nchar(clean(left_known)), - right_match + nchar(clean(left_known)) + nchar(replace_values[[tag]]) - 1) - if (is.null(match_limits) && !(right_match < 1)) { - match_limits <- right_match_limits - } - } - if (!is.null(right_match) && !is.null(left_match)) { - if (!identical(right_match_limits, left_match_limits)) { - give_warning <- TRUE - } - } - if (is.null(match_limits)) { - stop("Too complex path pattern specified for ", dataset_name, - ". Specify a simpler path pattern for this dataset.") - } - values_to_replace <- c(values_to_replace, tag) - tags_to_replace_starts <- c(tags_to_replace_starts, match_limits[1]) - tags_to_replace_ends <- c(tags_to_replace_ends, match_limits[2]) - } - } - } - - actual_path_with_tags <- actual_path - if (length(tags_to_replace_starts) > 0) { - reorder <- sort(tags_to_replace_starts, index.return = TRUE) - tags_to_replace_starts <- reorder$x - values_to_replace <- values_to_replace[reorder$ix] - tags_to_replace_ends <- tags_to_replace_ends[reorder$ix] - while (length(values_to_replace) > 0) { - actual_path_with_tags <- paste0(substr(actual_path_with_tags, 1, head(tags_to_replace_starts, 1) - 1), - '$', head(values_to_replace, 1), '$', - substr(actual_path_with_tags, head(tags_to_replace_ends, 1) + 1, nchar(actual_path_with_tags))) - extra_chars <- nchar(head(values_to_replace, 1)) + 2 - (head(tags_to_replace_ends, 1) - head(tags_to_replace_starts, 1) + 1) - values_to_replace <- values_to_replace[-1] - tags_to_replace_starts <- tags_to_replace_starts[-1] - tags_to_replace_ends <- tags_to_replace_ends[-1] - tags_to_replace_starts <- tags_to_replace_starts + extra_chars - tags_to_replace_ends <- tags_to_replace_ends + extra_chars - } - } - - if (give_warning) { - .warning(paste0("Too complex path pattern specified for ", dataset_name, - ". Double check carefully the '$Files' fetched for this dataset or specify a simpler path pattern.")) - } - - if (permissive) { - paste0(actual_path_with_tags, file_name_with_globs) - } else { - actual_path_with_tags - } -} - -.FindTagValue <- function(path_with_globs_and_tag, actual_path, tag) { - - addition_warning = FALSE - - if (!all(sapply(c(path_with_globs_and_tag, actual_path, tag), is.character))) { - stop("All 'path_with_globs_and_tag', 'actual_path' and 'tag' must be character strings.") - } - - if (grepl('$', tag, fixed = TRUE)) { - stop("The provided 'tag' must not contain '$' symbols.") - } - full_tag <- paste0('$', tag, '$') - - if (!grepl(full_tag, path_with_globs_and_tag, fixed = TRUE)) { - stop("The provided 'path_with_globs_and_tag' must contain the tag in 'tag' surrounded by '$' symbols.") - } - - parts <- strsplit(path_with_globs_and_tag, full_tag, fixed = TRUE)[[1]] - if (length(parts) == 1) { - parts <- c(parts, '') - } - parts[1] <- paste0('^', parts[1]) - parts[length(parts)] <- paste0(parts[length(parts)], '$') - - # Group the parts in 2 groups, in a way that both groups have a number - # of characters as similar as possible. - part_lengths <- sapply(parts, nchar) - group_len_diffs <- sapply(1:(length(parts) - 1), - function(x) { - sum(part_lengths[(x + 1):length(parts)]) - sum(part_lengths[1:x]) - } - ) - clp <- chosen_left_part <- which.min(group_len_diffs)[1] - - left_expr <- paste(parts[1:clp], collapse = full_tag) - - #because ? means sth, use . (any char) to substitute ? - left_expr <- gsub('?', '.', left_expr, fixed = TRUE) - test_left_expr <- left_expr - - # because * means zero or more char, use . to substitute *. - # And the * behind . means zero or more char. '?' for lazy evaluation. - left_expr <- gsub('*', '.*?', left_expr, fixed = TRUE) - left_expr <- gsub(full_tag, '.*?', left_expr, fixed = TRUE) - - # To test if the pattern matches only one... dont use lazy evaluation - test_left_expr <- gsub('*', '.*', test_left_expr, fixed = TRUE) - test_left_expr <- gsub(full_tag, '.*', test_left_expr, fixed = TRUE) - - # Find the match chars from left - left_match <- regexec(left_expr, actual_path)[[1]] - test_left_match <- regexec(test_left_expr, actual_path)[[1]] - - if (left_match < 0) { - stop("Unexpected error in .FindTagValue.") - } - - if (attr(test_left_match, "match.length") != attr(left_match, "match.length")) { - addition_warning = TRUE - warning("Detect more than one possibility derived from the global expression of path.") - } - - #Cut down the left match part - actual_path_sub <- substr(actual_path, - attr(left_match, 'match.length') + 1, - nchar(actual_path)) - - #----------Search match chars from right - right_expr <- paste(parts[(clp + 1):(length(parts))], collapse = full_tag) - right_expr <- gsub('?', '.', right_expr, fixed = TRUE) - - test_right_expr <- right_expr - # For lazy evaulation to work, pattern and string have to be reversed. - right_expr <- gsub('*', '.*?', right_expr, fixed = TRUE) - right_expr <- gsub(full_tag, '.*?', right_expr, fixed = TRUE) - right_expr <- gsub('$', '^', right_expr, fixed = TRUE) - - # To test if the pattern matches only one... dont use lazy evaluation - test_right_expr <- gsub('*', '.*', test_right_expr, fixed = TRUE) - test_right_expr <- gsub(full_tag, '.*', test_right_expr, fixed = TRUE) - test_right_expr <- gsub('$', '^', test_right_expr, fixed = TRUE) - - rev_str <- function(s) { - paste(rev(strsplit(s, NULL)[[1]]), collapse = '') - } - - right_expr <- rev_str(right_expr) - test_right_expr <- rev_str(test_right_expr) - - right_expr <- gsub('?*.', '.*?', right_expr, fixed = TRUE) - right_match <- regexec(right_expr, rev_str(actual_path))[[1]] - - test_right_expr <- gsub('*.', '.*', test_right_expr, fixed = TRUE) - test_right_match <- regexec(test_right_expr, rev_str(actual_path_sub))[[1]] - - if (right_match < 0) { - stop("Unexpected error in .FindTagValue.") - } - - if (attr(test_right_match, "match.length") != attr(right_match, "match.length")) { - addition_warning = TRUE - warning(paste0("Detect more than one possibility derived from the global ", - "expression of path.")) - } - - #-------------get tag value - right_match[] <- nchar(actual_path) - - (right_match[] + attr(right_match, 'match.length') - 1) + 1 - - if (addition_warning) { - warning(paste0("The extracted parameter ", full_tag, " is ", - substr(actual_path, left_match + attr(left_match, 'match.length'), - right_match - 1), - ". Check if all the desired files were read in. ", - "If not, specify parameter '", tag, - "' by values instead of indices, or set parameter ", - "'path_glob_permissive' as TRUE")) - } - - if ((left_match + attr(left_match, 'match.length')) > - (right_match - 1)) { - NULL - } else { - substr(actual_path, left_match + attr(left_match, 'match.length'), - right_match - 1) - } -} - -.message <- function(...) { - # Function to use the 'message' R function with our custom settings - # Default: new line at end of message, indent to 0, exdent to 3, - # collapse to \n* - args <- list(...) - - ## In case we need to specify message arguments - if (!is.null(args[["appendLF"]])) { - appendLF <- args[["appendLF"]] - } else { - ## Default value in message function - appendLF <- TRUE - } - if (!is.null(args[["domain"]])) { - domain <- args[["domain"]] - } else { - ## Default value in message function - domain <- NULL - } - args[["appendLF"]] <- NULL - args[["domain"]] <- NULL - - ## To modify strwrap indent and exdent arguments - if (!is.null(args[["indent"]])) { - indent <- args[["indent"]] - } else { - indent <- 0 - } - if (!is.null(args[["exdent"]])) { - exdent <- args[["exdent"]] - } else { - exdent <- 3 - } - args[["indent"]] <- NULL - args[["exdent"]] <- NULL - - ## To modify paste collapse argument - if (!is.null(args[["collapse"]])) { - collapse <- args[["collapse"]] - } else { - collapse <- "\n*" - } - args[["collapse"]] <- NULL - - ## Message tag - if (!is.null(args[["tag"]])) { - tag <- args[["tag"]] - } else { - tag <- "* " - } - args[["tag"]] <- NULL - - message(paste0(tag, paste(strwrap( - args, indent = indent, exdent = exdent - ), collapse = collapse)), appendLF = appendLF, domain = domain) -} - -.warning <- function(...) { - # Function to use the 'warning' R function with our custom settings - # Default: no call information, indent to 0, exdent to 3, - # collapse to \n - args <- list(...) - - ## In case we need to specify warning arguments - if (!is.null(args[["call."]])) { - call <- args[["call."]] - } else { - ## Default: don't show info about the call where the warning came up - call <- FALSE - } - if (!is.null(args[["immediate."]])) { - immediate <- args[["immediate."]] - } else { - ## Default value in warning function - immediate <- FALSE - } - if (!is.null(args[["noBreaks."]])) { - noBreaks <- args[["noBreaks."]] - } else { - ## Default value warning function - noBreaks <- FALSE - } - if (!is.null(args[["domain"]])) { - domain <- args[["domain"]] - } else { - ## Default value warning function - domain <- NULL - } - args[["call."]] <- NULL - args[["immediate."]] <- NULL - args[["noBreaks."]] <- NULL - args[["domain"]] <- NULL - - ## To modify strwrap indent and exdent arguments - if (!is.null(args[["indent"]])) { - indent <- args[["indent"]] - } else { - indent <- 0 - } - if (!is.null(args[["exdent"]])) { - exdent <- args[["exdent"]] - } else { - exdent <- 3 - } - args[["indent"]] <- NULL - args[["exdent"]] <- NULL - - ## To modify paste collapse argument - if (!is.null(args[["collapse"]])) { - collapse <- args[["collapse"]] - } else { - collapse <- "\n!" - } - args[["collapse"]] <- NULL - - ## Warning tag - if (!is.null(args[["tag"]])) { - tag <- args[["tag"]] - } else { - tag <- "! Warning: " - } - args[["tag"]] <- NULL - - warning(paste0(tag, paste(strwrap( - args, indent = indent, exdent = exdent - ), collapse = collapse)), call. = call, immediate. = immediate, - noBreaks. = noBreaks, domain = domain) -} - -# Function to permute arrays of non-atomic elements (e.g. POSIXct) -# Function to permute arrays of non-atomic elements (e.g. POSIXct) -.aperm2 <- function(x, new_order) { - old_dims <- dim(x) - attr_bk <- attributes(x) - if ('dim' %in% names(attr_bk)) { - attr_bk[['dim']] <- NULL - } - if (is.numeric(x)) { - x <- aperm(x, new_order) - } else { - y <- array(1:length(x), dim = dim(x)) - y <- aperm(y, new_order) - x <- x[as.vector(y)] - } - dim(x) <- old_dims[new_order] - attributes(x) <- c(attributes(x), attr_bk) - x -} - -# Function to bind arrays of non-atomic elements (e.g. POSIXct) -# 'x' and 'y' must have dimension names -# parameter 'along' must be a dimension name -.abind2 <- function(x, y, along) { - x_along <- which(names(dim(x)) == along) - if (x_along != length(dim(x))) { - tmp_order_x <- c((1:length(dim(x)))[-x_along], x_along) - x <- .aperm2(x, tmp_order_x) - } - y_along <- which(names(dim(y)) == along) - if (y_along != length(dim(y))) { - tmp_order_y <- c((1:length(dim(y)))[-y_along], y_along) - y <- .aperm2(y, tmp_order_y) - } - r <- c(x, y) - new_dims <- dim(x) - new_dims[length(new_dims)] <- dim(x)[length(dim(x))] + dim(y)[length(dim(y))] - dim(r) <- new_dims - if (x_along != length(dim(x))) { - final_order <- NULL - if (x_along > 1) { - final_order <- c(final_order, (1:length(dim(r)))[1:(x_along - 1)]) - } - final_order <- c(final_order, length(dim(r))) - final_order <- c(final_order, (1:length(dim(r)))[x_along:(length(dim(r)) - 1)]) - r <- .aperm2(r, final_order) - } - r -} - -# This function is a helper for the function .MergeArrays. -# It expects as inputs two named numeric vectors, and it extends them -# with dimensions of length 1 until an ordered common dimension -# format is reached. -# The first output is dims1 extended with 1s. -# The second output is dims2 extended with 1s. -# The third output is a merged dimension vector. If dimensions with -# the same name are found in the two inputs, and they have a different -# length, the maximum is taken. -.MergeArrayDims <- function(dims1, dims2) { - new_dims1 <- c() - new_dims2 <- c() - while (length(dims1) > 0) { - if (names(dims1)[1] %in% names(dims2)) { - pos <- which(names(dims2) == names(dims1)[1]) - dims_to_add <- rep(1, pos - 1) - if (length(dims_to_add) > 0) { - names(dims_to_add) <- names(dims2[1:(pos - 1)]) - } - new_dims1 <- c(new_dims1, dims_to_add, dims1[1]) - new_dims2 <- c(new_dims2, dims2[1:pos]) - dims1 <- dims1[-1] - dims2 <- dims2[-c(1:pos)] - } else { - new_dims1 <- c(new_dims1, dims1[1]) - new_dims2 <- c(new_dims2, 1) - names(new_dims2)[length(new_dims2)] <- names(dims1)[1] - dims1 <- dims1[-1] - } - } - if (length(dims2) > 0) { - dims_to_add <- rep(1, length(dims2)) - names(dims_to_add) <- names(dims2) - new_dims1 <- c(new_dims1, dims_to_add) - new_dims2 <- c(new_dims2, dims2) - } - list(new_dims1, new_dims2, pmax(new_dims1, new_dims2)) -} - -# This function takes two named arrays and merges them, filling with -# NA where needed. -# dim(array1) -# 'b' 'c' 'e' 'f' -# 1 3 7 9 -# dim(array2) -# 'a' 'b' 'd' 'f' 'g' -# 2 3 5 9 11 -# dim(.MergeArrays(array1, array2, 'b')) -# 'a' 'b' 'c' 'e' 'd' 'f' 'g' -# 2 4 3 7 5 9 11 -.MergeArrays <- function(array1, array2, along) { - if (!(is.null(array1) || is.null(array2))) { - if (!(identical(names(dim(array1)), names(dim(array2))) && - identical(dim(array1)[-which(names(dim(array1)) == along)], - dim(array2)[-which(names(dim(array2)) == along)]))) { - new_dims <- .MergeArrayDims(dim(array1), dim(array2)) - dim(array1) <- new_dims[[1]] - dim(array2) <- new_dims[[2]] - for (j in 1:length(dim(array1))) { - if (names(dim(array1))[j] != along) { - if (dim(array1)[j] != dim(array2)[j]) { - if (which.max(c(dim(array1)[j], dim(array2)[j])) == 1) { - na_array_dims <- dim(array2) - na_array_dims[j] <- dim(array1)[j] - dim(array2)[j] - na_array <- array(dim = na_array_dims) - array2 <- abind(array2, na_array, along = j) - names(dim(array2)) <- names(na_array_dims) - } else { - na_array_dims <- dim(array1) - na_array_dims[j] <- dim(array2)[j] - dim(array1)[j] - na_array <- array(dim = na_array_dims) - array1 <- abind(array1, na_array, along = j) - names(dim(array1)) <- names(na_array_dims) - } - } - } - } - } - if (!(along %in% names(dim(array2)))) { - stop("The dimension specified in 'along' is not present in the ", - "provided arrays.") - } - array1 <- abind(array1, array2, along = which(names(dim(array1)) == along)) - names(dim(array1)) <- names(dim(array2)) - } else if (is.null(array1)) { - array1 <- array2 - } - array1 -} - -# Takes as input a list of arrays. The list must have named dimensions. -.MergeArrayOfArrays <- function(array_of_arrays) { - MergeArrays <- .MergeArrays - array_dims <- (dim(array_of_arrays)) - dim_names <- names(array_dims) - - # Merge the chunks. - for (dim_index in 1:length(dim_names)) { - dim_sub_array_of_chunks <- dim_sub_array_of_chunk_indices <- NULL - if (dim_index < length(dim_names)) { - dim_sub_array_of_chunks <- array_dims[(dim_index + 1):length(dim_names)] - names(dim_sub_array_of_chunks) <- dim_names[(dim_index + 1):length(dim_names)] - dim_sub_array_of_chunk_indices <- dim_sub_array_of_chunks - sub_array_of_chunk_indices <- array(1:prod(dim_sub_array_of_chunk_indices), - dim_sub_array_of_chunk_indices) - } else { - sub_array_of_chunk_indices <- NULL - } - sub_array_of_chunks <- vector('list', prod(dim_sub_array_of_chunks)) - dim(sub_array_of_chunks) <- dim_sub_array_of_chunks - for (i in 1:prod(dim_sub_array_of_chunks)) { - if (!is.null(sub_array_of_chunk_indices)) { - chunk_sub_indices <- which(sub_array_of_chunk_indices == i, arr.ind = TRUE)[1, ] - } else { - chunk_sub_indices <- NULL - } - for (j in 1:(array_dims[dim_index])) { - new_chunk <- do.call('[[', c(list(x = array_of_arrays), - as.list(c(j, chunk_sub_indices)))) - if (is.null(new_chunk)) { - stop("Chunks missing.") - } - if (is.null(sub_array_of_chunks[[i]])) { - sub_array_of_chunks[[i]] <- new_chunk - } else { - sub_array_of_chunks[[i]] <- MergeArrays(sub_array_of_chunks[[i]], - new_chunk, - dim_names[dim_index]) - } - } - } - array_of_arrays <- sub_array_of_chunks - rm(sub_array_of_chunks) - gc() - } - - array_of_arrays[[1]] -} - -.MergeChunks <- function(shared_dir, suite_id, remove) { - MergeArrays <- startR:::.MergeArrays - - args <- NULL - shared_dir <- paste0(shared_dir, '/STARTR_CHUNKING_', suite_id) - - all_chunk_files_original <- list.files(paste0(shared_dir, '/'), '.*\\.Rds$') - all_chunk_files <- gsub('\\.Rds$', '', all_chunk_files_original) - chunk_filename_parts_all_components <- strsplit(all_chunk_files, '__') - all_components <- sapply(chunk_filename_parts_all_components, '[[', 1) - components <- unique(all_components) - result <- vector('list', length(components)) - names(result) <- components - for (component in components) { - chunk_files_original <- all_chunk_files_original[which(all_components == component)] - chunk_filename_parts <- chunk_filename_parts_all_components[which(all_components == component)] - chunk_filename_parts <- lapply(chunk_filename_parts, '[', -1) - if (length(unique(sapply(chunk_filename_parts, length))) != 1) { - stop("Detected chunks with more dimensions than others.") - } - dim_names <- sapply(chunk_filename_parts[[1]], - # TODO: strsplit by the last '_' match, not the first. - function(x) strsplit(x, '_')[[1]][1]) - # TODO check all files have exactly the same dimnames - found_chunk_indices <- sapply(chunk_filename_parts, - function(x) as.numeric(sapply(strsplit(x, '_'), '[[', 2))) - found_chunk_indices <- array(found_chunk_indices, - dim = c(length(dim_names), - length(found_chunk_indices) / length(dim_names)) - ) - found_chunks_str <- apply(found_chunk_indices, 2, paste, collapse = '_') - - if (length(args) > 0) { - if ((length(args) %% 2) != 0) { - stop("Wrong number of parameters.") - } - expected_dim_names <- args[(1:(length(args) / 2) - 1) * 2 + 1] - if (any(!is.character(expected_dim_names))) { - stop("Expected dimension names in parameters at odd positions.") - } - dim_indices <- args[(1:(length(args) / 2) - 1) * 2 + 2] - if (!any(dim_indices == 'all')) { - stop("Expected one dimension index to be 'all'.") - } - dim_to_merge <- which(dim_indices == 'all') - if (length(dim_indices) > 1) { - if (!all(is.numeric(dim_indices[-dim_to_merge]))) { - stop("Expected all dimension index but one to be numeric.") - } - } - # Check expected dim names match dim names - ## TODO - # Merge indices that vary along dim_to_merge whereas other fixed by dim_indices - # REMOVE FILES - ## TODO - stop("Feature not implemented.") - } else { - chunks_indices <- 1:length(dim_names) - chunks_indices <- lapply(chunks_indices, function(x) sort(unique(found_chunk_indices[x, ]))) - names(chunks_indices) <- dim_names - - # Load all found chunks into the array 'array_of_chuks'. - array_dims <- sapply(chunks_indices, length) - names(array_dims) <- dim_names - array_of_chunks <- vector('list', prod(array_dims)) - dim(array_of_chunks) <- array_dims - array_of_chunks_indices <- array(1:prod(array_dims), array_dims) - for (i in 1:prod(array_dims)) { - chunk_indices <- which(array_of_chunks_indices == i, arr.ind = TRUE)[1, ] - j <- 1 - chunk_indices_on_file <- sapply(chunk_indices, - function(x) { - r <- chunks_indices[[j]][x] - j <<- j + 1 - r - }) - found_chunk <- which(found_chunks_str == paste(chunk_indices_on_file, - collapse = '_'))[1] - if (length(found_chunk) > 0) { - num_tries <- 5 - found <- FALSE - try_num <- 1 - while ((try_num <= num_tries) && !found) { - array_of_chunks[[i]] <- try({ - readRDS(paste0(shared_dir, '/', - chunk_files_original[found_chunk])) - }) - if (('try-error' %in% class(array_of_chunks[[i]]))) { - message("Waiting for an incomplete file transfer...") - Sys.sleep(5) - } else { - found <- TRUE - } - try_num <- try_num + 1 - } - if (!found) { - stop("Could not open one of the chunks. Might be a large chunk ", - "in transfer. Merge aborted, files have been preserved.") - } - } - } - - result[[component]] <- startR:::.MergeArrayOfArrays(array_of_chunks) - rm(array_of_chunks) - gc() - } - } - - if (remove) { - sapply(all_chunk_files_original, - function(x) { - file.remove(paste0(shared_dir, '/', x)) - }) - } - - result -} +indices <- function(x) { + attr(x, 'indices') <- TRUE + attr(x, 'values') <- FALSE + attr(x, 'chunk') <- c(chunk = 1, n_chunks = 1) + x +} + +values <- function(x) { + attr(x, 'indices') <- FALSE + attr(x, 'values') <- TRUE + attr(x, 'chunk') <- c(chunk = 1, n_chunks = 1) + x +} + +chunk <- function(chunk, n_chunks, selectors) { + if (any(chunk > n_chunks)) { + stop("Requested chunk index out of bounds.") + } + if (length(chunk) == 1 && length(n_chunks) == 1) { + if (!is.null(attr(selectors, 'chunk'))) { + attr(selectors, 'chunk') <- c((attr(selectors, 'chunk')['chunk'] - 1) * n_chunks + + chunk, + attr(selectors, 'chunk')['n_chunks'] * n_chunks) + } else { + attr(selectors, 'chunk') <- c(chunk = unname(chunk), n_chunks = unname(n_chunks)) + } + } else { + # Chunking arrays of multidimensional selectors. + # This should be done in Start.R but implies modifications. + if (length(chunk) != length(n_chunks)) { + stop("Wrong chunk specification.") + } + #NOTE: 1. It should be for above? not nultidimensional selector + # 2. it was !is.null before, but it should be is.null (?) + # if (!is.null(attr(selectors, 'values'))) { + # stop("Multidimensional chunking only available when selector ", + # "values provided.") + # } + if (is.null(dim(selectors))) { + stop("Multidimensional chunking only available when multidimensional ", + "selector values provided.") + } + if (length(dim(selectors)) != length(chunk)) { + stop("As many chunk indices and chunk lengths as dimensions in the ", + "multidimensional selector array must be specified.") + } + old_indices <- attr(selectors, 'indices') + old_values <- attr(selectors, 'values') + selectors <- Subset(selectors, 1:length(chunk), + lapply(1:length(chunk), + function(x) { + n_indices <- dim(selectors)[x] + chunk_sizes <- rep(floor(n_indices / n_chunks[x]), n_chunks[x]) + chunks_to_extend <- n_indices - chunk_sizes[1] * n_chunks[x] + if (chunks_to_extend > 0) { + chunk_sizes[1:chunks_to_extend] <- chunk_sizes[1:chunks_to_extend] + 1 + } + chunk_size <- chunk_sizes[chunk[x]] + offset <- 0 + if (chunk[x] > 1) { + offset <- sum(chunk_sizes[1:(chunk[x] - 1)]) + } + 1:chunk_sizes[chunk[x]] + offset + })) + attr(selectors, 'indices') <- old_indices + attr(selectors, 'values') <- old_values + } + selectors +} + +.ReplaceVariablesInString <- function(string, replace_values, allow_undefined_key_vars = FALSE) { + # This function replaces all the occurrences of a variable in a string by + # their corresponding string stored in the replace_values. + if (length(strsplit(string, "\\$")[[1]]) > 1) { + parts <- strsplit(string, "\\$")[[1]] + output <- "" + i <- 0 + for (part in parts) { + if (i %% 2 == 0) { + output <- paste(output, part, sep = "") + } else { + if (part %in% names(replace_values)) { + output <- paste(output, .ReplaceVariablesInString(replace_values[[part]], replace_values, allow_undefined_key_vars), sep = "") + } else if (allow_undefined_key_vars) { + output <- paste0(output, "$", part, "$") + } else { + stop(paste('Error: The variable $', part, '$ was not defined in the configuration file.', sep = '')) + } + } + i <- i + 1 + } + output + } else { + string + } +} + +.ReplaceGlobExpressions <- function(path_with_globs, actual_path, + replace_values, tags_to_keep, + dataset_name, permissive) { + # The goal of this function is to replace the shell globbing expressions in + # a path pattern (that may contain shell globbing expressions and Load() + # tags) by the corresponding part of the real existing path. + # What is done actually is to replace all the values of the tags in the + # actual path by the corresponding $TAG$ + # + # It takes mainly two inputs. The path with expressions and tags, e.g.: + # /data/experiments/*/$EXP_NAME$/$VAR_NAME$/$VAR_NAME$_*$START_DATE$*.nc + # and a complete known path to one of the matching files, e.g.: + # /data/experiments/ecearth/i00k/tos/tos_fc0-1_19901101_199011-199110.nc + # and it returns the path pattern but without shell globbing expressions: + # /data/experiments/ecearth/$EXP_NAME$/$VAR_NAME$/$VAR_NAME$_fc0-1_$START_DATE$_199011-199110.nc + # + # To do that, it needs also as inputs the list of replace values (the + # association of each tag to their value). + # + # All the tags not present in the parameter tags_to_keep will be repalced. + # + # Not all cases can be resolved with the implemented algorithm. In an + # unsolvable case a warning is given and one possible guess is returned. + # + # In some cases it is interesting to replace only the expressions in the + # path to the file, but not the ones in the file name itself. To keep the + # expressions in the file name, the parameter permissive can be set to + # TRUE. To replace all the expressions it can be set to FALSE. + + # Tests + #a <- "/esarchive/exp/ecearth/a13c/3hourly/$var$_*/$var$_*-LR_historical_r1i1p1f1_gr_$chunk$.nc" + #b <- "/esarchive/exp/ecearth/a13c/3hourly/psl_f6h/psl_E3hrPt_EC-Earth3-LR_historical_r1i1p1f1_gr_195001010000-195001312100.nc" + #c <- list(dat = 'dat1', var = 'psl', chunk = '195001010000-195001312100') + #d <- c('dat', 'var', 'chunk') + #e <- 'dat1' + #f <- FALSE #TRUE/0/1/2/3 + #r <- .ReplaceGlobExpressions(a, b, c, d, e, f) + + clean <- function(x) { + if (nchar(x) > 0) { + x <- gsub('\\\\', '', x) + x <- gsub('\\^', '', x) + x <- gsub('\\$', '', x) + x <- unname(sapply(strsplit(x, '[',fixed = TRUE)[[1]], function(y) gsub('.*]', '.', y))) + do.call(paste0, as.list(x)) + } else { + x + } + } + + strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "") + + if (permissive == 0) { + permissive <- FALSE + } else { + if (permissive == TRUE) { + permissive_levels <- 1 + } else { + permissive_levels <- round(permissive[1]) + permissive <- TRUE + } + } + + if (permissive) { + actual_path_chunks <- strsplit(actual_path, '/')[[1]] + if (permissive_levels >= length(actual_path_chunks)) { + stop("Error: Provided levels out of scope in parameter 'permissive'.") + } + permissive_levels <- 1:permissive_levels + permissive_levels <- length(actual_path_chunks) - (rev(permissive_levels) - 1) + actual_path <- paste(actual_path_chunks[-permissive_levels], collapse = '/') + file_name <- paste(actual_path_chunks[permissive_levels], collapse = '/') + if (length(actual_path_chunks) > 1) { + file_name <- paste0('/', file_name) + } + path_with_globs_chunks <- strsplit(path_with_globs, '/')[[1]] + path_with_globs <- paste(path_with_globs_chunks[-permissive_levels], + collapse = '/') + path_with_globs_no_tags <- .ReplaceVariablesInString(path_with_globs, replace_values) + file_name_with_globs <- paste(path_with_globs_chunks[permissive_levels], collapse = '/') + if (length(path_with_globs_chunks) > 1) { + file_name_with_globs <- paste0('/', file_name_with_globs) + } + right_known <- head(strsplit(file_name_with_globs, '*', fixed = TRUE)[[1]], 1) + right_known_no_tags <- .ReplaceVariablesInString(right_known, replace_values) + path_with_globs_no_tags_rx <- utils::glob2rx(paste0(path_with_globs_no_tags, right_known_no_tags)) + match <- regexpr(gsub('$', '', path_with_globs_no_tags_rx, fixed = TRUE), paste0(actual_path, file_name)) + if (match != 1) { + stop("Incorrect parameters to replace glob expressions. The path with expressions does not match the actual path.") + } + #if (attr(match, 'match.length') - nchar(right_known_no_tags) < nchar(actual_path)) { + # path_with_globs_no_tags <- paste0(path_with_globs_no_tags, right_known_no_tags, '*') + # file_name_with_globs <- sub(right_known, '/*', file_name_with_globs) + #} + } + path_with_globs_rx <- utils::glob2rx(path_with_globs) + values_to_replace <- c() + tags_to_replace_starts <- c() + tags_to_replace_ends <- c() + give_warning <- FALSE + for (tag in tags_to_keep) { + matches <- gregexpr(paste0('$', tag, '$'), path_with_globs_rx, fixed = TRUE)[[1]] + lengths <- attr(matches, 'match.length') + if (!(length(matches) == 1 && matches[1] == -1)) { + for (i in 1:length(matches)) { + left <- NULL + if (matches[i] > 1) { + left <- .ReplaceVariablesInString(substr(path_with_globs_rx, 1, matches[i] - 1), replace_values) + left_known <- strReverse(head(strsplit(strReverse(left), strReverse('.*'), fixed = TRUE)[[1]], 1)) + } + right <- NULL + if ((matches[i] + lengths[i] - 1) < nchar(path_with_globs_rx)) { + right <- .ReplaceVariablesInString(substr(path_with_globs_rx, matches[i] + lengths[i], nchar(path_with_globs_rx)), replace_values) + right_known <- head(strsplit(right, '.*', fixed = TRUE)[[1]], 1) + } + final_match <- NULL + match_limits <- NULL + if (!is.null(left)) { + left_match <- regexpr(paste0(left, replace_values[[tag]], right_known), actual_path) + match_len <- attr(left_match, 'match.length') + left_match_limits <- c(left_match + match_len - 1 - nchar(clean(right_known)) - nchar(replace_values[[tag]]) + 1, + left_match + match_len - 1 - nchar(clean(right_known))) + if (!(left_match < 1)) { + match_limits <- left_match_limits + } + } + right_match <- NULL + if (!is.null(right)) { + right_match <- regexpr(paste0(left_known, replace_values[[tag]], right), actual_path) + match_len <- attr(right_match, 'match.length') + right_match_limits <- c(right_match + nchar(clean(left_known)), + right_match + nchar(clean(left_known)) + nchar(replace_values[[tag]]) - 1) + if (is.null(match_limits) && !(right_match < 1)) { + match_limits <- right_match_limits + } + } + if (!is.null(right_match) && !is.null(left_match)) { + if (!identical(right_match_limits, left_match_limits)) { + give_warning <- TRUE + } + } + if (is.null(match_limits)) { + stop("Too complex path pattern specified for ", dataset_name, + ". Specify a simpler path pattern for this dataset.") + } + values_to_replace <- c(values_to_replace, tag) + tags_to_replace_starts <- c(tags_to_replace_starts, match_limits[1]) + tags_to_replace_ends <- c(tags_to_replace_ends, match_limits[2]) + } + } + } + + actual_path_with_tags <- actual_path + if (length(tags_to_replace_starts) > 0) { + reorder <- sort(tags_to_replace_starts, index.return = TRUE) + tags_to_replace_starts <- reorder$x + values_to_replace <- values_to_replace[reorder$ix] + tags_to_replace_ends <- tags_to_replace_ends[reorder$ix] + while (length(values_to_replace) > 0) { + actual_path_with_tags <- paste0(substr(actual_path_with_tags, 1, head(tags_to_replace_starts, 1) - 1), + '$', head(values_to_replace, 1), '$', + substr(actual_path_with_tags, head(tags_to_replace_ends, 1) + 1, nchar(actual_path_with_tags))) + extra_chars <- nchar(head(values_to_replace, 1)) + 2 - (head(tags_to_replace_ends, 1) - head(tags_to_replace_starts, 1) + 1) + values_to_replace <- values_to_replace[-1] + tags_to_replace_starts <- tags_to_replace_starts[-1] + tags_to_replace_ends <- tags_to_replace_ends[-1] + tags_to_replace_starts <- tags_to_replace_starts + extra_chars + tags_to_replace_ends <- tags_to_replace_ends + extra_chars + } + } + + if (give_warning) { + .warning(paste0("Too complex path pattern specified for ", dataset_name, + ". Double check carefully the '$Files' fetched for this dataset or specify a simpler path pattern.")) + } + + if (permissive) { + paste0(actual_path_with_tags, file_name_with_globs) + } else { + actual_path_with_tags + } +} + +.FindTagValue <- function(path_with_globs_and_tag, actual_path, tag) { + + addition_warning = FALSE + + if (!all(sapply(c(path_with_globs_and_tag, actual_path, tag), is.character))) { + stop("All 'path_with_globs_and_tag', 'actual_path' and 'tag' must be character strings.") + } + + if (grepl('$', tag, fixed = TRUE)) { + stop("The provided 'tag' must not contain '$' symbols.") + } + full_tag <- paste0('$', tag, '$') + + if (!grepl(full_tag, path_with_globs_and_tag, fixed = TRUE)) { + stop("The provided 'path_with_globs_and_tag' must contain the tag in 'tag' surrounded by '$' symbols.") + } + + parts <- strsplit(path_with_globs_and_tag, full_tag, fixed = TRUE)[[1]] + if (length(parts) == 1) { + parts <- c(parts, '') + } + parts[1] <- paste0('^', parts[1]) + parts[length(parts)] <- paste0(parts[length(parts)], '$') + + # Group the parts in 2 groups, in a way that both groups have a number + # of characters as similar as possible. + part_lengths <- sapply(parts, nchar) + group_len_diffs <- sapply(1:(length(parts) - 1), + function(x) { + sum(part_lengths[(x + 1):length(parts)]) - sum(part_lengths[1:x]) + } + ) + clp <- chosen_left_part <- which.min(group_len_diffs)[1] + + left_expr <- paste(parts[1:clp], collapse = full_tag) + + #because ? means sth, use . (any char) to substitute ? + left_expr <- gsub('?', '.', left_expr, fixed = TRUE) + test_left_expr <- left_expr + + # because * means zero or more char, use . to substitute *. + # And the * behind . means zero or more char. '?' for lazy evaluation. + left_expr <- gsub('*', '.*?', left_expr, fixed = TRUE) + left_expr <- gsub(full_tag, '.*?', left_expr, fixed = TRUE) + + # To test if the pattern matches only one... dont use lazy evaluation + test_left_expr <- gsub('*', '.*', test_left_expr, fixed = TRUE) + test_left_expr <- gsub(full_tag, '.*', test_left_expr, fixed = TRUE) + + # Find the match chars from left + left_match <- regexec(left_expr, actual_path)[[1]] + test_left_match <- regexec(test_left_expr, actual_path)[[1]] + + if (left_match < 0) { + stop("Unexpected error in .FindTagValue.") + } + + if (attr(test_left_match, "match.length") != attr(left_match, "match.length")) { + addition_warning = TRUE + warning("Detect more than one possibility derived from the global expression of path.") + } + + #Cut down the left match part + actual_path_sub <- substr(actual_path, + attr(left_match, 'match.length') + 1, + nchar(actual_path)) + + #----------Search match chars from right + right_expr <- paste(parts[(clp + 1):(length(parts))], collapse = full_tag) + right_expr <- gsub('?', '.', right_expr, fixed = TRUE) + + test_right_expr <- right_expr + # For lazy evaulation to work, pattern and string have to be reversed. + right_expr <- gsub('*', '.*?', right_expr, fixed = TRUE) + right_expr <- gsub(full_tag, '.*?', right_expr, fixed = TRUE) + right_expr <- gsub('$', '^', right_expr, fixed = TRUE) + + # To test if the pattern matches only one... dont use lazy evaluation + test_right_expr <- gsub('*', '.*', test_right_expr, fixed = TRUE) + test_right_expr <- gsub(full_tag, '.*', test_right_expr, fixed = TRUE) + test_right_expr <- gsub('$', '^', test_right_expr, fixed = TRUE) + + rev_str <- function(s) { + paste(rev(strsplit(s, NULL)[[1]]), collapse = '') + } + + right_expr <- rev_str(right_expr) + test_right_expr <- rev_str(test_right_expr) + + right_expr <- gsub('?*.', '.*?', right_expr, fixed = TRUE) + right_match <- regexec(right_expr, rev_str(actual_path))[[1]] + + test_right_expr <- gsub('*.', '.*', test_right_expr, fixed = TRUE) + test_right_match <- regexec(test_right_expr, rev_str(actual_path_sub))[[1]] + + if (right_match < 0) { + stop("Unexpected error in .FindTagValue.") + } + + if (attr(test_right_match, "match.length") != attr(right_match, "match.length")) { + addition_warning = TRUE + warning(paste0("Detect more than one possibility derived from the global ", + "expression of path.")) + } + + #-------------get tag value + right_match[] <- nchar(actual_path) - + (right_match[] + attr(right_match, 'match.length') - 1) + 1 + + if (addition_warning) { + warning(paste0("The extracted parameter ", full_tag, " is ", + substr(actual_path, left_match + attr(left_match, 'match.length'), + right_match - 1), + ". Check if all the desired files were read in. ", + "If not, specify parameter '", tag, + "' by values instead of indices, or set parameter ", + "'path_glob_permissive' as TRUE")) + } + + if ((left_match + attr(left_match, 'match.length')) > + (right_match - 1)) { + NULL + } else { + substr(actual_path, left_match + attr(left_match, 'match.length'), + right_match - 1) + } +} + +.message <- function(...) { + # Function to use the 'message' R function with our custom settings + # Default: new line at end of message, indent to 0, exdent to 3, + # collapse to \n* + args <- list(...) + + ## In case we need to specify message arguments + if (!is.null(args[["appendLF"]])) { + appendLF <- args[["appendLF"]] + } else { + ## Default value in message function + appendLF <- TRUE + } + if (!is.null(args[["domain"]])) { + domain <- args[["domain"]] + } else { + ## Default value in message function + domain <- NULL + } + args[["appendLF"]] <- NULL + args[["domain"]] <- NULL + + ## To modify strwrap indent and exdent arguments + if (!is.null(args[["indent"]])) { + indent <- args[["indent"]] + } else { + indent <- 0 + } + if (!is.null(args[["exdent"]])) { + exdent <- args[["exdent"]] + } else { + exdent <- 3 + } + args[["indent"]] <- NULL + args[["exdent"]] <- NULL + + ## To modify paste collapse argument + if (!is.null(args[["collapse"]])) { + collapse <- args[["collapse"]] + } else { + collapse <- "\n*" + } + args[["collapse"]] <- NULL + + ## Message tag + if (!is.null(args[["tag"]])) { + tag <- args[["tag"]] + } else { + tag <- "* " + } + args[["tag"]] <- NULL + + message(paste0(tag, paste(strwrap( + args, indent = indent, exdent = exdent + ), collapse = collapse)), appendLF = appendLF, domain = domain) +} + +.warning <- function(...) { + # Function to use the 'warning' R function with our custom settings + # Default: no call information, indent to 0, exdent to 3, + # collapse to \n + args <- list(...) + + ## In case we need to specify warning arguments + if (!is.null(args[["call."]])) { + call <- args[["call."]] + } else { + ## Default: don't show info about the call where the warning came up + call <- FALSE + } + if (!is.null(args[["immediate."]])) { + immediate <- args[["immediate."]] + } else { + ## Default value in warning function + immediate <- FALSE + } + if (!is.null(args[["noBreaks."]])) { + noBreaks <- args[["noBreaks."]] + } else { + ## Default value warning function + noBreaks <- FALSE + } + if (!is.null(args[["domain"]])) { + domain <- args[["domain"]] + } else { + ## Default value warning function + domain <- NULL + } + args[["call."]] <- NULL + args[["immediate."]] <- NULL + args[["noBreaks."]] <- NULL + args[["domain"]] <- NULL + + ## To modify strwrap indent and exdent arguments + if (!is.null(args[["indent"]])) { + indent <- args[["indent"]] + } else { + indent <- 0 + } + if (!is.null(args[["exdent"]])) { + exdent <- args[["exdent"]] + } else { + exdent <- 3 + } + args[["indent"]] <- NULL + args[["exdent"]] <- NULL + + ## To modify paste collapse argument + if (!is.null(args[["collapse"]])) { + collapse <- args[["collapse"]] + } else { + collapse <- "\n!" + } + args[["collapse"]] <- NULL + + ## Warning tag + if (!is.null(args[["tag"]])) { + tag <- args[["tag"]] + } else { + tag <- "! Warning: " + } + args[["tag"]] <- NULL + + warning(paste0(tag, paste(strwrap( + args, indent = indent, exdent = exdent + ), collapse = collapse)), call. = call, immediate. = immediate, + noBreaks. = noBreaks, domain = domain) +} + +# Function to permute arrays of non-atomic elements (e.g. POSIXct) +# Function to permute arrays of non-atomic elements (e.g. POSIXct) +.aperm2 <- function(x, new_order) { + old_dims <- dim(x) + attr_bk <- attributes(x) + if ('dim' %in% names(attr_bk)) { + attr_bk[['dim']] <- NULL + } + if (is.numeric(x)) { + x <- aperm(x, new_order) + } else { + y <- array(1:length(x), dim = dim(x)) + y <- aperm(y, new_order) + x <- x[as.vector(y)] + } + dim(x) <- old_dims[new_order] + attributes(x) <- c(attributes(x), attr_bk) + x +} + +# Function to bind arrays of non-atomic elements (e.g. POSIXct) +# 'x' and 'y' must have dimension names +# parameter 'along' must be a dimension name +.abind2 <- function(x, y, along) { + x_along <- which(names(dim(x)) == along) + if (x_along != length(dim(x))) { + tmp_order_x <- c((1:length(dim(x)))[-x_along], x_along) + x <- .aperm2(x, tmp_order_x) + } + y_along <- which(names(dim(y)) == along) + if (y_along != length(dim(y))) { + tmp_order_y <- c((1:length(dim(y)))[-y_along], y_along) + y <- .aperm2(y, tmp_order_y) + } + r <- c(x, y) + new_dims <- dim(x) + new_dims[length(new_dims)] <- dim(x)[length(dim(x))] + dim(y)[length(dim(y))] + dim(r) <- new_dims + if (x_along != length(dim(x))) { + final_order <- NULL + if (x_along > 1) { + final_order <- c(final_order, (1:length(dim(r)))[1:(x_along - 1)]) + } + final_order <- c(final_order, length(dim(r))) + final_order <- c(final_order, (1:length(dim(r)))[x_along:(length(dim(r)) - 1)]) + r <- .aperm2(r, final_order) + } + r +} + +# This function is a helper for the function .MergeArrays. +# It expects as inputs two named numeric vectors, and it extends them +# with dimensions of length 1 until an ordered common dimension +# format is reached. +# The first output is dims1 extended with 1s. +# The second output is dims2 extended with 1s. +# The third output is a merged dimension vector. If dimensions with +# the same name are found in the two inputs, and they have a different +# length, the maximum is taken. +.MergeArrayDims <- function(dims1, dims2) { + new_dims1 <- c() + new_dims2 <- c() + while (length(dims1) > 0) { + if (names(dims1)[1] %in% names(dims2)) { + pos <- which(names(dims2) == names(dims1)[1]) + dims_to_add <- rep(1, pos - 1) + if (length(dims_to_add) > 0) { + names(dims_to_add) <- names(dims2[1:(pos - 1)]) + } + new_dims1 <- c(new_dims1, dims_to_add, dims1[1]) + new_dims2 <- c(new_dims2, dims2[1:pos]) + dims1 <- dims1[-1] + dims2 <- dims2[-c(1:pos)] + } else { + new_dims1 <- c(new_dims1, dims1[1]) + new_dims2 <- c(new_dims2, 1) + names(new_dims2)[length(new_dims2)] <- names(dims1)[1] + dims1 <- dims1[-1] + } + } + if (length(dims2) > 0) { + dims_to_add <- rep(1, length(dims2)) + names(dims_to_add) <- names(dims2) + new_dims1 <- c(new_dims1, dims_to_add) + new_dims2 <- c(new_dims2, dims2) + } + list(new_dims1, new_dims2, pmax(new_dims1, new_dims2)) +} + +# This function takes two named arrays and merges them, filling with +# NA where needed. +# dim(array1) +# 'b' 'c' 'e' 'f' +# 1 3 7 9 +# dim(array2) +# 'a' 'b' 'd' 'f' 'g' +# 2 3 5 9 11 +# dim(.MergeArrays(array1, array2, 'b')) +# 'a' 'b' 'c' 'e' 'd' 'f' 'g' +# 2 4 3 7 5 9 11 +.MergeArrays <- function(array1, array2, along) { + if (!(is.null(array1) || is.null(array2))) { + if (!(identical(names(dim(array1)), names(dim(array2))) && + identical(dim(array1)[-which(names(dim(array1)) == along)], + dim(array2)[-which(names(dim(array2)) == along)]))) { + new_dims <- .MergeArrayDims(dim(array1), dim(array2)) + dim(array1) <- new_dims[[1]] + dim(array2) <- new_dims[[2]] + for (j in 1:length(dim(array1))) { + if (names(dim(array1))[j] != along) { + if (dim(array1)[j] != dim(array2)[j]) { + if (which.max(c(dim(array1)[j], dim(array2)[j])) == 1) { + na_array_dims <- dim(array2) + na_array_dims[j] <- dim(array1)[j] - dim(array2)[j] + na_array <- array(dim = na_array_dims) + array2 <- abind(array2, na_array, along = j) + names(dim(array2)) <- names(na_array_dims) + } else { + na_array_dims <- dim(array1) + na_array_dims[j] <- dim(array2)[j] - dim(array1)[j] + na_array <- array(dim = na_array_dims) + array1 <- abind(array1, na_array, along = j) + names(dim(array1)) <- names(na_array_dims) + } + } + } + } + } + if (!(along %in% names(dim(array2)))) { + stop("The dimension specified in 'along' is not present in the ", + "provided arrays.") + } + array1 <- abind(array1, array2, along = which(names(dim(array1)) == along)) + names(dim(array1)) <- names(dim(array2)) + } else if (is.null(array1)) { + array1 <- array2 + } + array1 +} + +# Takes as input a list of arrays. The list must have named dimensions. +.MergeArrayOfArrays <- function(array_of_arrays) { + MergeArrays <- .MergeArrays + array_dims <- (dim(array_of_arrays)) + dim_names <- names(array_dims) + + # Merge the chunks. + for (dim_index in 1:length(dim_names)) { + dim_sub_array_of_chunks <- dim_sub_array_of_chunk_indices <- NULL + if (dim_index < length(dim_names)) { + dim_sub_array_of_chunks <- array_dims[(dim_index + 1):length(dim_names)] + names(dim_sub_array_of_chunks) <- dim_names[(dim_index + 1):length(dim_names)] + dim_sub_array_of_chunk_indices <- dim_sub_array_of_chunks + sub_array_of_chunk_indices <- array(1:prod(dim_sub_array_of_chunk_indices), + dim_sub_array_of_chunk_indices) + } else { + sub_array_of_chunk_indices <- NULL + } + sub_array_of_chunks <- vector('list', prod(dim_sub_array_of_chunks)) + dim(sub_array_of_chunks) <- dim_sub_array_of_chunks + for (i in 1:prod(dim_sub_array_of_chunks)) { + if (!is.null(sub_array_of_chunk_indices)) { + chunk_sub_indices <- which(sub_array_of_chunk_indices == i, arr.ind = TRUE)[1, ] + } else { + chunk_sub_indices <- NULL + } + for (j in 1:(array_dims[dim_index])) { + new_chunk <- do.call('[[', c(list(x = array_of_arrays), + as.list(c(j, chunk_sub_indices)))) + if (is.null(new_chunk)) { + stop("Chunks missing.") + } + if (is.null(sub_array_of_chunks[[i]])) { + sub_array_of_chunks[[i]] <- new_chunk + } else { + sub_array_of_chunks[[i]] <- MergeArrays(sub_array_of_chunks[[i]], + new_chunk, + dim_names[dim_index]) + } + } + } + array_of_arrays <- sub_array_of_chunks + rm(sub_array_of_chunks) + gc() + } + + array_of_arrays[[1]] +} + +.MergeChunks <- function(shared_dir, suite_id, remove) { + MergeArrays <- startR:::.MergeArrays + + args <- NULL + shared_dir <- paste0(shared_dir, '/STARTR_CHUNKING_', suite_id) + + all_chunk_files_original <- list.files(paste0(shared_dir, '/'), '.*\\.Rds$') + all_chunk_files <- gsub('\\.Rds$', '', all_chunk_files_original) + chunk_filename_parts_all_components <- strsplit(all_chunk_files, '__') + all_components <- sapply(chunk_filename_parts_all_components, '[[', 1) + components <- unique(all_components) + result <- vector('list', length(components)) + names(result) <- components + for (component in components) { + chunk_files_original <- all_chunk_files_original[which(all_components == component)] + chunk_filename_parts <- chunk_filename_parts_all_components[which(all_components == component)] + chunk_filename_parts <- lapply(chunk_filename_parts, '[', -1) + if (length(unique(sapply(chunk_filename_parts, length))) != 1) { + stop("Detected chunks with more dimensions than others.") + } + dim_names <- sapply(chunk_filename_parts[[1]], + # TODO: strsplit by the last '_' match, not the first. + function(x) strsplit(x, '_')[[1]][1]) + # TODO check all files have exactly the same dimnames + found_chunk_indices <- sapply(chunk_filename_parts, + function(x) as.numeric(sapply(strsplit(x, '_'), '[[', 2))) + found_chunk_indices <- array(found_chunk_indices, + dim = c(length(dim_names), + length(found_chunk_indices) / length(dim_names)) + ) + found_chunks_str <- apply(found_chunk_indices, 2, paste, collapse = '_') + + if (length(args) > 0) { + if ((length(args) %% 2) != 0) { + stop("Wrong number of parameters.") + } + expected_dim_names <- args[(1:(length(args) / 2) - 1) * 2 + 1] + if (any(!is.character(expected_dim_names))) { + stop("Expected dimension names in parameters at odd positions.") + } + dim_indices <- args[(1:(length(args) / 2) - 1) * 2 + 2] + if (!any(dim_indices == 'all')) { + stop("Expected one dimension index to be 'all'.") + } + dim_to_merge <- which(dim_indices == 'all') + if (length(dim_indices) > 1) { + if (!all(is.numeric(dim_indices[-dim_to_merge]))) { + stop("Expected all dimension index but one to be numeric.") + } + } + # Check expected dim names match dim names + ## TODO + # Merge indices that vary along dim_to_merge whereas other fixed by dim_indices + # REMOVE FILES + ## TODO + stop("Feature not implemented.") + } else { + chunks_indices <- 1:length(dim_names) + chunks_indices <- lapply(chunks_indices, function(x) sort(unique(found_chunk_indices[x, ]))) + names(chunks_indices) <- dim_names + + # Load all found chunks into the array 'array_of_chuks'. + array_dims <- sapply(chunks_indices, length) + names(array_dims) <- dim_names + array_of_chunks <- vector('list', prod(array_dims)) + dim(array_of_chunks) <- array_dims + array_of_chunks_indices <- array(1:prod(array_dims), array_dims) + for (i in 1:prod(array_dims)) { + chunk_indices <- which(array_of_chunks_indices == i, arr.ind = TRUE)[1, ] + j <- 1 + chunk_indices_on_file <- sapply(chunk_indices, + function(x) { + r <- chunks_indices[[j]][x] + j <<- j + 1 + r + }) + found_chunk <- which(found_chunks_str == paste(chunk_indices_on_file, + collapse = '_'))[1] + if (length(found_chunk) > 0) { + num_tries <- 5 + found <- FALSE + try_num <- 1 + while ((try_num <= num_tries) && !found) { + array_of_chunks[[i]] <- try({ + readRDS(paste0(shared_dir, '/', + chunk_files_original[found_chunk])) + }) + if (('try-error' %in% class(array_of_chunks[[i]]))) { + message("Waiting for an incomplete file transfer...") + Sys.sleep(5) + } else { + found <- TRUE + } + try_num <- try_num + 1 + } + if (!found) { + stop("Could not open one of the chunks. Might be a large chunk ", + "in transfer. Merge aborted, files have been preserved.") + } + } + } + + result[[component]] <- startR:::.MergeArrayOfArrays(array_of_chunks) + rm(array_of_chunks) + gc() + } + } + + if (remove) { + sapply(all_chunk_files_original, + function(x) { + file.remove(paste0(shared_dir, '/', x)) + }) + } + + result +}