From 94d069dd426d13fc654578fc265c70bb5e873195 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 5 Mar 2021 13:03:54 +0100 Subject: [PATCH 01/14] The wrapper works with retrieve = T & F. Only compute locally. Submit jobs to HPCs haven't finished. --- R/ByChunks.R | 64 +++++++++++ R/Start.R | 89 +++++++++++---- R/Utils.R | 21 +++- R/multiStart.R | 288 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 442 insertions(+), 20 deletions(-) create mode 100644 R/multiStart.R diff --git a/R/ByChunks.R b/R/ByChunks.R index dd10112..32f44bb 100644 --- a/R/ByChunks.R +++ b/R/ByChunks.R @@ -708,6 +708,20 @@ ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', for (call_dim in names(attr(start_call, 'Dimensions'))[dims_to_alter]) { start_call[[call_dim]] <- .chunk(chunk_indices[call_dim], chunks[[call_dim]], eval(start_call[[call_dim]])) + #========================================= + # multiStart is used + # If n_chunks > 1 & call_dim has NA, extra process in Start() when chunking + if (!is.null(attr(start_call, 'na_pos_record'))) { + if (chunks[[call_dim]] > 1 & + attr(start_call, 'na_pos_record')[[1]][[call_dim]] != 0) { + attr(start_call[[call_dim]], "chunk") <- c(attr(start_call[[call_dim]], "chunk"), chunk_with_NA = 1) + # put na_pos_record in attributes + attr(start_call[[call_dim]], "na_pos_record") <- attributes(start_call)$na_pos_record[[1]][[call_dim]] + } else { + attr(start_call[[call_dim]], "chunk") <- c(attr(start_call[[call_dim]], "chunk"), chunk_with_NA = 0) + } + } + #========================================== } } start_call[['silent']] <- !debug @@ -715,6 +729,56 @@ ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', start_call[['num_procs']] <- threads_load } data[[input]] <- eval(start_call) + + #====================================================== + # If multiStart() is used, deal with NA insertion here + if (!is.null(attr(start_call, 'na_pos_record'))) { + na_pos_record_list <- attr(start_call, 'na_pos_record')[[1]] + if (any(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any)))) { + have_na_dims <- which(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any))) + for (have_na_dim_name in names(have_na_dims)) { + + if (!have_na_dim_name %in% names(chunks)) { + not_chunked <- TRUE + } else if (chunks[[have_na_dim_name]] == 1) { + not_chunked <- TRUE + } else { + not_chunked <- FALSE + } + # If the dim is not chunked, insert NAs directly according to the list + if (not_chunked) { + data[[input]] <- Apply(data[[input]], + target_dims = have_na_dim_name, + output_dims = have_na_dim_name, + fun = .insertNAdim, + na_pos = na_pos_record_list[[have_na_dim_name]] + )$output1 + + } else { # chunk along this dim. Need to examine piece by piece. + dim_na_pos_record <- attr(start_call[[have_na_dim_name]], "na_pos_record") + chunk_vector <- attr(start_call[[have_na_dim_name]], "chunk") + sub_chunk_indices <- chunk_indices(length(start_call[[have_na_dim_name]]) + length(dim_na_pos_record), + chunk = chunk_vector[1], + n_chunks = chunk_vector[2], + have_na_dim_name) + without_NA_dim_length <- dim(data[[input]])[have_na_dim_name] + if (without_NA_dim_length != length(sub_chunk_indices)) { # need to insert NA + sub_na_pos <- which(sub_chunk_indices %in% dim_na_pos_record) + data[[input]] <- Apply(data[[input]], + target_dims = have_na_dim_name, + output_dims = have_na_dim_name, + fun = .insertNAdim, + na_pos = sub_na_pos + )$output1 + + } + } + } + } + + } + #========================================================= + } t_end_load <- Sys.time() timings[['load']] <- c(timings[['load']], diff --git a/R/Start.R b/R/Start.R index 80e1c83..2b8dca4 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2970,28 +2970,79 @@ Start <- function(..., # dim = indices/selectors, } } - ## This 'if' runs in both Start() and Compute(). In Start(), it doesn't have any effect (no chunk). - ## In Compute(), it creates the indices for each chunk. For example, if 'sub_array_of_indices' - ## is c(5:10) and chunked into 2, 'sub_array_of_indices' becomes c(5:7) for chunk = 1, c(8:10) - ## for chunk = 2. If 'sub_array_of_indices' is list(55, 62) and chunked into 2, it becomes - ## list(55, 58) for chunk = 1 and list(59, 62) for chunk = 2. - ## TODO: The list can be turned into vector here? So afterward no need to judge if it is list - ## or vector. - if (!is.list(sub_array_of_indices)) { - sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices), + #============================================================ + ## This part runs in both Start() and Compute(). + ## In Start(), it doesn't have any effect (no chunk). + ## In Compute(), it creates the indices for each chunk. For example, + ## if 'sub_array_of_indices' is c(5:10) and chunked into 2, + ## 'sub_array_of_indices' becomes c(5:7) for chunk = 1, c(8:10) + ## for chunk = 2. If 'sub_array_of_indices' is list(55, 62) + ## and chunked into 2, it becomes list(55, 58) for chunk = 1 + ## and list(59, 62) for chunk = 2. + + # Add one "if" to reduce redundant work. If n_chunks = 1, no need to chunk + if (chunks[[inner_dim]]["n_chunks"] > 1) { + # multiStart: If inner_dim needs to be chunked & has NAs, extra process + if (!(is.na(chunks[[inner_dim]]["chunk_with_NA"]) | + chunks[[inner_dim]]["chunk_with_NA"] == 0)) { + na_pos_record <- attr(dim_params[[inner_dim]], 'na_pos_record') # a vector + if (is.list(sub_array_of_indices)) { + # Turn list into vector + sub_array_of_indices <- c(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]) + sub_array_of_indices_is_list <- TRUE + } else { + sub_array_of_indices_is_list <- FALSE + } + + # Consider the real whole length (with NA) and chunk + indices_include_NA <- chunk_indices(length(sub_array_of_indices) + length(na_pos_record), + chunks[[inner_dim]]["chunk"], + chunks[[inner_dim]]["n_chunks"], + inner_dim) + # If the chunked piece has NA actually, remove and record it + if (any(indices_include_NA %in% na_pos_record)) { + sub_na_pos_record <- which(indices_include_NA %in% na_pos_record) + for (pos in 1:(length(indices_include_NA) - 1)) { + if (pos %in% sub_na_pos_record) { + indices_include_NA[c((pos + 1):length(indices_include_NA))] <- indices_include_NA[c((pos + 1):length(indices_include_NA))] - 1 + } + } + indices_exclude_NA <- indices_include_NA[-sub_na_pos_record] + } else { + indices_exclude_NA <- indices_include_NA + } + # Deduct if NAs exist in the previous chunks + if (any(na_pos_record < indices_include_NA[1])) { + indices_exclude_NA <- indices_exclude_NA - sum(na_pos_record < indices_include_NA[1]) + } + sub_array_of_indices <- sub_array_of_indices[indices_exclude_NA] + if (sub_array_of_indices_is_list) { + sub_array_of_indices <- list(sub_array_of_indices[1], sub_array_of_indices[length(sub_array_of_indices)]) + } + + } # extra process end + + else { # original code + if (!is.list(sub_array_of_indices)) { + sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices), chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], inner_dim)] - } else { - tmp <- chunk_indices(length(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]), - chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], - inner_dim) - vect <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] - sub_array_of_indices[[1]] <- vect[tmp[1]] - sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] - } - # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. - + } else { + ## TODO: The list can be turned into vector here? So afterward no need to judge if it is list + ## or vector. BUT saving as list is memory efficient. + tmp <- chunk_indices(length(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]), + chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], + inner_dim) + vect <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + sub_array_of_indices[[1]] <- vect[tmp[1]] + sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] + } + # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. + } + } # if n_chunks > 1 to avoid redundant work + #============================================================ + if (debug) { if (inner_dim %in% dims_to_check) { print("-> TRANSFORMATION REQUESTED?") diff --git a/R/Utils.R b/R/Utils.R index 3a6f6ea..d7fcdde 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -675,7 +675,17 @@ MergeArrays <- .MergeArrays array_dims <- (dim(array_of_arrays)) dim_names <- names(array_dims) - + + # Reorder if the arrays are not at the same dim order + array_of_arrays_dimnames <- lapply(lapply(array_of_arrays, dim), names) + same_dim_order <- sapply(array_of_arrays_dimnames, identical, array_of_arrays_dimnames[[1]]) + if (any(!same_dim_order)) { + arrays_to_correct <- which(!same_dim_order) + for (i in arrays_to_correct) { + array_of_arrays[[i]] <- s2dv::Reorder(array_of_arrays[[i]], array_of_arrays_dimnames[[1]]) + } + } + # Merge the chunks. for (dim_index in 1:length(dim_names)) { dim_sub_array_of_chunks <- dim_sub_array_of_chunk_indices <- NULL @@ -844,3 +854,12 @@ .KnownLatNames <- function() { known_lat_names <- c('lat', 'latitude', 'y', 'j', 'nav_lat') } + +.insertNAdim <- function(data, na_pos) { + # data: [time] + output <- rep(NA, length(data) + length(na_pos)) + output[-na_pos] <- data + + return(output) +} + diff --git a/R/multiStart.R b/R/multiStart.R new file mode 100644 index 0000000..6042b9b --- /dev/null +++ b/R/multiStart.R @@ -0,0 +1,288 @@ +#' How to use: +#' 1. Allow to read multiple datasets with different selectors +#' 2. A list of list is used when different selectors are required, just like 'dat'. +#' The items in each list must be 'name' and ''. 'name' is the +#' dataset name, same as the one in 'dat'. +#' 3. Only indices() is accepted. (If for values(), use selectorchecker to transform?) +#' 4. Allow to have NA in the selectors. Those NAs will be removed when being fed in +#' Start(), and be inserted back after Start(). +#' +#'NOTE: +#' 1. No reshape (merge_across_dim yes) +#' 2. The input check is not comprehensive +#' +#'@import abind multiApply +#'@export +multiStart <- function(..., # dim = indices/selectors, + # dim_var = 'var', + # dim_reorder = Sort/CircularSort, + # dim_tolerance = number, + # dim_depends = 'file_dim', + # dim_across = 'file_dim', + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, debug = FALSE) { + + dim_params <- list(...) + dim_params_list <- NULL + dim_names <- names(dim_params) + +#NOTE: The check can be removed. When entering Start(), the check will run as well. +#===============line 1020 (down)========================= + # Check pattern_dims + if (is.null(pattern_dims)) { + .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '", + dim_names[1], "' as 'pattern_dims'.")) + pattern_dims <- dim_names[1] + } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) { + pattern_dims <- unique(pattern_dims) + } else { + stop("Parameter 'pattern_dims' must be a vector of character strings.") + } + + # Find the pattern dimension with the pattern specifications + found_pattern_dim <- NULL + for (pattern_dim in pattern_dims) { + # Check all specifications in pattern_dim are valid + dat <- datasets <- dim_params[[pattern_dim]] + if (is.list(dat) || any(sapply(dat, is.list))) { + if (is.null(found_pattern_dim)) { + found_pattern_dim <- pattern_dim + } else { + stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.") + } + } + } + if (is.null(found_pattern_dim)) { + .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications.")) + found_pattern_dim <- pattern_dims[1] + } +#===============line 1056 (up)========================= + ndat <- length(dim_params[[found_pattern_dim]]) # 2 + dat_names <- sapply(dat, '[[', 'name') #c(hadgem3, mpi_esm) + dim_params_list <- vector('list', ndat) + dim_params_list <- lapply(dim_params_list, '[<-', vector('list', length(dim_params))) + names(dim_params_list) <- dat_names + dim_names <- names(dim_params) #c(dat, var, sdate,...) + + # Create an array for NA record + na_pos_record <- vector('list', ndat) + na_pos_record <- lapply(na_pos_record, '[<-', vector('list', length(dim_params))) + names(na_pos_record) <- dat_names #c(hadgem3, mpi_esm) + for (n_dat in 1:ndat) { + na_pos_record[[n_dat]] <- as.list(rep(0, length(dim_params))) + names(na_pos_record[[n_dat]]) <- dim_names + } + + for (param_ind in 1:length(dim_params)) { + dim_param <- dim_params[[param_ind]] # $ dat :List of 2; $ var : chr "tas" + if (is.list(dim_param) & all(lapply(lapply(dim_param, names), '[', 1) == 'name') & + (names(dim_params)[param_ind] != found_pattern_dim)) { + for (dat_ind in 1:ndat) { + # If NA exists, take it out and record in an array + if (any(is.na(dim_param[[dat_ind]][[2]]))) { + dim_params_list[[dat_ind]][[param_ind]] <- dim_param[[dat_ind]][[2]][!is.na(dim_param[[dat_ind]][[2]])] + attributes(dim_params_list[[dat_ind]][[param_ind]]) <- attributes(dim_param[[dat_ind]][[2]]) + na_pos_record[[dat_ind]][[param_ind]] <- which(is.na(dim_param[[dat_ind]][[2]])) + } else { + dim_params_list[[dat_ind]][[param_ind]] <- dim_param[[dat_ind]][[2]] # 2nd item is value, 1st item is dat name + } + } + } else { + if (names(dim_params)[param_ind] == found_pattern_dim) { + for (dat_ind in 1:ndat) { + dim_params_list[[dat_ind]][[param_ind]] <- dim_param[[dat_ind]] # vector + } + } else { + for (dat_ind in 1:ndat) { + dim_params_list[[dat_ind]][[param_ind]] <- dim_param # vector + } + } + } + } + # Put dim name back + for (dat_ind in 1:ndat) { + names(dim_params_list[[dat_ind]]) <- dim_names + } + + # Combine ... with other params + dim_params_list<- lapply(dim_params_list, c, + list(return_vars = return_vars, + synonims = synonims, + file_opener = file_opener, + file_var_reader = file_var_reader, + file_dim_reader = file_dim_reader, + file_data_reader = file_data_reader, + file_closer = file_closer, + transform = transform, + transform_params = transform_params, + transform_vars = transform_vars, + transform_extra_cells = transform_extra_cells, + apply_indices_after_transform = apply_indices_after_transform, + pattern_dims = pattern_dims, + metadata_dims = metadata_dims, + selector_checker = selector_checker, + merge_across_dims = merge_across_dims, + merge_across_dims_narm = merge_across_dims_narm, + split_multiselected_dims = split_multiselected_dims, + path_glob_permissive = path_glob_permissive, + largest_dims_length = largest_dims_length, + retrieve = retrieve, + num_procs = num_procs, + ObjectBigmemory = ObjectBigmemory, + silent = silent, + debug = debug)) + + # Put in Start() + res <- vector('list', ndat) # A list saving datasets separately + for (n_dat in 1:ndat) { + res[[n_dat]] <- do.call(Start, dim_params_list[[n_dat]]) + } + names(res) <- dat_names #c(hadgem3, mpi_esm) + + + if (retrieve) { + have_na <- as.logical(lapply(lapply(na_pos_record, unlist), sum)) # a logical vector + + for (n_dat in 1:ndat) { + if (have_na[n_dat]) { #need to insert NA + + # Save metadata for later use + saved_metadata <- attributes(res[[n_dat]]) #attr(res[[n_dat]], 'Variables') + + no_na_dim_name <- names(unlist(lapply(lapply(na_pos_record[[n_dat]], '==', 0), which))) + na_dim_name_ind <- which(!names(na_pos_record[[n_dat]]) %in% no_na_dim_name) + na_dims <- na_pos_record[[n_dat]][na_dim_name_ind] # list + correct_dim_order <- names(dim(res[[n_dat]])) # if reshape, here is diff from dim_names + for (na_dim in 1:length(na_dims)) { + na_dim_name <- names(na_dims)[na_dim] # 'time' + res[[n_dat]] <- Apply(res[[n_dat]], + target_dims = na_dim_name, + output_dims = na_dim_name, + fun = .insertNAdim, + na_pos = na_dims[[na_dim]] # a vector. b = c(2, 4) + )$output1 + res[[n_dat]] <- s2dv::Reorder(res[[n_dat]], correct_dim_order) + } + # Put metadata back + ## res[[n_dat]] already has 'dim' attributes. The rest is lost + lost_list_ind <- which(!names(saved_metadata) %in% names(attributes(res[[n_dat]]))) + attributes(res[[n_dat]]) <- c(attributes(res[[n_dat]]), + saved_metadata[lost_list_ind]) + } + + # Add na_pos_record list to metadata + attr(res[[n_dat]], 'na_pos_record')[[dat_names[n_dat]]] <- na_pos_record[[n_dat]] + } + + # If > 1 dataset, combine datasets. + if (length(res) > 1) { + bind_along <- which(dim_names == found_pattern_dim) + # Check if the dimensions are identical + array_dims <- lapply(res, dim) + names(array_dims) <- NULL + + if (!do.call(identical, array_dims)) { + stop("The expected dimensions of datasets are not identical. Cannot combine them together.") + } + abind_input_list <- c(res, along = bind_along) + combine_array <- do.call(abind::abind, abind_input_list) + names(dim(combine_array)) <- dim_names + + #---- restore metadata ---- + ## combine_array already has 'dim' attributes. The rest is lost + lost_list_ind <- which(!names(attributes(res[[1]])) %in% names(attributes(combine_array))) + attributes(combine_array) <- c(attributes(combine_array), + attributes(res[[1]])[lost_list_ind]) + + #---- Combine metadata which is not common ---- + #NOTE: $dim, $PatternDim, $class are common attributes. + attr_list <- vector('list', length = ndat) + for (n_dat in 1:ndat) { + attr_list[[n_dat]] <- attributes(res[[n_dat]]) + } + + # For the value is array + ## $Files + sublist <- lapply(attr_list, '[[', 'Files') # list(dat1 = files1, dat2 = files2) + bind_along <- which(names(dim(attr(res[[1]], 'Files'))) == found_pattern_dim) # 1 + attr(combine_array, 'Files') <- do.call(abind::abind, c(sublist, along = bind_along)) + names(dim(attr(combine_array, 'Files'))) <- names(dim(attr_list[[1]]$Files)) + + ## $NotFoundFiles + sublist <- lapply(attr_list, '[[', 'NotFoundFiles') + if (!is.null(unlist(sublist))) { #Some of the dats have NotFoundFiles + attr(combine_array, 'NotFoundFiles') <- do.call(abind::abind, c(sublist, along = bind_along)) #bind_along is from $Files above + names(attr(combine_array, 'NotFoundFiles')) <- names(dim(attr_list[[1]]$Files)) + } + # For the value is list + ## $FileSelectors + attr(combine_array, 'FileSelectors') <- lapply(lapply(attr_list, '[', 'FileSelectors'), + '[[', 1) + names(attr(combine_array, 'FileSelectors')) <- dat_names + ## $ObjectBigmemory + attr(combine_array, 'ObjectBigmemory') <- lapply(lapply(attr_list, '[', 'ObjectBigmemory'), + '[[', 1) + names(attr(combine_array, 'ObjectBigmemory')) <- dat_names + ## $na_pos_record + attr(combine_array, 'na_pos_record') <- lapply(lapply(attr_list, '[[', 'na_pos_record'), + '[[', 1) + names(attr(combine_array, 'na_pos_record')) <- dat_names + + for (n_dat in 2:ndat) { + # $Variables + ## For $common, use the 1st data. Therefore, just add 2nd data $dat2 into + ## the metadata of 1st data + if (!all(names(attr(res[[n_dat]], 'Variables')) %in% 'common')) { + dat_list_name <- names(attr(res[[n_dat]], 'Variables'))[which(!names(attr(res[[n_dat]], 'Variables')) %in% 'common')] #'dat2' + attr(combine_array, 'Variables')[[dat_list_name]] <- attr(res[[n_dat]], 'Variables')[[dat_list_name]] + } + } + + return(combine_array) + + } else { # only 1 dataset + return(res[[1]]) + } + + } else { # retrieve = FALSE + for (n_dat in 1:ndat) { + # Add na_pos_record list to metadata + attr(res[[n_dat]], 'na_pos_record')[[dat_names[n_dat]]] <- na_pos_record[[n_dat]] + + # Correct dimensions if NA exists + if (sum(unlist(na_pos_record[[n_dat]])) != 0) { + have_na_dims <- which(unlist(lapply(lapply(na_pos_record[[n_dat]], '!=', 0), any))) + for (have_na_dim in 1:length(have_na_dims)) { + attr(res[[n_dat]], 'Dimensions')[have_na_dims[have_na_dim]] <- attr(res[[n_dat]], 'Dimensions')[have_na_dims[have_na_dim]] + length(na_pos_record[[n_dat]][[have_na_dims[have_na_dim]]]) + } + } + + } + + return(res) + } + +} + -- GitLab From 0d3b24b4f765942c36ddca13f8e2fb82cb5d22d6 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 15 Mar 2021 16:25:15 +0100 Subject: [PATCH 02/14] Can use multiStart and submit jobs to HPCs now. --- R/Start.R | 18 +++++- R/multiStart.R | 24 ++++++-- inst/chunking/load_process_save_chunk.R | 79 ++++++++++++++++++++++++- 3 files changed, 111 insertions(+), 10 deletions(-) diff --git a/R/Start.R b/R/Start.R index 2b8dca4..867a238 100644 --- a/R/Start.R +++ b/R/Start.R @@ -836,6 +836,15 @@ Start <- function(..., # dim = indices/selectors, Subset <- ClimProjDiags::Subset dim_params <- list(...) + + # If multiStart is used. matchcall is the correct Start call argument. + # In multiStart(), do.call is used and it creates incorrect call argument that + # makes trouble when retrive = F & job is sent to HPCs. + if ('matchcall' %in% names(dim_params)) { + matchcall <- dim_params[['matchcall']] + dim_params[['matchcall']] <- NULL + } + # Take *_var parameters apart var_params <- take_var_params(dim_params) @@ -4375,7 +4384,14 @@ Start <- function(..., # dim = indices/selectors, if (!silent) { .message("Successfully discovered data dimensions.") } + + # If multiStart is used. Use 'matchcall' instead of generating it here. + if (exists('matchcall')) { + start_call <- matchcall + } else { # original code start_call <- match.call() + } + for (i in 2:length(start_call)) { if (class(start_call[[i]]) %in% c('name', 'call')) { start_call[[i]] <- eval.parent(start_call[[i]]) @@ -4415,11 +4431,9 @@ Start <- function(..., # dim = indices/selectors, ### TODO: Specify dependencies as parameter # suppressPackageStartupMessages({library(ncdf4)}) - #print("1") store_indices <- as.list(work_piece[['store_position']]) first_round_indices <- work_piece[['first_round_indices']] second_round_indices <- work_piece[['second_round_indices']] - #print("2") file_to_open <- work_piece[['file_path']] sub_array <- file_data_reader(file_to_open, NULL, work_piece[['file_selectors']], diff --git a/R/multiStart.R b/R/multiStart.R index 6042b9b..0be67a7 100644 --- a/R/multiStart.R +++ b/R/multiStart.R @@ -112,7 +112,7 @@ multiStart <- function(..., # dim = indices/selectors, } else { if (names(dim_params)[param_ind] == found_pattern_dim) { for (dat_ind in 1:ndat) { - dim_params_list[[dat_ind]][[param_ind]] <- dim_param[[dat_ind]] # vector + dim_params_list[[dat_ind]][[param_ind]] <- list(dim_param[[dat_ind]]) } } else { for (dat_ind in 1:ndat) { @@ -154,13 +154,27 @@ multiStart <- function(..., # dim = indices/selectors, silent = silent, debug = debug)) - # Put in Start() + #===============Put in Start()===================== res <- vector('list', ndat) # A list saving datasets separately + ## matchcall needs to be modified and put in Start call. + ## Because do.call is used below, match.call() in Start() will generate incorrect + ## call argument (it is correct only if Start(...) is used directly.) + matchcall <- match.call() + matchcall[[1]] <- as.name('Start') + for (n_dat in 1:ndat) { - res[[n_dat]] <- do.call(Start, dim_params_list[[n_dat]]) + + # Revise matchcall for each Start call + for (item in dim_names) { #only need to check '...' + if (!identical(matchcall[[item]], dim_params_list[[n_dat]][[item]])) { + matchcall[[item]] <- dim_params_list[[n_dat]][[item]] + } + } + + res[[n_dat]] <- do.call(Start, c(dim_params_list[[n_dat]], matchcall = quote(matchcall))) } - names(res) <- dat_names #c(hadgem3, mpi_esm) - + names(res) <- dat_names #e.g., c(hadgem3, mpi_esm) + #===================================================== if (retrieve) { have_na <- as.logical(lapply(lapply(na_pos_record, unlist), sum)) # a logical vector diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index 55e47f0..ae01073 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -5,10 +5,7 @@ if (!is.null(lib_dir)) { } .libPaths(new = lib_dir) } -library(startR) - out_dir <- - debug <- start_calls <- start_calls_attrs <- @@ -65,6 +62,20 @@ for (input in 1:length(data)) { for (call_dim in names_dims_to_alter) { start_call[[call_dim]] <- startR:::.chunk(chunk_indices[call_dim], chunks[[call_dim]], eval(start_call[[call_dim]])) + #========================================= + # multiStart is used + # If n_chunks > 1 & call_dim has NA, extra process in Start() when chunking + if (!is.null(start_calls_attrs[[input]][['na_pos_record']])) { + if (chunks[[call_dim]] > 1 & + start_calls_attrs[[input]][['na_pos_record']][[1]][[call_dim]] != 0) { + attr(start_call[[call_dim]], "chunk") <- c(attr(start_call[[call_dim]], "chunk"), chunk_with_NA = 1) + # put na_pos_record in attributes + attr(start_call[[call_dim]], "na_pos_record") <- start_calls_attrs[[input]][['na_pos_record']][[1]][[call_dim]] + } else { + attr(start_call[[call_dim]], "chunk") <- c(attr(start_call[[call_dim]], "chunk"), chunk_with_NA = 0) + } + } + #========================================== } } if (!('num_procs' %in% names(start_call))) { @@ -98,6 +109,68 @@ for (input in 1:length(data)) { #write.table(attributes(data[[input]])$ObjectBigmemory, # file = paste0(task_path, '.filename.txt'), # col.names = FALSE, row.names = FALSE, quote = FALSE) + + #====================================================== + # If multiStart() is used, deal with NA insertion here + if (!is.null(attr(start_call, 'na_pos_record'))) { + na_pos_record_list <- start_calls_attrs[[input]][['na_pos_record']][[1]] + if (any(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any)))) { + have_na_dims <- which(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any))) + for (have_na_dim_name in names(have_na_dims)) { + + if (!have_na_dim_name %in% names(chunks)) { + not_chunked <- TRUE + } else if (chunks[[have_na_dim_name]] == 1) { + not_chunked <- TRUE + } else { + not_chunked <- FALSE + } + # If the dim is not chunked, insert NAs directly according to the list +Apply <- multiApply::Apply + if (not_chunked) { + + data[[input]] <- do.call("Apply", + c( + list(data[[input]], + target_dims = have_na_dim_name, + fun = startR:::.insertNAdim, + output_dims = have_na_dim_name, + na_pos = na_pos_record_list[[have_na_dim_name]], + ncores = threads_compute) + ) + )$output1 + + + } else { # chunk along this dim. Need to examine piece by piece. + dim_na_pos_record <- attr(start_call[[have_na_dim_name]], "na_pos_record") + chunk_vector <- attr(start_call[[have_na_dim_name]], "chunk") + sub_chunk_indices <- startR:::chunk_indices(length(start_call[[have_na_dim_name]]) + length(dim_na_pos_record), + chunk = chunk_vector[1], + n_chunks = chunk_vector[2], + have_na_dim_name) + without_NA_dim_length <- dim(data[[input]])[have_na_dim_name] + if (without_NA_dim_length != length(sub_chunk_indices)) { # need to insert NA + sub_na_pos <- which(sub_chunk_indices %in% dim_na_pos_record) + + data[[input]] <- do.call("Apply", + c( + list(data[[input]], + target_dims = have_na_dim_name, + fun = startR:::.insertNAdim, + output_dims = have_na_dim_name, + na_pos = sub_na_pos, + ncores = threads_compute) + ) + )$output1 + } + } + } + } + + } + #========================================================= + + } t_end_load <- Sys.time() t_load <- as.numeric(difftime(t_end_load, t_begin_load, units = 'secs')) -- GitLab From 22c6a159d30d508d1ef7c2c05e2aa081e13bf242 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 15 Mar 2021 16:34:29 +0100 Subject: [PATCH 03/14] Add accidentally deleted lines back and correct some lines. --- inst/chunking/load_process_save_chunk.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index ae01073..6bce181 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -5,6 +5,7 @@ if (!is.null(lib_dir)) { } .libPaths(new = lib_dir) } +library(startR) out_dir <- debug <- start_calls <- @@ -112,7 +113,7 @@ for (input in 1:length(data)) { #====================================================== # If multiStart() is used, deal with NA insertion here - if (!is.null(attr(start_call, 'na_pos_record'))) { + if (!is.null(start_calls_attrs[[input]][['na_pos_record']])) { na_pos_record_list <- start_calls_attrs[[input]][['na_pos_record']][[1]] if (any(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any)))) { have_na_dims <- which(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any))) @@ -169,8 +170,6 @@ Apply <- multiApply::Apply } #========================================================= - - } t_end_load <- Sys.time() t_load <- as.numeric(difftime(t_end_load, t_begin_load, units = 'secs')) -- GitLab From 325a5b0f9a14ef06e893d19bdbe4a6b4ac516203 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 15 Mar 2021 19:29:19 +0100 Subject: [PATCH 04/14] Update Rd file --- DESCRIPTION | 2 +- NAMESPACE | 1 + man/AddStep.Rd | 1 - man/CDORemapper.Rd | 1 - man/Collect.Rd | 1 - man/Compute.Rd | 16 +- man/NcCloser.Rd | 1 - man/NcDataReader.Rd | 10 +- man/NcDimReader.Rd | 10 +- man/NcOpener.Rd | 1 - man/NcVarReader.Rd | 10 +- man/SelectorChecker.Rd | 4 +- man/Sort.Rd | 9 +- man/Start.Rd | 922 +++++++++++++++++++++-------------------- man/Step.Rd | 10 +- man/indices.Rd | 1 - man/multiStart.Rd | 47 +++ man/values.Rd | 1 - 18 files changed, 563 insertions(+), 485 deletions(-) create mode 100644 man/multiStart.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 5a33c3c..2dfda5d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,4 +38,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true SystemRequirements: cdo ecFlow -RoxygenNote: 5.0.0 +RoxygenNote: 7.0.1 diff --git a/NAMESPACE b/NAMESPACE index ccf783c..a61238a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(Sort) export(Start) export(Step) export(indices) +export(multiStart) export(values) import(PCICt) import(abind) diff --git a/man/AddStep.Rd b/man/AddStep.Rd index 3eece05..0d0ce46 100644 --- a/man/AddStep.Rd +++ b/man/AddStep.Rd @@ -54,4 +54,3 @@ create the complete workflow. It is the final step before data processing. wf <- AddStep(data, step, pi_val = pi_short) } - diff --git a/man/CDORemapper.Rd b/man/CDORemapper.Rd index 4f56baa..763be77 100644 --- a/man/CDORemapper.Rd +++ b/man/CDORemapper.Rd @@ -65,4 +65,3 @@ perform the interpolation, hence CDO is required to be installed. \seealso{ \code{\link[s2dverification]{CDORemap}} } - diff --git a/man/Collect.Rd b/man/Collect.Rd index 44a7dee..97b529b 100644 --- a/man/Collect.Rd +++ b/man/Collect.Rd @@ -83,4 +83,3 @@ of results as one data array when the execution is done. See more details on } } - diff --git a/man/Compute.Rd b/man/Compute.Rd index e07106a..7d6db4d 100644 --- a/man/Compute.Rd +++ b/man/Compute.Rd @@ -4,9 +4,18 @@ \alias{Compute} \title{Specify the execution parameters and trigger the execution} \usage{ -Compute(workflow, chunks = "auto", threads_load = 1, threads_compute = 1, - cluster = NULL, ecflow_suite_dir = NULL, ecflow_server = NULL, - silent = FALSE, debug = FALSE, wait = TRUE) +Compute( + workflow, + chunks = "auto", + threads_load = 1, + threads_compute = 1, + cluster = NULL, + ecflow_suite_dir = NULL, + ecflow_server = NULL, + silent = FALSE, + debug = FALSE, + wait = TRUE +) } \arguments{ \item{workflow}{A list of the class 'startR_workflow' returned by function @@ -104,4 +113,3 @@ arrays and additional metadata. res <- Compute(wf, chunks = list(longitude = 4, sdate = 2)) } - diff --git a/man/NcCloser.Rd b/man/NcCloser.Rd index 65beab8..588f63a 100644 --- a/man/NcCloser.Rd +++ b/man/NcCloser.Rd @@ -32,4 +32,3 @@ NcCloser(connection) \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcDimReader}} \code{\link{NcVarReader}} } - diff --git a/man/NcDataReader.Rd b/man/NcDataReader.Rd index a6d32c7..9014789 100644 --- a/man/NcDataReader.Rd +++ b/man/NcDataReader.Rd @@ -4,8 +4,13 @@ \alias{NcDataReader} \title{NetCDF file data reader for 'startR'} \usage{ -NcDataReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDataReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -61,4 +66,3 @@ in turn uses nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDimReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcDimReader.Rd b/man/NcDimReader.Rd index d539ffd..38dd870 100644 --- a/man/NcDimReader.Rd +++ b/man/NcDimReader.Rd @@ -4,8 +4,13 @@ \alias{NcDimReader} \title{NetCDF dimension reader for 'startR'} \usage{ -NcDimReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDimReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ This function uses the function NcReadDims() in the package 'easyNCDF'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcOpener.Rd b/man/NcOpener.Rd index e46384c..30885fc 100644 --- a/man/NcOpener.Rd +++ b/man/NcOpener.Rd @@ -34,4 +34,3 @@ NcCloser(connection) \code{\link{NcDimReader}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcVarReader.Rd b/man/NcVarReader.Rd index c601907..fb093ae 100644 --- a/man/NcVarReader.Rd +++ b/man/NcVarReader.Rd @@ -4,8 +4,13 @@ \alias{NcVarReader} \title{NetCDF variable reader for 'startR'} \usage{ -NcVarReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - var_name = NULL, synonims) +NcVarReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + var_name = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcDimReader}} } - diff --git a/man/SelectorChecker.Rd b/man/SelectorChecker.Rd index ef83575..e1cf112 100644 --- a/man/SelectorChecker.Rd +++ b/man/SelectorChecker.Rd @@ -4,8 +4,7 @@ \alias{SelectorChecker} \title{Translate a set of selectors into a set of numeric indices} \usage{ -SelectorChecker(selectors, var = NULL, return_indices = TRUE, - tolerance = NULL) +SelectorChecker(selectors, var = NULL, return_indices = TRUE, tolerance = NULL) } \arguments{ \item{selectors}{A vector or a list of two of numeric indices or variable @@ -50,4 +49,3 @@ sub_array_of_values <- seq(90, -90, length.out = 258)[2:257] SelectorChecker(sub_array_of_selectors, sub_array_of_values) } - diff --git a/man/Sort.Rd b/man/Sort.Rd index 9ab516e..25a92fe 100644 --- a/man/Sort.Rd +++ b/man/Sort.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/Sort.R \name{Sort} -\alias{CircularSort} \alias{Sort} +\alias{CircularSort} \title{Sort the coordinate variable values in a Start() call} \usage{ Sort(...) @@ -10,12 +10,12 @@ Sort(...) CircularSort(start, end, ...) } \arguments{ +\item{\dots}{Additional parameters to adjust the reorderig. See function +sort() for more details.} + \item{start}{A numeric indicating the lower bound of the circular range.} \item{end}{A numeric indicating the upper bound of the circular range.} - -\item{\dots}{Additional parameters to adjust the reorderig. See function -sort() for more details.} } \value{ A list of 2 containing: @@ -57,4 +57,3 @@ range. This is useful for circular coordinates such as the Earth longitudes. retrieve = FALSE) } - diff --git a/man/Start.Rd b/man/Start.Rd index 680168e..4f750ec 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -4,408 +4,36 @@ \alias{Start} \title{Declare, discover, subset and retrieve multidimensional distributed data sets} \usage{ -Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, - file_var_reader = NcVarReader, file_dim_reader = NcDimReader, - file_data_reader = NcDataReader, file_closer = NcCloser, - transform = NULL, transform_params = NULL, transform_vars = NULL, - transform_extra_cells = 2, apply_indices_after_transform = FALSE, - pattern_dims = NULL, metadata_dims = NULL, - selector_checker = SelectorChecker, merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, largest_dims_length = FALSE, - retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, - silent = FALSE, debug = FALSE) +Start( + ..., + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, + debug = FALSE +) } \arguments{ -\item{return_vars}{A named list where the names are the names of the -variables to be fetched in the files, and the values are vectors of -character strings with the names of the file dimension which to retrieve each -variable for, or NULL if the variable has to be retrieved only once -from any (the first) of the involved files.\cr\cr -Apart from retrieving a multidimensional data array, retrieving auxiliary -variables inside the files can also be needed. The parameter -'return_vars' allows for requesting such variables, as long as a -'file_var_reader' function is also specified in the call to -Start() (see documentation on the corresponding parameter). -\cr\cr -In the case of the the item sales example (see documentation on parameter -\code{\dots)}, the store location variable is requested with the parameter\cr -\code{return_vars = list(store_location = NULL)}.\cr This will cause -Start() to fetch once the variable 'store_location' and return it in -the component\cr \code{$Variables$common$store_location},\cr and will be an -array of character strings with the location names, with the dimensions -\code{c('store' = 100)}. Although useless in this example, we could ask -Start() to fetch and return such variable for each file along the -items dimension as follows: \cr -\code{return_vars = list(store_location = c('item'))}.\cr In that case, the -variable will be fetched once from a file of each of the items, and will be -returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. -\cr\cr -If a variable is requested along a file dimension that contains path pattern -specifications ('source' in the example), the fetched variable values will be -returned in the component\cr \code{$Variables$$}.\cr -For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item'))) -\cr # # Checking the structure of the returned variables -\cr # str(found_data$Variables) -\cr # Named list -\cr # ..$common: NULL -\cr # ..$sourceA: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # ..$sourceB: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # # Checking the dimensions of the returned variable -\cr # # for the source A -\cr # dim(found_data$Variables$sourceA) -\cr # item store -\cr # 3 3 -} -\cr\cr -The names of the requested variables do not necessarily have to match the -actual variable names inside the files. A list of alternative names to be -seeked can be specified via the parameter 'synonims'.} - -\item{synonims}{A named list where the names are the requested variable or -dimension names, and the values are vectors of character strings with -alternative names to seek for such dimension or variable.\cr\cr -In some requests, data from different sources may follow different naming -conventions for the dimensions or variables, or even files in the same source -could have varying names. This parameter is in order for Start() to -properly identify the dimensions or variables with different names. -\cr\cr -In the example used in parameter 'return_vars', it may be the case that -the two involved data sources follow slightly different naming conventions. -For example, source A uses 'sect' as name for the sections dimension, whereas -source B uses 'section'; source A uses 'store_loc' as variable name for the -store locations, whereas source B uses 'store_location'. This can be taken -into account as follows: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item')), -\cr # synonims = list( -\cr # section = c('sec', 'section'), -\cr # store_location = c('store_loc', -\cr # 'store_location') -\cr # )) -} -\cr} - -\item{file_opener}{A function that receives as a single parameter - 'file_path' a character string with the path to a file to be opened, - and returns an object with an open connection to the file (optionally with - header information) on success, or returns NULL on failure. -\cr\cr -This parameter takes by default NcOpener() (an opener function for NetCDF -files). -\cr\cr -See NcOpener() for a template to build a file opener for your own file -format.} - -\item{file_var_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, - \code{synonims} that returns an array with auxiliary data (i.e. data from a - variable) inside a file. Start() will provide automatically either a - 'file_path' or a 'file_object' to the 'file_var_reader' - function (the function has to be ready to work whichever of these two is - provided). The parameter 'file_selectors' will also be provided - automatically to the variable reader, containing a named list where the - names are the names of the file dimensions of the queried data set (see - documentation on \code{\dots}) and the values are single character strings - with the components used to build the path to the file being read (the one - provided in 'file_path' or 'file_object'). The parameter 'var_name' - will be filled in automatically by Start() also, with the name of one - of the variales to be read. The parameter 'synonims' will be filled in - with exactly the same value as provided in the parameter 'synonims' in - the call to Start(), and has to be used in the code of the variable - reader to check for alternative variable names inside the target file. The - 'file_var_reader' must return a (multi)dimensional array with named - dimensions, and optionally with the attribute 'variales' with other - additional metadata on the retrieved variable. -\cr\cr -Usually, the 'file_var_reader' should be a degenerate case of the -'file_data_reader' (see documentation on the corresponding parameter), -so it is recommended to code the 'file_data_reder' in first place. -\cr\cr -This parameter takes by default NcVarReader() (a variable reader function -for NetCDF files). -\cr\cr -See NcVarReader() for a template to build a variale reader for your own -file format.} - -\item{file_dim_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} - that returns a named numeric vector where the names are the names of the - dimensions of the multidimensional data array in the file and the values are - the sizes of such dimensions. Start() will provide automatically - either a 'file_path' or a 'file_object' to the - 'file_dim_reader' function (the function has to be ready to work - whichever of these two is provided). The parameter 'file_selectors' - will also be provided automatically to the dimension reader, containing a - named list where the names are the names of the file dimensions of the - queried data set (see documentation on \code{\dots}) and the values are - single character strings with the components used to build the path to the - file being read (the one provided in 'file_path' or 'file_object'). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and can optionally be used in advanced configurations. -\cr\cr -This parameter takes by default NcDimReader() (a dimension reader -function for NetCDF files). -\cr\cr -See NcDimReader() for (an advanced) template to build a dimension reader -for your own file format.} - -\item{file_data_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, - \code{inner_indices = NULL}, \code{synonims} that returns a subset of the - multidimensional data array inside a file (even if internally it is not an - array). Start() will provide automatically either a 'file_path' - or a 'file_object' to the 'file_data_reader' function (the - function has to be ready to work whichever of these two is provided). The - parameter 'file_selectors' will also be provided automatically to the - data reader, containing a named list where the names are the names of the - file dimensions of the queried data set (see documentation on \code{\dots}) - and the values are single character strings with the components used to - build the path to the file being read (the one provided in 'file_path' or - 'file_object'). The parameter 'inner_indices' will be filled in - automatically by Start() also, with a named list of numeric vectors, - where the names are the names of all the expected inner dimensions in a file - to be read, and the numeric vectors are the indices to be taken from the - corresponding dimension (the indices may not be consecutive nor in order). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and has to be used in the code of the data reader to check for alternative - dimension names inside the target file. The 'file_data_reader' must - return a (multi)dimensional array with named dimensions, and optionally with - the attribute 'variables' with other additional metadata on the retrieved - data. -\cr\cr -Usually, 'file_data_reader' should use 'file_dim_reader' -(see documentation on the corresponding parameter), so it is recommended to -code 'file_dim_reder' in first place. -\cr\cr -This parameter takes by default NcDataReader() (a data reader function -for NetCDF files). -\cr\cr -See NcDataReader() for a template to build a data reader for your own -file format.} - -\item{file_closer}{A function that receives as a single parameter - 'file_object' an open connection (as returned by 'file_opener') - to one of the files to be read, optionally with header information, and - closes the open connection. Always returns NULL. -\cr\cr -This parameter takes by default NcCloser() (a closer function for NetCDF -files). -\cr\cr -See NcCloser() for a template to build a file closer for your own file -format.} - -\item{transform}{A function with the header \code{dara_array}, -\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as -input, through the parameter \code{data_array}, a subset of a -multidimensional array (as returned by 'file_data_reader'), applies a -transformation to it and returns it, preserving the amount of dimensions but -potentially modifying their size. This transformation may require data from -other auxiliary variables, automatically provided to 'transform' -through the parameter 'variables', in the form of a named list where -the names are the variable names and the values are (multi)dimensional -arrays. Which variables need to be sent to 'transform' can be specified -with the parameter 'transform_vars' in Start(). The parameter -'file_selectors' will also be provided automatically to -'transform', containing a named list where the names are the names of -the file dimensions of the queried data set (see documentation on -\code{\dots}) and the values are single character strings with the -components used to build the path to the file the subset being processed -belongs to. The parameter \code{\dots} will be filled in with other -additional parameters to adjust the transformation, exactly as provided in -the call to Start() via the parameter 'transform_params'.} - -\item{transform_params}{A named list with additional parameters to be sent to -the 'transform' function (if specified). See documentation on parameter -'transform' for details.} - -\item{transform_vars}{A vector of character strings with the names of -auxiliary variables to be sent to the 'transform' function (if -specified). All the variables to be sent to 'transform' must also -have been requested as return variables in the parameter 'return_vars' -of Start().} - -\item{transform_extra_cells}{An integer of extra indices to retrieve from the -data set, beyond the requested indices in \code{\dots}, in order for -'transform' to dispose of additional information to properly apply -whichever transformation (if needed). As many as -'transform_extra_cells' will be retrieved beyond each of the limits for -each of those inner dimensions associated to a coordinate variable and sent -to 'transform' (i.e. present in 'transform_vars'). After -'transform' has finished, Start() will take again and return a -subset of the result, for the returned data to fall within the specified -bounds in \code{\dots}. The default value is 2.} - -\item{apply_indices_after_transform}{A logical value indicating when a -'transform' is specified in Start() and numeric indices are -provided for any of the inner dimensions that depend on coordinate variables, -these numeric indices can be made effective (retrieved) before applying the -transformation or after. The boolean flag allows to adjust this behaviour. -It takes FALSE by default (numeric indices are applied before sending -data to 'transform').} - -\item{pattern_dims}{A character string indicating the name of the dimension -with path pattern specifications (see \code{\dots} for details). If not -specified, Start() assumes the first provided dimension is the pattern -dimension, with a warning.} - -\item{metadata_dims}{A vector of character strings with the names of the file -dimensions which to return metadata for. As noted in 'file_data_reader', -the data reader can optionally return auxiliary data via the attribute -'variables' of the returned array. Start() by default returns the -auxiliary data read for only the first file of each source (or data set) in -the pattern dimension (see \code{\dots} for info on what the pattern -dimension is). However it can be configured to return the metadata for all -the files along any set of file dimensions. The default value is NULL, and -it will be assigned automatically as parameter 'pattern_dims'.} - -\item{selector_checker}{A function used internaly by Start() to -translate a set of selectors (values for a dimension associated to a -coordinate variable) into a set of numeric indices. It takes by default -SelectorChecker() and, in principle, it should not be required to -change it for customized file formats. The option to replace it is left open -for more versatility. See the code of SelectorChecker() for details on -the inputs, functioning and outputs of a selector checker.} - -\item{merge_across_dims}{A logical value indicating whether to merge -dimensions across which another dimension extends (according to the -'_across' parameters). Takes the value FALSE by default. For -example, if the dimension 'time' extends across the dimension 'chunk' and -\code{merge_across_dims = TRUE}, the resulting data array will only contain -only the dimension 'time' as long as all the chunks together.} - -\item{merge_across_dims_narm}{A logical value indicating whether to remove -the additional NAs from data when parameter 'merge_across_dims' is TRUE. -It is helpful when the length of the to-be-merged dimension is different -across another dimension. For example, if the dimension 'time' extends -across dimension 'chunk', and the time length along the first chunk is 2 -while along the second chunk is 10. Setting this parameter as TRUE can -remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} - -\item{split_multiselected_dims}{A logical value indicating whether to split a -dimension that has been selected with a multidimensional array of selectors -into as many dimensions as present in the selector array. The default value -is FALSE.} - -\item{path_glob_permissive}{A logical value or an integer specifying how many - folder levels in the path pattern, beginning from the end, the shell glob - expressions must be preserved and worked out for each file. The default - value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr -When specifying a path pattern for a dataset, it might contain shell glob -experissions. For each dataset, the first file matching the path pattern is -found, and the found file is used to work out fixed values for the glob -expressions that will be used for all the files of the dataset. However, in -some cases, the values of the shell glob expressions may not be constant for -all files in a dataset, and they need to be worked out for each file -involved.\cr\cr -For example, a path pattern could be as follows: \cr -\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving -\code{path_glob_permissive = FALSE} will trigger automatic seek of the - contents to replace the asterisks (e.g. the first asterisk matches with - \code{'bar'} and the second with \code{'baz'}. The found contents will be - used for all files in the dataset (in the example, the path pattern will be - fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if - any of the files in the dataset have other contents in the position of the - asterisks, Start() will not find them (in the example, a file like \cr - \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be - found). Setting \code{path_glob_permissive = 1} would preserve global - expressions in the latest level (in the example, the fixed path pattern - would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the - problematic file mentioned before would be found), but of course this would - slow down the Start() call if the dataset involves a large number of - files. Setting \code{path_glob_permissive = 2} would leave the original path - pattern with the original glob expressions in the 1st and 2nd levels (in the - example, both asterisks would be preserved, thus would allow Start() - to recognize files such as \cr - \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr -Note that each glob expression can only represent one possibility (Start() -chooses the first). Because /code{*} is not the tag, which means it cannot -be a dimension of the output array. Therefore, only one possibility can be -adopted. For example, if \cr -\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr -has two matches:\cr -\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr -\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr -only the first found file will be used.} - -\item{largest_dims_length}{A logical value or a named integer vector - indicating if Start() should examine all the files to get the largest - length of the inner dimensions (TRUE) or use the first valid file of each - dataset as the returned dimension length (FALSE). Since examining all the - files could be time-consuming, a vector can be used to explicitly specify - the expected length of the inner dimensions. For those inner dimensions not - specified, the first valid file will be used. The default value is FALSE.\cr\cr - This parameter is useful when the required files don't have consistent - inner dimension. For example, there are 10 required experimental data files - of a series of start dates. The data only contain 25 members for the first - 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, - the returned member dimension length will be 25 only. The 26th to 51st - members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, - the returned member dimension length will be 51. To save the resource, -\code{'largest_dims_length = c(member = 51)'} can also be used.} - -\item{retrieve}{A logical value indicating whether to retrieve the data -defined in the Start() call or to explore only its dimension lengths -and names, and the values for the file and inner dimensions. The default -value is FALSE.} - -\item{num_procs}{An integer of number of processes to be created for the -parallel execution of the retrieval/transformation/arrangement of the -multiple involved files in a call to Start(). If set to NULL, -takes the number of available cores (as detected by detectCores() in -the package 'future'). The default value is 1 (no parallel execution).} - -\item{ObjectBigmemory}{a character string to be included as part of the -bigmemory object name. This parameter is thought to be used internally by the -chunking capabilities of startR.} - -\item{silent}{A logical value of whether to display progress messages (FALSE) -or not (TRUE). The default value is FALSE.} - -\item{debug}{A logical value of whether to return detailed messages on the -progress and operations in a Start() call (TRUE) or not (FALSE). The -default value is FALSE.} - \item{\dots}{A selection of custemized parameters depending on the data format. When we retrieve data from one or a collection of data sets, the involved data can be perceived as belonging to a large multi-dimensional @@ -620,61 +248,450 @@ as they have been specified in the call. For example, the following call: \cr # variable = 'all') } \cr\cr -would return an array with the following dimensions: -\cr -\command{ -\cr # source month store item section variable -\cr # 1 24 100 3 2 2 -} +would return an array with the following dimensions: +\cr +\command{ +\cr # source month store item section variable +\cr # 1 24 100 3 2 2 +} +\cr\cr +Next, a more advanced example to retrieve data for only the sales records, for +the first section ('electronics'), for the 1st and 3rd items and for the +stores located in Barcelona (assuming the files contain the variable +'store_location' with the name of the city each of the 100 stores are located +at): +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr\cr +The defined names for the dimensions do not necessarily have to match the +names of the dimensions inside the file. Lists of alternative names to be +seeked can be defined in the parameter 'synonims'. +\cr\cr +If data from multiple sources (not necessarily following the same structure) +has to be retrieved, it can be done by providing a vector of character strings +with path pattern specifications, or, in the extended form, by providing a +list of lists with the components 'name' and 'path', and the name of the +dataset and path pattern as values, respectively. For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr} + +\item{return_vars}{A named list where the names are the names of the +variables to be fetched in the files, and the values are vectors of +character strings with the names of the file dimension which to retrieve each +variable for, or NULL if the variable has to be retrieved only once +from any (the first) of the involved files.\cr\cr +Apart from retrieving a multidimensional data array, retrieving auxiliary +variables inside the files can also be needed. The parameter +'return_vars' allows for requesting such variables, as long as a +'file_var_reader' function is also specified in the call to +Start() (see documentation on the corresponding parameter). +\cr\cr +In the case of the the item sales example (see documentation on parameter +\code{\dots)}, the store location variable is requested with the parameter\cr +\code{return_vars = list(store_location = NULL)}.\cr This will cause +Start() to fetch once the variable 'store_location' and return it in +the component\cr \code{$Variables$common$store_location},\cr and will be an +array of character strings with the location names, with the dimensions +\code{c('store' = 100)}. Although useless in this example, we could ask +Start() to fetch and return such variable for each file along the +items dimension as follows: \cr +\code{return_vars = list(store_location = c('item'))}.\cr In that case, the +variable will be fetched once from a file of each of the items, and will be +returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. +\cr\cr +If a variable is requested along a file dimension that contains path pattern +specifications ('source' in the example), the fetched variable values will be +returned in the component\cr \code{$Variables$$}.\cr +For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item'))) +\cr # # Checking the structure of the returned variables +\cr # str(found_data$Variables) +\cr # Named list +\cr # ..$common: NULL +\cr # ..$sourceA: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # ..$sourceB: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # # Checking the dimensions of the returned variable +\cr # # for the source A +\cr # dim(found_data$Variables$sourceA) +\cr # item store +\cr # 3 3 +} +\cr\cr +The names of the requested variables do not necessarily have to match the +actual variable names inside the files. A list of alternative names to be +seeked can be specified via the parameter 'synonims'.} + +\item{synonims}{A named list where the names are the requested variable or +dimension names, and the values are vectors of character strings with +alternative names to seek for such dimension or variable.\cr\cr +In some requests, data from different sources may follow different naming +conventions for the dimensions or variables, or even files in the same source +could have varying names. This parameter is in order for Start() to +properly identify the dimensions or variables with different names. +\cr\cr +In the example used in parameter 'return_vars', it may be the case that +the two involved data sources follow slightly different naming conventions. +For example, source A uses 'sect' as name for the sections dimension, whereas +source B uses 'section'; source A uses 'store_loc' as variable name for the +store locations, whereas source B uses 'store_location'. This can be taken +into account as follows: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item')), +\cr # synonims = list( +\cr # section = c('sec', 'section'), +\cr # store_location = c('store_loc', +\cr # 'store_location') +\cr # )) +} +\cr} + +\item{file_opener}{A function that receives as a single parameter + 'file_path' a character string with the path to a file to be opened, + and returns an object with an open connection to the file (optionally with + header information) on success, or returns NULL on failure. +\cr\cr +This parameter takes by default NcOpener() (an opener function for NetCDF +files). +\cr\cr +See NcOpener() for a template to build a file opener for your own file +format.} + +\item{file_var_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, + \code{synonims} that returns an array with auxiliary data (i.e. data from a + variable) inside a file. Start() will provide automatically either a + 'file_path' or a 'file_object' to the 'file_var_reader' + function (the function has to be ready to work whichever of these two is + provided). The parameter 'file_selectors' will also be provided + automatically to the variable reader, containing a named list where the + names are the names of the file dimensions of the queried data set (see + documentation on \code{\dots}) and the values are single character strings + with the components used to build the path to the file being read (the one + provided in 'file_path' or 'file_object'). The parameter 'var_name' + will be filled in automatically by Start() also, with the name of one + of the variales to be read. The parameter 'synonims' will be filled in + with exactly the same value as provided in the parameter 'synonims' in + the call to Start(), and has to be used in the code of the variable + reader to check for alternative variable names inside the target file. The + 'file_var_reader' must return a (multi)dimensional array with named + dimensions, and optionally with the attribute 'variales' with other + additional metadata on the retrieved variable. +\cr\cr +Usually, the 'file_var_reader' should be a degenerate case of the +'file_data_reader' (see documentation on the corresponding parameter), +so it is recommended to code the 'file_data_reder' in first place. +\cr\cr +This parameter takes by default NcVarReader() (a variable reader function +for NetCDF files). +\cr\cr +See NcVarReader() for a template to build a variale reader for your own +file format.} + +\item{file_dim_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} + that returns a named numeric vector where the names are the names of the + dimensions of the multidimensional data array in the file and the values are + the sizes of such dimensions. Start() will provide automatically + either a 'file_path' or a 'file_object' to the + 'file_dim_reader' function (the function has to be ready to work + whichever of these two is provided). The parameter 'file_selectors' + will also be provided automatically to the dimension reader, containing a + named list where the names are the names of the file dimensions of the + queried data set (see documentation on \code{\dots}) and the values are + single character strings with the components used to build the path to the + file being read (the one provided in 'file_path' or 'file_object'). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and can optionally be used in advanced configurations. +\cr\cr +This parameter takes by default NcDimReader() (a dimension reader +function for NetCDF files). +\cr\cr +See NcDimReader() for (an advanced) template to build a dimension reader +for your own file format.} + +\item{file_data_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, + \code{inner_indices = NULL}, \code{synonims} that returns a subset of the + multidimensional data array inside a file (even if internally it is not an + array). Start() will provide automatically either a 'file_path' + or a 'file_object' to the 'file_data_reader' function (the + function has to be ready to work whichever of these two is provided). The + parameter 'file_selectors' will also be provided automatically to the + data reader, containing a named list where the names are the names of the + file dimensions of the queried data set (see documentation on \code{\dots}) + and the values are single character strings with the components used to + build the path to the file being read (the one provided in 'file_path' or + 'file_object'). The parameter 'inner_indices' will be filled in + automatically by Start() also, with a named list of numeric vectors, + where the names are the names of all the expected inner dimensions in a file + to be read, and the numeric vectors are the indices to be taken from the + corresponding dimension (the indices may not be consecutive nor in order). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and has to be used in the code of the data reader to check for alternative + dimension names inside the target file. The 'file_data_reader' must + return a (multi)dimensional array with named dimensions, and optionally with + the attribute 'variables' with other additional metadata on the retrieved + data. +\cr\cr +Usually, 'file_data_reader' should use 'file_dim_reader' +(see documentation on the corresponding parameter), so it is recommended to +code 'file_dim_reder' in first place. \cr\cr -Next, a more advanced example to retrieve data for only the sales records, for -the first section ('electronics'), for the 1st and 3rd items and for the -stores located in Barcelona (assuming the files contain the variable -'store_location' with the name of the city each of the 100 stores are located -at): -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} +This parameter takes by default NcDataReader() (a data reader function +for NetCDF files). \cr\cr -The defined names for the dimensions do not necessarily have to match the -names of the dimensions inside the file. Lists of alternative names to be -seeked can be defined in the parameter 'synonims'. +See NcDataReader() for a template to build a data reader for your own +file format.} + +\item{file_closer}{A function that receives as a single parameter + 'file_object' an open connection (as returned by 'file_opener') + to one of the files to be read, optionally with header information, and + closes the open connection. Always returns NULL. \cr\cr -If data from multiple sources (not necessarily following the same structure) -has to be retrieved, it can be done by providing a vector of character strings -with path pattern specifications, or, in the extended form, by providing a -list of lists with the components 'name' and 'path', and the name of the -dataset and path pattern as values, respectively. For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} -\cr} +This parameter takes by default NcCloser() (a closer function for NetCDF +files). +\cr\cr +See NcCloser() for a template to build a file closer for your own file +format.} + +\item{transform}{A function with the header \code{dara_array}, +\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as +input, through the parameter \code{data_array}, a subset of a +multidimensional array (as returned by 'file_data_reader'), applies a +transformation to it and returns it, preserving the amount of dimensions but +potentially modifying their size. This transformation may require data from +other auxiliary variables, automatically provided to 'transform' +through the parameter 'variables', in the form of a named list where +the names are the variable names and the values are (multi)dimensional +arrays. Which variables need to be sent to 'transform' can be specified +with the parameter 'transform_vars' in Start(). The parameter +'file_selectors' will also be provided automatically to +'transform', containing a named list where the names are the names of +the file dimensions of the queried data set (see documentation on +\code{\dots}) and the values are single character strings with the +components used to build the path to the file the subset being processed +belongs to. The parameter \code{\dots} will be filled in with other +additional parameters to adjust the transformation, exactly as provided in +the call to Start() via the parameter 'transform_params'.} + +\item{transform_params}{A named list with additional parameters to be sent to +the 'transform' function (if specified). See documentation on parameter +'transform' for details.} + +\item{transform_vars}{A vector of character strings with the names of +auxiliary variables to be sent to the 'transform' function (if +specified). All the variables to be sent to 'transform' must also +have been requested as return variables in the parameter 'return_vars' +of Start().} + +\item{transform_extra_cells}{An integer of extra indices to retrieve from the +data set, beyond the requested indices in \code{\dots}, in order for +'transform' to dispose of additional information to properly apply +whichever transformation (if needed). As many as +'transform_extra_cells' will be retrieved beyond each of the limits for +each of those inner dimensions associated to a coordinate variable and sent +to 'transform' (i.e. present in 'transform_vars'). After +'transform' has finished, Start() will take again and return a +subset of the result, for the returned data to fall within the specified +bounds in \code{\dots}. The default value is 2.} + +\item{apply_indices_after_transform}{A logical value indicating when a +'transform' is specified in Start() and numeric indices are +provided for any of the inner dimensions that depend on coordinate variables, +these numeric indices can be made effective (retrieved) before applying the +transformation or after. The boolean flag allows to adjust this behaviour. +It takes FALSE by default (numeric indices are applied before sending +data to 'transform').} + +\item{pattern_dims}{A character string indicating the name of the dimension +with path pattern specifications (see \code{\dots} for details). If not +specified, Start() assumes the first provided dimension is the pattern +dimension, with a warning.} + +\item{metadata_dims}{A vector of character strings with the names of the file +dimensions which to return metadata for. As noted in 'file_data_reader', +the data reader can optionally return auxiliary data via the attribute +'variables' of the returned array. Start() by default returns the +auxiliary data read for only the first file of each source (or data set) in +the pattern dimension (see \code{\dots} for info on what the pattern +dimension is). However it can be configured to return the metadata for all +the files along any set of file dimensions. The default value is NULL, and +it will be assigned automatically as parameter 'pattern_dims'.} + +\item{selector_checker}{A function used internaly by Start() to +translate a set of selectors (values for a dimension associated to a +coordinate variable) into a set of numeric indices. It takes by default +SelectorChecker() and, in principle, it should not be required to +change it for customized file formats. The option to replace it is left open +for more versatility. See the code of SelectorChecker() for details on +the inputs, functioning and outputs of a selector checker.} + +\item{merge_across_dims}{A logical value indicating whether to merge +dimensions across which another dimension extends (according to the +'_across' parameters). Takes the value FALSE by default. For +example, if the dimension 'time' extends across the dimension 'chunk' and +\code{merge_across_dims = TRUE}, the resulting data array will only contain +only the dimension 'time' as long as all the chunks together.} + +\item{merge_across_dims_narm}{A logical value indicating whether to remove +the additional NAs from data when parameter 'merge_across_dims' is TRUE. +It is helpful when the length of the to-be-merged dimension is different +across another dimension. For example, if the dimension 'time' extends +across dimension 'chunk', and the time length along the first chunk is 2 +while along the second chunk is 10. Setting this parameter as TRUE can +remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} + +\item{split_multiselected_dims}{A logical value indicating whether to split a +dimension that has been selected with a multidimensional array of selectors +into as many dimensions as present in the selector array. The default value +is FALSE.} + +\item{path_glob_permissive}{A logical value or an integer specifying how many + folder levels in the path pattern, beginning from the end, the shell glob + expressions must be preserved and worked out for each file. The default + value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr +When specifying a path pattern for a dataset, it might contain shell glob +experissions. For each dataset, the first file matching the path pattern is +found, and the found file is used to work out fixed values for the glob +expressions that will be used for all the files of the dataset. However, in +some cases, the values of the shell glob expressions may not be constant for +all files in a dataset, and they need to be worked out for each file +involved.\cr\cr +For example, a path pattern could be as follows: \cr +\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving +\code{path_glob_permissive = FALSE} will trigger automatic seek of the + contents to replace the asterisks (e.g. the first asterisk matches with + \code{'bar'} and the second with \code{'baz'}. The found contents will be + used for all files in the dataset (in the example, the path pattern will be + fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if + any of the files in the dataset have other contents in the position of the + asterisks, Start() will not find them (in the example, a file like \cr + \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be + found). Setting \code{path_glob_permissive = 1} would preserve global + expressions in the latest level (in the example, the fixed path pattern + would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the + problematic file mentioned before would be found), but of course this would + slow down the Start() call if the dataset involves a large number of + files. Setting \code{path_glob_permissive = 2} would leave the original path + pattern with the original glob expressions in the 1st and 2nd levels (in the + example, both asterisks would be preserved, thus would allow Start() + to recognize files such as \cr + \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +Note that each glob expression can only represent one possibility (Start() +chooses the first). Because /code{*} is not the tag, which means it cannot +be a dimension of the output array. Therefore, only one possibility can be +adopted. For example, if \cr +\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +has two matches:\cr +\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +only the first found file will be used.} + +\item{largest_dims_length}{A logical value or a named integer vector + indicating if Start() should examine all the files to get the largest + length of the inner dimensions (TRUE) or use the first valid file of each + dataset as the returned dimension length (FALSE). Since examining all the + files could be time-consuming, a vector can be used to explicitly specify + the expected length of the inner dimensions. For those inner dimensions not + specified, the first valid file will be used. The default value is FALSE.\cr\cr + This parameter is useful when the required files don't have consistent + inner dimension. For example, there are 10 required experimental data files + of a series of start dates. The data only contain 25 members for the first + 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, + the returned member dimension length will be 25 only. The 26th to 51st + members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, + the returned member dimension length will be 51. To save the resource, +\code{'largest_dims_length = c(member = 51)'} can also be used.} + +\item{retrieve}{A logical value indicating whether to retrieve the data +defined in the Start() call or to explore only its dimension lengths +and names, and the values for the file and inner dimensions. The default +value is FALSE.} + +\item{num_procs}{An integer of number of processes to be created for the +parallel execution of the retrieval/transformation/arrangement of the +multiple involved files in a call to Start(). If set to NULL, +takes the number of available cores (as detected by detectCores() in +the package 'future'). The default value is 1 (no parallel execution).} + +\item{ObjectBigmemory}{a character string to be included as part of the +bigmemory object name. This parameter is thought to be used internally by the +chunking capabilities of startR.} + +\item{silent}{A logical value of whether to display progress messages (FALSE) +or not (TRUE). The default value is FALSE.} + +\item{debug}{A logical value of whether to return detailed messages on the +progress and operations in a Start() call (TRUE) or not (FALSE). The +default value is FALSE.} } \value{ If \code{retrieve = TRUE} the involved data is loaded into RAM memory @@ -830,4 +847,3 @@ file format. retrieve = FALSE) } - diff --git a/man/Step.Rd b/man/Step.Rd index 65f0c72..c473ccb 100644 --- a/man/Step.Rd +++ b/man/Step.Rd @@ -4,8 +4,13 @@ \alias{Step} \title{Define the operation applied on declared data.} \usage{ -Step(fun, target_dims, output_dims, use_libraries = NULL, - use_attributes = NULL) +Step( + fun, + target_dims, + output_dims, + use_libraries = NULL, + use_attributes = NULL +) } \arguments{ \item{fun}{A function in R format defining the operation to be applied to the @@ -70,4 +75,3 @@ to the expected order for this function. wf <- AddStep(data, step) } - diff --git a/man/indices.Rd b/man/indices.Rd index a3d85ea..6233b71 100644 --- a/man/indices.Rd +++ b/man/indices.Rd @@ -39,4 +39,3 @@ original data. See details in the documentation of the parameter \code{\dots} \seealso{ \code{\link{values}} } - diff --git a/man/multiStart.Rd b/man/multiStart.Rd new file mode 100644 index 0000000..a4fa459 --- /dev/null +++ b/man/multiStart.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multiStart.R +\name{multiStart} +\alias{multiStart} +\title{How to use: +1. Allow to read multiple datasets with different selectors +2. A list of list is used when different selectors are required, just like 'dat'. + The items in each list must be 'name' and ''. 'name' is the + dataset name, same as the one in 'dat'. +3. Only indices() is accepted. (If for values(), use selectorchecker to transform?) +4. Allow to have NA in the selectors. Those NAs will be removed when being fed in + Start(), and be inserted back after Start().} +\usage{ +multiStart( + ..., + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, + debug = FALSE +) +} +\description{ +NOTE: +1. No reshape (merge_across_dim yes) +2. The input check is not comprehensive +} diff --git a/man/values.Rd b/man/values.Rd index 3300f19..31ce95a 100644 --- a/man/values.Rd +++ b/man/values.Rd @@ -41,4 +41,3 @@ coordinate variable. See details in the documentation of the parameter \seealso{ \code{\link{indices}} } - -- GitLab From 4171bf02e6a03a85483f26e5881110570dd95af8 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 16 Mar 2021 12:47:35 +0100 Subject: [PATCH 05/14] Create use cases for multiApply --- .../ex1_13_multiStart_insert_NA_reshape.R | 36 ++++ .../doc/usecase/ex2_11_multiStart_insert_NA.R | 192 ++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R create mode 100644 inst/doc/usecase/ex2_11_multiStart_insert_NA.R diff --git a/inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R b/inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R new file mode 100644 index 0000000..cb94d6b --- /dev/null +++ b/inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R @@ -0,0 +1,36 @@ +#------------------------------------------------- +# This case doesn't make much sense to insert an NA. But it shows the availability +# of multiStart() to deal with NAs while reshaping. If you have a more meaningful +# example, please let us know. + +#NOTE: It may not work with 'split_multiselected_dims'. Need further tests. +#------------------------------------------------- + +#=================================== +# merge_across_dims; two variables +#=================================== + +path <- '/esarchive/recon/ecmwf/erainterim/monthly_mean/$var$_f6h/$var$_$sdate$.nc' +sdate <- c('199801', '199802', '199803') +time <- indices(c(1, 2, NA, 3)) +res <- multiStart( + dat = list(list(name = 'erainterim', path = path)), + var = c('ps', 'tas'), + sdate = sdate, + time = list(list(name = 'erainterim', time = time)), + time_across = 'sdate', + merge_across_dims = TRUE, + lat = indices(1), lon = indices(1), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = TRUE) + +dim(res) +# dat var time lat lon +# 1 2 4 1 1 +res[1, , , 1, 1] + [,1] [,2] [,3] [,4] +[1,] 102528.9219 101670.4375 NA 102194.1484 +[2,] 244.3599 242.1576 NA 243.7157 + diff --git a/inst/doc/usecase/ex2_11_multiStart_insert_NA.R b/inst/doc/usecase/ex2_11_multiStart_insert_NA.R new file mode 100644 index 0000000..16e2e7c --- /dev/null +++ b/inst/doc/usecase/ex2_11_multiStart_insert_NA.R @@ -0,0 +1,192 @@ +#------------------------------------------------------ +# This use case shows the basic usage of multiStart(), the wrapper of Start() +# for multiple datasets retrieval that have different selector requirement. +# The usage is almost the same as Start(). To assign different selectors for +# each dataset, use the same format as the pattern_dim: a list of lists, first +# item 'name' is the dataset name and the second item '' is the selector. +# +# This use case wants to put HadGEM3 and MPI-ESM datasets together in one call. +# The major differences between them are the spatial grids and calendar type. +# HadGEM3 uses 360_day calendar and MPI-ESM uses standard calendar. For the grid +# difference, the 'tramsform' parameters can deal with it; while for the calendar +# difference, Start() has difficulty align the time steps. We cannot assign NAs +# as the selector in Start(), and Start() coarces the values to the nearest found +# values, producing the hidden errors. +# +# Therefore, we use multiStart() to insert NAs manually. Three days, 30th Dec., +# 31st Dec., and 1st Jan. are requested. HadGEM3 with 360_day calendar doesn't +# have 31st Dec., so we need to insert one NA at the 2nd time step. Be sure that +# the length (NA included) of two time selectors are equal. +# +# Here are some limitations for now: +# 1. The selector for each dataset can only be indices(), not values() or 'all'. +# If the selector is the common one (e.g., 'sdate', 'lat', and 'lon' in this +# use case), then it follows the rule of Start(). +# 2. The reshape options are not all available. See ex1_13 for details. +# 3. The checker of multiStart() is not comprehensive. Please carefully examine +# the results you get. +#------------------------------------------------------ + +#======================================================= +# Define the paths and selectors +#======================================================= + + # Define paths + # HadGEM3 daily data (360_day) + path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/', + 'dcppA-hindcast/r1i1p1f2/day/$var$/gn/v20200101/', + '$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$fyear$.nc') + ## tasmax_day_HadGEM3-GC31-MM_dcppA-hindcast_s2000-r1i1p1f2_gn_20001101-20101230.nc + + # MPI-ESM1.2-HR (standard) + path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/', + 'dcppA-hindcast/r1i1p1f1/day/$var$/gn/v20200101/', + '$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') + ## tasmax_day_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gn_20001101-20101231.nc + + # Define selectors for each dataset + sdate <- '2000' + fyear_hadgem3 <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1230') + fyear_mpi_esm <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1231') + # To align the time steps, insert an NA in hadgem3 + time_hadgem3 <- indices(c(60, NA, 61)) #12/30, NA, 1/1 + time_mpi_esm <- indices(60:62) #12/30, 12/31, 1/1 + + +#======================================================= +# retrieve = TRUE; interpolation +#======================================================= + + res <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', + sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + transform = CDORemapper, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = TRUE) + + # Check the returned values. hadgem3 has NA in the 2nd time step. + dim(res) + # dat var sdate fyear time lat lon + # 2 1 1 1 3 61 51 + res[,1,1,1,,10,10] + # [,1] [,2] [,3] + #[1,] 292.5023 NA 292.0899 + #[2,] 293.0914 293.4608 292.8224 + + + +#======================================================= +# retrieve = FALSE; Compute() +#======================================================= +#NOTE: To show a clearer concept of multiStart usage, the data are not interpolated +# here, but they should be. If interpolated, the chunking dimensions cannot be +# lat and lon (or the results contain many NAs.) + +res <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = indices(3:5), lon = indices(1), + return_vars = list(lat = 'dat', lon = 'dat', time = 'dat'), + retrieve = FALSE) + +str(res) +#List of 2 +# $ hadgem3: +# .. +# .. +# $ mpi_esm: +# .. +# .. + +str(attr(res[[1]], 'na_pos_record')) +#List of 1 +# $ hadgem3:List of 7 +# ..$ dat : num 0 +# ..$ var : num 0 +# ..$ sdate: num 0 +# ..$ fyear: num 0 +# ..$ time : int 2 +# ..$ lat : num 0 +# ..$ lon : num 0 + +str(attr(res[[2]], 'na_pos_record')) +#List of 1 +# $ mpi_esm:List of 7 +# ..$ dat : num 0 +# ..$ var : num 0 +# ..$ sdate: num 0 +# ..$ fyear: num 0 +# ..$ time : num 0 +# ..$ lat : num 0 +# ..$ lon : num 0 + + +func_sum <- function(x, y) { + return(x + y) +} + +# 'time' and 'fyear' can also be the operational dimension +step <- Step(func_sum, target_dims = list(hadgem3 = c('var', 'lon'), + mpi_esm = c('var', 'lon')), + output_dims = c('var', 'lon')) + +wf <- AddStep(res, step) + +# Option 1: run locally +output <- Compute(wf, chunks = list(lat = 2)) +output <- Compute(wf, chunks = list(lat = 2, time = 2)) + + +# Option 2: run on Nord3 +#-------------------user-defined--------------------- + queue_host <- 'nord1' + temp_dir <- '/gpfs/scratch/bsc32/bsc32734/startR_hpc/' + ecflow_suite_dir <- '/home/Earth/aho/startR_local/' +#---------------------------------------------------- + + output <- Compute(wf, + chunks = list(lat = 2, time = 2), + threads_load = 2, + threads_compute = 4, + cluster = list(queue_host = queue_host, + queue_type = 'lsf', + temp_dir = temp_dir, + cores_per_job = 2, + job_wallclock = '05:00', + max_jobs = 4, + extra_queue_params = list('#BSUB -q bsc_es'), + bidirectional = FALSE, + polling_period = 10 + ), + ecflow_suite_dir = ecflow_suite_dir, + wait = TRUE + ) + +dim(output$output1) +# var lon time dat sdate fyear lat +# 1 1 3 1 1 1 3 +output$output1[1, 1, , 1, 1, 1, ] +# [,1] [,2] [,3] +#[1,] 503.0177 502.6464 503.0280 +#[2,] NA NA NA +#[3,] 507.7226 507.1748 507.9795 + -- GitLab From a7605acaa326e350a55243f414d9eacfca3721bb Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 16 Mar 2021 12:57:25 +0100 Subject: [PATCH 06/14] Create use cases for multiStart. --- inst/doc/usecase.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index ac4e4ad..af88ea3 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -54,6 +54,10 @@ You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata This script shows how to load and plot data in rotated coordinates using **Monarch-dust** simulations. + 13. [Use multiStart to read data with NA assigned](inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R) + The script shows a simple case that uses multiStart to read data when NA insertion is needed in certain selectors. + + 2. **Execute computation (use `Compute()`)** 1. [Function working on time dimension](inst/doc/usecase/ex2_1_timedim.R) 2. [Function using attributes of the data](inst/doc/usecase/ex2_2_attr.R) @@ -72,8 +76,8 @@ You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata 10. [Apply an existing mask on data](inst/doc/usecase/ex2_10_existing_mask.R) This use case shows you how to apply the existing mask file on your data. If you need to create the mask file on your own, go to ex2_9_mask.R. - - + 11. [Use multiStart to read two datasets with different calendars in one call](inst/doc/usecase/ex2_11_multiStart_insert_NA.R) + This use case decribes the features of multiStart and how to use it. It shows how to read two datasets with different calendars aligned together with NA inserted. It is recommended to read it first if you want to use multiStart(). -- GitLab From e8c606db81e4a79ec067fb74f88ea22ce363b72e Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 16 Mar 2021 13:24:01 +0100 Subject: [PATCH 07/14] Add unit test for multiStart --- tests/testthat/test-multiStart_NA_insertion.R | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 tests/testthat/test-multiStart_NA_insertion.R diff --git a/tests/testthat/test-multiStart_NA_insertion.R b/tests/testthat/test-multiStart_NA_insertion.R new file mode 100644 index 0000000..0c95803 --- /dev/null +++ b/tests/testthat/test-multiStart_NA_insertion.R @@ -0,0 +1,146 @@ +context("multiStart() NA insertion") +# Same case as ex2_11. +#--------------------------------------------------------------- + + # Define paths + # HadGEM3 daily data (360_day) + path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/', + 'dcppA-hindcast/r1i1p1f2/day/$var$/gn/v20200101/', + '$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$fyear$.nc') + ## tasmax_day_HadGEM3-GC31-MM_dcppA-hindcast_s2000-r1i1p1f2_gn_20001101-20101230.nc + + # MPI-ESM1.2-HR (standard) + path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/', + 'dcppA-hindcast/r1i1p1f1/day/$var$/gn/v20200101/', + '$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') + ## tasmax_day_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gn_20001101-20101231.nc + + # Define selectors for each dataset + sdate <- '2000' + fyear_hadgem3 <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1230') + fyear_mpi_esm <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1231') + # To align the time steps, insert an NA in hadgem3 + time_hadgem3 <- indices(c(60, NA, 61)) #12/30, NA, 1/1 + time_mpi_esm <- indices(60:62) #12/30, 12/31, 1/1 + + + + +test_that("1. retrieve = T", { + +suppressWarnings( +res <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', + sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = values(list(-10, 10)), + lon = values(list(0, 10)), + transform = CDORemapper, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(0, 10, -10, 10)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = TRUE) +) + + +expect_equal( +dim(res), +c(dat = 2, var = 1, sdate = 1, fyear = 1, time = 3, lat = 21, lon = 11) +) +expect_equal( +length(which(is.na(res))), +231 +) +expect_equal( +res[1, 1, 1, 1, , 1, 1], +c(297.5703, NA, 297.5506), +tolerance = 0.0001 +) + +}) + +test_that("2. retrieve = F; Compute locally", { + +suppressWarnings( +res <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = indices(3:5), lon = indices(1), + return_vars = list(lat = 'dat', lon = 'dat', time = 'dat'), + retrieve = FALSE) +) + +#--------------------------------------- +expect_equal( +attr(res[[1]], 'Dimensions'), +c(dat = 1, var = 1, sdate = 1, fyear = 1, time = 3, lat = 3, lon = 1) +) +expect_equal( +attr(res[[2]], 'Dimensions'), +c(dat = 1, var = 1, sdate = 1, fyear = 1, time = 3, lat = 3, lon = 1) +) +expect_equal( +attr(res[[1]], 'na_pos_record')[[1]]$time, +2 +) +expect_equal( +sum(unlist(attr(res[[2]], 'na_pos_record')[[1]])), +0 +) + +#--------------------------------------- + +func_sum <- function(x, y) { + return(x + y) +} + +# 'time' and 'fyear' can also be the operational dimension +step <- Step(func_sum, target_dims = list(hadgem3 = c('var', 'lon'), + mpi_esm = c('var', 'lon')), + output_dims = c('var', 'lon')) + +wf <- AddStep(res, step) + +output1 <- Compute(wf, chunks = list(lat = 2)) +output2 <- Compute(wf, chunks = list(lat = 2, time = 2)) + +expect_equal( +dim(output1$output1), +c(var = 1, lon = 1, time = 3, dat = 1, sdate = 1, fyear = 1, lat = 3) +) +expect_equal( +dim(output2$output1), +c(var = 1, lon = 1, time = 3, dat = 1, sdate = 1, fyear = 1, lat = 3) +) +expect_equal( +output1$output1[1,1,,1,1,1,1], +c(503.0177, NA, 507.7226), +tolerance = 0.001 +) +expect_equal( +all(is.na(output1$output1[1,1,2,1,1,1,])), +TRUE +) + +expect_equal( +as.vector(output1$output1), +as.vector(output2$output1) +) + + + +}) -- GitLab From f0ab206dbca76e758ed9942c2aa36eea14eddb55 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 16 Mar 2021 17:21:23 +0100 Subject: [PATCH 08/14] Add some messages. --- R/multiStart.R | 59 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/R/multiStart.R b/R/multiStart.R index 0be67a7..f8f929f 100644 --- a/R/multiStart.R +++ b/R/multiStart.R @@ -171,9 +171,12 @@ multiStart <- function(..., # dim = indices/selectors, } } + message("* Use Start() to get dataset '", dat_names[n_dat], "' now...") res[[n_dat]] <- do.call(Start, c(dim_params_list[[n_dat]], matchcall = quote(matchcall))) + message("* Start() has successfully gotten dataset '", dat_names[n_dat], "'.") } names(res) <- dat_names #e.g., c(hadgem3, mpi_esm) + #===================================================== if (retrieve) { @@ -224,6 +227,26 @@ multiStart <- function(..., # dim = indices/selectors, combine_array <- do.call(abind::abind, abind_input_list) names(dim(combine_array)) <- dim_names + if (!silent) { + .message("Final dimension sizes after combination:") + final_dims_combined <- dim(combine_array) + longest_dim_len <- max(sapply(names(final_dims_combined), nchar)) + longest_size_len <- max(sapply(paste0(final_dims_combined, ''), nchar)) + sapply(names(final_dims_combined), + function(x) { + message(paste0("* ", paste(rep(' ', longest_dim_len - nchar(x)), collapse = ''), + x, ": ", paste(rep(' ', longest_size_len - nchar(paste0(final_dims_combined[x], ''))), collapse = ''), + final_dims_combined[x])) + }) + bytes <- prod(c(final_dims_combined, 8)) + + .message(paste("Total size of combined data:")) + dim_sizes <- paste(final_dims_combined, collapse = ' x ') + .message(paste(dim_sizes, " x 8 bytes =", + format(structure(bytes, class = "object_size"), units = "auto")), + indent = 2) + } + #---- restore metadata ---- ## combine_array already has 'dim' attributes. The rest is lost lost_list_ind <- which(!names(attributes(res[[1]])) %in% names(attributes(combine_array))) @@ -292,7 +315,41 @@ multiStart <- function(..., # dim = indices/selectors, attr(res[[n_dat]], 'Dimensions')[have_na_dims[have_na_dim]] <- attr(res[[n_dat]], 'Dimensions')[have_na_dims[have_na_dim]] + length(na_pos_record[[n_dat]][[have_na_dims[have_na_dim]]]) } } - + } + # Check if the datasets can be combined + dim_list <- lapply(1:ndat, function(x) { attr(res[[x]], 'Dimensions') } ) + found_pattern_dim_ind <-lapply(lapply( + lapply(dim_list, names), + '==', found_pattern_dim), which) + dim_list_without_dat <- lapply(1:ndat, function(x) { + dim_list[[x]] <- dim_list[[x]][-found_pattern_dim_ind[[x]]]}) + if (!do.call(identical, dim_list_without_dat)) { + .warning(paste0("The datasets are detected to have inconsistent dimensions. ", + "They may not be able to combine for the later operation.")) + } else { + if (!silent) { + dim_list_dat <- lapply(1:ndat, function(x) { + dim_list[[x]] <- dim_list[[x]][found_pattern_dim_ind[[x]]]}) + combined_dat_length <- do.call(sum, dim_list_dat) + final_dims_combined <- attr(res[[1]], 'Dimensions') + final_dims_combined[found_pattern_dim] <- do.call(sum, dim_list_dat) + .message("Detected final dimension sizes after combination:") + longest_dim_len <- max(sapply(names(final_dims_combined), nchar)) + longest_size_len <- max(sapply(paste0(final_dims_combined, ''), nchar)) + sapply(names(final_dims_combined), + function(x) { + message(paste0("* ", paste(rep(' ', longest_dim_len - nchar(x)), collapse = ''), + x, ": ", paste(rep(' ', longest_size_len - nchar(paste0(final_dims_combined[x], ''))), collapse = ''), + final_dims_combined[x])) + }) + bytes <- prod(c(final_dims_combined, 8)) + + .message(paste("Total size of combined data:")) + dim_sizes <- paste(final_dims_combined, collapse = ' x ') + .message(paste(dim_sizes, " x 8 bytes =", + format(structure(bytes, class = "object_size"), units = "auto")), + indent = 2) + } } return(res) -- GitLab From 7806ac8467951b8715f4c5a4866d2c1b3bd128a0 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 18 Mar 2021 14:56:09 +0100 Subject: [PATCH 09/14] Use transform for retrieve = F case --- .../doc/usecase/ex2_11_multiStart_insert_NA.R | 61 ++++++++++--------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/inst/doc/usecase/ex2_11_multiStart_insert_NA.R b/inst/doc/usecase/ex2_11_multiStart_insert_NA.R index 16e2e7c..7186e81 100644 --- a/inst/doc/usecase/ex2_11_multiStart_insert_NA.R +++ b/inst/doc/usecase/ex2_11_multiStart_insert_NA.R @@ -92,29 +92,36 @@ #======================================================= # retrieve = FALSE; Compute() #======================================================= -#NOTE: To show a clearer concept of multiStart usage, the data are not interpolated -# here, but they should be. If interpolated, the chunking dimensions cannot be -# lat and lon (or the results contain many NAs.) - -res <- multiStart( + res <- multiStart( dat = list(list(name = 'hadgem3', path = path_hadgem3), list(name = 'mpi_esm', path = path_mpi_esm)), - var = 'tasmax', sdate = '2000', + var = 'tasmax', + sdate = '2000', fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), list(name = 'mpi_esm', fyear = fyear_mpi_esm)), time = list(list(name = 'hadgem3', time = time_hadgem3), list(name = 'mpi_esm', time = time_mpi_esm)), - lat = indices(3:5), lon = indices(1), - return_vars = list(lat = 'dat', lon = 'dat', time = 'dat'), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + transform = CDORemapper, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), retrieve = FALSE) str(res) #List of 2 -# $ hadgem3: +# $ hadgem3: language Start ... +# ..- attr(*, "Dimensions") ... # .. +# ..- attr(*, "Variables") ... # .. -# $ mpi_esm: +# $ mpi_esm: language Start ... +# ..- attr(*, "Dimensions") ... # .. +# ..- attr(*, "Variables") ... # .. str(attr(res[[1]], 'na_pos_record')) @@ -139,32 +146,29 @@ str(attr(res[[2]], 'na_pos_record')) # ..$ lat : num 0 # ..$ lon : num 0 - func_sum <- function(x, y) { return(x + y) } # 'time' and 'fyear' can also be the operational dimension -step <- Step(func_sum, target_dims = list(hadgem3 = c('var', 'lon'), - mpi_esm = c('var', 'lon')), - output_dims = c('var', 'lon')) +step <- Step(func_sum, target_dims = list(hadgem3 = c('var'), + mpi_esm = c('var')), + output_dims = c('var')) wf <- AddStep(res, step) # Option 1: run locally -output <- Compute(wf, chunks = list(lat = 2)) -output <- Compute(wf, chunks = list(lat = 2, time = 2)) - +output <- Compute(wf, chunks = list(time = 2)) # Option 2: run on Nord3 #-------------------user-defined--------------------- - queue_host <- 'nord1' - temp_dir <- '/gpfs/scratch/bsc32/bsc32734/startR_hpc/' - ecflow_suite_dir <- '/home/Earth/aho/startR_local/' + queue_host <- 'nord3' + temp_dir <- '/gpfs/scratch/bsc32/bsc32/startR_hpc/' + ecflow_suite_dir <- '/home/Earth//startR_local/' #---------------------------------------------------- - + output <- Compute(wf, - chunks = list(lat = 2, time = 2), + chunks = list(time = 2), threads_load = 2, threads_compute = 4, cluster = list(queue_host = queue_host, @@ -179,14 +183,11 @@ output <- Compute(wf, chunks = list(lat = 2, time = 2)) ), ecflow_suite_dir = ecflow_suite_dir, wait = TRUE - ) + ) dim(output$output1) -# var lon time dat sdate fyear lat -# 1 1 3 1 1 1 3 -output$output1[1, 1, , 1, 1, 1, ] -# [,1] [,2] [,3] -#[1,] 503.0177 502.6464 503.0280 -#[2,] NA NA NA -#[3,] 507.7226 507.1748 507.9795 +# var time dat sdate fyear lat lon +# 1 3 1 1 1 61 51 +output$output1[1, , 1, 1, 1, 1, 1] +[1] 584.9990 NA 587.8201 -- GitLab From 08c7d78a3bbe6235f0b11a805c9c015c37a1839e Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 29 Mar 2021 15:47:32 +0200 Subject: [PATCH 10/14] Consider _depends/across in multiStart() --- R/multiStart.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/R/multiStart.R b/R/multiStart.R index f8f929f..8485654 100644 --- a/R/multiStart.R +++ b/R/multiStart.R @@ -44,7 +44,11 @@ multiStart <- function(..., # dim = indices/selectors, ObjectBigmemory = NULL, silent = FALSE, debug = FALSE) { - dim_params <- list(...) + dim_params_all <- list(...) + inner_dims_across_files <- take_var_across(dim_params_all) + dim_params <- rebuild_dim_params(dim_params_all, merge_across_dims, + inner_dims_across_files) + extra_params <- dim_params_all[!dim_params_all %in% dim_params] dim_params_list <- NULL dim_names <- names(dim_params) @@ -127,7 +131,8 @@ multiStart <- function(..., # dim = indices/selectors, } # Combine ... with other params - dim_params_list<- lapply(dim_params_list, c, + dim_params_list <- lapply(dim_params_list, c, + c(extra_params, list(return_vars = return_vars, synonims = synonims, file_opener = file_opener, @@ -152,7 +157,7 @@ multiStart <- function(..., # dim = indices/selectors, num_procs = num_procs, ObjectBigmemory = ObjectBigmemory, silent = silent, - debug = debug)) + debug = debug))) #===============Put in Start()===================== res <- vector('list', ndat) # A list saving datasets separately -- GitLab From 6e78db3aa4fcc708585dd7e27cce81be761d41e2 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 29 Mar 2021 17:31:12 +0200 Subject: [PATCH 11/14] Add use case and unit tests for multiStart --- inst/doc/usecase.md | 4 +- .../doc/usecase/ex2_11_multiStart_insert_NA.R | 72 +++++--- .../ex2_12_multiStart_diff_init_dates.R | 118 +++++++++++++ .../test-multiStart_diff_init_dates.R | 160 ++++++++++++++++++ 4 files changed, 325 insertions(+), 29 deletions(-) create mode 100644 inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R create mode 100644 tests/testthat/test-multiStart_diff_init_dates.R diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index af88ea3..efb59e1 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -78,6 +78,6 @@ You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata If you need to create the mask file on your own, go to ex2_9_mask.R. 11. [Use multiStart to read two datasets with different calendars in one call](inst/doc/usecase/ex2_11_multiStart_insert_NA.R) This use case decribes the features of multiStart and how to use it. It shows how to read two datasets with different calendars aligned together with NA inserted. It is recommended to read it first if you want to use multiStart(). - - + 12. [Load monthly data from decadal prediction systems with different initial dates](inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R) + An example showing how to load and operate monthly data from decadal prediction systems that are initialised in different dates by multiStart(). diff --git a/inst/doc/usecase/ex2_11_multiStart_insert_NA.R b/inst/doc/usecase/ex2_11_multiStart_insert_NA.R index 7186e81..068bc06 100644 --- a/inst/doc/usecase/ex2_11_multiStart_insert_NA.R +++ b/inst/doc/usecase/ex2_11_multiStart_insert_NA.R @@ -1,9 +1,17 @@ #------------------------------------------------------ +# Author: An-Chi Ho +# Date: 29th March 2021 +#------------------------------------------------------ # This use case shows the basic usage of multiStart(), the wrapper of Start() # for multiple datasets retrieval that have different selector requirement. -# The usage is almost the same as Start(). To assign different selectors for -# each dataset, use the same format as the pattern_dim: a list of lists, first -# item 'name' is the dataset name and the second item '' is the selector. +# The usage is quite similar to Start() only with some differences. +# - To assign different selectors for each dataset, use the same format as the +# pattern_dim: a list of lists, first item 'name' is the dataset name and the second +# item '' is the selector. +# - When defining the operation and workflow, notice that the considered data array +# number is the dataset number. In this case, it is 2 (hadgem3 and mpi_esm). Even +# though multiStart() returns one object 'data', it actually contains the startR calls +# for each dataset separately. # # This use case wants to put HadGEM3 and MPI-ESM datasets together in one call. # The major differences between them are the spatial grids and calendar type. @@ -92,26 +100,26 @@ #======================================================= # retrieve = FALSE; Compute() #======================================================= - res <- multiStart( - dat = list(list(name = 'hadgem3', path = path_hadgem3), - list(name = 'mpi_esm', path = path_mpi_esm)), - var = 'tasmax', - sdate = '2000', - fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), - list(name = 'mpi_esm', fyear = fyear_mpi_esm)), - time = list(list(name = 'hadgem3', time = time_hadgem3), - list(name = 'mpi_esm', time = time_mpi_esm)), - lat = values(list(-30, 30)), - lon = values(list(0, 50)), - transform = CDORemapper, - transform_params = list(grid = 'r360x181', - method = 'conservative', - crop = c(0, 50, -30, 30)), - transform_vars = c('lat', 'lon'), - return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), - retrieve = FALSE) - -str(res) + data <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', + sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + transform = CDORemapper, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = FALSE) + +str(data) #List of 2 # $ hadgem3: language Start ... # ..- attr(*, "Dimensions") ... @@ -124,7 +132,7 @@ str(res) # ..- attr(*, "Variables") ... # .. -str(attr(res[[1]], 'na_pos_record')) +str(attr(data[[1]], 'na_pos_record')) #List of 1 # $ hadgem3:List of 7 # ..$ dat : num 0 @@ -135,7 +143,7 @@ str(attr(res[[1]], 'na_pos_record')) # ..$ lat : num 0 # ..$ lon : num 0 -str(attr(res[[2]], 'na_pos_record')) +str(attr(data[[2]], 'na_pos_record')) #List of 1 # $ mpi_esm:List of 7 # ..$ dat : num 0 @@ -146,16 +154,26 @@ str(attr(res[[2]], 'na_pos_record')) # ..$ lat : num 0 # ..$ lon : num 0 +# Notice that even though multiStart() returns one object 'data', it actually +# contain two startR calls inside. +length(data) +# 2 +names(data) +#[1] "hadgem3" "mpi_esm" + +# Therefore, the operation should consider two datasets instead of one. func_sum <- function(x, y) { return(x + y) } -# 'time' and 'fyear' can also be the operational dimension +# The target_dims should be a list of 2. The dataset names can be random. +# 'time' and 'fyear' can also be the operational dimension. step <- Step(func_sum, target_dims = list(hadgem3 = c('var'), mpi_esm = c('var')), output_dims = c('var')) -wf <- AddStep(res, step) +# The inputs is still one object 'data', but it contains three calls implicitly. +wf <- AddStep(data, step) # Option 1: run locally output <- Compute(wf, chunks = list(time = 2)) diff --git a/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R b/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R new file mode 100644 index 0000000..3c4d1f1 --- /dev/null +++ b/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R @@ -0,0 +1,118 @@ +#-------------------------------------------------------- +# Author: Carlos Delgado, An-Chi Ho +# Date: 24th March 2021 +#-------------------------------------------------------- +# Example on how to load monthly data from decadal prediction systems that are +# initialised in different dates: +# - HadGEM3 (initialised in November) +# - IPSL-CM6A-LR (initialised in January) +# - NorCPM1 (initialised in October) +# Therefore, we use multiStart() to insert NAs at missing months manually. +# +# To see more details of multiStart() usage, check ex2_11. +#-------------------------------------------------------- +library(startR) + +variable <- 'tas' +sdates <- 2000:2004 + +## Paths +path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Amon/$var$/gr/v20200316/', + '$var$_*_s$sdate$_r1i1p1f2_gr_$fyear$.nc') +path_ipsl <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/ipsl-cm6a-lr/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'IPSL/IPSL-CM6A-LR/dcppA-hindcast/r1i1p1f1/Amon/$var$/gr/v20200504/', + '$var$_Amon_IPSL-CM6A-LR_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$fyear$.nc') +path_norcpm1 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/norcpm1/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'NCC/NorCPM1/dcppA-hindcast/r1i1p1f1/Amon/$var$/gn/v20200320/', + '$var$_Amon_NorCPM1_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') + +## Forecast months: six indices (from October to March) +time_hadgem3 <- indices(c(NA, 1, 2, 3, 4, 5)) +time_ipsl <- indices(c(NA, NA, NA, 1, 2, 3)) +time_norcpm1 <- indices(c(1, 2, 3, 4, 5, 6)) + +data <- multiStart(dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'ipsl', path = path_ipsl), + list(name = 'norcpm1', path = path_norcpm1)), + var = variable, + sdate = paste0(sdates), + fyear = indices(1), ## there is one file per sdate + fyear_depends = 'sdate', + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'ipsl', time = time_ipsl), + list(name = 'norcpm1', time = time_norcpm1)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + transform = CDORemapper, + transform_params = list(grid = 'r180x90', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = FALSE) + + +# Notice that though multiStart() returns one object 'data', it actually contains +# three startR calls. +length(data) +# 3 +names(data) +#[1] "hadgem3" "ipsl" "norcpm1" + +# Therefore, the operation should consider three data arrays +# rather than one. +fun <- function(dat1, dat2, dat3) { + # dat: [dat = 1] + mean(c(dat1, dat2, dat3), na.rm = TRUE) +} + +# As explained above, the number of data arrays is three, so the target_dims is a +# list of three. The names of dataset (i.e., dat1, dat2, dat3) can be random. +step <- Step(fun = fun, + target_dims = list(dat1 = 'dat', dat2 = 'dat', dat3 = 'dat'), + output_dims = NULL) + +# The inputs is still 'data', which contains three calls implicitly. +wf <- AddStep(inputs = data, step_fun = step) + + +# Option 1: run locally +output <- Compute(workflow = wf, + chunks = list(sdate = 5)) + +# Option 2: run on Nord3 +#-------------------user-defined--------------------- + queue_host <- 'nord3' + temp_dir <- '/gpfs/scratch/bsc32/bsc32/startR_hpc/' + ecflow_suite_dir <- '/home/Earth//startR_local/' +#---------------------------------------------------- +output <- Compute(workflow = wf, + chunks = list(sdate = 5), + threads_load = 1, + threads_compute = 5, + cluster = list(queue_host = queue_host, + queue_type = 'lsf', + temp_dir = tmp_dir, + cores_per_job = 1, + job_wallclock = '01:00', + max_jobs = 5, + extra_queue_params = list('#BSUB -q bsc_es -M 3000'), + bidirectional = FALSE, + polling_period = 10), + ecflow_suite_dir = ecflow_suite_dir, + wait = TRUE) + + +# Results +dim(output$output1) +# time var sdate fyear lat lon +# 6 1 5 1 30 26 + +summary(output$output1) +# Min. 1st Qu. Median Mean 3rd Qu. Max. +# 279.4 293.4 296.6 295.5 298.7 304.1 + + + diff --git a/tests/testthat/test-multiStart_diff_init_dates.R b/tests/testthat/test-multiStart_diff_init_dates.R new file mode 100644 index 0000000..9e0276f --- /dev/null +++ b/tests/testthat/test-multiStart_diff_init_dates.R @@ -0,0 +1,160 @@ +context("multiStart() NA insertion") +# Same case as ex2_12. +#--------------------------------------------------------------- + +variable <- 'tas' +sdates <- 2000:2004 + +## Paths +path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Amon/$var$/gr/v20200316/', + '$var$_*_s$sdate$_r1i1p1f2_gr_$fyear$.nc') +path_ipsl <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/ipsl-cm6a-lr/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'IPSL/IPSL-CM6A-LR/dcppA-hindcast/r1i1p1f1/Amon/$var$/gr/v20200504/', + '$var$_Amon_IPSL-CM6A-LR_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$fyear$.nc') +path_norcpm1 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/norcpm1/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'NCC/NorCPM1/dcppA-hindcast/r1i1p1f1/Amon/$var$/gn/v20200320/', + '$var$_Amon_NorCPM1_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') + +## Forecast months: six indices (from October to March) +time_hadgem3 <- indices(c(NA, 1, 2, 3, 4, 5)) +time_ipsl <- indices(c(NA, NA, NA, 1, 2, 3)) +time_norcpm1 <- indices(c(1, 2, 3, 4, 5, 6)) + + +test_that("1. retrieve = T", { + +suppressWarnings( +data <- multiStart(dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'ipsl', path = path_ipsl), + list(name = 'norcpm1', path = path_norcpm1)), + var = variable, + sdate = paste0(sdates), + fyear = indices(1), ## there is one file per sdate + fyear_depends = 'sdate', + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'ipsl', time = time_ipsl), + list(name = 'norcpm1', time = time_norcpm1)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + transform = CDORemapper, + transform_params = list(grid = 'r180x90', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = TRUE) +) + + +expect_equal( +dim(data), +c(dat = 3, var = 1, sdate = 5, fyear = 1, time = 6, lat = 30, lon = 26) +) +expect_equal( +mean(data, na.rm = T), +295.4626, +tolerance = 0.0001 +) +expect_equal( +length(which(is.na(data))), +15600 +) +expect_equal( +data[1, 1, 2, 1, 1:4, 1, 1], +c(NA, 290.1964, 292.6637, 293.2186), +tolerance = 0.0001 +) +expect_equal( +data[2, 1, 2, 1, 1:4, 1, 1], +c(NA, NA, NA, 294.2315), +tolerance = 0.0001 +) +expect_equal( +data[3, 1, 2, 1, 1:4, 1, 1], +c(288.6785, 289.5546, 290.4638, 291.7312), +tolerance = 0.0001 +) + +}) + + +test_that("2. retrieve = F", { + +suppressWarnings( +data <- multiStart(dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'ipsl', path = path_ipsl), + list(name = 'norcpm1', path = path_norcpm1)), + var = variable, + sdate = paste0(sdates), + fyear = indices(1), ## there is one file per sdate + fyear_depends = 'sdate', + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'ipsl', time = time_ipsl), + list(name = 'norcpm1', time = time_norcpm1)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + transform = CDORemapper, + transform_params = list(grid = 'r180x90', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = FALSE) +) + +expect_equal( +length(data), +3 +) +expect_equal( +names(data), +c("hadgem3", "ipsl", "norcpm1") +) + +# Therefore, the operation should consider three data arrays +# rather than one. +fun <- function(dat1, dat2, dat3) { + # dat: [dat = 1] + mean(c(dat1, dat2, dat3), na.rm = TRUE) +} + +# As explained above, the number of data arrays is three, so the target_dims is a +# list of three. The names of dataset (i.e., dat1, dat2, dat3) can be random. +step <- Step(fun = fun, + target_dims = list(dat1 = 'dat', dat2 = 'dat', dat3 = 'dat'), + output_dims = NULL) + +# The inputs is still 'data', which contains three calls implicitly. +wf <- AddStep(inputs = data, step_fun = step) + + +# run locally +output <- Compute(workflow = wf, + chunks = list(sdate = 2))$output1 + + +expect_equal( +dim(output), +c(time = 6, var = 1, sdate = 5, fyear = 1, lat = 30, lon = 26) +) + +expect_equal( +mean(output), +295.4511, +tolerance = 0.0001 +) +expect_equal( +length(which(is.na(output))), +0 +) +expect_equal( +output[1:5], +c(288.9735, 290.6019, 291.8069, 293.3672, 293.4944), +tolerance = 0.0001 +) + + +}) -- GitLab From 7db9d89509d898d2166d29e8febfb9b8f89c9d57 Mon Sep 17 00:00:00 2001 From: Carlos Delgado Date: Tue, 30 Mar 2021 15:57:24 +0200 Subject: [PATCH 12/14] fixed typo in "temp_dir" --- .../ex2_12_multiStart_diff_init_dates.R | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R b/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R index 3c4d1f1..fe91f3b 100644 --- a/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R +++ b/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R @@ -80,7 +80,7 @@ wf <- AddStep(inputs = data, step_fun = step) # Option 1: run locally output <- Compute(workflow = wf, - chunks = list(sdate = 5)) + chunks = list(sdate = 5)) # Option 2: run on Nord3 #-------------------user-defined--------------------- @@ -89,20 +89,20 @@ output <- Compute(workflow = wf, ecflow_suite_dir <- '/home/Earth//startR_local/' #---------------------------------------------------- output <- Compute(workflow = wf, - chunks = list(sdate = 5), - threads_load = 1, - threads_compute = 5, - cluster = list(queue_host = queue_host, - queue_type = 'lsf', - temp_dir = tmp_dir, - cores_per_job = 1, - job_wallclock = '01:00', - max_jobs = 5, - extra_queue_params = list('#BSUB -q bsc_es -M 3000'), - bidirectional = FALSE, - polling_period = 10), - ecflow_suite_dir = ecflow_suite_dir, - wait = TRUE) + chunks = list(sdate = 5), + threads_load = 1, + threads_compute = 5, + cluster = list(queue_host = queue_host, + queue_type = 'lsf', + temp_dir = temp_dir, + cores_per_job = 1, + job_wallclock = '01:00', + max_jobs = 5, + extra_queue_params = list('#BSUB -q bsc_es -M 3000'), + bidirectional = FALSE, + polling_period = 10), + ecflow_suite_dir = ecflow_suite_dir, + wait = TRUE) # Results @@ -113,6 +113,3 @@ dim(output$output1) summary(output$output1) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 279.4 293.4 296.6 295.5 298.7 304.1 - - - -- GitLab From bcf976aed7acbcc6c5043ecc77be4462addb45f9 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 31 Mar 2021 15:03:17 +0200 Subject: [PATCH 13/14] Adjust dim names of the combined array. It caused error before if merge_across_dims is used. --- R/multiStart.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/multiStart.R b/R/multiStart.R index 8485654..e97468e 100644 --- a/R/multiStart.R +++ b/R/multiStart.R @@ -230,7 +230,7 @@ multiStart <- function(..., # dim = indices/selectors, } abind_input_list <- c(res, along = bind_along) combine_array <- do.call(abind::abind, abind_input_list) - names(dim(combine_array)) <- dim_names + names(dim(combine_array)) <- names(dim(res[[1]])) if (!silent) { .message("Final dimension sizes after combination:") -- GitLab From 685647ce8ae7c7b989bf6f3847a1cb28898bd474 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 7 Apr 2021 16:03:45 +0200 Subject: [PATCH 14/14] Enable multiStart() to read EC-Earth3 that has multiple chunks for one sdate with other datasets. --- R/multiStart.R | 19 ++---- inst/doc/usecase.md | 3 +- .../usecase/ex1_14_multiStart_ecearth_merge.R | 65 +++++++++++++++++++ 3 files changed, 74 insertions(+), 13 deletions(-) create mode 100644 inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R diff --git a/R/multiStart.R b/R/multiStart.R index e97468e..666d108 100644 --- a/R/multiStart.R +++ b/R/multiStart.R @@ -45,9 +45,9 @@ multiStart <- function(..., # dim = indices/selectors, silent = FALSE, debug = FALSE) { dim_params_all <- list(...) - inner_dims_across_files <- take_var_across(dim_params_all) - dim_params <- rebuild_dim_params(dim_params_all, merge_across_dims, - inner_dims_across_files) + # No need to reorder the dims here, so set the last two params FALSE/NULL. + dim_params <- rebuild_dim_params(dim_params_all, merge_across_dims = FALSE, + inner_dims_across_files = NULL) extra_params <- dim_params_all[!dim_params_all %in% dim_params] dim_params_list <- NULL dim_names <- names(dim_params) @@ -265,20 +265,15 @@ multiStart <- function(..., # dim = indices/selectors, attr_list[[n_dat]] <- attributes(res[[n_dat]]) } - # For the value is array ## $Files - sublist <- lapply(attr_list, '[[', 'Files') # list(dat1 = files1, dat2 = files2) - bind_along <- which(names(dim(attr(res[[1]], 'Files'))) == found_pattern_dim) # 1 - attr(combine_array, 'Files') <- do.call(abind::abind, c(sublist, along = bind_along)) - names(dim(attr(combine_array, 'Files'))) <- names(dim(attr_list[[1]]$Files)) - + attr(combine_array, 'Files') <- lapply(lapply(attr_list, '[', 'Files'), + '[[', 1) ## $NotFoundFiles sublist <- lapply(attr_list, '[[', 'NotFoundFiles') if (!is.null(unlist(sublist))) { #Some of the dats have NotFoundFiles - attr(combine_array, 'NotFoundFiles') <- do.call(abind::abind, c(sublist, along = bind_along)) #bind_along is from $Files above - names(attr(combine_array, 'NotFoundFiles')) <- names(dim(attr_list[[1]]$Files)) + attr(combine_array, 'NotFoundFiles') <- lapply(lapply(attr_list, '[', 'NotFoundFiles'), + '[[', 1) } - # For the value is list ## $FileSelectors attr(combine_array, 'FileSelectors') <- lapply(lapply(attr_list, '[', 'FileSelectors'), '[[', 1) diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index efb59e1..3e4f9fc 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -56,7 +56,8 @@ You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata 13. [Use multiStart to read data with NA assigned](inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R) The script shows a simple case that uses multiStart to read data when NA insertion is needed in certain selectors. - + 14. [Use multiStart to read two datasets with different prediction lengths per file](inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R) + This use case shows how to use multiStart() to read two datasets (EC-Earth3 and MPI-ESM) with different prediction length per file together. 2. **Execute computation (use `Compute()`)** 1. [Function working on time dimension](inst/doc/usecase/ex2_1_timedim.R) diff --git a/inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R b/inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R new file mode 100644 index 0000000..34c7aee --- /dev/null +++ b/inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R @@ -0,0 +1,65 @@ +#-------------------------------------------------------- +# Author: An-Chi Ho +# Date: 7th April 2021 +#--------------------------------------------- +# This use case shows how to use multiStart() to read two datasets with different +# prediction length per file together. The two datasets, EC-Earth3 and MPI-ESM1.2-HR, +# have the same start date and calendar, but the prediction length per file is different. +# EC-Earth3 has seveal files for "sdate = 2000" and each file has 12 time steps (i.e., +# 12 months); MPI-ESM1.2 has only one file for "sdate = 2000" and the file contains +# 122 time steps. To read them together, 'merge_across_dims = TRUE' and 'time_across = 'fyear' +# need to be used. +#--------------------------------------------- + +library(startR) + +# Define selectors + #------- EC-Earth3 --------- + ## calendar: standard + ## Prediction period: 2000 11 - 2011 10 (11 years) + ## time steps per file: 12 + path_ecearth3 <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/', + 'EC-Earth3/dcppA-hindcast/r1i1p1f1/Omon/$var$/gr/v20190713/', + '$var$_Omon_EC-Earth3_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$fyear$.nc') + ## file name: + ## tos_Omon_EC-Earth3_dcppA-hindcast_s2000-r1i1p1f1_gr_200011-200110.nc + ## tos_Omon_EC-Earth3_dcppA-hindcast_s2000-r1i1p1f1_gr_200111-200210.nc + + #-------- MPI-ESM1.2-HR -------- + ## calendar: standard + ## Prediction period: 2000 11 - 2010 12 (10 years and 2 months) + ## time steps per file: 122 + path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/', + 'r1i1p1f1/Omon/$var$/gr/v20200320/', + '$var$_Omon_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$fyear$.nc') + ## file name: + ## tos_Omon_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gr_200011-201012.nc + + # Selectors + sdate <- '2000' + fyear <- 2000:2001 + fyear_ecearth3 <- paste0(fyear, '11-', as.numeric(fyear) + 1, '10') + fyear_mpi_esm <- paste0(sdate, '11-', as.numeric(sdate) + 10, '12') + time_ecearth3 <- 1:24 #200011-200210 + time_mpi_esm <- 1:24 #200011-200210 + + + +res <- multiStart( + dat = list(list(name = 'ecearth3', path = path_ecearth3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tos', sdate = '2000', + fyear = list(list(name = 'ecearth3', fyear = fyear_ecearth3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'ecearth3', time = time_ecearth3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = indices(1), lon = indices(1), + time_across = 'fyear', + merge_across_dims = TRUE, + return_vars = list(lat = 'dat', lon = 'dat', time = 'sdate'), + retrieve = TRUE) + +dim(res) +# dat var sdate time lat lon +# 2 1 1 24 1 1 -- GitLab