diff --git a/NAMESPACE b/NAMESPACE index ccf783cbe12b94f8b703a403b9cf16f4be0ff870..a61238ac24e5f136b0ffc518088aea883719b5f6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(Sort) export(Start) export(Step) export(indices) +export(multiStart) export(values) import(PCICt) import(abind) diff --git a/R/ByChunks.R b/R/ByChunks.R index dd101120d1b394ba8cae6eca7f8bf1cb43688f8f..32f44bbbc0f0f8269a1fc869ea9ede21a2b9af0d 100644 --- a/R/ByChunks.R +++ b/R/ByChunks.R @@ -708,6 +708,20 @@ ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', for (call_dim in names(attr(start_call, 'Dimensions'))[dims_to_alter]) { start_call[[call_dim]] <- .chunk(chunk_indices[call_dim], chunks[[call_dim]], eval(start_call[[call_dim]])) + #========================================= + # multiStart is used + # If n_chunks > 1 & call_dim has NA, extra process in Start() when chunking + if (!is.null(attr(start_call, 'na_pos_record'))) { + if (chunks[[call_dim]] > 1 & + attr(start_call, 'na_pos_record')[[1]][[call_dim]] != 0) { + attr(start_call[[call_dim]], "chunk") <- c(attr(start_call[[call_dim]], "chunk"), chunk_with_NA = 1) + # put na_pos_record in attributes + attr(start_call[[call_dim]], "na_pos_record") <- attributes(start_call)$na_pos_record[[1]][[call_dim]] + } else { + attr(start_call[[call_dim]], "chunk") <- c(attr(start_call[[call_dim]], "chunk"), chunk_with_NA = 0) + } + } + #========================================== } } start_call[['silent']] <- !debug @@ -715,6 +729,56 @@ ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', start_call[['num_procs']] <- threads_load } data[[input]] <- eval(start_call) + + #====================================================== + # If multiStart() is used, deal with NA insertion here + if (!is.null(attr(start_call, 'na_pos_record'))) { + na_pos_record_list <- attr(start_call, 'na_pos_record')[[1]] + if (any(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any)))) { + have_na_dims <- which(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any))) + for (have_na_dim_name in names(have_na_dims)) { + + if (!have_na_dim_name %in% names(chunks)) { + not_chunked <- TRUE + } else if (chunks[[have_na_dim_name]] == 1) { + not_chunked <- TRUE + } else { + not_chunked <- FALSE + } + # If the dim is not chunked, insert NAs directly according to the list + if (not_chunked) { + data[[input]] <- Apply(data[[input]], + target_dims = have_na_dim_name, + output_dims = have_na_dim_name, + fun = .insertNAdim, + na_pos = na_pos_record_list[[have_na_dim_name]] + )$output1 + + } else { # chunk along this dim. Need to examine piece by piece. + dim_na_pos_record <- attr(start_call[[have_na_dim_name]], "na_pos_record") + chunk_vector <- attr(start_call[[have_na_dim_name]], "chunk") + sub_chunk_indices <- chunk_indices(length(start_call[[have_na_dim_name]]) + length(dim_na_pos_record), + chunk = chunk_vector[1], + n_chunks = chunk_vector[2], + have_na_dim_name) + without_NA_dim_length <- dim(data[[input]])[have_na_dim_name] + if (without_NA_dim_length != length(sub_chunk_indices)) { # need to insert NA + sub_na_pos <- which(sub_chunk_indices %in% dim_na_pos_record) + data[[input]] <- Apply(data[[input]], + target_dims = have_na_dim_name, + output_dims = have_na_dim_name, + fun = .insertNAdim, + na_pos = sub_na_pos + )$output1 + + } + } + } + } + + } + #========================================================= + } t_end_load <- Sys.time() timings[['load']] <- c(timings[['load']], diff --git a/R/Start.R b/R/Start.R index ff4b97831b5c7337cdab1da166b9016895ac92b3..ef384ea1f4d7c87a157f047036eebb5fdba480d5 100644 --- a/R/Start.R +++ b/R/Start.R @@ -837,6 +837,15 @@ Start <- function(..., # dim = indices/selectors, Subset <- ClimProjDiags::Subset dim_params <- list(...) + + # If multiStart is used. matchcall is the correct Start call argument. + # In multiStart(), do.call is used and it creates incorrect call argument that + # makes trouble when retrive = F & job is sent to HPCs. + if ('matchcall' %in% names(dim_params)) { + matchcall <- dim_params[['matchcall']] + dim_params[['matchcall']] <- NULL + } + # Take *_var parameters apart var_params <- take_var_params(dim_params) @@ -2976,28 +2985,79 @@ Start <- function(..., # dim = indices/selectors, } } - ## This 'if' runs in both Start() and Compute(). In Start(), it doesn't have any effect (no chunk). - ## In Compute(), it creates the indices for each chunk. For example, if 'sub_array_of_indices' - ## is c(5:10) and chunked into 2, 'sub_array_of_indices' becomes c(5:7) for chunk = 1, c(8:10) - ## for chunk = 2. If 'sub_array_of_indices' is list(55, 62) and chunked into 2, it becomes - ## list(55, 58) for chunk = 1 and list(59, 62) for chunk = 2. - ## TODO: The list can be turned into vector here? So afterward no need to judge if it is list - ## or vector. - if (!is.list(sub_array_of_indices)) { - sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices), + #============================================================ + ## This part runs in both Start() and Compute(). + ## In Start(), it doesn't have any effect (no chunk). + ## In Compute(), it creates the indices for each chunk. For example, + ## if 'sub_array_of_indices' is c(5:10) and chunked into 2, + ## 'sub_array_of_indices' becomes c(5:7) for chunk = 1, c(8:10) + ## for chunk = 2. If 'sub_array_of_indices' is list(55, 62) + ## and chunked into 2, it becomes list(55, 58) for chunk = 1 + ## and list(59, 62) for chunk = 2. + + # Add one "if" to reduce redundant work. If n_chunks = 1, no need to chunk + if (chunks[[inner_dim]]["n_chunks"] > 1) { + # multiStart: If inner_dim needs to be chunked & has NAs, extra process + if (!(is.na(chunks[[inner_dim]]["chunk_with_NA"]) | + chunks[[inner_dim]]["chunk_with_NA"] == 0)) { + na_pos_record <- attr(dim_params[[inner_dim]], 'na_pos_record') # a vector + if (is.list(sub_array_of_indices)) { + # Turn list into vector + sub_array_of_indices <- c(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]) + sub_array_of_indices_is_list <- TRUE + } else { + sub_array_of_indices_is_list <- FALSE + } + + # Consider the real whole length (with NA) and chunk + indices_include_NA <- chunk_indices(length(sub_array_of_indices) + length(na_pos_record), + chunks[[inner_dim]]["chunk"], + chunks[[inner_dim]]["n_chunks"], + inner_dim) + # If the chunked piece has NA actually, remove and record it + if (any(indices_include_NA %in% na_pos_record)) { + sub_na_pos_record <- which(indices_include_NA %in% na_pos_record) + for (pos in 1:(length(indices_include_NA) - 1)) { + if (pos %in% sub_na_pos_record) { + indices_include_NA[c((pos + 1):length(indices_include_NA))] <- indices_include_NA[c((pos + 1):length(indices_include_NA))] - 1 + } + } + indices_exclude_NA <- indices_include_NA[-sub_na_pos_record] + } else { + indices_exclude_NA <- indices_include_NA + } + # Deduct if NAs exist in the previous chunks + if (any(na_pos_record < indices_include_NA[1])) { + indices_exclude_NA <- indices_exclude_NA - sum(na_pos_record < indices_include_NA[1]) + } + sub_array_of_indices <- sub_array_of_indices[indices_exclude_NA] + if (sub_array_of_indices_is_list) { + sub_array_of_indices <- list(sub_array_of_indices[1], sub_array_of_indices[length(sub_array_of_indices)]) + } + + } # extra process end + + else { # original code + if (!is.list(sub_array_of_indices)) { + sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices), chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], inner_dim)] - } else { - tmp <- chunk_indices(length(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]), - chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], - inner_dim) - vect <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] - sub_array_of_indices[[1]] <- vect[tmp[1]] - sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] - } - # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. - + } else { + ## TODO: The list can be turned into vector here? So afterward no need to judge if it is list + ## or vector. BUT saving as list is memory efficient. + tmp <- chunk_indices(length(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]), + chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], + inner_dim) + vect <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + sub_array_of_indices[[1]] <- vect[tmp[1]] + sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] + } + # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. + } + } # if n_chunks > 1 to avoid redundant work + #============================================================ + if (debug) { if (inner_dim %in% dims_to_check) { print("-> TRANSFORMATION REQUESTED?") @@ -4358,7 +4418,14 @@ Start <- function(..., # dim = indices/selectors, if (!silent) { .message("Successfully discovered data dimensions.") } + + # If multiStart is used. Use 'matchcall' instead of generating it here. + if (exists('matchcall')) { + start_call <- matchcall + } else { # original code start_call <- match.call() + } + for (i in 2:length(start_call)) { if (class(start_call[[i]]) %in% c('name', 'call')) { start_call[[i]] <- eval.parent(start_call[[i]]) @@ -4398,11 +4465,9 @@ Start <- function(..., # dim = indices/selectors, ### TODO: Specify dependencies as parameter # suppressPackageStartupMessages({library(ncdf4)}) - #print("1") store_indices <- as.list(work_piece[['store_position']]) first_round_indices <- work_piece[['first_round_indices']] second_round_indices <- work_piece[['second_round_indices']] - #print("2") file_to_open <- work_piece[['file_path']] sub_array <- file_data_reader(file_to_open, NULL, work_piece[['file_selectors']], diff --git a/R/Utils.R b/R/Utils.R index 3a6f6ea59feca0038389a4627aa9850bba170e4e..d7fcdde314606a84b876500f452965c264f9c8bf 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -675,7 +675,17 @@ MergeArrays <- .MergeArrays array_dims <- (dim(array_of_arrays)) dim_names <- names(array_dims) - + + # Reorder if the arrays are not at the same dim order + array_of_arrays_dimnames <- lapply(lapply(array_of_arrays, dim), names) + same_dim_order <- sapply(array_of_arrays_dimnames, identical, array_of_arrays_dimnames[[1]]) + if (any(!same_dim_order)) { + arrays_to_correct <- which(!same_dim_order) + for (i in arrays_to_correct) { + array_of_arrays[[i]] <- s2dv::Reorder(array_of_arrays[[i]], array_of_arrays_dimnames[[1]]) + } + } + # Merge the chunks. for (dim_index in 1:length(dim_names)) { dim_sub_array_of_chunks <- dim_sub_array_of_chunk_indices <- NULL @@ -844,3 +854,12 @@ .KnownLatNames <- function() { known_lat_names <- c('lat', 'latitude', 'y', 'j', 'nav_lat') } + +.insertNAdim <- function(data, na_pos) { + # data: [time] + output <- rep(NA, length(data) + length(na_pos)) + output[-na_pos] <- data + + return(output) +} + diff --git a/R/multiStart.R b/R/multiStart.R new file mode 100644 index 0000000000000000000000000000000000000000..666d108bcf4cca5cb4630bcb37badec76262af0e --- /dev/null +++ b/R/multiStart.R @@ -0,0 +1,359 @@ +#' How to use: +#' 1. Allow to read multiple datasets with different selectors +#' 2. A list of list is used when different selectors are required, just like 'dat'. +#' The items in each list must be 'name' and ''. 'name' is the +#' dataset name, same as the one in 'dat'. +#' 3. Only indices() is accepted. (If for values(), use selectorchecker to transform?) +#' 4. Allow to have NA in the selectors. Those NAs will be removed when being fed in +#' Start(), and be inserted back after Start(). +#' +#'NOTE: +#' 1. No reshape (merge_across_dim yes) +#' 2. The input check is not comprehensive +#' +#'@import abind multiApply +#'@export +multiStart <- function(..., # dim = indices/selectors, + # dim_var = 'var', + # dim_reorder = Sort/CircularSort, + # dim_tolerance = number, + # dim_depends = 'file_dim', + # dim_across = 'file_dim', + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, debug = FALSE) { + + dim_params_all <- list(...) + # No need to reorder the dims here, so set the last two params FALSE/NULL. + dim_params <- rebuild_dim_params(dim_params_all, merge_across_dims = FALSE, + inner_dims_across_files = NULL) + extra_params <- dim_params_all[!dim_params_all %in% dim_params] + dim_params_list <- NULL + dim_names <- names(dim_params) + +#NOTE: The check can be removed. When entering Start(), the check will run as well. +#===============line 1020 (down)========================= + # Check pattern_dims + if (is.null(pattern_dims)) { + .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '", + dim_names[1], "' as 'pattern_dims'.")) + pattern_dims <- dim_names[1] + } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) { + pattern_dims <- unique(pattern_dims) + } else { + stop("Parameter 'pattern_dims' must be a vector of character strings.") + } + + # Find the pattern dimension with the pattern specifications + found_pattern_dim <- NULL + for (pattern_dim in pattern_dims) { + # Check all specifications in pattern_dim are valid + dat <- datasets <- dim_params[[pattern_dim]] + if (is.list(dat) || any(sapply(dat, is.list))) { + if (is.null(found_pattern_dim)) { + found_pattern_dim <- pattern_dim + } else { + stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.") + } + } + } + if (is.null(found_pattern_dim)) { + .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications.")) + found_pattern_dim <- pattern_dims[1] + } +#===============line 1056 (up)========================= + ndat <- length(dim_params[[found_pattern_dim]]) # 2 + dat_names <- sapply(dat, '[[', 'name') #c(hadgem3, mpi_esm) + dim_params_list <- vector('list', ndat) + dim_params_list <- lapply(dim_params_list, '[<-', vector('list', length(dim_params))) + names(dim_params_list) <- dat_names + dim_names <- names(dim_params) #c(dat, var, sdate,...) + + # Create an array for NA record + na_pos_record <- vector('list', ndat) + na_pos_record <- lapply(na_pos_record, '[<-', vector('list', length(dim_params))) + names(na_pos_record) <- dat_names #c(hadgem3, mpi_esm) + for (n_dat in 1:ndat) { + na_pos_record[[n_dat]] <- as.list(rep(0, length(dim_params))) + names(na_pos_record[[n_dat]]) <- dim_names + } + + for (param_ind in 1:length(dim_params)) { + dim_param <- dim_params[[param_ind]] # $ dat :List of 2; $ var : chr "tas" + if (is.list(dim_param) & all(lapply(lapply(dim_param, names), '[', 1) == 'name') & + (names(dim_params)[param_ind] != found_pattern_dim)) { + for (dat_ind in 1:ndat) { + # If NA exists, take it out and record in an array + if (any(is.na(dim_param[[dat_ind]][[2]]))) { + dim_params_list[[dat_ind]][[param_ind]] <- dim_param[[dat_ind]][[2]][!is.na(dim_param[[dat_ind]][[2]])] + attributes(dim_params_list[[dat_ind]][[param_ind]]) <- attributes(dim_param[[dat_ind]][[2]]) + na_pos_record[[dat_ind]][[param_ind]] <- which(is.na(dim_param[[dat_ind]][[2]])) + } else { + dim_params_list[[dat_ind]][[param_ind]] <- dim_param[[dat_ind]][[2]] # 2nd item is value, 1st item is dat name + } + } + } else { + if (names(dim_params)[param_ind] == found_pattern_dim) { + for (dat_ind in 1:ndat) { + dim_params_list[[dat_ind]][[param_ind]] <- list(dim_param[[dat_ind]]) + } + } else { + for (dat_ind in 1:ndat) { + dim_params_list[[dat_ind]][[param_ind]] <- dim_param # vector + } + } + } + } + # Put dim name back + for (dat_ind in 1:ndat) { + names(dim_params_list[[dat_ind]]) <- dim_names + } + + # Combine ... with other params + dim_params_list <- lapply(dim_params_list, c, + c(extra_params, + list(return_vars = return_vars, + synonims = synonims, + file_opener = file_opener, + file_var_reader = file_var_reader, + file_dim_reader = file_dim_reader, + file_data_reader = file_data_reader, + file_closer = file_closer, + transform = transform, + transform_params = transform_params, + transform_vars = transform_vars, + transform_extra_cells = transform_extra_cells, + apply_indices_after_transform = apply_indices_after_transform, + pattern_dims = pattern_dims, + metadata_dims = metadata_dims, + selector_checker = selector_checker, + merge_across_dims = merge_across_dims, + merge_across_dims_narm = merge_across_dims_narm, + split_multiselected_dims = split_multiselected_dims, + path_glob_permissive = path_glob_permissive, + largest_dims_length = largest_dims_length, + retrieve = retrieve, + num_procs = num_procs, + ObjectBigmemory = ObjectBigmemory, + silent = silent, + debug = debug))) + + #===============Put in Start()===================== + res <- vector('list', ndat) # A list saving datasets separately + ## matchcall needs to be modified and put in Start call. + ## Because do.call is used below, match.call() in Start() will generate incorrect + ## call argument (it is correct only if Start(...) is used directly.) + matchcall <- match.call() + matchcall[[1]] <- as.name('Start') + + for (n_dat in 1:ndat) { + + # Revise matchcall for each Start call + for (item in dim_names) { #only need to check '...' + if (!identical(matchcall[[item]], dim_params_list[[n_dat]][[item]])) { + matchcall[[item]] <- dim_params_list[[n_dat]][[item]] + } + } + + message("* Use Start() to get dataset '", dat_names[n_dat], "' now...") + res[[n_dat]] <- do.call(Start, c(dim_params_list[[n_dat]], matchcall = quote(matchcall))) + message("* Start() has successfully gotten dataset '", dat_names[n_dat], "'.") + } + names(res) <- dat_names #e.g., c(hadgem3, mpi_esm) + + #===================================================== + + if (retrieve) { + have_na <- as.logical(lapply(lapply(na_pos_record, unlist), sum)) # a logical vector + + for (n_dat in 1:ndat) { + if (have_na[n_dat]) { #need to insert NA + + # Save metadata for later use + saved_metadata <- attributes(res[[n_dat]]) #attr(res[[n_dat]], 'Variables') + + no_na_dim_name <- names(unlist(lapply(lapply(na_pos_record[[n_dat]], '==', 0), which))) + na_dim_name_ind <- which(!names(na_pos_record[[n_dat]]) %in% no_na_dim_name) + na_dims <- na_pos_record[[n_dat]][na_dim_name_ind] # list + correct_dim_order <- names(dim(res[[n_dat]])) # if reshape, here is diff from dim_names + for (na_dim in 1:length(na_dims)) { + na_dim_name <- names(na_dims)[na_dim] # 'time' + res[[n_dat]] <- Apply(res[[n_dat]], + target_dims = na_dim_name, + output_dims = na_dim_name, + fun = .insertNAdim, + na_pos = na_dims[[na_dim]] # a vector. b = c(2, 4) + )$output1 + res[[n_dat]] <- s2dv::Reorder(res[[n_dat]], correct_dim_order) + } + # Put metadata back + ## res[[n_dat]] already has 'dim' attributes. The rest is lost + lost_list_ind <- which(!names(saved_metadata) %in% names(attributes(res[[n_dat]]))) + attributes(res[[n_dat]]) <- c(attributes(res[[n_dat]]), + saved_metadata[lost_list_ind]) + } + + # Add na_pos_record list to metadata + attr(res[[n_dat]], 'na_pos_record')[[dat_names[n_dat]]] <- na_pos_record[[n_dat]] + } + + # If > 1 dataset, combine datasets. + if (length(res) > 1) { + bind_along <- which(dim_names == found_pattern_dim) + # Check if the dimensions are identical + array_dims <- lapply(res, dim) + names(array_dims) <- NULL + + if (!do.call(identical, array_dims)) { + stop("The expected dimensions of datasets are not identical. Cannot combine them together.") + } + abind_input_list <- c(res, along = bind_along) + combine_array <- do.call(abind::abind, abind_input_list) + names(dim(combine_array)) <- names(dim(res[[1]])) + + if (!silent) { + .message("Final dimension sizes after combination:") + final_dims_combined <- dim(combine_array) + longest_dim_len <- max(sapply(names(final_dims_combined), nchar)) + longest_size_len <- max(sapply(paste0(final_dims_combined, ''), nchar)) + sapply(names(final_dims_combined), + function(x) { + message(paste0("* ", paste(rep(' ', longest_dim_len - nchar(x)), collapse = ''), + x, ": ", paste(rep(' ', longest_size_len - nchar(paste0(final_dims_combined[x], ''))), collapse = ''), + final_dims_combined[x])) + }) + bytes <- prod(c(final_dims_combined, 8)) + + .message(paste("Total size of combined data:")) + dim_sizes <- paste(final_dims_combined, collapse = ' x ') + .message(paste(dim_sizes, " x 8 bytes =", + format(structure(bytes, class = "object_size"), units = "auto")), + indent = 2) + } + + #---- restore metadata ---- + ## combine_array already has 'dim' attributes. The rest is lost + lost_list_ind <- which(!names(attributes(res[[1]])) %in% names(attributes(combine_array))) + attributes(combine_array) <- c(attributes(combine_array), + attributes(res[[1]])[lost_list_ind]) + + #---- Combine metadata which is not common ---- + #NOTE: $dim, $PatternDim, $class are common attributes. + attr_list <- vector('list', length = ndat) + for (n_dat in 1:ndat) { + attr_list[[n_dat]] <- attributes(res[[n_dat]]) + } + + ## $Files + attr(combine_array, 'Files') <- lapply(lapply(attr_list, '[', 'Files'), + '[[', 1) + ## $NotFoundFiles + sublist <- lapply(attr_list, '[[', 'NotFoundFiles') + if (!is.null(unlist(sublist))) { #Some of the dats have NotFoundFiles + attr(combine_array, 'NotFoundFiles') <- lapply(lapply(attr_list, '[', 'NotFoundFiles'), + '[[', 1) + } + ## $FileSelectors + attr(combine_array, 'FileSelectors') <- lapply(lapply(attr_list, '[', 'FileSelectors'), + '[[', 1) + names(attr(combine_array, 'FileSelectors')) <- dat_names + ## $ObjectBigmemory + attr(combine_array, 'ObjectBigmemory') <- lapply(lapply(attr_list, '[', 'ObjectBigmemory'), + '[[', 1) + names(attr(combine_array, 'ObjectBigmemory')) <- dat_names + ## $na_pos_record + attr(combine_array, 'na_pos_record') <- lapply(lapply(attr_list, '[[', 'na_pos_record'), + '[[', 1) + names(attr(combine_array, 'na_pos_record')) <- dat_names + + for (n_dat in 2:ndat) { + # $Variables + ## For $common, use the 1st data. Therefore, just add 2nd data $dat2 into + ## the metadata of 1st data + if (!all(names(attr(res[[n_dat]], 'Variables')) %in% 'common')) { + dat_list_name <- names(attr(res[[n_dat]], 'Variables'))[which(!names(attr(res[[n_dat]], 'Variables')) %in% 'common')] #'dat2' + attr(combine_array, 'Variables')[[dat_list_name]] <- attr(res[[n_dat]], 'Variables')[[dat_list_name]] + } + } + + return(combine_array) + + } else { # only 1 dataset + return(res[[1]]) + } + + } else { # retrieve = FALSE + for (n_dat in 1:ndat) { + # Add na_pos_record list to metadata + attr(res[[n_dat]], 'na_pos_record')[[dat_names[n_dat]]] <- na_pos_record[[n_dat]] + + # Correct dimensions if NA exists + if (sum(unlist(na_pos_record[[n_dat]])) != 0) { + have_na_dims <- which(unlist(lapply(lapply(na_pos_record[[n_dat]], '!=', 0), any))) + for (have_na_dim in 1:length(have_na_dims)) { + attr(res[[n_dat]], 'Dimensions')[have_na_dims[have_na_dim]] <- attr(res[[n_dat]], 'Dimensions')[have_na_dims[have_na_dim]] + length(na_pos_record[[n_dat]][[have_na_dims[have_na_dim]]]) + } + } + } + # Check if the datasets can be combined + dim_list <- lapply(1:ndat, function(x) { attr(res[[x]], 'Dimensions') } ) + found_pattern_dim_ind <-lapply(lapply( + lapply(dim_list, names), + '==', found_pattern_dim), which) + dim_list_without_dat <- lapply(1:ndat, function(x) { + dim_list[[x]] <- dim_list[[x]][-found_pattern_dim_ind[[x]]]}) + if (!do.call(identical, dim_list_without_dat)) { + .warning(paste0("The datasets are detected to have inconsistent dimensions. ", + "They may not be able to combine for the later operation.")) + } else { + if (!silent) { + dim_list_dat <- lapply(1:ndat, function(x) { + dim_list[[x]] <- dim_list[[x]][found_pattern_dim_ind[[x]]]}) + combined_dat_length <- do.call(sum, dim_list_dat) + final_dims_combined <- attr(res[[1]], 'Dimensions') + final_dims_combined[found_pattern_dim] <- do.call(sum, dim_list_dat) + .message("Detected final dimension sizes after combination:") + longest_dim_len <- max(sapply(names(final_dims_combined), nchar)) + longest_size_len <- max(sapply(paste0(final_dims_combined, ''), nchar)) + sapply(names(final_dims_combined), + function(x) { + message(paste0("* ", paste(rep(' ', longest_dim_len - nchar(x)), collapse = ''), + x, ": ", paste(rep(' ', longest_size_len - nchar(paste0(final_dims_combined[x], ''))), collapse = ''), + final_dims_combined[x])) + }) + bytes <- prod(c(final_dims_combined, 8)) + + .message(paste("Total size of combined data:")) + dim_sizes <- paste(final_dims_combined, collapse = ' x ') + .message(paste(dim_sizes, " x 8 bytes =", + format(structure(bytes, class = "object_size"), units = "auto")), + indent = 2) + } + } + + return(res) + } + +} + diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index 55e47f07716f476dcaf86b0b036099a9d9238906..6bce18113ee81ed9d5a9461a2b6c47960958ef9c 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -6,9 +6,7 @@ if (!is.null(lib_dir)) { .libPaths(new = lib_dir) } library(startR) - out_dir <- - debug <- start_calls <- start_calls_attrs <- @@ -65,6 +63,20 @@ for (input in 1:length(data)) { for (call_dim in names_dims_to_alter) { start_call[[call_dim]] <- startR:::.chunk(chunk_indices[call_dim], chunks[[call_dim]], eval(start_call[[call_dim]])) + #========================================= + # multiStart is used + # If n_chunks > 1 & call_dim has NA, extra process in Start() when chunking + if (!is.null(start_calls_attrs[[input]][['na_pos_record']])) { + if (chunks[[call_dim]] > 1 & + start_calls_attrs[[input]][['na_pos_record']][[1]][[call_dim]] != 0) { + attr(start_call[[call_dim]], "chunk") <- c(attr(start_call[[call_dim]], "chunk"), chunk_with_NA = 1) + # put na_pos_record in attributes + attr(start_call[[call_dim]], "na_pos_record") <- start_calls_attrs[[input]][['na_pos_record']][[1]][[call_dim]] + } else { + attr(start_call[[call_dim]], "chunk") <- c(attr(start_call[[call_dim]], "chunk"), chunk_with_NA = 0) + } + } + #========================================== } } if (!('num_procs' %in% names(start_call))) { @@ -98,6 +110,66 @@ for (input in 1:length(data)) { #write.table(attributes(data[[input]])$ObjectBigmemory, # file = paste0(task_path, '.filename.txt'), # col.names = FALSE, row.names = FALSE, quote = FALSE) + + #====================================================== + # If multiStart() is used, deal with NA insertion here + if (!is.null(start_calls_attrs[[input]][['na_pos_record']])) { + na_pos_record_list <- start_calls_attrs[[input]][['na_pos_record']][[1]] + if (any(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any)))) { + have_na_dims <- which(unlist(lapply(lapply(na_pos_record_list, '!=', 0), any))) + for (have_na_dim_name in names(have_na_dims)) { + + if (!have_na_dim_name %in% names(chunks)) { + not_chunked <- TRUE + } else if (chunks[[have_na_dim_name]] == 1) { + not_chunked <- TRUE + } else { + not_chunked <- FALSE + } + # If the dim is not chunked, insert NAs directly according to the list +Apply <- multiApply::Apply + if (not_chunked) { + + data[[input]] <- do.call("Apply", + c( + list(data[[input]], + target_dims = have_na_dim_name, + fun = startR:::.insertNAdim, + output_dims = have_na_dim_name, + na_pos = na_pos_record_list[[have_na_dim_name]], + ncores = threads_compute) + ) + )$output1 + + + } else { # chunk along this dim. Need to examine piece by piece. + dim_na_pos_record <- attr(start_call[[have_na_dim_name]], "na_pos_record") + chunk_vector <- attr(start_call[[have_na_dim_name]], "chunk") + sub_chunk_indices <- startR:::chunk_indices(length(start_call[[have_na_dim_name]]) + length(dim_na_pos_record), + chunk = chunk_vector[1], + n_chunks = chunk_vector[2], + have_na_dim_name) + without_NA_dim_length <- dim(data[[input]])[have_na_dim_name] + if (without_NA_dim_length != length(sub_chunk_indices)) { # need to insert NA + sub_na_pos <- which(sub_chunk_indices %in% dim_na_pos_record) + + data[[input]] <- do.call("Apply", + c( + list(data[[input]], + target_dims = have_na_dim_name, + fun = startR:::.insertNAdim, + output_dims = have_na_dim_name, + na_pos = sub_na_pos, + ncores = threads_compute) + ) + )$output1 + } + } + } + } + + } + #========================================================= } t_end_load <- Sys.time() t_load <- as.numeric(difftime(t_end_load, t_begin_load, units = 'secs')) diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 06f2c044eb64673cedaf1141a00c0cc69bc2b5b5..8a202cc49d55ae4d1a9d01d82b6afad35177699e 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -54,6 +54,11 @@ You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata This script shows how to load and plot data in rotated coordinates using **Monarch-dust** simulations. + 13. [Use multiStart to read data with NA assigned](inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R) + The script shows a simple case that uses multiStart to read data when NA insertion is needed in certain selectors. + 14. [Use multiStart to read two datasets with different prediction lengths per file](inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R) + This use case shows how to use multiStart() to read two datasets (EC-Earth3 and MPI-ESM) with different prediction length per file together. + 2. **Execute computation (use `Compute()`)** 1. [Function working on time dimension](inst/doc/usecase/ex2_1_timedim.R) 2. [Function using attributes of the data](inst/doc/usecase/ex2_2_attr.R) @@ -72,8 +77,8 @@ You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata 10. [Apply an existing mask on data](inst/doc/usecase/ex2_10_existing_mask.R) This use case shows you how to apply the existing mask file on your data. If you need to create the mask file on your own, go to ex2_9_mask.R. - - - - + 11. [Use multiStart to read two datasets with different calendars in one call](inst/doc/usecase/ex2_11_multiStart_insert_NA.R) + This use case decribes the features of multiStart and how to use it. It shows how to read two datasets with different calendars aligned together with NA inserted. It is recommended to read it first if you want to use multiStart(). + 12. [Load monthly data from decadal prediction systems with different initial dates](inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R) + An example showing how to load and operate monthly data from decadal prediction systems that are initialised in different dates by multiStart(). diff --git a/inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R b/inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R new file mode 100644 index 0000000000000000000000000000000000000000..cb94d6b1d7272044ac9f797a2d06c0e93f6fdc45 --- /dev/null +++ b/inst/doc/usecase/ex1_13_multiStart_insert_NA_reshape.R @@ -0,0 +1,36 @@ +#------------------------------------------------- +# This case doesn't make much sense to insert an NA. But it shows the availability +# of multiStart() to deal with NAs while reshaping. If you have a more meaningful +# example, please let us know. + +#NOTE: It may not work with 'split_multiselected_dims'. Need further tests. +#------------------------------------------------- + +#=================================== +# merge_across_dims; two variables +#=================================== + +path <- '/esarchive/recon/ecmwf/erainterim/monthly_mean/$var$_f6h/$var$_$sdate$.nc' +sdate <- c('199801', '199802', '199803') +time <- indices(c(1, 2, NA, 3)) +res <- multiStart( + dat = list(list(name = 'erainterim', path = path)), + var = c('ps', 'tas'), + sdate = sdate, + time = list(list(name = 'erainterim', time = time)), + time_across = 'sdate', + merge_across_dims = TRUE, + lat = indices(1), lon = indices(1), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = TRUE) + +dim(res) +# dat var time lat lon +# 1 2 4 1 1 +res[1, , , 1, 1] + [,1] [,2] [,3] [,4] +[1,] 102528.9219 101670.4375 NA 102194.1484 +[2,] 244.3599 242.1576 NA 243.7157 + diff --git a/inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R b/inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R new file mode 100644 index 0000000000000000000000000000000000000000..34c7aee1d5a5c2e676903ae1af3a47d2468dde99 --- /dev/null +++ b/inst/doc/usecase/ex1_14_multiStart_ecearth_merge.R @@ -0,0 +1,65 @@ +#-------------------------------------------------------- +# Author: An-Chi Ho +# Date: 7th April 2021 +#--------------------------------------------- +# This use case shows how to use multiStart() to read two datasets with different +# prediction length per file together. The two datasets, EC-Earth3 and MPI-ESM1.2-HR, +# have the same start date and calendar, but the prediction length per file is different. +# EC-Earth3 has seveal files for "sdate = 2000" and each file has 12 time steps (i.e., +# 12 months); MPI-ESM1.2 has only one file for "sdate = 2000" and the file contains +# 122 time steps. To read them together, 'merge_across_dims = TRUE' and 'time_across = 'fyear' +# need to be used. +#--------------------------------------------- + +library(startR) + +# Define selectors + #------- EC-Earth3 --------- + ## calendar: standard + ## Prediction period: 2000 11 - 2011 10 (11 years) + ## time steps per file: 12 + path_ecearth3 <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/', + 'EC-Earth3/dcppA-hindcast/r1i1p1f1/Omon/$var$/gr/v20190713/', + '$var$_Omon_EC-Earth3_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$fyear$.nc') + ## file name: + ## tos_Omon_EC-Earth3_dcppA-hindcast_s2000-r1i1p1f1_gr_200011-200110.nc + ## tos_Omon_EC-Earth3_dcppA-hindcast_s2000-r1i1p1f1_gr_200111-200210.nc + + #-------- MPI-ESM1.2-HR -------- + ## calendar: standard + ## Prediction period: 2000 11 - 2010 12 (10 years and 2 months) + ## time steps per file: 122 + path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/', + 'r1i1p1f1/Omon/$var$/gr/v20200320/', + '$var$_Omon_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$fyear$.nc') + ## file name: + ## tos_Omon_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gr_200011-201012.nc + + # Selectors + sdate <- '2000' + fyear <- 2000:2001 + fyear_ecearth3 <- paste0(fyear, '11-', as.numeric(fyear) + 1, '10') + fyear_mpi_esm <- paste0(sdate, '11-', as.numeric(sdate) + 10, '12') + time_ecearth3 <- 1:24 #200011-200210 + time_mpi_esm <- 1:24 #200011-200210 + + + +res <- multiStart( + dat = list(list(name = 'ecearth3', path = path_ecearth3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tos', sdate = '2000', + fyear = list(list(name = 'ecearth3', fyear = fyear_ecearth3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'ecearth3', time = time_ecearth3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = indices(1), lon = indices(1), + time_across = 'fyear', + merge_across_dims = TRUE, + return_vars = list(lat = 'dat', lon = 'dat', time = 'sdate'), + retrieve = TRUE) + +dim(res) +# dat var sdate time lat lon +# 2 1 1 24 1 1 diff --git a/inst/doc/usecase/ex2_11_multiStart_insert_NA.R b/inst/doc/usecase/ex2_11_multiStart_insert_NA.R new file mode 100644 index 0000000000000000000000000000000000000000..068bc06f531e4a38d0e66996700a2ee1a2f28f9d --- /dev/null +++ b/inst/doc/usecase/ex2_11_multiStart_insert_NA.R @@ -0,0 +1,211 @@ +#------------------------------------------------------ +# Author: An-Chi Ho +# Date: 29th March 2021 +#------------------------------------------------------ +# This use case shows the basic usage of multiStart(), the wrapper of Start() +# for multiple datasets retrieval that have different selector requirement. +# The usage is quite similar to Start() only with some differences. +# - To assign different selectors for each dataset, use the same format as the +# pattern_dim: a list of lists, first item 'name' is the dataset name and the second +# item '' is the selector. +# - When defining the operation and workflow, notice that the considered data array +# number is the dataset number. In this case, it is 2 (hadgem3 and mpi_esm). Even +# though multiStart() returns one object 'data', it actually contains the startR calls +# for each dataset separately. +# +# This use case wants to put HadGEM3 and MPI-ESM datasets together in one call. +# The major differences between them are the spatial grids and calendar type. +# HadGEM3 uses 360_day calendar and MPI-ESM uses standard calendar. For the grid +# difference, the 'tramsform' parameters can deal with it; while for the calendar +# difference, Start() has difficulty align the time steps. We cannot assign NAs +# as the selector in Start(), and Start() coarces the values to the nearest found +# values, producing the hidden errors. +# +# Therefore, we use multiStart() to insert NAs manually. Three days, 30th Dec., +# 31st Dec., and 1st Jan. are requested. HadGEM3 with 360_day calendar doesn't +# have 31st Dec., so we need to insert one NA at the 2nd time step. Be sure that +# the length (NA included) of two time selectors are equal. +# +# Here are some limitations for now: +# 1. The selector for each dataset can only be indices(), not values() or 'all'. +# If the selector is the common one (e.g., 'sdate', 'lat', and 'lon' in this +# use case), then it follows the rule of Start(). +# 2. The reshape options are not all available. See ex1_13 for details. +# 3. The checker of multiStart() is not comprehensive. Please carefully examine +# the results you get. +#------------------------------------------------------ + +#======================================================= +# Define the paths and selectors +#======================================================= + + # Define paths + # HadGEM3 daily data (360_day) + path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/', + 'dcppA-hindcast/r1i1p1f2/day/$var$/gn/v20200101/', + '$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$fyear$.nc') + ## tasmax_day_HadGEM3-GC31-MM_dcppA-hindcast_s2000-r1i1p1f2_gn_20001101-20101230.nc + + # MPI-ESM1.2-HR (standard) + path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/', + 'dcppA-hindcast/r1i1p1f1/day/$var$/gn/v20200101/', + '$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') + ## tasmax_day_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gn_20001101-20101231.nc + + # Define selectors for each dataset + sdate <- '2000' + fyear_hadgem3 <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1230') + fyear_mpi_esm <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1231') + # To align the time steps, insert an NA in hadgem3 + time_hadgem3 <- indices(c(60, NA, 61)) #12/30, NA, 1/1 + time_mpi_esm <- indices(60:62) #12/30, 12/31, 1/1 + + +#======================================================= +# retrieve = TRUE; interpolation +#======================================================= + + res <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', + sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + transform = CDORemapper, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = TRUE) + + # Check the returned values. hadgem3 has NA in the 2nd time step. + dim(res) + # dat var sdate fyear time lat lon + # 2 1 1 1 3 61 51 + res[,1,1,1,,10,10] + # [,1] [,2] [,3] + #[1,] 292.5023 NA 292.0899 + #[2,] 293.0914 293.4608 292.8224 + + + +#======================================================= +# retrieve = FALSE; Compute() +#======================================================= + data <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', + sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + transform = CDORemapper, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = FALSE) + +str(data) +#List of 2 +# $ hadgem3: language Start ... +# ..- attr(*, "Dimensions") ... +# .. +# ..- attr(*, "Variables") ... +# .. +# $ mpi_esm: language Start ... +# ..- attr(*, "Dimensions") ... +# .. +# ..- attr(*, "Variables") ... +# .. + +str(attr(data[[1]], 'na_pos_record')) +#List of 1 +# $ hadgem3:List of 7 +# ..$ dat : num 0 +# ..$ var : num 0 +# ..$ sdate: num 0 +# ..$ fyear: num 0 +# ..$ time : int 2 +# ..$ lat : num 0 +# ..$ lon : num 0 + +str(attr(data[[2]], 'na_pos_record')) +#List of 1 +# $ mpi_esm:List of 7 +# ..$ dat : num 0 +# ..$ var : num 0 +# ..$ sdate: num 0 +# ..$ fyear: num 0 +# ..$ time : num 0 +# ..$ lat : num 0 +# ..$ lon : num 0 + +# Notice that even though multiStart() returns one object 'data', it actually +# contain two startR calls inside. +length(data) +# 2 +names(data) +#[1] "hadgem3" "mpi_esm" + +# Therefore, the operation should consider two datasets instead of one. +func_sum <- function(x, y) { + return(x + y) +} + +# The target_dims should be a list of 2. The dataset names can be random. +# 'time' and 'fyear' can also be the operational dimension. +step <- Step(func_sum, target_dims = list(hadgem3 = c('var'), + mpi_esm = c('var')), + output_dims = c('var')) + +# The inputs is still one object 'data', but it contains three calls implicitly. +wf <- AddStep(data, step) + +# Option 1: run locally +output <- Compute(wf, chunks = list(time = 2)) + +# Option 2: run on Nord3 +#-------------------user-defined--------------------- + queue_host <- 'nord3' + temp_dir <- '/gpfs/scratch/bsc32/bsc32/startR_hpc/' + ecflow_suite_dir <- '/home/Earth//startR_local/' +#---------------------------------------------------- + + output <- Compute(wf, + chunks = list(time = 2), + threads_load = 2, + threads_compute = 4, + cluster = list(queue_host = queue_host, + queue_type = 'lsf', + temp_dir = temp_dir, + cores_per_job = 2, + job_wallclock = '05:00', + max_jobs = 4, + extra_queue_params = list('#BSUB -q bsc_es'), + bidirectional = FALSE, + polling_period = 10 + ), + ecflow_suite_dir = ecflow_suite_dir, + wait = TRUE + ) + +dim(output$output1) +# var time dat sdate fyear lat lon +# 1 3 1 1 1 61 51 +output$output1[1, , 1, 1, 1, 1, 1] +[1] 584.9990 NA 587.8201 + diff --git a/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R b/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R new file mode 100644 index 0000000000000000000000000000000000000000..fe91f3be25e93dee5db98b5135183957fe02b928 --- /dev/null +++ b/inst/doc/usecase/ex2_12_multiStart_diff_init_dates.R @@ -0,0 +1,115 @@ +#-------------------------------------------------------- +# Author: Carlos Delgado, An-Chi Ho +# Date: 24th March 2021 +#-------------------------------------------------------- +# Example on how to load monthly data from decadal prediction systems that are +# initialised in different dates: +# - HadGEM3 (initialised in November) +# - IPSL-CM6A-LR (initialised in January) +# - NorCPM1 (initialised in October) +# Therefore, we use multiStart() to insert NAs at missing months manually. +# +# To see more details of multiStart() usage, check ex2_11. +#-------------------------------------------------------- +library(startR) + +variable <- 'tas' +sdates <- 2000:2004 + +## Paths +path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Amon/$var$/gr/v20200316/', + '$var$_*_s$sdate$_r1i1p1f2_gr_$fyear$.nc') +path_ipsl <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/ipsl-cm6a-lr/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'IPSL/IPSL-CM6A-LR/dcppA-hindcast/r1i1p1f1/Amon/$var$/gr/v20200504/', + '$var$_Amon_IPSL-CM6A-LR_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$fyear$.nc') +path_norcpm1 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/norcpm1/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'NCC/NorCPM1/dcppA-hindcast/r1i1p1f1/Amon/$var$/gn/v20200320/', + '$var$_Amon_NorCPM1_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') + +## Forecast months: six indices (from October to March) +time_hadgem3 <- indices(c(NA, 1, 2, 3, 4, 5)) +time_ipsl <- indices(c(NA, NA, NA, 1, 2, 3)) +time_norcpm1 <- indices(c(1, 2, 3, 4, 5, 6)) + +data <- multiStart(dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'ipsl', path = path_ipsl), + list(name = 'norcpm1', path = path_norcpm1)), + var = variable, + sdate = paste0(sdates), + fyear = indices(1), ## there is one file per sdate + fyear_depends = 'sdate', + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'ipsl', time = time_ipsl), + list(name = 'norcpm1', time = time_norcpm1)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + transform = CDORemapper, + transform_params = list(grid = 'r180x90', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = FALSE) + + +# Notice that though multiStart() returns one object 'data', it actually contains +# three startR calls. +length(data) +# 3 +names(data) +#[1] "hadgem3" "ipsl" "norcpm1" + +# Therefore, the operation should consider three data arrays +# rather than one. +fun <- function(dat1, dat2, dat3) { + # dat: [dat = 1] + mean(c(dat1, dat2, dat3), na.rm = TRUE) +} + +# As explained above, the number of data arrays is three, so the target_dims is a +# list of three. The names of dataset (i.e., dat1, dat2, dat3) can be random. +step <- Step(fun = fun, + target_dims = list(dat1 = 'dat', dat2 = 'dat', dat3 = 'dat'), + output_dims = NULL) + +# The inputs is still 'data', which contains three calls implicitly. +wf <- AddStep(inputs = data, step_fun = step) + + +# Option 1: run locally +output <- Compute(workflow = wf, + chunks = list(sdate = 5)) + +# Option 2: run on Nord3 +#-------------------user-defined--------------------- + queue_host <- 'nord3' + temp_dir <- '/gpfs/scratch/bsc32/bsc32/startR_hpc/' + ecflow_suite_dir <- '/home/Earth//startR_local/' +#---------------------------------------------------- +output <- Compute(workflow = wf, + chunks = list(sdate = 5), + threads_load = 1, + threads_compute = 5, + cluster = list(queue_host = queue_host, + queue_type = 'lsf', + temp_dir = temp_dir, + cores_per_job = 1, + job_wallclock = '01:00', + max_jobs = 5, + extra_queue_params = list('#BSUB -q bsc_es -M 3000'), + bidirectional = FALSE, + polling_period = 10), + ecflow_suite_dir = ecflow_suite_dir, + wait = TRUE) + + +# Results +dim(output$output1) +# time var sdate fyear lat lon +# 6 1 5 1 30 26 + +summary(output$output1) +# Min. 1st Qu. Median Mean 3rd Qu. Max. +# 279.4 293.4 296.6 295.5 298.7 304.1 diff --git a/man/multiStart.Rd b/man/multiStart.Rd new file mode 100644 index 0000000000000000000000000000000000000000..a4fa459560972ac4669c9b72d1ed25ec66ab0f9c --- /dev/null +++ b/man/multiStart.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multiStart.R +\name{multiStart} +\alias{multiStart} +\title{How to use: +1. Allow to read multiple datasets with different selectors +2. A list of list is used when different selectors are required, just like 'dat'. + The items in each list must be 'name' and ''. 'name' is the + dataset name, same as the one in 'dat'. +3. Only indices() is accepted. (If for values(), use selectorchecker to transform?) +4. Allow to have NA in the selectors. Those NAs will be removed when being fed in + Start(), and be inserted back after Start().} +\usage{ +multiStart( + ..., + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, + debug = FALSE +) +} +\description{ +NOTE: +1. No reshape (merge_across_dim yes) +2. The input check is not comprehensive +} diff --git a/tests/testthat/test-multiStart_NA_insertion.R b/tests/testthat/test-multiStart_NA_insertion.R new file mode 100644 index 0000000000000000000000000000000000000000..0c95803396696e4e8edf446be7d7f267d6e2c24a --- /dev/null +++ b/tests/testthat/test-multiStart_NA_insertion.R @@ -0,0 +1,146 @@ +context("multiStart() NA insertion") +# Same case as ex2_11. +#--------------------------------------------------------------- + + # Define paths + # HadGEM3 daily data (360_day) + path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/', + 'dcppA-hindcast/r1i1p1f2/day/$var$/gn/v20200101/', + '$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$fyear$.nc') + ## tasmax_day_HadGEM3-GC31-MM_dcppA-hindcast_s2000-r1i1p1f2_gn_20001101-20101230.nc + + # MPI-ESM1.2-HR (standard) + path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/', + 'dcppA-hindcast/r1i1p1f1/day/$var$/gn/v20200101/', + '$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') + ## tasmax_day_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gn_20001101-20101231.nc + + # Define selectors for each dataset + sdate <- '2000' + fyear_hadgem3 <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1230') + fyear_mpi_esm <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1231') + # To align the time steps, insert an NA in hadgem3 + time_hadgem3 <- indices(c(60, NA, 61)) #12/30, NA, 1/1 + time_mpi_esm <- indices(60:62) #12/30, 12/31, 1/1 + + + + +test_that("1. retrieve = T", { + +suppressWarnings( +res <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', + sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = values(list(-10, 10)), + lon = values(list(0, 10)), + transform = CDORemapper, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(0, 10, -10, 10)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = TRUE) +) + + +expect_equal( +dim(res), +c(dat = 2, var = 1, sdate = 1, fyear = 1, time = 3, lat = 21, lon = 11) +) +expect_equal( +length(which(is.na(res))), +231 +) +expect_equal( +res[1, 1, 1, 1, , 1, 1], +c(297.5703, NA, 297.5506), +tolerance = 0.0001 +) + +}) + +test_that("2. retrieve = F; Compute locally", { + +suppressWarnings( +res <- multiStart( + dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'mpi_esm', path = path_mpi_esm)), + var = 'tasmax', sdate = '2000', + fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), + list(name = 'mpi_esm', fyear = fyear_mpi_esm)), + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'mpi_esm', time = time_mpi_esm)), + lat = indices(3:5), lon = indices(1), + return_vars = list(lat = 'dat', lon = 'dat', time = 'dat'), + retrieve = FALSE) +) + +#--------------------------------------- +expect_equal( +attr(res[[1]], 'Dimensions'), +c(dat = 1, var = 1, sdate = 1, fyear = 1, time = 3, lat = 3, lon = 1) +) +expect_equal( +attr(res[[2]], 'Dimensions'), +c(dat = 1, var = 1, sdate = 1, fyear = 1, time = 3, lat = 3, lon = 1) +) +expect_equal( +attr(res[[1]], 'na_pos_record')[[1]]$time, +2 +) +expect_equal( +sum(unlist(attr(res[[2]], 'na_pos_record')[[1]])), +0 +) + +#--------------------------------------- + +func_sum <- function(x, y) { + return(x + y) +} + +# 'time' and 'fyear' can also be the operational dimension +step <- Step(func_sum, target_dims = list(hadgem3 = c('var', 'lon'), + mpi_esm = c('var', 'lon')), + output_dims = c('var', 'lon')) + +wf <- AddStep(res, step) + +output1 <- Compute(wf, chunks = list(lat = 2)) +output2 <- Compute(wf, chunks = list(lat = 2, time = 2)) + +expect_equal( +dim(output1$output1), +c(var = 1, lon = 1, time = 3, dat = 1, sdate = 1, fyear = 1, lat = 3) +) +expect_equal( +dim(output2$output1), +c(var = 1, lon = 1, time = 3, dat = 1, sdate = 1, fyear = 1, lat = 3) +) +expect_equal( +output1$output1[1,1,,1,1,1,1], +c(503.0177, NA, 507.7226), +tolerance = 0.001 +) +expect_equal( +all(is.na(output1$output1[1,1,2,1,1,1,])), +TRUE +) + +expect_equal( +as.vector(output1$output1), +as.vector(output2$output1) +) + + + +}) diff --git a/tests/testthat/test-multiStart_diff_init_dates.R b/tests/testthat/test-multiStart_diff_init_dates.R new file mode 100644 index 0000000000000000000000000000000000000000..9e0276f220d9944586b657f8f646b1057144db3c --- /dev/null +++ b/tests/testthat/test-multiStart_diff_init_dates.R @@ -0,0 +1,160 @@ +context("multiStart() NA insertion") +# Same case as ex2_12. +#--------------------------------------------------------------- + +variable <- 'tas' +sdates <- 2000:2004 + +## Paths +path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Amon/$var$/gr/v20200316/', + '$var$_*_s$sdate$_r1i1p1f2_gr_$fyear$.nc') +path_ipsl <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/ipsl-cm6a-lr/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'IPSL/IPSL-CM6A-LR/dcppA-hindcast/r1i1p1f1/Amon/$var$/gr/v20200504/', + '$var$_Amon_IPSL-CM6A-LR_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$fyear$.nc') +path_norcpm1 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/norcpm1/cmip6-dcppA-hindcast_i1p1/DCPP/', + 'NCC/NorCPM1/dcppA-hindcast/r1i1p1f1/Amon/$var$/gn/v20200320/', + '$var$_Amon_NorCPM1_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') + +## Forecast months: six indices (from October to March) +time_hadgem3 <- indices(c(NA, 1, 2, 3, 4, 5)) +time_ipsl <- indices(c(NA, NA, NA, 1, 2, 3)) +time_norcpm1 <- indices(c(1, 2, 3, 4, 5, 6)) + + +test_that("1. retrieve = T", { + +suppressWarnings( +data <- multiStart(dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'ipsl', path = path_ipsl), + list(name = 'norcpm1', path = path_norcpm1)), + var = variable, + sdate = paste0(sdates), + fyear = indices(1), ## there is one file per sdate + fyear_depends = 'sdate', + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'ipsl', time = time_ipsl), + list(name = 'norcpm1', time = time_norcpm1)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + transform = CDORemapper, + transform_params = list(grid = 'r180x90', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = TRUE) +) + + +expect_equal( +dim(data), +c(dat = 3, var = 1, sdate = 5, fyear = 1, time = 6, lat = 30, lon = 26) +) +expect_equal( +mean(data, na.rm = T), +295.4626, +tolerance = 0.0001 +) +expect_equal( +length(which(is.na(data))), +15600 +) +expect_equal( +data[1, 1, 2, 1, 1:4, 1, 1], +c(NA, 290.1964, 292.6637, 293.2186), +tolerance = 0.0001 +) +expect_equal( +data[2, 1, 2, 1, 1:4, 1, 1], +c(NA, NA, NA, 294.2315), +tolerance = 0.0001 +) +expect_equal( +data[3, 1, 2, 1, 1:4, 1, 1], +c(288.6785, 289.5546, 290.4638, 291.7312), +tolerance = 0.0001 +) + +}) + + +test_that("2. retrieve = F", { + +suppressWarnings( +data <- multiStart(dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'ipsl', path = path_ipsl), + list(name = 'norcpm1', path = path_norcpm1)), + var = variable, + sdate = paste0(sdates), + fyear = indices(1), ## there is one file per sdate + fyear_depends = 'sdate', + time = list(list(name = 'hadgem3', time = time_hadgem3), + list(name = 'ipsl', time = time_ipsl), + list(name = 'norcpm1', time = time_norcpm1)), + lat = values(list(-30, 30)), + lon = values(list(0, 50)), + synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + transform = CDORemapper, + transform_params = list(grid = 'r180x90', + method = 'conservative', + crop = c(0, 50, -30, 30)), + transform_vars = c('lat', 'lon'), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = FALSE) +) + +expect_equal( +length(data), +3 +) +expect_equal( +names(data), +c("hadgem3", "ipsl", "norcpm1") +) + +# Therefore, the operation should consider three data arrays +# rather than one. +fun <- function(dat1, dat2, dat3) { + # dat: [dat = 1] + mean(c(dat1, dat2, dat3), na.rm = TRUE) +} + +# As explained above, the number of data arrays is three, so the target_dims is a +# list of three. The names of dataset (i.e., dat1, dat2, dat3) can be random. +step <- Step(fun = fun, + target_dims = list(dat1 = 'dat', dat2 = 'dat', dat3 = 'dat'), + output_dims = NULL) + +# The inputs is still 'data', which contains three calls implicitly. +wf <- AddStep(inputs = data, step_fun = step) + + +# run locally +output <- Compute(workflow = wf, + chunks = list(sdate = 2))$output1 + + +expect_equal( +dim(output), +c(time = 6, var = 1, sdate = 5, fyear = 1, lat = 30, lon = 26) +) + +expect_equal( +mean(output), +295.4511, +tolerance = 0.0001 +) +expect_equal( +length(which(is.na(output))), +0 +) +expect_equal( +output[1:5], +c(288.9735, 290.6019, 291.8069, 293.3672, 293.4944), +tolerance = 0.0001 +) + + +})