diff --git a/DESCRIPTION b/DESCRIPTION index a75b2cfedb8c97faf8bc273c812c97f75afb21f6..bf0ed00992f66d66b2701e9199c1d41db7c593fc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 2.0.1 +Version: 2.1.0 Authors@R: c( person("BSC-CNS", role = c("aut", "cph")), person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), @@ -9,15 +9,15 @@ Authors@R: c( person("Javier", "Vegas", , "javier.vegas@bsc.es", role = c("ctb")), person("Pierre-Antoine", "Bretonniere", , "pierre-antoine.bretonniere@bsc.es", role = c("ctb")), person("Roberto", "Serrano", , "rsnotivoli@gmal.com", role = c("ctb"))) -Description: Tool to automatically fetch, transform and arrange subsets of multi- - dimensional data sets (collections of files) stored in local and/or remote - file systems or servers, using multicore capabilities where possible. The tool - provides an interface to perceive a collection of data sets as a single large - multidimensional data array, and enables the user to request for automatic +Description: Tool to automatically fetch, transform and arrange subsets of + multi- dimensional data sets (collections of files) stored in local and/or + remote file systems or servers, using multicore capabilities where possible. + The tool provides an interface to perceive a collection of data sets as a single + large multidimensional data array, and enables the user to request for automatic retrieval, processing and arrangement of subsets of the large array. Wrapper functions to add support for custom file formats can be plugged in/out, making - the tool suitable for any research field where large multidimensional data - sets are involved. + the tool suitable for any research field where large multidimensional data sets + are involved. Depends: R (>= 3.2.0) Imports: @@ -36,5 +36,5 @@ License: LGPL-3 URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true -SystemRequirements: cdo +SystemRequirements: cdo ecFlow RoxygenNote: 5.0.0 diff --git a/NEWS.md b/NEWS.md index c348ddaa134d2f9e18c08b6eb97f37ef29fa4ce2..8c7a11636c700e54e7c663ffedbc9295be22601b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,13 @@ +# startR v2.1.0 (Release date: 2020-10-30) +- Bugfix for metadata retrieving when there are more than one dataset and one of them is missing. +- Bugfix for the Start() parameter 'metadata_dims' is set to non-dat dimension. +- Bugfix for wildcard reading when the Start() parameter 'path_glob_permissive' is used. +- /dev/shm automatic cleaning on Compute(). Solve the error 'No space left on device' which happened when the jobs are aborted. +- Add new paramter 'largest_dims_length' in Start(). It can examine all the files to find the largest inner dimension length. It is useful when certain inner dimension among the files does not have consistent length (e.g., different ensemble number). + +# startR v2.0.1 (Release date: 2020-09-10) +- /dev/shm automatic cleaning on Compute() + # startR v2.0.1 (Release date: 2020-08-25) - Bugfix for the function .chunk(). Its name was chunk() before v2.0.0, and there are two parts were not renamed to .chunk() in v2.0.0. diff --git a/R/ByChunks.R b/R/ByChunks.R index 5f0bba5783974ee912825b25c2449e26893d5f72..dd101120d1b394ba8cae6eca7f8bf1cb43688f8f 100644 --- a/R/ByChunks.R +++ b/R/ByChunks.R @@ -551,6 +551,8 @@ ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', ecflow_suite_dir_suite) file.copy(system.file('chunking/tail.h', package = 'startR'), ecflow_suite_dir_suite) + #file.copy(system.file('chunking/clean_devshm.sh', package = 'startR'), + # ecflow_suite_dir_suite) } add_line <- function(suite, line, tabs) { diff --git a/R/Start.R b/R/Start.R index 8243fda0d5826a1da7ba09e65b0524dab2ecac4e..39e1e6cbd8588c37f555eaea8cb268ec45645e82 100644 --- a/R/Start.R +++ b/R/Start.R @@ -677,6 +677,21 @@ #'\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr #'\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr #'only the first found file will be used. +#'@param largest_dims_length A logical value or a named integer vector +#' indicating if Start() should examine all the files to get the largest +#' length of the inner dimensions (TRUE) or use the first valid file of each +#' dataset as the returned dimension length (FALSE). Since examining all the +#' files could be time-consuming, a vector can be used to explicitly specify +#' the expected length of the inner dimensions. For those inner dimensions not +#' specified, the first valid file will be used. The default value is FALSE.\cr\cr +#' This parameter is useful when the required files don't have consistent +#' inner dimension. For example, there are 10 required experimental data files +#' of a series of start dates. The data only contain 25 members for the first +#' 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, +#' the returned member dimension length will be 25 only. The 26th to 51st +#' members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, +#' the returned member dimension length will be 51. To save the resource, +#' \code{'largest_dims_length = c(member = 51)'} can also be used. #'@param retrieve A logical value indicating whether to retrieve the data #' defined in the Start() call or to explore only its dimension lengths #' and names, and the values for the file and inner dimensions. The default @@ -686,6 +701,9 @@ #' multiple involved files in a call to Start(). If set to NULL, #' takes the number of available cores (as detected by detectCores() in #' the package 'future'). The default value is 1 (no parallel execution). +#'@param ObjectBigmemory a character string to be included as part of the +#' bigmemory object name. This parameter is thought to be used internally by the +#' chunking capabilities of startR. #'@param silent A logical value of whether to display progress messages (FALSE) #' or not (TRUE). The default value is FALSE. #'@param debug A logical value of whether to return detailed messages on the @@ -805,8 +823,10 @@ Start <- function(..., # dim = indices/selectors, merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, + largest_dims_length = FALSE, retrieve = FALSE, num_procs = 1, + ObjectBigmemory = NULL, silent = FALSE, debug = FALSE) { #, config_file = NULL #dictionary_dim_names = , @@ -1360,6 +1380,19 @@ Start <- function(..., # dim = indices/selectors, stop("Parameter 'path_glob_permissive' must be of length 1.") } + # Check largest_dims_length + if (!is.numeric(largest_dims_length) && !is.logical(largest_dims_length)) { + stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.") + } + if (is.numeric(largest_dims_length)) { + if (any(largest_dims_length %% 1 != 0) | any(largest_dims_length < 0) | is.null(names(largest_dims_length))) { + stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.") + } + } + if (is.logical(largest_dims_length) && length(largest_dims_length) != 1) { + stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or a named integer vector.") + } + # Check retrieve if (!is.logical(retrieve)) { stop("Parameter 'retrieve' must be TRUE or FALSE.") @@ -1768,11 +1801,23 @@ Start <- function(..., # dim = indices/selectors, dim = files_to_load) names(dim(sub_array_of_not_found_files)) <- known_dims j <- 1 - selector_indices_save <- vector('list', prod(files_to_load)) + if (!exists('selector_indices_save')) { + selector_indices_save <- vector('list', length = length(dat)) + } + if (!exists('selectors_total_list')) { + selectors_total_list <- vector('list', length = length(dat)) + } + selector_indices_save[[i]] <- vector('list', length = prod(files_to_load)) + selectors_total_list[[i]] <- vector('list', length = prod(files_to_load)) + while (j <= prod(files_to_load)) { selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] names(selector_indices) <- known_dims - selector_indices_save[[j]] <- selector_indices + + tmp <- selector_indices + tmp[which(known_dims == found_pattern_dim)] <- i + selector_indices_save[[i]][[j]] <- tmp + selectors <- sapply(1:length(known_dims), function (x) { vector_to_pick <- 1 @@ -1782,6 +1827,9 @@ Start <- function(..., # dim = indices/selectors, dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]] }) names(selectors) <- known_dims + selectors_total_list[[i]][[j]] <- selectors + names(selectors_total_list[[i]][[j]]) <- known_dims + replace_values[known_dims] <- selectors if (!dataset_has_files[i]) { if (any(is.na(selectors))) { @@ -2210,29 +2258,85 @@ Start <- function(..., # dim = indices/selectors, if (dataset_has_files[i]) { indices <- indices_of_first_files_with_data[[i]] if (!is.null(indices)) { - file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) - # The following 5 lines should go several lines below, but were moved - # here for better performance. - # If any of the dimensions comes without defining variable, then we read - # the data dimensions. - data_dims <- NULL - if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { - file_to_open <- file_path - data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], - lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), - synonims) + if (largest_dims_length == FALSE | is.numeric(largest_dims_length)) { #old code. use the 1st valid file to determine the dims + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) + # The following 5 lines should go several lines below, but were moved + # here for better performance. + # If any of the dimensions comes without defining variable, then we read + # the data dimensions. + data_dims <- NULL + if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { + file_to_open <- file_path + data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims) + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(data_dims) <- sapply(names(data_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + } + + if (is.numeric(largest_dims_length)) { # largest_dims_length is a named vector + # Check if the names fit the inner dimension names + if (!all(names(largest_dims_length) %in% names(data_dims))) { + #NOTE: stop or warning? + stop("Parameter 'largest_dims_length' has inconsistent names with inner dimensions.") + } else { + match_ind <- match(names(largest_dims_length), names(data_dims)) + data_dims[match_ind] <- largest_dims_length + } + } + + } else { # largest_dims_length == TRUE + # Open and get all the dim from all the files + data_dims_all_files <- vector('list', length = length(selectors_total_list[[i]])) + + for (selectors_kk in 1:length(data_dims_all_files)) { + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(selector_indices_save[[i]][[selectors_kk]]))) + file_to_open <- file_path + + data_dims_all_files[[selectors_kk]] <- try(file_dim_reader(file_to_open, NULL, selectors_total_list[[i]][[selectors_kk]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims), silent = TRUE) + + } + + # Remove the missing files (i.e., fail try above) + if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { + tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') + data_dims_all_files <- data_dims_all_files[-tmp] + } + + # Find the longest dimensions from all the files + largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) + for (kk in 1:length(data_dims_all_files[[1]])) { + largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) + } + names(largest_data_dims) <- names(data_dims_all_files[[1]]) + # file_dim_reader returns dimension names as found in the file. # Need to translate accoridng to synonims: - names(data_dims) <- sapply(names(data_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) - } + names(largest_data_dims) <- sapply(names(largest_data_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + + # replace data_dims with largest_data_dims + data_dims <- largest_data_dims + } # end of if (largest_dims_length == TRUE) + # Transform the variables if needed and keep them apart. if (!is.null(transform) && (length(transform_vars) > 0)) { if (!all(transform_vars %in% c(names(picked_vars[[i]]), names(picked_common_vars)))) { @@ -2250,7 +2354,7 @@ Start <- function(..., # dim = indices/selectors, if (length(which_are_ordered) > 0) { tmp <- which(!is.na(match(names(picked_vars_ordered[[i]]), names(which_are_ordered)))) new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[[i]][tmp] - + } vars_to_transform <- c(vars_to_transform, new_vars_to_transform) } @@ -3715,8 +3819,24 @@ Start <- function(..., # dim = indices/selectors, # TODO: try performance of storing all in cols instead of rows # Create the shared memory array, and a pointer to it, to be sent # to the work pieces. - data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1) + if (is.null(ObjectBigmemory)) { + data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1) + } else { + data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1, + backingfile = ObjectBigmemory) + } shared_matrix_pointer <- bigmemory::describe(data_array) + if (is.null(ObjectBigmemory)) { + name_bigmemory_obj <- attr(shared_matrix_pointer, 'description')$sharedName + } else { + name_bigmemory_obj <- attr(shared_matrix_pointer, 'description')$filename + } + + #warning(paste("SharedName:", attr(shared_matrix_pointer, 'description')$sharedName)) + #warning(paste("Filename:", attr(shared_matrix_pointer, 'description')$filename)) + #if (!is.null(ObjectBigmemory)) { + # attr(shared_matrix_pointer, 'description')$sharedName <- ObjectBigmemory + #} if (is.null(num_procs)) { num_procs <- future::availableCores() } @@ -4121,9 +4241,19 @@ Start <- function(..., # dim = indices/selectors, loaded_metadata_count <- 1 for (kk in 1:length(return_metadata)) { for (jj in 1:length(return_metadata[[kk]])) { - return_metadata[[kk]][jj] <- loaded_metadata[[loaded_metadata_count]] - names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[loaded_metadata_count]]) - loaded_metadata_count <- loaded_metadata_count + 1 + + if (dataset_has_files[kk]) { + if (loaded_metadata_count %in% loaded_metadata_files) { + return_metadata[[kk]][jj] <- loaded_metadata[[loaded_metadata_count]] + names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[loaded_metadata_count]]) + } else { + return_metadata[[kk]][jj] <- NULL + } + loaded_metadata_count <- loaded_metadata_count + 1 + } else { + return_metadata[[kk]][jj] <- NULL + } + } } } @@ -4210,27 +4340,37 @@ Start <- function(..., # dim = indices/selectors, var_backup <- attr(data_array, 'Variables') for (kk in 1:length(var_backup)) { sublist_names <- lapply(var_backup, names)[[kk]] - for (jj in 1:length(sublist_names)) { - picked_vars[[kk]][[sublist_names[jj]]] <- var_backup[[kk]][[jj]] + if (!is.null(sublist_names)) { + for (jj in 1:length(sublist_names)) { + picked_vars[[kk]][[sublist_names[jj]]] <- var_backup[[kk]][[jj]] + } } } var_backup <- NULL } else { #(1) var_backup <- attr(data_array, 'Variables') - var_backup_names <- unlist(lapply(var_backup, names)) - new_list <- vector('list', length = length(var_backup_names)) + len <- unlist(lapply(var_backup, length)) + len <- sum(len) + length(which(len == 0)) #0 means NULL + name_list <- lapply(var_backup, names) + new_list <- vector('list', length = len) count <- 1 + for (kk in 1:length(var_backup)) { - for (jj in 1:length(var_backup[[kk]])) { - new_list[[count]] <- var_backup[[kk]][[jj]] + if (length(var_backup[[kk]]) == 0) { #NULL count <- count + 1 + } else { + for (jj in 1:length(var_backup[[kk]])) { + new_list[[count]] <- var_backup[[kk]][[jj]] + names(new_list)[count] <- name_list[[kk]][jj] + count <- count + 1 + } } } - names(new_list) <- var_backup_names var_backup <- new_list } } + attr(data_array, 'Variables') <- NULL attributes(data_array) <- c(attributes(data_array), list(Variables = c(list(common = c(picked_common_vars, var_backup)), @@ -4238,7 +4378,8 @@ Start <- function(..., # dim = indices/selectors, Files = array_of_files_to_load, NotFoundFiles = array_of_not_found_files, FileSelectors = file_selectors, - PatternDim = found_pattern_dim) + PatternDim = found_pattern_dim, + ObjectBigmemory = name_bigmemory_obj) #attr(shared_matrix_pointer, 'description')$sharedName) ) attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class')) data_array @@ -4281,6 +4422,7 @@ Start <- function(..., # dim = indices/selectors, file_data_reader, synonims, transform, transform_params, silent = FALSE, debug = FALSE) { + #warning(attr(shared_matrix_pointer, 'description')$sharedName) # suppressPackageStartupMessages({library(bigmemory)}) ### TODO: Specify dependencies as parameter # suppressPackageStartupMessages({library(ncdf4)}) diff --git a/R/Utils.R b/R/Utils.R index a4255c1790057b2328dba51c076842064514883c..3a6f6ea59feca0038389a4627aa9850bba170e4e 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -203,8 +203,12 @@ if (!is.null(left)) { left_match <- regexpr(paste0(left, replace_values[[tag]], right_known), actual_path) match_len <- attr(left_match, 'match.length') - left_match_limits <- c(left_match + match_len - 1 - nchar(clean(right_known)) - nchar(replace_values[[tag]]) + 1, - left_match + match_len - 1 - nchar(clean(right_known))) + + right_known_nchar <- nchar(clean(right_known)) + if (identical(right_known_nchar, integer(0))) right_known_nchar <- 0 + left_match_limits <- c(left_match + match_len - 1 - right_known_nchar - nchar(replace_values[[tag]]) + 1, + left_match + match_len - 1 - right_known_nchar) + if (!(left_match < 1)) { match_limits <- left_match_limits } @@ -213,8 +217,11 @@ if (!is.null(right)) { right_match <- regexpr(paste0(left_known, replace_values[[tag]], right), actual_path) match_len <- attr(right_match, 'match.length') - right_match_limits <- c(right_match + nchar(clean(left_known)), - right_match + nchar(clean(left_known)) + nchar(replace_values[[tag]]) - 1) + + left_known_nchar <- nchar(clean(left_known)) + if (identical(left_known_nchar, integer(0))) left_known_nchar <- 0 + right_match_limits <- c(right_match + left_known_nchar, + right_match + left_known_nchar + nchar(replace_values[[tag]]) - 1) if (is.null(match_limits) && !(right_match < 1)) { match_limits <- right_match_limits } diff --git a/README.md b/README.md index ab9968e7938b1c75f6d79beda02b84bf0452bb46..2bf1c579ab5b08d0c585f94eaa102ee53944bd70 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,13 @@ startR is an R package developed at the Barcelona Supercomputing Center which implements the MapReduce paradigm (a.k.a. domain decomposition) on HPCs in a way transparent to the user and specially oriented to complex multidimensional datasets. +The useful information for learning startR: +- [**practical guide**](inst/doc/practical_guide.md) +- [**use cases**](inst/doc/usecase.md) +- [**FAQs**](inst/doc/faq.md) +- [**tutorial**](inst/doc/tutorial/startR_tutorial_20200902.pdf) + + Following the startR framework, the user can represent in a one-page startR script all the information that defines a use case, including: - the involved (multidimensional) data sources and the distribution of the data files - the workflow of operations to be applied, over which data sources and dimensions diff --git a/inst/chunking/Chunk.ecf b/inst/chunking/Chunk.ecf index 96b7645e54fadeca3fadcb1ef56c90869313e834..60bd051a657d28ef957876c28c8ab2a45686f579 100644 --- a/inst/chunking/Chunk.ecf +++ b/inst/chunking/Chunk.ecf @@ -15,4 +15,7 @@ task_path=%REMOTE_ECF_HOME%/%ECF_NAME% Rscript load_process_save_chunk.R --args $task_path insert_indices #include_transfer_back_and_rm +#clean temporal folder +#bash %REMOTE_ECF_HOME%clean_devshm.sh $task_path + %include "./tail.h" diff --git a/inst/chunking/clean_devshm.sh b/inst/chunking/clean_devshm.sh new file mode 100644 index 0000000000000000000000000000000000000000..a2f317ba5a4171da4c37936832fcbe092438d4e8 --- /dev/null +++ b/inst/chunking/clean_devshm.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Take the filename +path=$1 +name=.filename.txt +remote=$1$name +echo "$remote" +while IFS= read -r line +do + echo "$line" + parti='/dev/shm/' + filename=$parti$line +# Check the file is exists or not +echo "$filename" +if [[ -f $filename ]] +then + # Remove  the file + rm "$filename" +else + echo File does not exist +fi +done < "$remote" diff --git a/inst/chunking/load_process_save_chunk.R b/inst/chunking/load_process_save_chunk.R index e68b8699ad8309d938431d5d82f5fc6f30fdbb74..55e47f07716f476dcaf86b0b036099a9d9238906 100644 --- a/inst/chunking/load_process_save_chunk.R +++ b/inst/chunking/load_process_save_chunk.R @@ -70,7 +70,34 @@ for (input in 1:length(data)) { if (!('num_procs' %in% names(start_call))) { start_call[['num_procs']] <- threads_load } - data[[input]] <- eval(start_call) + # Creates a name for the temporal file using the chunks numbers: + nameMemoryObject <- gsub("[^0-9.-]", "_", gsub(out_dir, "", task_path)) + nameMemoryObject <- substr(nameMemoryObject, 2, nchar(nameMemoryObject)) + removeRS <- function(str) paste(rle(strsplit(str, "")[[1]])$values, collapse = "") + nameMemoryObject <- removeRS(nameMemoryObject) + start_call[['ObjectBigmemory']] <- nameMemoryObject + data[[input]] <- tryCatch(eval(start_call), + # Handler when an error occurs: + error = function(e) { + message(paste("The data cannot be loaded.")) + message("See the original error message:") + message(e) + message("\n Current files in /dev/shm:") + noreturn <- lapply(list.files("/dev/shm"), function (x) { + info <- file.info(paste0("/dev/shm/", x)) + message(paste("file:", rownames(info), + "size:", info$size, + "uname:", info$uname))}) + message(getwd()) + file.remove(nameMemoryObject) + file.remove(paste0(nameMemoryObject, ".desc")) + message(paste("Files", nameMemoryObject, "has been removed.")) + stop("The job has failed while loading data. See original error reported above.") + }) + warning(attributes(data[[input]])$ObjectBigmemory) + #write.table(attributes(data[[input]])$ObjectBigmemory, + # file = paste0(task_path, '.filename.txt'), + # col.names = FALSE, row.names = FALSE, quote = FALSE) } t_end_load <- Sys.time() t_load <- as.numeric(difftime(t_end_load, t_begin_load, units = 'secs')) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 53320d55adda9a365f8901af068e5eefd53ca002..947ad9f5c5591aa72b1db79d22d0dc0d884aea8e 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -23,7 +23,8 @@ This document intends to be the first reference for any doubts that you may have 17. [Use parameter 'split_multiselected_dims' in Start()](#17-use-parameter-split_multiselected_dims-in-start) 18. [Use glob expression '*' to define the file path](#18-use-glob-expression-to-define-the-file-path) 19. [Get metadata when the first file does not exist](#19-get-metadata-when-the-first-file-does-not-exist) - 20. [Use 'metadata_dims' to retrieve variable metadata](#19-use-metadata_dims-to-retrieve-variable-metadata) + 20. [Use 'metadata_dims' to retrieve variable metadata](#20-use-metadata_dims-to-retrieve-variable-metadata) + 21. [Retrieve the complete data when the dimension length varies among files](#21-retrieve-the-complete-data-when-the-dimension-length-varies-among-files) 2. **Something goes wrong...** @@ -804,6 +805,26 @@ only the first data set will have the variable's metadata. Please find the relevant use cases in [ex1_10](inst/doc/usecase/ex1_10_metadata_dims.R). +### 21. Retrieve the complete data when the dimension length varies among files +By default, Start() uses the first valid file of each data set to determine the dimensions +of the return data array. However, the inner dimension length among the files may not be the +same. For example, the member number in one experiment is 25 in the early years while +increase to 51 later. If you assign `member = 'all'` in Start() call, the returned member +dimension length will be 25 only. + +The parameter 'largest_dims_length' is for this case. Its default value is `FALSE`, meaning +that Start() can only use the first valid file to decide the dimensions. If it is changed to +`TRUE`, Start() will examine all the required files to find the largest length for all the inner +dimensions. It is time- and resource-consuming, but useful when you are not sure how the dimensions +in all the files look like. + +If you know the expected dimension length, it is recommended to assign 'largest_dims_length' +by a named integer vector, for example, `largest_dims_length = c(member = 51)`. Start() will +adopt the provided ones and use the first valid file to decide the rest of dimensions. +By this means, the efficiency can be similar to `largest_dims_length = FALSE`. + + + # Something goes wrong... ### 1. No space left on device diff --git a/inst/doc/tutorial/crps.png b/inst/doc/tutorial/crps.png new file mode 100644 index 0000000000000000000000000000000000000000..5aa35413982a8e13e9bd6093f075023e8dcde434 Binary files /dev/null and b/inst/doc/tutorial/crps.png differ diff --git a/inst/doc/tutorial/hands-on_part1.md b/inst/doc/tutorial/hands-on_part1.md new file mode 100644 index 0000000000000000000000000000000000000000..a704bdc8250b0016b38590c9f4f07caa9b30f7bf --- /dev/null +++ b/inst/doc/tutorial/hands-on_part1.md @@ -0,0 +1,315 @@ +# Hands-on Part 1 +## Goal +Know how to use different parameters of Start() to generate the desired output array. + +## 1. Common-used parameters +There are several parameters of Start() usually used in every script. They help you +set up common rules and avoid mistakes. + +- **retrieve** +*Document definition: A logical value indicating whether to retrieve the data + defined in the Start() call or to explore only its dimension lengths + and names, and the values for the file and inner dimensions. The default + value is FALSE.* + +If you want to retrieve data to your workstation only rather than using the whole +startR workflow, set `retrieve = TRUE`. However, it is always recommended to use +`retrieve = FALSE` first to check the data size, so you can avoid crashing the +workstation accidentally. + +- **synonims** +*Document definition: A named list where the names are the requested variable or +dimension names, and the values are vectors of character strings with +alternative names to seek for such dimension or variable.\cr\cr +In some requests, data from different sources may follow different naming +conventions for the dimensions or variables, or even files in the same source +could have varying names. This parameter is in order for Start() to +properly identify the dimensions or variables with different names.* + +The most common synonims are lon/longitude and lat/latitude. You can also use this +parameter to rename the dimension. For example, if the variable in netCDF file has +the time dimension named 'time' while you want to have 'ftime' dimension instead, +you can use 'ftime' as inner dimension and specify `synonims = list(ftime = c('time'))`. + + +- **return_vars** +*Document definition: A named list where the names are the names of the +variables to be fetched in the files, and the values are vectors of +character strings with the names of the file dimension which to retrieve each +variable for, or NULL if the variable has to be retrieved only once +from any (the first) of the involved files.* + +The commonly used ones are `time = 'sdate'`, `longitude = NULL`, and `latitude = NULL`. +See FAQ [How-to-16](https://earth.bsc.es/gitlab/es/startR/-/blob/master/inst/doc/faq.md#16-use-parameter-return_vars-in-start) for more details. + + +```r +library(startR) + + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101', '20170201') + + data <- Start(dat = repos, + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + lat = 'all', + lon = 'all', + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = NULL, + latitude = NULL), + retrieve = FALSE + ) +``` + +1. Run the above script. What is the data size? Check the dimension of the returned object using `attr(data, 'Dimension')`. + +2. Comment out the parameter 'synonims'. Run the script and see what will happen. +Check the original dimension name of 'tas' in netCDF file by `ncdump -h /esarchive/exp/ecmwf/system5_m1/monthly_mean/tas_f6h/tas_20170101.nc |less`. + +3. Recover the script. How to rename the inner dimension 'ensemble' to 'ens'? + +4. Use `str(data)` to check the data structure. Where can you find the information +of 'time', 'longitude', and 'latitude'? Can you retrieve the 'time' array? + +5. Comment out the parameter 'return_vars'. Can you still find the information of 'time', 'longitude', and 'latitude'? + +6. Change 'return_vars' to `return_vars = list(time = 'sdate', longitude = 'dat', latitude = 'dat')` and check the data structure again. What is different from the original script? When will this change be beneficial? Try the following script and check again. + +```r + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + repos2 <- "/esarchive/exp/ecmwf/system4_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101', '20170201') + + data <- Start(dat = list(list(name = 'system5_m1', path = repos), + list(name = 'system4_m1', path = repos2)), + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + lat = 'all', + lon = 'all', + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = 'dat', + latitude = 'dat'), + retrieve = FALSE + ) +``` + + +## 2. transformation/interpolation parameters +You can easily interpolate the data within Start(). The default transformation function +is startR::CDORemapper, a wrapper function of s2dverification::CDORemap that uses cdo inside. + +The current startR version only allows the transformation with the longitude and latitude +assigned by values(). The other two ways, 'all' and indices(), are not supported now. + +We are going to learn how to: +(1) assign longitude and latitude by values() +(2) use parameter '_reorder', which is highly recommended to use along with (1) +(3) use transform-related parameters in the following practice. + +```r + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101') + lon.min <- 0 + lon.max <- 359.9 + lat.min <- -90 + lat.max <- 90 + + data <- Start(dat = repos, + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + latitude = values(list(lat.min, lat.max)), + latitude_reorder = Sort(), + longitude = values(list(lon.min, lon.max)), + longitude_reorder = CircularSort(0, 360), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = 'dat', + latitude = 'dat'), + retrieve = FALSE + ) +``` + +1. Run the above script. Check the dimensions, the warning messages, and the values of +longitude and latitude. What is the range of longitude and latitude? + +2. Why 'lon.max' is 359.9 but not 360? What will happen if it is 360? + +3. Now, change + - `latitude_reorder = Sort()` to `latitude_reorder = Sort(decreasing = TRUE)` + - `longitude_reorder = CircularSort(0, 360)` to `longitude_reorder = CircularSort(-180, 180)` + - Set `lon.min <- -180` and `lon.max <- 179.9` + +Check the values of longitude and latitude again. Is it different from the original script? + + + + +Now, let us add in the transformation parameters. +```r + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101') + lon.min <- 0 + lon.max <- 359.9 + lat.min <- -90 + lat.max <- 90 + + data <- Start(dat = repos, + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + latitude = values(list(lat.min, lat.max)), + latitude_reorder = Sort(), + longitude = values(list(lon.min, lon.max)), + longitude_reorder = CircularSort(0, 360), + ## transformation + transform = CDORemapper, + transform_extra_cells = 2, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(lon.min, lon.max, + lat.min, lat.max)), + transform_vars = c('latitude', 'longitude'), + apply_indices_after_transform = FALSE, + ## + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = NULL, + latitude = NULL), + retrieve = FALSE + ) +``` +4. Run the above script. Check the dimensions and the values of longitude and latitude. + + +## 3. Reshape parameters +Start() provides three parameters for reshaping the output dimensions: +'merge_across_dims' and 'merge_across_dims_narm' for merging the dimensions, +and 'split_multiselected_dims' for splitting one dimension into multiple ones. + +- merge: +The parameter 'xxx_across = yyy' indicates that the inner dimension 'xxx' is continuous along the file dimension 'yyy'. +A common example is 'time_across = chunk', when the experiment runs through years and is saved as several chunk files. +If the depedent relationship is indicated, you can specified the 'xxx' selector along 'yyy', not only confined to one single chunk. + +Sometimes, it is more convenient to connect all 'xxx' together as one dimension instead of 'xxx' and 'yyy' two dimensions. +To achieve this, specifying `merge_across_dims = TRUE` to merge all 'xxx' together as one dimension, and 'yyy' this dimension will disappear. + +One thing to keep in mind is that the length of chosen 'xxx' at each 'yyy' should be the same, or Start() will fill NAs into the shorter 'xxx'. So, even the 'xxx' is continuous, there will be some NAs in the final 'xxx' dimension. +If the NAs are not wanted, you can use `merge_across_dims_narm = TRUE` to remove them. + +- split: +`split_multiselected_dims = TRUE` can split one dimension into n dimensions if this dimension is specified by an n-dimensions array. +It is useful when using experimental data attribute as the selector for the corresponding observational data, or vice versa. + +```r +# An El Niño event (Apr 1957 - Mar 1959) + +repos <- paste0('/esarchive/exp/ecearth/a1tr/cmorfiles/CMIP/EC-Earth-Consortium/EC-Earth3/', + 'historical/r24i1p1f1/Omon/$var$/gr/v20190312/', + '$var$_Omon_EC-Earth3_historical_r24i1p1f1_gr_$chunk$.nc') + +data <- Start(dat = repos, + var = 'tos', + chunk = c('195701-195712', '195801-195812', '195901-195912'), + time = indices(4:27), # Apr 1957 to Mar 1959 + time_across = 'chunk', + merge_across_dims = FALSE, + lat = values(list(-20, 20)), + lat_reorder = Sort(), + lon = values(list(120, 220)), + lon_reorder = CircularSort(0, 360), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(time = 'chunk', lat = NULL, lon = NULL), + retrieve = FALSE) +``` + +1. Run the above script. Check the dimensions and dimension length. + +2. Comment out `time_across = 'chunk'`. What is the error message? + +3. Recover the script, and change 'merge_across_dims' to TRUE. What are the dimensions and dimension length now? Retrieve the data, and type `data[1, 1, , 10, 1]`. What is the NA pattern? + +4. The time length we want is actually 24 months, from April 1957 to March 1959. To remove those additional NAs, add `merge_across_dims_narm = TRUE` in the above code. +What are the dimensions and dimension length now? Check `data[1, 1, , 10, 1]` again. Is there any NA? + + +In the following script, 'merge' and 'split' parameters are used. we are going to load experimental data first, then use its attirbute to +load the corresponding observational data. + +```r +# exp +repos_exp <- paste0('/esarchive/exp/ecearth/a1tr/cmorfiles/CMIP/EC-Earth-Consortium/EC-Earth3/', + 'historical/r24i1p1f1/Amon/$var$/gr/v20190312/', + '$var$_Amon_EC-Earth3_historical_r24i1p1f1_gr_$sdate$01-$sdate$12.nc') + +exp <- Start(dat = repos_exp, + var = 'tas', + sdate = as.character(c(2005:2008)), + time = indices(1:3), + lat = 'all', + lon = 'all', + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = FALSE) + +lats <- attr(exp, 'Variables')$common$lat +lons <- attr(exp, 'Variables')$common$lon +dates <- attr(exp, 'Variables')$common$time + +# obs +repos_obs <- '/esarchive/recon/ecmwf/erainterim/monthly_mean/$var$_f6h/$var$_$date$.nc' + +obs <- Start(dat = repos_obs, + var = 'tas', + date = unique(format(dates, '%Y%m')), + time = values(dates), + time_across = 'date', + merge_across_dims = TRUE, + split_multiselected_dims = TRUE, + lat = values(lats), + lon = values(lons), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = FALSE) + +``` + +1. Check the dimension of 'dates', the time attribute of experimental data. How many dimensions does it have? + +2. We want to get the same dimension structure of observational data as the experimental one. +However, the netCDF files of the two are different. Each experimental data has 12 months while each observational data has 1 month. +To generate the same dimensions, we need to **merge 'time' across 'date' first, then split 'time' into 'sdate' and 'time'.** +Check the two output dimensions of the above script. Are they the same? + +3. Comment out `split_multiselected_dims = TRUE` in obs. What is the dimension now? + +4. What will happen if `split_multiselected_dims = TRUE` but `merge_across_dims = FALSE`? Does it make sense? + + +## 4. Others +The parameters we did not mention in this hands-on (but you might use): 'pattern_dims', 'metadata_dims', 'path_glob_permissive'. +You can check the startR document or go to FAQs and usecase on startR GitLab to learn more. diff --git a/inst/doc/tutorial/hands-on_part1_ans.md b/inst/doc/tutorial/hands-on_part1_ans.md new file mode 100644 index 0000000000000000000000000000000000000000..0d769238f2897eb8f9052ca4ca3d6fa981be1c76 --- /dev/null +++ b/inst/doc/tutorial/hands-on_part1_ans.md @@ -0,0 +1,480 @@ +# Hands-on Part 1 +## Goal +Know how to use different parameters of Start() to generate the desired output array. + +## 1. Common-used parameters +There are several parameters of Start() usually used in every script. They help you +set up common rules and avoid mistakes. + +- **retrieve** +*Document definition: A logical value indicating whether to retrieve the data + defined in the Start() call or to explore only its dimension lengths + and names, and the values for the file and inner dimensions. The default + value is FALSE.* + +If you want to retrieve data to your workstation only rather than using the whole +startR workflow, set `retrieve = TRUE`. However, it is always recommended to use +`retrieve = FALSE` first to check the data size, so you can avoid crashing the +workstation accidentally. + +- **synonims** +*Document definition: A named list where the names are the requested variable or +dimension names, and the values are vectors of character strings with +alternative names to seek for such dimension or variable.\cr\cr +In some requests, data from different sources may follow different naming +conventions for the dimensions or variables, or even files in the same source +could have varying names. This parameter is in order for Start() to +properly identify the dimensions or variables with different names.* + +The most common synonims are lon/longitude and lat/latitude. You can also use this +parameter to rename the dimension. For example, if the variable in netCDF file has +the time dimension named 'time' while you want to have 'ftime' dimension instead, +you can use 'ftime' as inner dimension and specify `synonims = list(ftime = c('time'))`. + + +- **return_vars** +*Document definition: A named list where the names are the names of the +variables to be fetched in the files, and the values are vectors of +character strings with the names of the file dimension which to retrieve each +variable for, or NULL if the variable has to be retrieved only once +from any (the first) of the involved files.* + +The commonly used ones are `time = 'sdate'`, `longitude = NULL`, and `latitude = NULL`. +See FAQ [How-to-16](https://earth.bsc.es/gitlab/es/startR/-/blob/master/inst/doc/faq.md#16-use-parameter-return_vars-in-start) for more details. + + +```r +library(startR) + + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101', '20170201') + + data <- Start(dat = repos, + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + lat = 'all', + lon = 'all', + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = NULL, + latitude = NULL), + retrieve = FALSE + ) +``` + +1. Run the above script. What is the data size? Check the dimension of the returned object using `attr(data, 'Dimension')`. +```r +attr(data, 'Dimension') + dat var sdate ensemble time lat lon + 1 1 2 50 7 640 1296 +``` + +2. Comment out the parameter 'synonims'. Run the script and see what will happen. +Check the original dimension name of 'tas' in netCDF file by `ncdump -h /esarchive/exp/ecmwf/system5_m1/monthly_mean/tas_f6h/tas_20170101.nc |less`. +```r +Error in Start(dat = repos, var = var, sdate = sdate, ensemble = indices(1:50), : + Could not find the dimension 'lat' in the file. Either change the dimension name in your request, adjust the parameter 'dim_names_in_files' or fix the dimension name in the file. +/esarchive/exp/ecmwf/system5_m1/monthly_mean/tas_f6h/tas_20170101.nc +``` + +3. Recover the script. How to rename the inner dimension 'ensemble' to 'ens'? +```r + data <- Start(dat = repos, + var = var, + sdate = sdate, + ens = indices(1:50), + time = 'all', + lat = 'all', + lon = 'all', + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude'), + ens = c('ens', 'ensemble')), + return_vars = list(time = 'sdate', + longitude = NULL, + latitude = NULL), + retrieve = FALSE + ) +``` + +4. Use `str(data)` to check the data structure. Where can you find the information +of 'time', 'longitude', and 'latitude'? Can you retrieve the 'time' array? +```r +time <- attr(data, 'Variables')$common$time +longitude <- attr(data, 'Variables')$common$longitude +latitude <- attr(data, 'Variables')$common$latitude +``` + +5. Comment out the parameter 'return_vars'. Can you still find the information of 'time', 'longitude', and 'latitude'? +No. There is no information under $common nor $dat1. + +6. Change 'return_vars' to `return_vars = list(time = 'sdate', longitude = 'dat', latitude = 'dat')` and check the data structure again. What is different from the original script? When will this change be beneficial? +The longitude and latitude attributes go to $dat1. The values are the same. +```r +longitude <- attr(data, 'Variables')$dat1$longitude +latitude <- attr(data, 'Variables')$dat1$latitude +``` +It does not have advantage compared to the original script (i.e., equal to NULL) because here we only have one 'dat'. + +7. Following the previous question, try the script below and check again. Does it make more sense to use `return_vars = list(longitude = 'dat', latitude = 'dat')` here? +```r + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + repos2 <- "/esarchive/exp/ecmwf/system4_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101', '20170201') + + data <- Start(dat = list(list(name = 'system5_m1', path = repos), + list(name = 'system4_m1', path = repos2)), + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + lat = 'all', + lon = 'all', + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = 'dat', + latitude = 'dat'), + retrieve = FALSE + ) +``` +Using 'dat' makes a difference now because we have two 'dat' here and they have different longitude and latitude. You can find the respective longitude/latitude under $system5_m1 and $system4_m1. +If we still use 'NULL', the longitude/latitude will be listed under $common, and only the values of the first 'dat' (i.e., system5_m1) will be recorded. +```r +# system5_m1 +longitude_system5 <- attr(data, 'Variables')$system5_m1$longitude +latitude_system5 <- attr(data, 'Variables')$system5_m1$latitude +length(longitude_system5) +[1] 1296 +length(latitude_system5) +[1] 640 + +# system4_m1 +longitude_system4 <- attr(data, 'Variables')$system4_m1$longitude +latitude_system4 <- attr(data, 'Variables')$system4_m1$latitude +length(longitude_system4) +[1] 512 +length(latitude_system4) +[1] 256 + +``` + +## 2. transformation/interpolation parameters +You can easily interpolate the data within Start(). The default transformation function +is startR::CDORemapper, a wrapper function of s2dverification::CDORemap that uses cdo inside. + +The current startR version only allows the transformation with the longitude and latitude +assigned by values(). The other two ways, 'all' and indices(), are not supported now. + +We are going to learn how to: +(1) assign longitude and latitude by values() +(2) use parameter '_reorder', which is highly recommended to use along with (1) +(3) use transform-related parameters in the following practice. + +```r + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101') + lon.min <- 0 + lon.max <- 359.9 + lat.min <- -90 + lat.max <- 90 + + data <- Start(dat = repos, + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + latitude = values(list(lat.min, lat.max)), + latitude_reorder = Sort(), + longitude = values(list(lon.min, lon.max)), + longitude_reorder = CircularSort(0, 360), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = 'dat', + latitude = 'dat'), + retrieve = FALSE + ) +``` + +1. Run the above script. Check the dimensions, the warning messages, and the values of +longitude and latitude. What is the range of longitude and latitude? +```r +attr(data, 'Dimensions') + dat var sdate ensemble time latitude longitude + 1 1 1 50 7 640 1296 + +longitude <- attr(data, 'Variables')$dat1$longitude +range(longitude) +[1] 0.0000 359.7222 + +latitude <- attr(data, 'Variables')$dat1$latitude +latitude[1] +[1] -89.78488 +latitude[640] +[1] 89.78488 +``` + +2. Why 'lon.max' is 359.9 but not 360? What will happen if it is 360? +If it is 360, you only get one point of longitude. Because of `longitude_reorder = CircularSort(0, 360)`, Start() regards 0 and 360 as the same point. Therefore, we need to set a number slightly smaller than 360 but bigger than the maximum value in the original data, so we can get the whole range. + +3. Now, change + - `latitude_reorder = Sort()` to `latitude_reorder = Sort(decreasing = TRUE)` + - `longitude_reorder = CircularSort(0, 360)` to `longitude_reorder = CircularSort(-180, 180)` + - Set `lon.min <- -180` and `lon.max <- 179.9` +Check the values of longitude and latitude again. Is it different from the original script? +```r +attr(data, 'Dimensions') + dat var sdate ensemble time latitude longitude + 1 1 1 50 7 640 1296 + +longitude <- attr(data, 'Variables')$dat1$longitude +range(longitude) +[1] -180.0000 179.7222 + +latitude <- attr(data, 'Variables')$dat1$latitude +latitude[1] +[1] 89.78488 +latitude[640] +[1] -89.78488 + +``` +The dimensions are the same. The longitude range changes to [-180, 180], and the latitude sorts from 90 to -90. + + +Now, let us add in the transformation parameters. + +```r + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101') + lon.min <- 0 + lon.max <- 359.9 + lat.min <- -90 + lat.max <- 90 + + data <- Start(dat = repos, + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + latitude = values(list(lat.min, lat.max)), + latitude_reorder = Sort(), + longitude = values(list(lon.min, lon.max)), + longitude_reorder = CircularSort(0, 360), + ## transformation + transform = CDORemapper, + transform_extra_cells = 2, + transform_params = list(grid = 'r360x181', + method = 'conservative', + crop = c(lon.min, lon.max, + lat.min, lat.max)), + transform_vars = c('latitude', 'longitude'), + apply_indices_after_transform = FALSE, + ## + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = NULL, + latitude = NULL), + retrieve = FALSE + ) +``` + +4. Run the above script. Check the dimensions and the values of longitude and latitude. +```r +attr(data, 'Dimensions') + dat var sdate ensemble time latitude longitude + 1 1 1 50 7 181 360 + +longitude <- attr(data, 'Variables')$common$longitude +range(longitude) +[1] 0 359 + +latitude <- attr(data, 'Variables')$common$latitude +range(latitude) +[1] -90 90 +``` + + +## 3. Reshape parameters +Start() provides three parameters for reshaping the output dimensions: +'merge_across_dims' and 'merge_across_dims_narm' for merging the dimensions, +and 'split_multiselected_dims' for splitting one dimension into multiple ones. + +- merge: +The parameter 'xxx_across = yyy' indicates that the inner dimension 'xxx' is continuous along the file dimension 'yyy'. +A common example is 'time_across = chunk', when the experiment runs through years and is saved as several chunk files. +If the depedent relationship is indicated, you can specified the 'xxx' selector along 'yyy', not only confined to one single chunk. + +Sometimes, it is more convenient to connect all 'xxx' together as one dimension instead of 'xxx' and 'yyy' two dimensions. +To achieve this, specifying `merge_across_dims = TRUE` to merge all 'xxx' together as one dimension, and 'yyy' this dimension will disappear. + +One thing to keep in mind is that the length of chosen 'xxx' at each 'yyy' should be the same, or Start() will fill NAs into the shorter 'xxx'. So, even the 'xxx' is continuous, there will be some NAs in the final 'xxx' dimension. +If the NAs are not wanted, you can use 'merge_across_dims_narm = TRUE' to remove them. + +- split: +`split_multiselected_dims = TRUE` can split one dimension into n dimensions if this dimension is specified by an n-dimensions array. +It is useful when using experimental data attribute as the selector for the corresponding observational data, or vice versa. + +```r +# An El Niño event (Apr 1957 - Mar 1959) + +repos <- paste0('/esarchive/exp/ecearth/a1tr/cmorfiles/CMIP/EC-Earth-Consortium/EC-Earth3/', + 'historical/r24i1p1f1/Omon/$var$/gr/v20190312/', + '$var$_Omon_EC-Earth3_historical_r24i1p1f1_gr_$chunk$.nc') + +data <- Start(dat = repos, + var = 'tos', + chunk = c('195701-195712', '195801-195812', '195901-195912'), + time = indices(4:27), # Apr 1957 to Mar 1959 + time_across = 'chunk', + merge_across_dims = FALSE, + lat = values(list(-20, 20)), + lat_reorder = Sort(), + lon = values(list(120, 220)), + lon_reorder = CircularSort(0, 360), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(time = 'chunk', lat = NULL, lon = NULL), + retrieve = FALSE) +``` + +1. Run the above script. Check the dimensions and dimension length. +```r +attr(data, 'Dimension') + dat var chunk time lat lon + 1 1 3 12 58 142 +``` + +2. Comment out `time_across = 'chunk'`. What is the error message? +```r +Error in Start(dat = repos, var = "tos", chunk = c("195701-195712", "195801-195812", : + Provided indices out of range for dimension 'time' for dataset 'dat1' (accepted range: 1 to 12). +``` + +3. Recover the script, and change 'merge_across_dims' to TRUE. What are the dimensions and dimension length now? Retrieve the data, and type `data[1, 1, , 10, 1]`. What is the NA pattern? +```r +attr(data, 'Dimension') + dat var time lat lon + 1 1 36 58 142 + +data[1, 1, , 10, 1] + [1] 301.5078 300.4698 299.5681 298.6258 298.5635 299.4374 301.2602 303.2887 + [9] 304.1350 NA NA NA 302.9555 302.4082 302.5160 302.0977 +[17] 301.0403 299.6315 299.1873 299.2484 300.3640 302.4520 304.1331 304.7454 +[25] 303.7984 302.4535 301.9276 NA NA NA NA NA +[33] NA NA NA NA + +``` +The NAs appears at [10:12], which are the three blank months in 1957 (Jan to Mar). +Also at [28:36], which are the nine blank months in 1959 (Apr to Dec). +Start() put the NAs at the end of each year (chunk). + +4. The time length we want is actually 24 months, from April 1957 to March 1959. To remove those additional NAs, add `merge_across_dims_narm = TRUE` in the above code. +What are the dimensions and dimension length now? Check `data[1, 1, , 10, 1]` again. Is there any NA? +```r +dim(data) + dat var time lat lon + 1 1 24 58 142 + +data[1, 1, , 10, 1] + [1] 301.5078 300.4698 299.5681 298.6258 298.5635 299.4374 301.2602 303.2887 + [9] 304.1350 302.9555 302.4082 302.5160 302.0977 301.0403 299.6315 299.1873 +[17] 299.2484 300.3640 302.4520 304.1331 304.7454 303.7984 302.4535 301.9276 +``` +With `merge_across_dims_narm = TRUE`, the additional NAs are removed. The time length is 24 now. + + +In the following script, 'merge' and 'split' parameters are used. we are going to load experimental data first, then use its attirbute to +load the corresponding observational data. + +```r +# exp +repos_exp <- paste0('/esarchive/exp/ecearth/a1tr/cmorfiles/CMIP/EC-Earth-Consortium/EC-Earth3/', + 'historical/r24i1p1f1/Amon/$var$/gr/v20190312/', + '$var$_Amon_EC-Earth3_historical_r24i1p1f1_gr_$sdate$01-$sdate$12.nc') + +exp <- Start(dat = repos_exp, + var = 'tas', + sdate = as.character(c(2005:2008)), + time = indices(1:3), + lat = 'all', + lon = 'all', + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = FALSE) + +lats <- attr(exp, 'Variables')$common$lat +lons <- attr(exp, 'Variables')$common$lon +dates <- attr(exp, 'Variables')$common$time + +# obs +repos_obs <- '/esarchive/recon/ecmwf/erainterim/monthly_mean/$var$_f6h/$var$_$date$.nc' + +obs <- Start(dat = repos_obs, + var = 'tas', + date = unique(format(dates, '%Y%m')), + time = values(dates), + time_across = 'date', + merge_across_dims = TRUE, + split_multiselected_dims = TRUE, + lat = values(lats), + lon = values(lons), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = FALSE) + +``` + +1. Check the dimension of 'dates', the time attribute of experimental data. How many dimensions does it have? +```r +dim(dates) +sdate time + 4 3 +``` +It has two dimesnions. + +2. We want to get the same dimension structure of observational data as the experimental one. +However, the netCDF files of the two are different. Each experimental data has 12 months while each observational data has 1 month. +To generate the same dimensions, we need to **merge 'time' across 'date' first, then split 'time' into 'sdate' and 'time'.** +Check the two output dimensions of the above script. Are they the same? +```r +attr(exp, 'Dimensions') + dat var sdate time lat lon + 1 1 4 3 256 512 + +attr(obs, 'Dimensions') + dat var sdate time lat lon + 1 1 4 3 256 512 + +``` + +3. Comment out `split_multiselected_dims = TRUE` in obs. What is the dimension now? +```r +attr(obs, 'Dimensions') + dat var time lat lon + 1 1 12 256 512 +``` + +4. What will happen if `split_multiselected_dims = TRUE` but `merge_across_dims = FALSE`? Does it make sense? +```r +attr(obs, 'Dimensions') + dat var date sdate time lat lon + 1 1 12 4 3 256 512 +``` +It does not make sense to have 'sdate' and 'time' dimensions and 'date' dimension together. They are repetitive. Always merge the dimensions before splitting! + +## 4. Others +The parameters we did not mention in this hands-on (but you might use): 'pattern_dims', 'metadata_dims', 'path_glob_permissive'. You can check the startR document or go to FAQs and usecase on startR GitLab to learn more. diff --git a/inst/doc/tutorial/hands-on_part2.md b/inst/doc/tutorial/hands-on_part2.md new file mode 100644 index 0000000000000000000000000000000000000000..6c5a9758d6fcf044f3f5afd50a8dfc5f1160b296 --- /dev/null +++ b/inst/doc/tutorial/hands-on_part2.md @@ -0,0 +1,145 @@ +# Hands-on Part 2 +## Goal +Learn how to use the whole startR workflow to calculate skill score. +In this case, the ensemble-adjusted Continuous Ranked Probability Score (CRPS) and the root mean square error skill score (RMSSS) are used. +To make the process faster, the required data size is small here, and we can run the execution on workstation. + +## 1. Define data +We want to verify the experimental data with the observational data by the skill scores. +The first step is use Start() to define the data. + +Hints: +(1) The experimental data files are under `/esarchive/exp/ecmwf/system4_m1/monthly_mean/`, and the observational data files are under `/esarchive/recon/ecmwf/erainterim/monthly_mean/`. +(2) The variable we want to use is 'tas' under 'tas_f6h/'. +(3) The required time (sdate) period is November 1981 to 2016. +(4) Read global data. +(5) Take only the first time step. +(6) To make the data smaller, we only take the first two ensemble members from the experimental data. +(7) Because we are going to use the whole startR workflow, we only need to create a pointer to the data repository rather than retrieve it to the workstation. + +```r +library(startR) + +# exp data + repos_exp <- paste0('/esarchive/exp/ecmwf/system4_m1/monthly_mean/', + ______) + sdates_exp <- ______ + + exp <- Start(dat = repos_exp, + var = ______, + sdate = sdates_exp, + time = ______, + ensemble = ______, + latitude = ______, + longitude = ______, + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + retrieve = ______) + +# obs data + repos_obs <- paste0('/esarchive/recon/ecmwf/erainterim/monthly_mean/', + ______) + sdates_obs <- ______ + + obs <- Start(dat = repos_obs, + var = ______, + sdate = sdates_obs, + time = ______, + latitude = ______, + longitude = ______, + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + retrieve = ______) +``` + +Question: + +1. What are the dimensions of these two data? + +2. What is the size of these two data? + + +## 2. Define operation and workflow +It is recommended to define the function and write Step() together, because the latter one helps you clarify the input and output dimensions of the function. + +In the self-defined function, we want to use the two functions, 'SpecsVerification::EnsCrps' and 's2dv:::.RMSSS', to calculate the skill score. +You can check the first function by typing `?SpecsVerification::EnsCrps`, and the second function on [s2dv GitLab](https://earth.bsc.es/gitlab/es/s2dv/-/blob/master/R/RMSSS.R). + +Hint: +(1) The self-defined function is for exp and obs data. Therefore, target_dims and output_dims in Step() should be a list with two vector elements. +(2) Check the functions 'EnsCrps' and '.RMSSS'. What are the minimum required dimensions of inputs of each function? These dimensions should be the target_dims in Step(). +(3) To return the two skill scores together, put them in a list with two elements. +(4) What are the dimensions of outputs of each function? These dimension should be the output_dims in Step(). +(5) The first input of AddStep() should also be a list containing exp and obs. + +```r + # self-defined function + func <- function(x, y) { + # CRPS + crps <- mean(SpecsVerification::EnsCrps(x, y, R.new = Inf)) + + # RMSSS + # obs only has [sdate] now. Add one more dim for it. + y <- s2dv::InsertDim(y, posdim = 2, lendim = 1, name = 'ensemble') + rmsss <- mean(s2dv:::.RMSSS(x, y, time_dim = 'sdate', memb_dim = 'ensemble', pval = FALSE)$rmsss) + + return(______) + } + step <- Step(func, target_dims = ______, + output_dims = ______) + wf <- AddStep(______, step) +``` + +Question: +1. Which dimensions are used in operation? Which dimensions are free? + +2. Can we use more dimensions as target_dims? What are the disadvantages? + + +## 3. Execute locally +To avoid potential technical problems in the connection and configuration, we choose to run the execution locally. Please remember that it is recommended to submit the job to HPCs if the operation is heavy. + +Hint: +(1) Use the free dimensions to chunk. +(2) It is safe to divide the data into pieces of which the size each is 1/2-1/3 of RAM. In this case, the data size is only around 100Mb, so you can play with it without worrying to crash your workstation. + +```r + res <- Compute(wf$crps, + chunks = list(______)) +``` + +## 4. Check the results + +1. Check the list structure. What is the length of the list? + +2. What does the dimension look like? + +3. Plot the map +Use `s2dv::PlotEquiMap` or other tools to plot the map. +```r + library(s2dv) + # manually create the lon and lat vector + lon <- seq(from = 0, to = 359, length.out = 512) + lat <- seq(from = 90, to = -90, length.out = 256) + + # Set the color bar + brks_crps <- seq(min(res$crps, na.rm = TRUE), max(res$crps, na.rm = TRUE), length.out = 11) + brks_rmsss <- seq(min(res$rmsss, na.rm = TRUE), max(res$rmsss, na.rm = TRUE), length.out = 11) + + # Plot crps + PlotEquiMap(______, lon, lat, + color_fun = clim.palette('yellowred'), brks = brks_crps, + filled.continents = FALSE, triangle_ends = c(TRUE, TRUE), + toptitle = 'ECMWF monthly mean tas CRPS 2012-2016', title_scale = 0.6) + + # Plot rmsss + PlotEquiMap(______, lon, lat, + color_fun = clim.palette('yellowred'), brks = brks_rmsss, + filled.continents = FALSE, triangle_ends = c(TRUE, TRUE), + toptitle = 'ECMWF monthly mean tas RMSSS 2012-2016', title_scale = 0.6) + +``` + +4. (bonus) We create lat and lon vectors by ourselves above. However, the actual values are slightly different. +Can you get the lon and lat vectors from the data attributes? + diff --git a/inst/doc/tutorial/hands-on_part2_ans.md b/inst/doc/tutorial/hands-on_part2_ans.md new file mode 100644 index 0000000000000000000000000000000000000000..08d8ef084cc62c312472f52b9390fe5adde9c4f3 --- /dev/null +++ b/inst/doc/tutorial/hands-on_part2_ans.md @@ -0,0 +1,197 @@ +# Hands-on Part 2 +## Goal +Learn how to use the whole startR workflow to calculate skill score. +In this case, the ensemble-adjusted Continuous Ranked Probability Score (CRPS) and the root mean square error skill score (RMSSS) are used. +To make the process faster, the required data size is small here, and we can run the execution on workstation. + +## 1. Define data +We want to verify the experimental data with the observational data by the skill scores. +The first step is use Start() to define the data. + +Hints: +(1) The experimental data files are under `/esarchive/exp/ecmwf/system4_m1/monthly_mean/`, and the observational data files are under `/esarchive/recon/ecmwf/erainterim/monthly_mean/`. +(2) The variable we want to use is 'tas' under 'tas_f6h/'. +(3) The required time (sdate) period is November 1981 to 2016. +(4) Read global data. +(5) Take only the first time step. +(6) To make the data smaller, we only take the first two ensemble members from the experimental data. +(7) Because we are going to use the whole startR workflow, we only need to create a pointer to the data repository rather than retrieve it to the workstation. + +```r +library(startR) + +# exp data + repos_exp <- paste0('/esarchive/exp/ecmwf/system4_m1/monthly_mean/', + '$var$_f6h/$var$_$sdate$.nc') + sdates_exp <- sapply(1981:2016, function(x) paste0(x, '1101')) + + exp <- Start(dat = repos_exp, + var = 'tas', + sdate = sdates_exp, + time = indices(1), + ensemble = indices(1:2), + latitude = 'all', + longitude = 'all', + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + retrieve = F) + +# obs data + repos_obs <- paste0('/esarchive/recon/ecmwf/erainterim/monthly_mean/', + '$var$_f6h/$var$_$sdate$.nc') + sdates_obs <- sapply(1981:2016, function(x) paste0(x, '11')) + + obs <- Start(dat = repos_obs, + var = 'tas', + sdate = sdates_obs, + time = indices(1), + latitude = 'all', + longitude = 'all', + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + retrieve = F) +``` + +Question: + +1. What are the dimensions of these two data? +```r +attr(exp, 'Dimensions') + dat var sdate time ensemble latitude longitude + 1 1 36 1 2 256 512 + +attr(obs, 'Dimensions') + dat var sdate time latitude longitude + 1 1 36 1 256 512 +``` + +2. What is the size of these two data? +exp: 72Mb; obs: 36Mb + + +## 2. Define operation and workflow +It is recommended to define the function and write Step() together, because the latter one helps you clarify the input and output dimensions of the function. + +In the self-defined function, we want to use the two functions, 'SpecsVerification::EnsCrps' and 's2dv:::.RMSSS', to calculate the skill score. +You can check the first function by typing `?SpecsVerification::EnsCrps`, and the second function on [s2dv GitLab](https://earth.bsc.es/gitlab/es/s2dv/-/blob/master/R/RMSSS.R). + +Hint: +(1) The self-defined function is for exp and obs data. Therefore, target_dims and output_dims in Step() should be a list with two vector elements. +(2) Check the functions 'EnsCrps' and '.RMSSS'. What are the minimum required dimensions of inputs of each function? These dimensions should be the target_dims in Step(). +(3) To return the two skill scores together, put them in a list with two elements. +(4) What are the dimensions of outputs of each function? These dimension should be the output_dims in Step(). +(5) The first input of AddStep() should also be a list containing exp and obs. + +```r + # self-defined function + func <- function(x, y) { + # CRPS + crps <- mean(SpecsVerification::EnsCrps(x, y, R.new = Inf)) + + # RMSSS + # exp: [sdate, member_exp] + # obs: [sdate, member_obs] + # obs only has [sdate] now. Add one more dim for it. + y <- s2dv::InsertDim(y, posdim = 2, lendim = 1, name = 'ensemble') + rmsss <- mean(s2dv:::.RMSSS(x, y, time_dim = 'sdate', memb_dim = 'ensemble', pval = FALSE)$rmsss) + + return(list(crps = crps, rmsss = rmsss)) + } + step <- Step(func, target_dims = list(c('sdate', 'ensemble'), c('sdate')), + output_dims = list(crps = NULL, rmsss = NULL)) + wf <- AddStep(list(exp, obs), step) +``` + +Question: +1. Which dimensions are used in operation? Which dimensions are free? +'sdate' and 'ensemble' are used in operation. The other dimensions (dat, var, time, latitude, and longitude) are not necessary for the function. These free dimensions will be subsetted by multiApply during the execution. + +2. Can we use more dimensions as target_dims? What are the disadvantages? +It still works if putting more dimensions as target_dims, but we will lose the choices for chunking in the next step. Also, the lighter the function is, the quicker the operation runs. + + + +## 3. Execute locally +To avoid potential technical problems in the connection and configuration, we choose to run the execution locally. Please remember that it is recommended to submit the job to HPCs if the operation is heavy. + +Hint: +(1) Use the free dimensions to chunk. +(2) It is safe to divide the data into pieces of which the size each is 1/2-1/3 of RAM. In this case, the data size is only around 100Mb, so you can play with it without worrying to crash your workstation. + +```r + res <- Compute(wf$crps, + chunks = list(latitude = 2, + longitude = 2)) +``` + +## 4. Check the results + +1. Check the list structure. What is the length of the list? +```r + str(res) +List of 2 + $ crps : num [1, 1, 1, 1:256, 1:512] 1.02 1.06 1.09 1.12 1.1 ... + $ rmsss: num [1, 1, 1, 1:256, 1:512] 0.991 0.991 0.99 0.989 0.989 ... +``` + +2. What does the dimension look like? +```r +dim(res$crps) + dat var time latitude longitude + 1 1 1 256 512 +dim(res$rmsss) + dat var time latitude longitude + 1 1 1 256 512 +``` + +3. Plot the map +Use `s2dv::PlotEquiMap` or other tools to plot the map. +```r + library(s2dv) + # manually create the lon and lat vector + lon <- seq(from = 0, to = 359, length.out = 512) + lat <- seq(from = 90, to = -90, length.out = 256) + + # Set the color bar + brks_crps <- seq(min(res$crps, na.rm = TRUE), max(res$crps, na.rm = TRUE), length.out = 11) + brks_rmsss <- seq(min(res$rmsss, na.rm = TRUE), max(res$rmsss, na.rm = TRUE), length.out = 11) + + # Plot crps + PlotEquiMap(res$crps[1, 1, 1, , ], lon, lat, + color_fun = clim.palette('yellowred'), brks = brks_crps, + filled.continents = FALSE, triangle_ends = c(TRUE, TRUE), + toptitle = 'ECMWF monthly mean tas CRPS 2012-2016', title_scale = 0.6) + + # Plot rmsss + PlotEquiMap(res$rmsss[1, 1, 1, , ], lon, lat, + color_fun = clim.palette('yellowred'), brks = brks_rmsss, + filled.continents = FALSE, triangle_ends = c(TRUE, TRUE), + toptitle = 'ECMWF monthly mean tas RMSSS 2012-2016', title_scale = 0.6) + +``` + +4. (bonus) We create lat and lon vectors by ourselves above. However, the actual values are slightly different. +Can you get the lon and lat vectors from the data attributes? + +```r + repos_exp <- paste0('/esarchive/exp/ecmwf/system4_m1/monthly_mean/', + '$var$_f6h/$var$_$sdate$.nc') + sdates_exp <- sapply(2012:2016, function(x) paste0(x, sprintf('%02d', 1:12), '01')) + + exp <- Start(dat = repos_exp, + var = 'tas', + sdate = sdates_exp, + time = indices(1), + ensemble = indices(1:2), + latitude = 'all', + longitude = 'all', + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list(latitude = NULL, longitude = NULL), + retrieve = F) + + lon <- as.vector(attr(exp, 'Variables')$common$longitude) + lat <- as.vector(attr(exp, 'Variables')$common$latitude) + +``` + diff --git a/inst/doc/tutorial/nord3_demo.R b/inst/doc/tutorial/nord3_demo.R new file mode 100644 index 0000000000000000000000000000000000000000..dc1e36a2183c80f47cdaef0288ee8e15ad2b0d52 --- /dev/null +++ b/inst/doc/tutorial/nord3_demo.R @@ -0,0 +1,74 @@ +library(startR) + + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + var <- 'tas' + sdate <- c('20170101', '20170201') + lon.min <- 0 + lon.max <- 359.9 + lat.min <- -90 + lat.max <- 90 + + data <- Start(dat = repos, + var = var, + sdate = sdate, + ensemble = indices(1:50), + time = 'all', + latitude = values(list(lat.min, lat.max)), + latitude_reorder = Sort(), + longitude = values(list(lon.min, lon.max)), + longitude_reorder = CircularSort(0, 360), + synonims = list(latitude=c('lat', 'latitude'), + longitude=c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = NULL, latitude = NULL), + retrieve = FALSE + ) + + func <- function(x, ave_dim) { + a <- apply(x, ave_dim, mean) + dim(a) <- c(time = length(a)) # give it a name! + return(a) + } + step <- Step(func, target_dims = c('ensemble', 'time'), + output_dims = c('time')) + wf <- AddStep(data, step, ave_dim = 2) + +#-------------------user-defined--------------------- + queue_host <- 'nord1' + temp_dir <- '/gpfs/scratch/bsc32/bsc32734/startR_hpc/' + ecflow_suite_dir <- '/home/Earth/aho/startR_local/' +#---------------------------------------------------- + + res <- Compute(wf, + chunks = list(latitude = 2, + longitude = 4),#$output1 + threads_load = 2, + threads_compute = 4, + cluster = list(queue_host = queue_host, + queue_type = 'lsf', + temp_dir = temp_dir, + cores_per_job = 2, + job_wallclock = '05:00', + max_jobs = 4, + extra_queue_params = list('#BSUB -q bsc_es'), + bidirectional = FALSE, + polling_period = 10 + ), + ecflow_suite_dir = ecflow_suite_dir, + wait = FALSE + ) #$output1 + + +# Save the header of res +save_path <- '/home/Earth/aho/aho-testtest/startR/tutorial/' +save_file <- paste0(save_path, 'nord3_demo_header.Rds') +saveRDS(res, file = save_file) + +# You can leave now. When you come back next time, just read the .Rds file +res_collect <- readRDS(save_file) +result <- Collect(res_collect, wait = TRUE) + +saveRDS(result, file = paste0(save_path, 'nord3_demo_result.Rds')) + + + diff --git a/inst/doc/tutorial/rmsss.png b/inst/doc/tutorial/rmsss.png new file mode 100644 index 0000000000000000000000000000000000000000..304c52c94a8b801a015d9aa36908cebe553fa4d3 Binary files /dev/null and b/inst/doc/tutorial/rmsss.png differ diff --git a/inst/doc/tutorial/startR_tutorial_20200902.pdf b/inst/doc/tutorial/startR_tutorial_20200902.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9c4f6f967ebca1bda8e4d691a1daf660f74fec93 Binary files /dev/null and b/inst/doc/tutorial/startR_tutorial_20200902.pdf differ diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 4e446311af471ec13ba027335e9d383315e9ebc7..82e0bf73cc76924270c8ecd20c2377595ef956de 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -47,6 +47,9 @@ You can also find information in [FAQ How-to-18](inst/doc/faq.md#18-use-glob-exp You will see four difference cases and learn the rules. You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata_dims-to-retrieve-variable-metadata). + 11. [Three methods to load experimental files with different member and version](inst/doc/usecase/ex1_11_expid_member_version.R) + This script shows three ways to load the data with different expid - member - version combination. It is useful for climate prediction of multiple experiments. + 2. **Execute computation (use `Compute()`)** 1. [Function working on time dimension](inst/doc/usecase/ex2_1_timedim.R) diff --git a/inst/doc/usecase/ex1_11_expid_member_version.R b/inst/doc/usecase/ex1_11_expid_member_version.R new file mode 100644 index 0000000000000000000000000000000000000000..1accfff126518b9c03be4f972c04828aa0ae857a --- /dev/null +++ b/inst/doc/usecase/ex1_11_expid_member_version.R @@ -0,0 +1,110 @@ +# Author: An-Chi Ho +# Date: 7th Oct. 2020 +#--------------------------------------------------------------------- +# The script shows three ways to load the data with different expid - member - version +# combination. It is useful for climate prediction of multiple experiments. +# In this case, the two datasets have the following combination: +# | expid | member | version | +# |-------|----------|---------| +# | a1st | r7i1p1f1 |v20190302| +# | a1sx |r10i1p1f1 |v20190308| +# +# The three methods to load the data are: +# (1) dependencies +# (2) glob expression +# (3) dataset +#--------------------------------------------------------------------- + +library(startR) + +# (1) dependencies +# Because the three file dimensions 'expid', 'member', and 'version' have dependency +# on each other, so we can use the parameter 'xxx_depends' to specify the relationship. + +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/$member$/Omon/$var$/gn/$version$/', + '$var$_Omon_EC-Earth3_historical_$member$_gn_$year$.nc') +yrh1 <- 1960 +yrh2 <- 1961 +years <- paste0(c(yrh1 : yrh2), '01-', c(yrh1 : yrh2), '12') + +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + member = 'all', + version = 'all', + member_depends = 'expid', + member_depends = 'version', + version_depends = 'expid', + version_depends = 'member', + year = years, + time = 'all', + region = 'all', + return_vars = list(time = NULL, region = NULL), + retrieve = T) + +dim(data) + dat var expid member version year time region + 1 1 2 1 2 2 12 14 + + +# (2) glob expression +# The parameter 'path_glob_permissive' allows to use '*' to define the path. +# Note that '*' can only represent one possibility. Because each expid only has +# one member and one version, so we can use '*' to define the path. +# See Start() documentation for more details of 'path_glob_permissive'. + +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/*/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') + +yrh1 <- 1960 +yrh2 <- 1961 +years <- paste0(c(yrh1 : yrh2), '01-', c(yrh1 : yrh2), '12') + +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + time = 'all', + region = 'all', + path_glob_permissive = 6, # to preserve * for the last 6 folder layers (6th is $member$ originally) + return_vars = list(time = NULL, region = NULL), + retrieve = T) + +dim(data) + dat var expid year time region + 1 1 2 55 12 14 + + +# (3) dataset +# We can simply define two expID as two datasets. Therefore, the member and version +# can be specified in each path directly. +# The following script is a bit different from the above two. It read two versions +# for the first dataset. Therefore, the result dimension 'version = 2', and the second +# dataset has NAs along [version = 2] since it only has one version. + +repos <- list(list(name = 'a1st', + path = paste0('/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/r7i1p1f1/Omon/$var$/gn/$version$/', + '$var$_Omon_EC-Earth3_historical_r7i1p1f1_gn_$year$.nc')), + list(name = 'a1sx', + path = paste0('/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/r10i1p1f1/Omon/$var$/gn/$version$/', + '$var$_Omon_EC-Earth3_historical_r10i1p1f1_gn_$year$.nc')) + ) +yrh1 <- 1960 +yrh2 <- 1961 #2014 +years <- paste0(c(yrh1 : yrh2), '01-', c(yrh1 : yrh2), '12') + +data <- Start(dat = repos, + var = 'tosmean', + year = years, + version = indices(1:2), #'all', + time = 'all', + region = 'all', + retrieve = T) + + dim(data) + dat var year version time region + 2 1 2 2 12 14 diff --git a/man/Start.Rd b/man/Start.Rd index c41c9619f4489bb9f6655b4e3203cfb4616c532d..680168e44903df0b1ae68c2294096bd0c5ca99c4 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -12,7 +12,8 @@ Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, pattern_dims = NULL, metadata_dims = NULL, selector_checker = SelectorChecker, merge_across_dims = FALSE, merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, retrieve = FALSE, num_procs = 1, + path_glob_permissive = FALSE, largest_dims_length = FALSE, + retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, silent = FALSE, debug = FALSE) } \arguments{ @@ -367,6 +368,22 @@ has two matches:\cr \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr only the first found file will be used.} +\item{largest_dims_length}{A logical value or a named integer vector + indicating if Start() should examine all the files to get the largest + length of the inner dimensions (TRUE) or use the first valid file of each + dataset as the returned dimension length (FALSE). Since examining all the + files could be time-consuming, a vector can be used to explicitly specify + the expected length of the inner dimensions. For those inner dimensions not + specified, the first valid file will be used. The default value is FALSE.\cr\cr + This parameter is useful when the required files don't have consistent + inner dimension. For example, there are 10 required experimental data files + of a series of start dates. The data only contain 25 members for the first + 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, + the returned member dimension length will be 25 only. The 26th to 51st + members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, + the returned member dimension length will be 51. To save the resource, +\code{'largest_dims_length = c(member = 51)'} can also be used.} + \item{retrieve}{A logical value indicating whether to retrieve the data defined in the Start() call or to explore only its dimension lengths and names, and the values for the file and inner dimensions. The default @@ -378,6 +395,10 @@ multiple involved files in a call to Start(). If set to NULL, takes the number of available cores (as detected by detectCores() in the package 'future'). The default value is 1 (no parallel execution).} +\item{ObjectBigmemory}{a character string to be included as part of the +bigmemory object name. This parameter is thought to be used internally by the +chunking capabilities of startR.} + \item{silent}{A logical value of whether to display progress messages (FALSE) or not (TRUE). The default value is FALSE.} diff --git a/startR-manual.pdf b/startR-manual.pdf index d8749b9c70344724b637231f2c63e0dcc6e034ec..0e205e687143fcabaa6c6c47c17b14e302458c9c 100644 Binary files a/startR-manual.pdf and b/startR-manual.pdf differ diff --git a/tests/testthat/test-Start-first_file_missing.R b/tests/testthat/test-Start-first_file_missing.R index 95319aee359cc3fc462ad72aaeaef50a4a5d397f..25f4d0241cea4b25d642db5863f79fd8df08e0f6 100644 --- a/tests/testthat/test-Start-first_file_missing.R +++ b/tests/testthat/test-Start-first_file_missing.R @@ -12,7 +12,7 @@ sdates1 <- c('20130611') #exists sdates2 <- c('20130618') #does not exist sdates3 <- c("20130611", "20130618") #1st exists, 2nd missing sdates4 <- c("20130618", "20130611") #1st missing, 2nd exists - +sdates5 <- c("20130611", "20130612") #both exist test_that("1. first file missing, no assign parameter 'metadata_dims'", { @@ -69,7 +69,7 @@ data <- Start(dat = file, return_vars = list(latitude = 'dat', longitude = 'dat', time = 'file_date'), -# metadata_dims = c('file_date'), + metadata_dims = c('file_date'), retrieve = T) expect_equal( @@ -87,11 +87,11 @@ data <- Start(dat = file, ) expect_equal( names(attr(data, 'Variables')$common), - c('time') + c('time', NA, 'tas') ) }) -test_that("3. Use parameter 'metadata_dims', all common attributes", { +test_that("3. Use parameter 'metadata_dims', all common attributes, 1st file missing", { data <- Start(dat = file, var = var, @@ -112,7 +112,7 @@ data <- Start(dat = file, expect_equal( names(attr(data, 'Variables')$common), - c('latitude', 'longitude', 'time', 'tas', NA) + c('latitude', 'longitude', 'time', NA, 'tas') ) expect_equal( as.vector(attr(data, 'NotFoundFiles')), @@ -121,3 +121,62 @@ data <- Start(dat = file, }) +test_that("4. Use parameter 'metadata_dims', all common attributes, 2nd file missing", { + +data <- Start(dat = file, + var = var, + file_date = sdates3, + time = indices(1:4), + latitude = values(list(20, 30)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-20, -10)), + longitude_reorder = CircularSort(-180, 180), + ensemble = indices(1), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + metadata_dims = c('file_date'), + retrieve = T) + + expect_equal( + names(attr(data, 'Variables')$common), + c('latitude', 'longitude', 'time', 'tas', NA) + ) + expect_equal( + as.vector(attr(data, 'NotFoundFiles')), + c(NA, "/esarchive/exp/ncep/cfs-v2/weekly_mean/s2s/tas_f24h/tas_20130618.nc") + ) + +}) + +test_that("5. Use parameter 'metadata_dims', all common attributes, no file missing", { + +data <- Start(dat = file, + var = var, + file_date = sdates5, + time = indices(1:4), + latitude = values(list(20, 30)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-20, -10)), + longitude_reorder = CircularSort(-180, 180), + ensemble = indices(1), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + metadata_dims = c('file_date'), + retrieve = T) + + expect_equal( + names(attr(data, 'Variables')$common), + c('latitude', 'longitude', 'time', 'tas', 'tas') + ) + expect_equal( + as.vector(attr(data, 'NotFoundFiles')), + NULL + ) + +}) diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R new file mode 100644 index 0000000000000000000000000000000000000000..fe0899e8d19d46ff5372f2d1ee2796f15974b27f --- /dev/null +++ b/tests/testthat/test-Start-largest_dims_length.R @@ -0,0 +1,133 @@ +context("Start() largest_dims_length check") +# When certain inner dim of files is not consistent, the parameter 'largest_dims_length' can +# be used to ensure the returned array has the largest length of inner dimensions. + +test_that("1. inconsistent member length", { +# system5c3s: sdate = '19931101' has 25 members, '20200901' has 51 members. +# system3: 40 members. + +repos <- list(list(name = 'system5c3s', + path = "/esarchive/exp/ecmwf/system5c3s/monthly_mean/g500_f12h/$var$_$sdate$.nc"), + list(name = 'system3_m1-c3s', + path = "/esarchive/exp/cmcc/system3_m1-c3s/monthly_mean/g500_f12h/$var$_$sdate$.nc")) + +# largest_dims_length = FALSE +dat1 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = FALSE, + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat1), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 40, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat1[is.na(dat1)]), + 3000 + ) + expect_equal( + sum(as.numeric(is.na(dat1[1, 1, 1, , , , ]))), + 540 + ) + expect_equal( + sum(as.numeric(is.na(dat1[1, 1, 2, , , , ]))), + 540 + ) + +# largest_dims_length = TRUE +dat2 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = TRUE, + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat2), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 51, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat2[is.na(dat2)]), + 3528 + ) + expect_equal( + sum(as.numeric(is.na(dat2[1, 1, 1, , , , ]))), + 936 + ) + expect_equal( + sum(as.numeric(is.na(dat2[1, 1, 2, , , , ]))), + 0 + ) + +# largest_dims_length = c(ensemble = 51) +dat3 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = c(ensemble = 51), + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat3), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 51, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat3[is.na(dat3)]), + 3528 + ) + expect_equal( + sum(as.numeric(is.na(dat3[1, 1, 1, , , , ]))), + 936 + ) + expect_equal( + sum(as.numeric(is.na(dat3[1, 1, 2, , , , ]))), + 0 + ) + +# Compare dat1, dat2, and dat3 + expect_equal( + mean(dat2, na.rm = T), + mean(dat3, na.rm = T) + ) + expect_equal( + mean(dat1[, , , , 1:25, , ], na.rm = T), + mean(dat2[, , , , 1:25, , ], na.rm = T) + ) + expect_equal( + mean(dat1[, , , , 1:25, , ], na.rm = T), + mean(dat3[, , , , 1:25, , ], na.rm = T) + ) + +}) diff --git a/tests/testthat/test-Start-metadata_dims.R b/tests/testthat/test-Start-metadata_dims.R index b4ef3cf91ead8fd03a5beaee7af50d36db9ab2f4..1c0bc912705c088b30f3aecafced117375f6bc48 100644 --- a/tests/testthat/test-Start-metadata_dims.R +++ b/tests/testthat/test-Start-metadata_dims.R @@ -329,3 +329,116 @@ data <- Start(repos = mask_path, ) }) + +test_that("7. Two data sets, while one is missing", { + repos <- "/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc" + # incorrect path. Therefore repos2 doesn't have any valid files + repos2 <- "/esarchive/exp/ecmwf/system4_m1/monthly_mean/$var$_f2h/$var$_$sdate$.nc" # correct one is _f6h + var <- 'tas' + data <- Start(dat = list(list(name = 'system4_m1', path = repos2), + list(name = 'system5_m1', path = repos)), + var = var, + sdate = '20170101', + ensemble = indices(1), + time = indices(1), + lat = 'all', + lon = 'all', + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(time = 'sdate', + longitude = 'dat', + latitude = 'dat'), + metadata_dims = 'dat', # it can be omitted since it is automatically specified as 'dat' + retrieve = T + ) + + expect_equal( + length(data[is.na(data)]), + 829440 + ) + expect_equal( + attr(data, "Variables")$system4_m1, + NULL + ) + expect_equal( + length(attr(data, "Variables")$system5_m1$longitude), + 1296 + ) + expect_equal( + length(attr(data, "Variables")$system5_m1), + 3 + ) + expect_equal( + attr(data, 'Files'), + array(c(NA, "/esarchive/exp/ecmwf/system5_m1/monthly_mean/tas_f6h/tas_20170101.nc"), + dim = c(dat = 2, var = 1, sdate = 1)) + ) +}) + +test_that("8. Two data sets, both have files but the first file is missing", { +path_list <- list( + MPI = list(name = 'MPI_ESM', + path = paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/', + 'dcppA-hindcast/$member$/day/$var$/gn/v20200101/', + '$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-$member$_gn_$chunk$.nc')), + Had = list(name = 'HadGEM3', + path = paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/', + 'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/', + 'dcppA-hindcast/$member$/day/$var$/gn/v20200101/', + '$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-$member$_gn_$chunk$.nc'))) + +data <- Start(dataset = path_list, + var = 'tasmin', + member = list(c('r1i1p1f1', 'r2i1p1f2')), + sdate = paste0(2018), + chunk = list(c('20181101-20281231', '20181101-20281230')), + time = indices(1), #'all', + lat = values(list(0, 14)), + lon = values(list(0, 28)), + synonims = list(time = c('fmonth','time'), + lon = c('lon', 'longitude'), + lat = c('lat', 'latitude')), + return_vars = list(lat = 'dataset', lon = 'dataset'), + lat_reorder = Sort(), + num_procs = 1, + retrieve = T) + + expect_equal( + length(data[is.na(data)]), + 5500 + ) + expect_equal( + length(attr(data, "Variables")$MPI_ESM), + 3 + ) + expect_equal( + length(attr(data, "Variables")$MPI_ESM$lon), + 30 + ) + expect_equal( + names(attr(data, "Variables")$MPI_ESM), + c('lat', 'lon', 'tasmin') + ) + expect_equal( + length(attr(data, "Variables")$HadGEM3), + 2 + ) + expect_equal( + length(attr(data, "Variables")$HadGEM3$lon), + 34 + ) + expect_equal( + names(attr(data, "Variables")$HadGEM3), + c('lat', 'lon') + ) + expect_equal( + attr(data, 'Files'), + array(c("/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/r1i1p1f1/day/tasmin/gn/v20200101/tasmin_day_MPI-ESM1-2-HR_dcppA-hindcast_s2018-r1i1p1f1_gn_20181101-20281231.nc", + NA, NA, NA, NA, NA, NA, + "/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r2i1p1f2/day/tasmin/gn/v20200101/tasmin_day_HadGEM3-GC31-MM_dcppA-hindcast_s2018-r2i1p1f2_gn_20181101-20281230.nc"), + dim = c(dataset = 2, var = 1, member = 2, sdate = 1, chunk = 2)) + ) + + +}) diff --git a/tests/testthat/test-Start-path_glob_permissive.R b/tests/testthat/test-Start-path_glob_permissive.R index 9e14c0252df03d7a1b9503a12fc2b219cc2baeac..2809d73585a4364fc06db41afb76a802e05a4244 100644 --- a/tests/testthat/test-Start-path_glob_permissive.R +++ b/tests/testthat/test-Start-path_glob_permissive.R @@ -36,9 +36,10 @@ data <- Start(dat = repos, "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), dim = c(dat = 1, var = 1, expid = 2, year = 2)) ) +# NOTE: in R_3.2.0, the following test doesn't have dimension. In R_3.6.2 it does. expect_equal( data[1, 1, , , 1, 1], - matrix(c(18.60422, 17.13862, 18.52348, 17.21780), 2, 2), + array(c(18.60422, 17.13862, 18.52348, 17.21780), dim = c(expid = 2, year = 2)), tolerance = 0.0001 ) @@ -77,9 +78,10 @@ data <- Start(dat = repos, "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), dim = c(dat = 1, var = 1, expid = 2, year = 2, member = 1)) ) +# NOTE: in R_3.2.0, the following test doesn't have dimension. In R_3.6.2 it does. expect_equal( data[1, 1, , , 1, 1, 1], - matrix(c(18.60422, 17.13862, 18.52348, 17.21780), 2, 2), + array(c(18.60422, 17.13862, 18.52348, 17.21780), dim = c(expid = 2, year = 2)), tolerance = 0.0001 ) expect_equal( @@ -88,3 +90,50 @@ data <- Start(dat = repos, ) }) + + +test_that("2. tag at the end", { +# Without the layer that path_glob_permissive allows to contain *, the last item in the path is tag. In the example below, the path without path_glob_permissive layer is +# "/esarchive/oper/S2S4E-data/weekly_statistics/S2S/$var$/$sdate$/". The last item is "$sdate$" + +sdates.seq.thu <- format(seq(as.Date(paste(2020, 06, 11, sep = '-')), as.Date(paste(2020, 09, 17, sep = '-')), + by = 'weeks'), format='%Y%m%d') +path <- "/esarchive/oper/S2S4E-data/weekly_statistics/S2S/$var$/$sdate$/$var$_$sdate$_*.nc" + +exp <- Start(dat = path, + var = "tas", + sdate = sdates.seq.thu, + time = 'all', + ensemble = "all", + latitude = indices(1:2), + longitude = indices(1:2), + path_glob_permissive = 1, + retrieve = F) + + asd <- as.list(attr(exp, 'ExpectedFiles')) + qwe <- sapply(sapply(asd, strsplit, '/'), '[[', 9) + files <- paste0('tas_', sdates.seq.thu, '_', 24:38, '.nc') + expect_equal( + qwe, files + ) + +exp <- Start(dat = path, + var = "tas", + sdate = sdates.seq.thu, + time = 'all', + ensemble = "all", + latitude = indices(1:2), + longitude = indices(1:2), + path_glob_permissive = FALSE, + retrieve = F) + + asd <- as.list(attr(exp, 'ExpectedFiles')) + qwe <- sapply(sapply(asd, strsplit, '/'), '[[', 9) + files <- paste0('tas_', sdates.seq.thu, '_', 24, '.nc') + expect_equal( + qwe, files + ) + + +}) +