From fb2d1c59c0822e1f5a53b8393ffa6d5a0105f6ad Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 16 Oct 2020 17:35:37 +0200 Subject: [PATCH 01/10] Add code to find the longest dimensions among the files --- R/Start.R | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/R/Start.R b/R/Start.R index 718ae42..2c0f360 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1773,6 +1773,8 @@ Start <- function(..., # dim = indices/selectors, names(dim(sub_array_of_not_found_files)) <- known_dims j <- 1 selector_indices_save <- vector('list', prod(files_to_load)) + selectors_total_list <- vector('list', prod(files_to_load)) + while (j <= prod(files_to_load)) { selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] names(selector_indices) <- known_dims @@ -1786,6 +1788,10 @@ Start <- function(..., # dim = indices/selectors, dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]] }) names(selectors) <- known_dims + + selectors_total_list[[j]] <- selectors + names(selectors_total_list[[j]]) <- known_dims + replace_values[known_dims] <- selectors if (!dataset_has_files[i]) { if (any(is.na(selectors))) { @@ -2237,6 +2243,40 @@ Start <- function(..., # dim = indices/selectors, } }) } + + # Open and get all the dim from all the files + data_dims_all_files <- vector('list', length = length(selectors_total_list)) + for (selectors_kk in 1:length(data_dims_all_files)) { + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(selector_indices_save[[selectors_kk]]))) + file_to_open <- file_path + + data_dims_all_files[[selectors_kk]] <- file_dim_reader(file_to_open, NULL, selectors_total_list[[selectors_kk]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims) + + } + # Find the longest dimensions from all the files + largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) + for (kk in 1:length(data_dims_all_files[[1]])) { + largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) + } + names(largest_data_dims) <- names(data_dims_all_files[[1]]) + + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(largest_data_dims) <- sapply(names(largest_data_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + + # replace data_dims with largest_data_dims + data_dims <- largest_data_dims + # Transform the variables if needed and keep them apart. if (!is.null(transform) && (length(transform_vars) > 0)) { if (!all(transform_vars %in% c(names(picked_vars[[i]]), names(picked_common_vars)))) { -- GitLab From daa05bf239ea9c3ceedfe209c427998741f0b47d Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 19 Oct 2020 17:07:14 +0200 Subject: [PATCH 02/10] Further revise for unit testing --- R/Start.R | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/R/Start.R b/R/Start.R index 2c0f360..3cf9819 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1772,13 +1772,23 @@ Start <- function(..., # dim = indices/selectors, dim = files_to_load) names(dim(sub_array_of_not_found_files)) <- known_dims j <- 1 - selector_indices_save <- vector('list', prod(files_to_load)) - selectors_total_list <- vector('list', prod(files_to_load)) + if (!exists('selector_indices_save')) { + selector_indices_save <- vector('list', length = length(dat)) + } + if (!exists('selectors_total_list')) { + selectors_total_list <- vector('list', length = length(dat)) + } + selector_indices_save[[i]] <- vector('list', length = prod(files_to_load)) + selectors_total_list[[i]] <- vector('list', length = prod(files_to_load)) while (j <= prod(files_to_load)) { selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] names(selector_indices) <- known_dims - selector_indices_save[[j]] <- selector_indices + + tmp <- selector_indices + tmp[which(known_dims == found_pattern_dim)] <- i + selector_indices_save[[i]][[j]] <- tmp + selectors <- sapply(1:length(known_dims), function (x) { vector_to_pick <- 1 @@ -1788,9 +1798,8 @@ Start <- function(..., # dim = indices/selectors, dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]] }) names(selectors) <- known_dims - - selectors_total_list[[j]] <- selectors - names(selectors_total_list[[j]]) <- known_dims + selectors_total_list[[i]][[j]] <- selectors + names(selectors_total_list[[i]][[j]]) <- known_dims replace_values[known_dims] <- selectors if (!dataset_has_files[i]) { @@ -2245,16 +2254,24 @@ Start <- function(..., # dim = indices/selectors, } # Open and get all the dim from all the files - data_dims_all_files <- vector('list', length = length(selectors_total_list)) - for (selectors_kk in 1:length(data_dims_all_files)) { - file_path <- do.call("[", c(list(array_of_files_to_load), as.list(selector_indices_save[[selectors_kk]]))) - file_to_open <- file_path + data_dims_all_files <- vector('list', length = length(selectors_total_list[[i]])) + + for (selectors_kk in 1:length(data_dims_all_files)) { + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(selector_indices_save[[i]][[selectors_kk]]))) + file_to_open <- file_path - data_dims_all_files[[selectors_kk]] <- file_dim_reader(file_to_open, NULL, selectors_total_list[[selectors_kk]], + data_dims_all_files[[selectors_kk]] <- try(file_dim_reader(file_to_open, NULL, selectors_total_list[[i]][[selectors_kk]], lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), - synonims) + synonims), silent = TRUE) + + } + + # Remove the missing files (i.e., fail try above) + if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { + tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') + data_dims_all_files <- data_dims_all_files[-tmp] + - } # Find the longest dimensions from all the files largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) for (kk in 1:length(data_dims_all_files[[1]])) { -- GitLab From 7d0c32eebc954b3d469b56d3bcb41c309b4b6e3c Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 19 Oct 2020 17:07:31 +0200 Subject: [PATCH 03/10] Add NOTE --- tests/testthat/test-Start-path_glob_permissive.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/testthat/test-Start-path_glob_permissive.R b/tests/testthat/test-Start-path_glob_permissive.R index ca6bbc0..2809d73 100644 --- a/tests/testthat/test-Start-path_glob_permissive.R +++ b/tests/testthat/test-Start-path_glob_permissive.R @@ -36,6 +36,7 @@ data <- Start(dat = repos, "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), dim = c(dat = 1, var = 1, expid = 2, year = 2)) ) +# NOTE: in R_3.2.0, the following test doesn't have dimension. In R_3.6.2 it does. expect_equal( data[1, 1, , , 1, 1], array(c(18.60422, 17.13862, 18.52348, 17.21780), dim = c(expid = 2, year = 2)), @@ -77,6 +78,7 @@ data <- Start(dat = repos, "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), dim = c(dat = 1, var = 1, expid = 2, year = 2, member = 1)) ) +# NOTE: in R_3.2.0, the following test doesn't have dimension. In R_3.6.2 it does. expect_equal( data[1, 1, , , 1, 1, 1], array(c(18.60422, 17.13862, 18.52348, 17.21780), dim = c(expid = 2, year = 2)), -- GitLab From 299304f4cd8f1626945fcf4ca01ed5b8a552935e Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 19 Oct 2020 17:21:03 +0200 Subject: [PATCH 04/10] Fix syntax error --- R/Start.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index 3cf9819..3adfc93 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2270,7 +2270,7 @@ Start <- function(..., # dim = indices/selectors, if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') data_dims_all_files <- data_dims_all_files[-tmp] - + } # Find the longest dimensions from all the files largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) -- GitLab From 54597c573c96ad6103d83e5e1159e7ff2c6a3f8d Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 20 Oct 2020 13:43:45 +0200 Subject: [PATCH 05/10] Add new params 'largest_dims_length' to determine if Start() should examine all the files to get the dim or not --- R/Start.R | 157 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 100 insertions(+), 57 deletions(-) diff --git a/R/Start.R b/R/Start.R index 3adfc93..bfd2ca4 100644 --- a/R/Start.R +++ b/R/Start.R @@ -677,6 +677,21 @@ #'\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr #'\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr #'only the first found file will be used. +#'@param largest_dims_length A logical value or a named integer vector +#' indicating if Start() should examine all the files to get the largest +#' length of the inner dimensions (TRUE) or use the first valid file of each +#' dataset as the returned dimension length (FALSE). Since examining all the +#' files could be time-consuming, a vector can be used to explicitly specify +#' the expected length of the inner dimensions. For those inner dimensions not +#' specified, the first valid file will be used. The default value is FALSE.\cr\cr +#' This parameter is useful when the required files don't have consistent +#' inner dimension. For example, there are 10 required experimental data files +#' of a series of start dates. The data only contain 25 members for the first +#' 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, +#' the returned member dimension length will be 25 only. The 26th to 51st +#; members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, +#' the returned member dimension length will be 51. To save the resource, +#' \code{'largest_dims_length = c(member = 51)'} can also be used. #'@param retrieve A logical value indicating whether to retrieve the data #' defined in the Start() call or to explore only its dimension lengths #' and names, and the values for the file and inner dimensions. The default @@ -808,6 +823,7 @@ Start <- function(..., # dim = indices/selectors, merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, + largest_dims_length = FALSE, retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, @@ -1364,6 +1380,19 @@ Start <- function(..., # dim = indices/selectors, stop("Parameter 'path_glob_permissive' must be of length 1.") } + # Check largest_dims_length + if (!is.numeric(largest_dims_length) && !is.logical(largest_dims_length)) { + stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.") + } + if (is.numeric(largest_dims_length)) { + if (any(largest_dims_length %% 1 != 0) | any(largest_dims_length < 0) | is.null(names(largest_dims_length))) { + stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.") + } + } + if (is.logical(largest_dims_length) && length(largest_dims_length) != 1) { + stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or a named integer vector.") + } + # Check retrieve if (!is.logical(retrieve)) { stop("Parameter 'retrieve' must be TRUE or FALSE.") @@ -2229,70 +2258,84 @@ Start <- function(..., # dim = indices/selectors, if (dataset_has_files[i]) { indices <- indices_of_first_files_with_data[[i]] if (!is.null(indices)) { - file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) - # The following 5 lines should go several lines below, but were moved - # here for better performance. - # If any of the dimensions comes without defining variable, then we read - # the data dimensions. - data_dims <- NULL - if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { - file_to_open <- file_path - data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], - lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), - synonims) - # file_dim_reader returns dimension names as found in the file. - # Need to translate accoridng to synonims: - names(data_dims) <- sapply(names(data_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) - } + if (largest_dims_length == FALSE | is.numeric(largest_dims_length)) { #old code. use the 1st valid file to determine the dims + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) + # The following 5 lines should go several lines below, but were moved + # here for better performance. + # If any of the dimensions comes without defining variable, then we read + # the data dimensions. + data_dims <- NULL + if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { + file_to_open <- file_path + data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims) + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(data_dims) <- sapply(names(data_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + } - # Open and get all the dim from all the files - data_dims_all_files <- vector('list', length = length(selectors_total_list[[i]])) + if (is.numeric(largest_dims_length)) { # largest_dims_length is a named vector + # Check if the names fit the inner dimension names + if (!all(names(largest_dims_length) %in% names(data_dims))) { + #NOTE: stop or warning? + stop("Parameter 'largest_dims_length' has inconsistent names with inner dimensions.") + } else { + match_ind <- match(names(largest_dims_length), names(data_dims)) + data_dims[match_ind] <- largest_dims_length + } + } - for (selectors_kk in 1:length(data_dims_all_files)) { - file_path <- do.call("[", c(list(array_of_files_to_load), as.list(selector_indices_save[[i]][[selectors_kk]]))) - file_to_open <- file_path + } else { # largest_dims_length == TRUE + # Open and get all the dim from all the files + data_dims_all_files <- vector('list', length = length(selectors_total_list[[i]])) - data_dims_all_files[[selectors_kk]] <- try(file_dim_reader(file_to_open, NULL, selectors_total_list[[i]][[selectors_kk]], - lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), - synonims), silent = TRUE) + for (selectors_kk in 1:length(data_dims_all_files)) { + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(selector_indices_save[[i]][[selectors_kk]]))) + file_to_open <- file_path - } + data_dims_all_files[[selectors_kk]] <- try(file_dim_reader(file_to_open, NULL, selectors_total_list[[i]][[selectors_kk]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims), silent = TRUE) - # Remove the missing files (i.e., fail try above) - if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { - tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') - data_dims_all_files <- data_dims_all_files[-tmp] - } + } - # Find the longest dimensions from all the files - largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) - for (kk in 1:length(data_dims_all_files[[1]])) { - largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) - } - names(largest_data_dims) <- names(data_dims_all_files[[1]]) + # Remove the missing files (i.e., fail try above) + if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { + tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') + data_dims_all_files <- data_dims_all_files[-tmp] + } - # file_dim_reader returns dimension names as found in the file. - # Need to translate accoridng to synonims: - names(largest_data_dims) <- sapply(names(largest_data_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) + # Find the longest dimensions from all the files + largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) + for (kk in 1:length(data_dims_all_files[[1]])) { + largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) + } + names(largest_data_dims) <- names(data_dims_all_files[[1]]) - # replace data_dims with largest_data_dims - data_dims <- largest_data_dims + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(largest_data_dims) <- sapply(names(largest_data_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + + # replace data_dims with largest_data_dims + data_dims <- largest_data_dims + } # end of if (largest_dims_length == TRUE) # Transform the variables if needed and keep them apart. if (!is.null(transform) && (length(transform_vars) > 0)) { @@ -2311,7 +2354,7 @@ Start <- function(..., # dim = indices/selectors, if (length(which_are_ordered) > 0) { tmp <- which(!is.na(match(names(picked_vars_ordered[[i]]), names(which_are_ordered)))) new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[[i]][tmp] - + } vars_to_transform <- c(vars_to_transform, new_vars_to_transform) } -- GitLab From 48a72d91263307b11b21cc0d41aca18d7ca4bdc7 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 20 Oct 2020 14:50:45 +0200 Subject: [PATCH 06/10] Update --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0a32038..86ec2f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -37,4 +37,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true SystemRequirements: cdo -RoxygenNote: 5.0.0 +RoxygenNote: 7.0.1 -- GitLab From e65a1871ae27c4f81d3897c3b42198c369a8ddaf Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 20 Oct 2020 14:54:17 +0200 Subject: [PATCH 07/10] Update --- man/AddStep.Rd | 1 - 1 file changed, 1 deletion(-) diff --git a/man/AddStep.Rd b/man/AddStep.Rd index 3eece05..0d0ce46 100644 --- a/man/AddStep.Rd +++ b/man/AddStep.Rd @@ -54,4 +54,3 @@ create the complete workflow. It is the final step before data processing. wf <- AddStep(data, step, pi_val = pi_short) } - -- GitLab From a83886d638e8e85e5c07930736751103fd4d1f53 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 20 Oct 2020 14:57:46 +0200 Subject: [PATCH 08/10] Update .Rd file --- DESCRIPTION | 2 +- man/AddStep.Rd | 1 + man/Start.Rd | 20 ++++++++++++++++++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 86ec2f1..0a32038 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -37,4 +37,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true SystemRequirements: cdo -RoxygenNote: 7.0.1 +RoxygenNote: 5.0.0 diff --git a/man/AddStep.Rd b/man/AddStep.Rd index 0d0ce46..3eece05 100644 --- a/man/AddStep.Rd +++ b/man/AddStep.Rd @@ -54,3 +54,4 @@ create the complete workflow. It is the final step before data processing. wf <- AddStep(data, step, pi_val = pi_short) } + diff --git a/man/Start.Rd b/man/Start.Rd index 76510ad..4651e91 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -12,8 +12,9 @@ Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, pattern_dims = NULL, metadata_dims = NULL, selector_checker = SelectorChecker, merge_across_dims = FALSE, merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, retrieve = FALSE, num_procs = 1, - ObjectBigmemory = NULL, silent = FALSE, debug = FALSE) + path_glob_permissive = FALSE, largest_dims_length = FALSE, + retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, + silent = FALSE, debug = FALSE) } \arguments{ \item{return_vars}{A named list where the names are the names of the @@ -367,6 +368,21 @@ has two matches:\cr \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr only the first found file will be used.} +\item{largest_dims_length}{A logical value or a named integer vector + indicating if Start() should examine all the files to get the largest + length of the inner dimensions (TRUE) or use the first valid file of each + dataset as the returned dimension length (FALSE). Since examining all the + files could be time-consuming, a vector can be used to explicitly specify + the expected length of the inner dimensions. For those inner dimensions not + specified, the first valid file will be used. The default value is FALSE.\cr\cr + This parameter is useful when the required files don't have consistent + inner dimension. For example, there are 10 required experimental data files + of a series of start dates. The data only contain 25 members for the first + 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, + the returned member dimension length will be 25 only. The 26th to 51st + the returned member dimension length will be 51. To save the resource, +\code{'largest_dims_length = c(member = 51)'} can also be used.} + \item{retrieve}{A logical value indicating whether to retrieve the data defined in the Start() call or to explore only its dimension lengths and names, and the values for the file and inner dimensions. The default -- GitLab From 282e2c0bea7f507a17d756cfdf9e88f7834b25f1 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 20 Oct 2020 15:10:36 +0200 Subject: [PATCH 09/10] Typo fixed --- R/Start.R | 2 +- man/Start.Rd | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index bfd2ca4..45a9f61 100644 --- a/R/Start.R +++ b/R/Start.R @@ -689,7 +689,7 @@ #' of a series of start dates. The data only contain 25 members for the first #' 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, #' the returned member dimension length will be 25 only. The 26th to 51st -#; members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, +#' members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, #' the returned member dimension length will be 51. To save the resource, #' \code{'largest_dims_length = c(member = 51)'} can also be used. #'@param retrieve A logical value indicating whether to retrieve the data diff --git a/man/Start.Rd b/man/Start.Rd index 4651e91..680168e 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -380,6 +380,7 @@ only the first found file will be used.} of a series of start dates. The data only contain 25 members for the first 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, the returned member dimension length will be 25 only. The 26th to 51st + members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, the returned member dimension length will be 51. To save the resource, \code{'largest_dims_length = c(member = 51)'} can also be used.} -- GitLab From 52da0dd218947225718ba624077b901eb1f92247 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 21 Oct 2020 15:45:41 +0200 Subject: [PATCH 10/10] Update NEWS.md for largest_dims_length and create unit test and FAQ --- NEWS.md | 1 + inst/doc/faq.md | 23 ++- .../testthat/test-Start-largest_dims_length.R | 133 ++++++++++++++++++ 3 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/test-Start-largest_dims_length.R diff --git a/NEWS.md b/NEWS.md index 8b65f5e..d43ad5a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ - Bugfix for metadata_dims along non-dat dim. - Bugfix for wildcard reading when parameter 'path_glob_permissive' is used. - /dev/shm automatic cleaning on Compute(). Solve the error 'No space left on device' which happened when the jobs are aborted. +- Add new paramter 'largest_dims_length' in Start(). It can examine all the files to find the largest inner dimension length. It is useful when certain inner dimension among the files does not have consistent length (e.g., different ensemble number). # startR v2.0.1 (Release date: 2020-09-10) - /dev/shm automatic cleaning on Compute() diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 53320d5..947ad9f 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -23,7 +23,8 @@ This document intends to be the first reference for any doubts that you may have 17. [Use parameter 'split_multiselected_dims' in Start()](#17-use-parameter-split_multiselected_dims-in-start) 18. [Use glob expression '*' to define the file path](#18-use-glob-expression-to-define-the-file-path) 19. [Get metadata when the first file does not exist](#19-get-metadata-when-the-first-file-does-not-exist) - 20. [Use 'metadata_dims' to retrieve variable metadata](#19-use-metadata_dims-to-retrieve-variable-metadata) + 20. [Use 'metadata_dims' to retrieve variable metadata](#20-use-metadata_dims-to-retrieve-variable-metadata) + 21. [Retrieve the complete data when the dimension length varies among files](#21-retrieve-the-complete-data-when-the-dimension-length-varies-among-files) 2. **Something goes wrong...** @@ -804,6 +805,26 @@ only the first data set will have the variable's metadata. Please find the relevant use cases in [ex1_10](inst/doc/usecase/ex1_10_metadata_dims.R). +### 21. Retrieve the complete data when the dimension length varies among files +By default, Start() uses the first valid file of each data set to determine the dimensions +of the return data array. However, the inner dimension length among the files may not be the +same. For example, the member number in one experiment is 25 in the early years while +increase to 51 later. If you assign `member = 'all'` in Start() call, the returned member +dimension length will be 25 only. + +The parameter 'largest_dims_length' is for this case. Its default value is `FALSE`, meaning +that Start() can only use the first valid file to decide the dimensions. If it is changed to +`TRUE`, Start() will examine all the required files to find the largest length for all the inner +dimensions. It is time- and resource-consuming, but useful when you are not sure how the dimensions +in all the files look like. + +If you know the expected dimension length, it is recommended to assign 'largest_dims_length' +by a named integer vector, for example, `largest_dims_length = c(member = 51)`. Start() will +adopt the provided ones and use the first valid file to decide the rest of dimensions. +By this means, the efficiency can be similar to `largest_dims_length = FALSE`. + + + # Something goes wrong... ### 1. No space left on device diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R new file mode 100644 index 0000000..fe0899e --- /dev/null +++ b/tests/testthat/test-Start-largest_dims_length.R @@ -0,0 +1,133 @@ +context("Start() largest_dims_length check") +# When certain inner dim of files is not consistent, the parameter 'largest_dims_length' can +# be used to ensure the returned array has the largest length of inner dimensions. + +test_that("1. inconsistent member length", { +# system5c3s: sdate = '19931101' has 25 members, '20200901' has 51 members. +# system3: 40 members. + +repos <- list(list(name = 'system5c3s', + path = "/esarchive/exp/ecmwf/system5c3s/monthly_mean/g500_f12h/$var$_$sdate$.nc"), + list(name = 'system3_m1-c3s', + path = "/esarchive/exp/cmcc/system3_m1-c3s/monthly_mean/g500_f12h/$var$_$sdate$.nc")) + +# largest_dims_length = FALSE +dat1 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = FALSE, + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat1), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 40, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat1[is.na(dat1)]), + 3000 + ) + expect_equal( + sum(as.numeric(is.na(dat1[1, 1, 1, , , , ]))), + 540 + ) + expect_equal( + sum(as.numeric(is.na(dat1[1, 1, 2, , , , ]))), + 540 + ) + +# largest_dims_length = TRUE +dat2 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = TRUE, + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat2), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 51, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat2[is.na(dat2)]), + 3528 + ) + expect_equal( + sum(as.numeric(is.na(dat2[1, 1, 1, , , , ]))), + 936 + ) + expect_equal( + sum(as.numeric(is.na(dat2[1, 1, 2, , , , ]))), + 0 + ) + +# largest_dims_length = c(ensemble = 51) +dat3 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = c(ensemble = 51), + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat3), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 51, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat3[is.na(dat3)]), + 3528 + ) + expect_equal( + sum(as.numeric(is.na(dat3[1, 1, 1, , , , ]))), + 936 + ) + expect_equal( + sum(as.numeric(is.na(dat3[1, 1, 2, , , , ]))), + 0 + ) + +# Compare dat1, dat2, and dat3 + expect_equal( + mean(dat2, na.rm = T), + mean(dat3, na.rm = T) + ) + expect_equal( + mean(dat1[, , , , 1:25, , ], na.rm = T), + mean(dat2[, , , , 1:25, , ], na.rm = T) + ) + expect_equal( + mean(dat1[, , , , 1:25, , ], na.rm = T), + mean(dat3[, , , , 1:25, , ], na.rm = T) + ) + +}) -- GitLab