diff --git a/NEWS.md b/NEWS.md index 8b65f5efada6d88723a0be848243e1aab0f35580..d43ad5aa20cf4aa889fa3057581366b4b6acf64c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ - Bugfix for metadata_dims along non-dat dim. - Bugfix for wildcard reading when parameter 'path_glob_permissive' is used. - /dev/shm automatic cleaning on Compute(). Solve the error 'No space left on device' which happened when the jobs are aborted. +- Add new paramter 'largest_dims_length' in Start(). It can examine all the files to find the largest inner dimension length. It is useful when certain inner dimension among the files does not have consistent length (e.g., different ensemble number). # startR v2.0.1 (Release date: 2020-09-10) - /dev/shm automatic cleaning on Compute() diff --git a/R/Start.R b/R/Start.R index 718ae42654791c500df7fa63bcb60806bf9efa1e..45a9f611c2ceddc755acb06575eec91c550db14d 100644 --- a/R/Start.R +++ b/R/Start.R @@ -677,6 +677,21 @@ #'\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr #'\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr #'only the first found file will be used. +#'@param largest_dims_length A logical value or a named integer vector +#' indicating if Start() should examine all the files to get the largest +#' length of the inner dimensions (TRUE) or use the first valid file of each +#' dataset as the returned dimension length (FALSE). Since examining all the +#' files could be time-consuming, a vector can be used to explicitly specify +#' the expected length of the inner dimensions. For those inner dimensions not +#' specified, the first valid file will be used. The default value is FALSE.\cr\cr +#' This parameter is useful when the required files don't have consistent +#' inner dimension. For example, there are 10 required experimental data files +#' of a series of start dates. The data only contain 25 members for the first +#' 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, +#' the returned member dimension length will be 25 only. The 26th to 51st +#' members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, +#' the returned member dimension length will be 51. To save the resource, +#' \code{'largest_dims_length = c(member = 51)'} can also be used. #'@param retrieve A logical value indicating whether to retrieve the data #' defined in the Start() call or to explore only its dimension lengths #' and names, and the values for the file and inner dimensions. The default @@ -808,6 +823,7 @@ Start <- function(..., # dim = indices/selectors, merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, + largest_dims_length = FALSE, retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, @@ -1364,6 +1380,19 @@ Start <- function(..., # dim = indices/selectors, stop("Parameter 'path_glob_permissive' must be of length 1.") } + # Check largest_dims_length + if (!is.numeric(largest_dims_length) && !is.logical(largest_dims_length)) { + stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.") + } + if (is.numeric(largest_dims_length)) { + if (any(largest_dims_length %% 1 != 0) | any(largest_dims_length < 0) | is.null(names(largest_dims_length))) { + stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.") + } + } + if (is.logical(largest_dims_length) && length(largest_dims_length) != 1) { + stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or a named integer vector.") + } + # Check retrieve if (!is.logical(retrieve)) { stop("Parameter 'retrieve' must be TRUE or FALSE.") @@ -1772,11 +1801,23 @@ Start <- function(..., # dim = indices/selectors, dim = files_to_load) names(dim(sub_array_of_not_found_files)) <- known_dims j <- 1 - selector_indices_save <- vector('list', prod(files_to_load)) + if (!exists('selector_indices_save')) { + selector_indices_save <- vector('list', length = length(dat)) + } + if (!exists('selectors_total_list')) { + selectors_total_list <- vector('list', length = length(dat)) + } + selector_indices_save[[i]] <- vector('list', length = prod(files_to_load)) + selectors_total_list[[i]] <- vector('list', length = prod(files_to_load)) + while (j <= prod(files_to_load)) { selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] names(selector_indices) <- known_dims - selector_indices_save[[j]] <- selector_indices + + tmp <- selector_indices + tmp[which(known_dims == found_pattern_dim)] <- i + selector_indices_save[[i]][[j]] <- tmp + selectors <- sapply(1:length(known_dims), function (x) { vector_to_pick <- 1 @@ -1786,6 +1827,9 @@ Start <- function(..., # dim = indices/selectors, dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]] }) names(selectors) <- known_dims + selectors_total_list[[i]][[j]] <- selectors + names(selectors_total_list[[i]][[j]]) <- known_dims + replace_values[known_dims] <- selectors if (!dataset_has_files[i]) { if (any(is.na(selectors))) { @@ -2214,29 +2258,85 @@ Start <- function(..., # dim = indices/selectors, if (dataset_has_files[i]) { indices <- indices_of_first_files_with_data[[i]] if (!is.null(indices)) { - file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) - # The following 5 lines should go several lines below, but were moved - # here for better performance. - # If any of the dimensions comes without defining variable, then we read - # the data dimensions. - data_dims <- NULL - if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { - file_to_open <- file_path - data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], - lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), - synonims) + if (largest_dims_length == FALSE | is.numeric(largest_dims_length)) { #old code. use the 1st valid file to determine the dims + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) + # The following 5 lines should go several lines below, but were moved + # here for better performance. + # If any of the dimensions comes without defining variable, then we read + # the data dimensions. + data_dims <- NULL + if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { + file_to_open <- file_path + data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims) + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(data_dims) <- sapply(names(data_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + } + + if (is.numeric(largest_dims_length)) { # largest_dims_length is a named vector + # Check if the names fit the inner dimension names + if (!all(names(largest_dims_length) %in% names(data_dims))) { + #NOTE: stop or warning? + stop("Parameter 'largest_dims_length' has inconsistent names with inner dimensions.") + } else { + match_ind <- match(names(largest_dims_length), names(data_dims)) + data_dims[match_ind] <- largest_dims_length + } + } + + } else { # largest_dims_length == TRUE + # Open and get all the dim from all the files + data_dims_all_files <- vector('list', length = length(selectors_total_list[[i]])) + + for (selectors_kk in 1:length(data_dims_all_files)) { + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(selector_indices_save[[i]][[selectors_kk]]))) + file_to_open <- file_path + + data_dims_all_files[[selectors_kk]] <- try(file_dim_reader(file_to_open, NULL, selectors_total_list[[i]][[selectors_kk]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims), silent = TRUE) + + } + + # Remove the missing files (i.e., fail try above) + if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { + tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') + data_dims_all_files <- data_dims_all_files[-tmp] + } + + # Find the longest dimensions from all the files + largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) + for (kk in 1:length(data_dims_all_files[[1]])) { + largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) + } + names(largest_data_dims) <- names(data_dims_all_files[[1]]) + # file_dim_reader returns dimension names as found in the file. # Need to translate accoridng to synonims: - names(data_dims) <- sapply(names(data_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) - } + names(largest_data_dims) <- sapply(names(largest_data_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + + # replace data_dims with largest_data_dims + data_dims <- largest_data_dims + } # end of if (largest_dims_length == TRUE) + # Transform the variables if needed and keep them apart. if (!is.null(transform) && (length(transform_vars) > 0)) { if (!all(transform_vars %in% c(names(picked_vars[[i]]), names(picked_common_vars)))) { @@ -2254,7 +2354,7 @@ Start <- function(..., # dim = indices/selectors, if (length(which_are_ordered) > 0) { tmp <- which(!is.na(match(names(picked_vars_ordered[[i]]), names(which_are_ordered)))) new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[[i]][tmp] - + } vars_to_transform <- c(vars_to_transform, new_vars_to_transform) } diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 53320d55adda9a365f8901af068e5eefd53ca002..947ad9f5c5591aa72b1db79d22d0dc0d884aea8e 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -23,7 +23,8 @@ This document intends to be the first reference for any doubts that you may have 17. [Use parameter 'split_multiselected_dims' in Start()](#17-use-parameter-split_multiselected_dims-in-start) 18. [Use glob expression '*' to define the file path](#18-use-glob-expression-to-define-the-file-path) 19. [Get metadata when the first file does not exist](#19-get-metadata-when-the-first-file-does-not-exist) - 20. [Use 'metadata_dims' to retrieve variable metadata](#19-use-metadata_dims-to-retrieve-variable-metadata) + 20. [Use 'metadata_dims' to retrieve variable metadata](#20-use-metadata_dims-to-retrieve-variable-metadata) + 21. [Retrieve the complete data when the dimension length varies among files](#21-retrieve-the-complete-data-when-the-dimension-length-varies-among-files) 2. **Something goes wrong...** @@ -804,6 +805,26 @@ only the first data set will have the variable's metadata. Please find the relevant use cases in [ex1_10](inst/doc/usecase/ex1_10_metadata_dims.R). +### 21. Retrieve the complete data when the dimension length varies among files +By default, Start() uses the first valid file of each data set to determine the dimensions +of the return data array. However, the inner dimension length among the files may not be the +same. For example, the member number in one experiment is 25 in the early years while +increase to 51 later. If you assign `member = 'all'` in Start() call, the returned member +dimension length will be 25 only. + +The parameter 'largest_dims_length' is for this case. Its default value is `FALSE`, meaning +that Start() can only use the first valid file to decide the dimensions. If it is changed to +`TRUE`, Start() will examine all the required files to find the largest length for all the inner +dimensions. It is time- and resource-consuming, but useful when you are not sure how the dimensions +in all the files look like. + +If you know the expected dimension length, it is recommended to assign 'largest_dims_length' +by a named integer vector, for example, `largest_dims_length = c(member = 51)`. Start() will +adopt the provided ones and use the first valid file to decide the rest of dimensions. +By this means, the efficiency can be similar to `largest_dims_length = FALSE`. + + + # Something goes wrong... ### 1. No space left on device diff --git a/man/Start.Rd b/man/Start.Rd index 76510ad20e6cc88d38226dba126e0692fec814de..680168e44903df0b1ae68c2294096bd0c5ca99c4 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -12,8 +12,9 @@ Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, pattern_dims = NULL, metadata_dims = NULL, selector_checker = SelectorChecker, merge_across_dims = FALSE, merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, retrieve = FALSE, num_procs = 1, - ObjectBigmemory = NULL, silent = FALSE, debug = FALSE) + path_glob_permissive = FALSE, largest_dims_length = FALSE, + retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, + silent = FALSE, debug = FALSE) } \arguments{ \item{return_vars}{A named list where the names are the names of the @@ -367,6 +368,22 @@ has two matches:\cr \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr only the first found file will be used.} +\item{largest_dims_length}{A logical value or a named integer vector + indicating if Start() should examine all the files to get the largest + length of the inner dimensions (TRUE) or use the first valid file of each + dataset as the returned dimension length (FALSE). Since examining all the + files could be time-consuming, a vector can be used to explicitly specify + the expected length of the inner dimensions. For those inner dimensions not + specified, the first valid file will be used. The default value is FALSE.\cr\cr + This parameter is useful when the required files don't have consistent + inner dimension. For example, there are 10 required experimental data files + of a series of start dates. The data only contain 25 members for the first + 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, + the returned member dimension length will be 25 only. The 26th to 51st + members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, + the returned member dimension length will be 51. To save the resource, +\code{'largest_dims_length = c(member = 51)'} can also be used.} + \item{retrieve}{A logical value indicating whether to retrieve the data defined in the Start() call or to explore only its dimension lengths and names, and the values for the file and inner dimensions. The default diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R new file mode 100644 index 0000000000000000000000000000000000000000..fe0899e8d19d46ff5372f2d1ee2796f15974b27f --- /dev/null +++ b/tests/testthat/test-Start-largest_dims_length.R @@ -0,0 +1,133 @@ +context("Start() largest_dims_length check") +# When certain inner dim of files is not consistent, the parameter 'largest_dims_length' can +# be used to ensure the returned array has the largest length of inner dimensions. + +test_that("1. inconsistent member length", { +# system5c3s: sdate = '19931101' has 25 members, '20200901' has 51 members. +# system3: 40 members. + +repos <- list(list(name = 'system5c3s', + path = "/esarchive/exp/ecmwf/system5c3s/monthly_mean/g500_f12h/$var$_$sdate$.nc"), + list(name = 'system3_m1-c3s', + path = "/esarchive/exp/cmcc/system3_m1-c3s/monthly_mean/g500_f12h/$var$_$sdate$.nc")) + +# largest_dims_length = FALSE +dat1 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = FALSE, + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat1), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 40, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat1[is.na(dat1)]), + 3000 + ) + expect_equal( + sum(as.numeric(is.na(dat1[1, 1, 1, , , , ]))), + 540 + ) + expect_equal( + sum(as.numeric(is.na(dat1[1, 1, 2, , , , ]))), + 540 + ) + +# largest_dims_length = TRUE +dat2 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = TRUE, + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat2), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 51, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat2[is.na(dat2)]), + 3528 + ) + expect_equal( + sum(as.numeric(is.na(dat2[1, 1, 1, , , , ]))), + 936 + ) + expect_equal( + sum(as.numeric(is.na(dat2[1, 1, 2, , , , ]))), + 0 + ) + +# largest_dims_length = c(ensemble = 51) +dat3 <- Start(dataset = repos, + var = "g500", + sdate = c("19931101","20200901"), + time = 'all', + ensemble = 'all', + latitude = values(list(10, 12)), + latitude_reorder = Sort(decreasing = T), + longitude = values(list(10, 11)), + longitude_reorder = CircularSort(0, 360), + largest_dims_length = c(ensemble = 51), + synonims = list(longitude = c('lon', 'longitude'), + latitude = c('lat', 'latitude')), + return_vars = list( time = NULL, + latitude = 'dataset', + longitude = 'dataset'), + retrieve = T) + + expect_equal( + dim(dat3), + c(dataset = 2, var = 1, sdate = 2, time = 6, ensemble = 51, latitude = 3, longitude = 2) + ) + expect_equal( + length(dat3[is.na(dat3)]), + 3528 + ) + expect_equal( + sum(as.numeric(is.na(dat3[1, 1, 1, , , , ]))), + 936 + ) + expect_equal( + sum(as.numeric(is.na(dat3[1, 1, 2, , , , ]))), + 0 + ) + +# Compare dat1, dat2, and dat3 + expect_equal( + mean(dat2, na.rm = T), + mean(dat3, na.rm = T) + ) + expect_equal( + mean(dat1[, , , , 1:25, , ], na.rm = T), + mean(dat2[, , , , 1:25, , ], na.rm = T) + ) + expect_equal( + mean(dat1[, , , , 1:25, , ], na.rm = T), + mean(dat3[, , , , 1:25, , ], na.rm = T) + ) + +}) diff --git a/tests/testthat/test-Start-path_glob_permissive.R b/tests/testthat/test-Start-path_glob_permissive.R index ca6bbc03debf19b0b2f6201938070ad2e16e2f45..2809d73585a4364fc06db41afb76a802e05a4244 100644 --- a/tests/testthat/test-Start-path_glob_permissive.R +++ b/tests/testthat/test-Start-path_glob_permissive.R @@ -36,6 +36,7 @@ data <- Start(dat = repos, "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), dim = c(dat = 1, var = 1, expid = 2, year = 2)) ) +# NOTE: in R_3.2.0, the following test doesn't have dimension. In R_3.6.2 it does. expect_equal( data[1, 1, , , 1, 1], array(c(18.60422, 17.13862, 18.52348, 17.21780), dim = c(expid = 2, year = 2)), @@ -77,6 +78,7 @@ data <- Start(dat = repos, "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), dim = c(dat = 1, var = 1, expid = 2, year = 2, member = 1)) ) +# NOTE: in R_3.2.0, the following test doesn't have dimension. In R_3.6.2 it does. expect_equal( data[1, 1, , , 1, 1, 1], array(c(18.60422, 17.13862, 18.52348, 17.21780), dim = c(expid = 2, year = 2)),