diff --git a/.gitlab/issue_templates/Default.md b/.gitlab/issue_templates/Default.md index 6d47b7150d6ddd446c2013022b5bfbcdc94abce4..e16e0c7ce417fc74877ae733e1e4d7e913c135fd 100644 --- a/.gitlab/issue_templates/Default.md +++ b/.gitlab/issue_templates/Default.md @@ -1,6 +1,6 @@ (This is a template to report problems or suggest a new development. Please fill in the relevant information and remove the rest.) -Hi @aho, +Hi @vagudets, #### Summary (Bug: Summarize the bug and explain briefly the expected and the current behavior.) diff --git a/DESCRIPTION b/DESCRIPTION index 8fd5ee18350d5b284ae22abc5ca64e50af7b7184..35bb1b9f57c6d28367f2e1ad3933cb76a66c3c9b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,12 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 2.3.1 +Version: 2.4.0 Authors@R: c( person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), - person("An-Chi", "Ho", , "an.ho@bsc.es", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4182-5258")), + person("An-Chi", "Ho", , "an.ho@bsc.es", role = c("aut"), comment = c(ORCID = "0000-0002-4182-5258")), person("Nuria", "Perez-Zanon", , "nuria.perez@bsc.es", role = c("aut"), comment = c(ORCID = "0000-0001-8568-3071")), person("Eva", "Rifa", , "eva.rifarovira@bsc.es", role = "ctb"), - person("Victoria", "Agudetse", , "victoria.agudetse@bsc.es", role = "ctb"), + person("Victoria", "Agudetse", , "victoria.agudetse@bsc.es", role = c("cre", "ctb")), person("Bruno", "de Paula Kinoshita", , "bruno.depaulakinoshita@bsc.es", role = "ctb"), person("Javier", "Vegas", , "javier.vegas@bsc.es", role = c("ctb")), person("Pierre-Antoine", "Bretonniere", , "pierre-antoine.bretonniere@bsc.es", role = c("ctb")), @@ -44,5 +44,5 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues SystemRequirements: cdo ecFlow Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Config/testthat/edition: 3 diff --git a/NEWS.md b/NEWS.md index c19d7a3389ad581d2f70855758fa7417908d21bc..ad8b0957497246722f2ac438f6720c4b18267ead 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# startR v2.4.0 (Release date: 2024-09-10) +- Allow chunking along inner dimensions that go across file dimensions +- Allow more than one file dimension to be specified in "metadata_dims" +- Add check and warning for when special wildcard "$var$" is missing in the path +- Bugfix: Start() retrieve correct time steps when time is across file dimension and the time steps of the first files are skipped +- Bugfix: Generate correct file paths when a file dimension has multiple depending dimensions + # startR v2.3.1 (Release date: 2023-12-22) - Use Autosubmit as workflow manager on hub - New feature: Collect result by Collect() on HPCs diff --git a/R/NcDataReader.R b/R/NcDataReader.R index 25e33d412dca7545f1878b18eabe90d07e0432d4..47817c65e54684380670ab35b2bd928268afce62 100644 --- a/R/NcDataReader.R +++ b/R/NcDataReader.R @@ -62,7 +62,6 @@ NcDataReader <- function(file_path = NULL, file_object = NULL, if (is.null(file_to_read)) { return(NULL) } - var_requested <- is.null(inner_indices) drop_var_dim <- FALSE diff --git a/R/Start.R b/R/Start.R index 5bfb3bfc67487ff764e7debe1fb7881c325ba610..75e352cbcf78bbea193fe832cce68c7c869e7a43 100644 --- a/R/Start.R +++ b/R/Start.R @@ -750,6 +750,9 @@ #' components used to build up the paths to each of the files in the data #' sources. #' } +#' \item{PatternDim}{ +#' Character string containing the name of the file pattern dimension. +#' } #'If \code{retrieve = FALSE} the involved data is not loaded into RAM memory and #'an object of the class 'startR_header' with the following components is #' returned:\cr @@ -766,7 +769,7 @@ #' multidimensional array with named dimensions, and potentially with the #' attribute 'variables' with additional auxiliary data. #' } -#' \item{Files}{ +#' \item{ExpectedFiles}{ #' Multidimensonal character string array with named dimensions. Its dimensions #' are the file dimensions (as requested in \dots). Each cell in this array #' contains a path to a file to be retrieved (which may exist or not). @@ -777,6 +780,9 @@ #' components used to build up the paths to each of the files in the data #' sources. #' } +#' \item{PatternDim}{ +#' Character string containing the name of the file pattern dimension. +#' } #' \item{StartRCall}{ #' List of parameters sent to the Start() call, with the parameter #' 'retrieve' set to TRUE. Intended for calling in order to @@ -874,7 +880,6 @@ Start <- function(..., # dim = indices/selectors, if (!merge_across_dims & merge_across_dims_narm) { merge_across_dims_narm <- FALSE } - # Leave alone the dimension parameters in the variable dim_params dim_params <- rebuild_dim_params(dim_params, merge_across_dims, inner_dims_across_files) @@ -887,6 +892,12 @@ Start <- function(..., # dim = indices/selectors, found_pattern_dim <- found_pattern_dims(pattern_dims, dim_names, var_params, dim_params, dim_reorder_params) + # Check if file pattern contains '$var$' substring + if (any(!grepl("$var$", dim_params[[found_pattern_dim]], fixed = TRUE))) { + .warning(paste("The special wildcard '$var$' is not present in the file", + "path. This might cause Start() to fail if it cannot parse", + "the inner dimensions in all the files.")) + } # Check all *_reorder are NULL or functions, and that they all have # a matching dimension param. i <- 1 @@ -954,15 +965,13 @@ Start <- function(..., # dim = indices/selectors, metadata_dims <- c(pattern_dims, metadata_dims[-which(metadata_dims == pattern_dims)]) } # Check if metadata_dims has more than 2 elements - if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2)) { - .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", - "function. Keep '", metadata_dims[1], "' and '", metadata_dims[2], "' only.")) - metadata_dims <- metadata_dims[1:2] - } else if (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1) { - .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", - "function. Keep '", metadata_dims[1], "' only.")) - metadata_dims <- metadata_dims[1] - } + if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2) || + (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1)) { + .warning(paste0("Parameter 'metadata_dims' contains some elements which", + "might serve a repetitive purpose: ", + metadata_dims[which(metadata_dims != pattern_dims)], + ". This could impact the performance of Start().")) + } # Once the pattern dimension with dataset specifications is found, # the variable 'dat' is mounted with the information of each @@ -2492,6 +2501,10 @@ Start <- function(..., # dim = indices/selectors, } # Find the largest length of each time step inner_dim_lengths <- do.call(pmax, inner_dim_lengths_cat) + ## NOTE: NA values can be present if the size of a depending + ## dimension varies along its depended dim. Removing them allows + ## retrieval of the common indices. Could cause other issues? + inner_dim_lengths <- inner_dim_lengths[which(!is.na(inner_dim_lengths))] } fri <- first_round_indices <- NULL @@ -2768,10 +2781,10 @@ Start <- function(..., # dim = indices/selectors, !is.null(dim_reorder_params[[inner_dim]])) { if ('circular' %in% names(attributes(dim_reorder_params[[inner_dim]]))) { is_circular_dim <- attr(dim_reorder_params[[inner_dim]], "circular") - if (is_circular_dim & is.list(sub_array_of_selectors)) { - tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix - goes_across_prime_meridian <- tmp[1] > tmp[2] - } + if (is_circular_dim & is.list(sub_array_of_selectors)) { + tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix + goes_across_prime_meridian <- tmp[1] > tmp[2] + } } } @@ -2918,7 +2931,7 @@ Start <- function(..., # dim = indices/selectors, sub_array_of_indices[[1]] <- vect[tmp[1]] sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] } - } + } # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. #//////////////////////////////////////////////////////////// @@ -3195,10 +3208,12 @@ Start <- function(..., # dim = indices/selectors, } } # If "_across = + merge_across_dims = FALSE + chunk over ", return error because this instance is not logically correct. - if (chunks[[inner_dim]]["n_chunks"] > 1 & inner_dim %in% inner_dims_across_files) { + if (chunks[[inner_dim]]["n_chunks"] > 1 & inner_dim %in% inner_dims_across_files & + merge_across_dims == FALSE) { stop("Chunk over dimension '", inner_dim, "' is not allowed because '", inner_dim, "' is across '", - names(inner_dims_across_files)[which(inner_dim %in% inner_dims_across_files)], "'.") + names(inner_dims_across_files)[which(inner_dim %in% inner_dims_across_files)], + "' and 'merge_across_dims' is set to FALSE'.") } if (inner_dim %in% names(dim(sub_array_of_selectors))) { @@ -3273,8 +3288,11 @@ Start <- function(..., # dim = indices/selectors, indices_chunk <- c(indices_chunk, rep(item, length(tmp) - length(indices_chunk))) } sub_array_of_indices_by_file <- split(sub_array_of_indices, indices_chunk) - for (item in 2:length(sub_array_of_indices_by_file)) { - sub_array_of_indices_by_file[[item]] <- sub_array_of_indices_by_file[[item]] - cumsum(inner_dim_lengths)[item - 1] + for (item in names((sub_array_of_indices_by_file))) { + # If item is 1, cumsum(inner_dim_lengths)[item - 1] returns numeric(0) + if (as.numeric(item) > 1) { + sub_array_of_indices_by_file[[item]] <- sub_array_of_indices_by_file[[item]] - cumsum(inner_dim_lengths)[as.numeric(item) - 1] + } } transformed_indices <- unlist(sub_array_of_indices_by_file, use.names = FALSE) } diff --git a/R/zzz.R b/R/zzz.R index f098a3b11651e260dc72a23f5ec490e5e76a7320..f198746362c9dc6876d3b98a722c38696cae5d20 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -151,7 +151,7 @@ rebuild_dim_params <- function(dim_params, merge_across_dims, # Look for chunked dims look_for_chunks <- function(dim_params, dim_names) { - chunks <- vector('list', length(dim_names)) + chunks <- vector('list', length(dim_names)) names(chunks) <- dim_names for (dim_name in dim_names) { if (!is.null(attr(dim_params[[dim_name]], 'chunk'))) { @@ -382,6 +382,16 @@ find_ufd_value <- function(undefined_file_dims, dat, i, replace_values, depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))] replace_values[depending_dims] <- rep('*', length(depending_dims)) } + # If u_file_dim depends on the same depended dimension as another depending + # dimension, then the value of the depending dim should be replaced with '*' + # to avoid only the first value being used, which can result in the wrong + # path specification. + other_depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == u_file_dim)] + if (length(depending_file_dims) > 1 && + any(unlist(other_depending_file_dims) == depended_dim)) { + depending_dims <- names(other_depending_file_dims)[which(other_depending_file_dims == depended_dim)] + replace_values[depending_dims] <- rep('*', length(depending_dims)) + } for (j in 1:length(depended_dim_values)) { parsed_values <- c() if (!is.null(depended_dim)) { @@ -1021,7 +1031,6 @@ build_work_pieces <- function(work_pieces, i, selectors, file_dims, inner_dims, } j <- j + 1 } - return(work_pieces) } diff --git a/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md b/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md index 85015130a5c60462493cb96dd168f410d7d995af..e554c66ddbb3d8f86334b111a7bb34b69cbb8874 100644 --- a/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md +++ b/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md @@ -18,7 +18,7 @@ After loading startR package, the paths to the hindcast should be defined, inclu library(startR) ecmwf_path <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensforhc/', - 'weekly_mean/$var$_f24h/$sdate$/$var$_$syear$.nc') + 'weekly_mean/$var$_f6h/$sdate$/$var$_$syear$.nc') ``` Now, create the sequence of start dates in 2016 following the scheme in this figure: diff --git a/man/Start.Rd b/man/Start.Rd index 640c5a9e8b6a2b27b827231fac0ede40c4258b94..275ae9bb391c36f379294dcf43a8f05997729a9a 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -734,6 +734,9 @@ If \code{retrieve = TRUE} the involved data is loaded into RAM memory components used to build up the paths to each of the files in the data sources. } + \item{PatternDim}{ + Character string containing the name of the file pattern dimension. + } If \code{retrieve = FALSE} the involved data is not loaded into RAM memory and an object of the class 'startR_header' with the following components is returned:\cr @@ -750,7 +753,7 @@ returned:\cr multidimensional array with named dimensions, and potentially with the attribute 'variables' with additional auxiliary data. } - \item{Files}{ + \item{ExpectedFiles}{ Multidimensonal character string array with named dimensions. Its dimensions are the file dimensions (as requested in \dots). Each cell in this array contains a path to a file to be retrieved (which may exist or not). @@ -761,6 +764,9 @@ returned:\cr components used to build up the paths to each of the files in the data sources. } + \item{PatternDim}{ + Character string containing the name of the file pattern dimension. + } \item{StartRCall}{ List of parameters sent to the Start() call, with the parameter 'retrieve' set to TRUE. Intended for calling in order to diff --git a/tests/testthat/test-Compute-chunk_across_dim.R b/tests/testthat/test-Compute-chunk_across_dim.R new file mode 100644 index 0000000000000000000000000000000000000000..0fb3382facf9e6dcadf687ec2c64024362038db7 --- /dev/null +++ b/tests/testthat/test-Compute-chunk_across_dim.R @@ -0,0 +1,129 @@ +suppressMessages({ +# This unit test tests the chunking over dimension that goes across files. +# 1. across dim is a vector +# a. merge_across_dims is TRUE +# b. merge_across_dims is FALSE +# Note that 1.b. doesn't work. + +path <- '/esarchive/exp/CMIP6/dcppA-hindcast/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$chunk$.nc' +path <- paste0('/esarchive/scratch/aho/startR_unittest_files/', path) + +sdates <- c('2016', '2017', '2018') + +# retrieve = T for verification +suppressWarnings( + data_T <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(1:24), + i = indices(450:452), + j = indices(650:651), + time_across = 'chunk', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) +) + +test_that("1.a. across dim is a vector, merge_across_dims = TRUE", { + +suppressWarnings( +data <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(1:24), + i = indices(450:452), + j = indices(650:651), + merge_across_dims = TRUE, + largest_dims_length = TRUE, + time_across = 'chunk', + return_vars = list(time = 'sdate'), + retrieve = FALSE) +) + +fun <- function(x) { +return(x) +} + +step <- Step(fun = fun, + target_dims = 'dat', output_dims = 'dat') +wf <- AddStep(inputs = data, step_fun = step) +suppressWarnings( +res1 <- Compute(workflow = wf, chunks = list(time = 1))$output1 +) +suppressWarnings( +res2 <- Compute(workflow = wf, chunks = list(time = 2))$output1 +) +suppressWarnings( +res3 <- Compute(workflow = wf, chunks = list(time = 3))$output1 +) + +expect_equal( +as.vector(data_T), +as.vector(res1) +) +expect_equal( +res1, +res2 +) +expect_equal( +res1, +res3 +) + +expect_equal( +as.vector(drop(res1)[ , 1, 1, 1]), +c(29.91125, 29.94805, 30.35584), +tolerance = 0.0001 +) +expect_equal( +as.vector(drop(res1)[ , 2, 1, 1]), +c(29.53878, 29.72491, 30.34167), +tolerance = 0.0001 +) + +}) + +################################################################# +################################################################# +################################################################# + +test_that("1.b. across dim is a vector, merge_across_dims = FALSE", { + +suppressWarnings( +data <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(1:24), + i = indices(450:452), + j = indices(650:651), + merge_across_dims = FALSE, + largest_dims_length = TRUE, + time_across = 'chunk', + return_vars = list(time = 'sdate'), + retrieve = FALSE) +) + +fun <- function(x) { +return(x) +} + +step <- Step(fun = fun, + target_dims = 'dat', output_dims = 'dat') +wf <- AddStep(inputs = data, step_fun = step) + +expect_error( +suppressWarnings( +res <- Compute(workflow = wf, chunks = list(time = 2))$output1), +"Chunk over dimension 'time' is not allowed because 'time' is across 'chunk' and 'merge_across_dims' is set to FALSE'." +) + +}) + +}) #suppressMessages diff --git a/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R b/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R new file mode 100644 index 0000000000000000000000000000000000000000..843ea98de3da653d488a705328c1872742e5fd1d --- /dev/null +++ b/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R @@ -0,0 +1,63 @@ +suppressMessages({ +test_that("Chunks of DCPP files with largest_dims_length = TRUE - Local execution", { + path <- '/esarchive/exp/CMIP6/dcppA-hindcast/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$chunk$.nc' + path <- paste0('/esarchive/scratch/aho/startR_unittest_files/', path) + + sdates <- c('2017', '2018') +suppressWarnings( + dat1 <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(1:15), + i = indices(450:452), + j = indices(650:651), + time_across = 'chunk', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) +) + +# Start at chunk 2 (skip time steps in the first file) +suppressWarnings( + dat2 <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(3:15), + i = indices(450:452), + j = indices(650:651), + time_across = 'chunk', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) +) + +expect_equal(dat1[1, 1, 1:2, 3:15, , ], dat2[1, 1, 1:2, , , ]) + +# Start at chunk 3 (skip time steps in the first and second files) +suppressWarnings( + dat3 <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(15), + i = indices(450:452), + j = indices(650:651), + time_across = 'chunk', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) +) + +expect_equal(dat1[1, 1, 1:2, 15, , ], dat3[1, 1, 1:2, , , ]) + +}) + +}) #suppressMessages diff --git a/tests/testthat/test-Start-multiple_depends.R b/tests/testthat/test-Start-multiple_depends.R new file mode 100644 index 0000000000000000000000000000000000000000..473d0614522458decee0aaae21a4d3bc36f72f20 --- /dev/null +++ b/tests/testthat/test-Start-multiple_depends.R @@ -0,0 +1,65 @@ +suppressMessages({ +# This unit test tests the case where a depended dimension has multiple +# depending dimensions and the 'all' selector is used for a depending dim. + +path <- "/esarchive/exp/CMIP6/$dcpp$/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/$dcpp$/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_$dcpp$_s$sdate$-r1i1p1f2_gn_$chunk$.nc" + +path <- paste0('/esarchive/scratch/aho/startR_unittest_files', path) + +sdates <- c('2018', '2019') + +test_that("1. ", { +suppressWarnings( +dat1 <- Start(dat = path, + var = 'tos', + chunk = 'all', + time = indices(1:14), + time_across = 'chunk', + sdate = sdates, + dcpp = list('2018' = "dcppA-hindcast", '2019' = "dcppB-forecast"), + dcpp_depends = 'sdate', + chunk_depends = 'sdate', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + i = indices(450:460), + j = indices(685:700), + return_vars = list(time = c('chunk', 'sdate')), + retrieve = TRUE) +) + +suppressWarnings( +dat2 <- Start(dat = path, + var = 'tos', + chunk = list('2018' = c('201811-201812', '201901-201912'), + '2019' = c('201911-201912', '202001-202012')), + time = 'all', + time_across = 'chunk', + sdate = sdates, + dcpp = list('2018' = "dcppA-hindcast", '2019' = "dcppB-forecast"), + dcpp_depends = 'sdate', + chunk_depends = 'sdate', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + i = indices(450:460), + j = indices(685:700), + return_vars = list(time = c('chunk', 'sdate')), + retrieve = TRUE) +) + +expect_equal( + as.vector(dat1), + as.vector(dat2) +) +expect_equal( + mean(dat2, na.rm = T), + 29.21144, + tolerance = 0.0001 +) +expect_equal( + dat1[1, 1, 2, 2, 1, 1:3, 10], + c(28.84955, 28.84827, 28.84126), + tolerance = 0.0001 + ) +}) + +}) #suppressMessages