From eeb7cd4cc975033a06c0b1c44f2777467a4901ba Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 19 Feb 2024 09:57:48 +0100 Subject: [PATCH 01/24] Add 'ExpectedFiles' and 'PatternDim' to Start() output documentation --- DESCRIPTION | 2 +- R/Start.R | 8 +++++++- man/Start.Rd | 8 +++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8fd5ee1..88bfb62 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,5 +44,5 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues SystemRequirements: cdo ecFlow Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Config/testthat/edition: 3 diff --git a/R/Start.R b/R/Start.R index 5bfb3bf..2be297b 100644 --- a/R/Start.R +++ b/R/Start.R @@ -750,6 +750,9 @@ #' components used to build up the paths to each of the files in the data #' sources. #' } +#' \item{PatternDim}{ +#' Character string containing the name of the file pattern dimension. +#' } #'If \code{retrieve = FALSE} the involved data is not loaded into RAM memory and #'an object of the class 'startR_header' with the following components is #' returned:\cr @@ -766,7 +769,7 @@ #' multidimensional array with named dimensions, and potentially with the #' attribute 'variables' with additional auxiliary data. #' } -#' \item{Files}{ +#' \item{ExpectedFiles}{ #' Multidimensonal character string array with named dimensions. Its dimensions #' are the file dimensions (as requested in \dots). Each cell in this array #' contains a path to a file to be retrieved (which may exist or not). @@ -777,6 +780,9 @@ #' components used to build up the paths to each of the files in the data #' sources. #' } +#' \item{PatternDim}{ +#' Character string containing the name of the file pattern dimension. +#' } #' \item{StartRCall}{ #' List of parameters sent to the Start() call, with the parameter #' 'retrieve' set to TRUE. Intended for calling in order to diff --git a/man/Start.Rd b/man/Start.Rd index 640c5a9..275ae9b 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -734,6 +734,9 @@ If \code{retrieve = TRUE} the involved data is loaded into RAM memory components used to build up the paths to each of the files in the data sources. } + \item{PatternDim}{ + Character string containing the name of the file pattern dimension. + } If \code{retrieve = FALSE} the involved data is not loaded into RAM memory and an object of the class 'startR_header' with the following components is returned:\cr @@ -750,7 +753,7 @@ returned:\cr multidimensional array with named dimensions, and potentially with the attribute 'variables' with additional auxiliary data. } - \item{Files}{ + \item{ExpectedFiles}{ Multidimensonal character string array with named dimensions. Its dimensions are the file dimensions (as requested in \dots). Each cell in this array contains a path to a file to be retrieved (which may exist or not). @@ -761,6 +764,9 @@ returned:\cr components used to build up the paths to each of the files in the data sources. } + \item{PatternDim}{ + Character string containing the name of the file pattern dimension. + } \item{StartRCall}{ List of parameters sent to the Start() call, with the parameter 'retrieve' set to TRUE. Intended for calling in order to -- GitLab From c7bde326275404372d29a97604872bf8e97a7d5d Mon Sep 17 00:00:00 2001 From: abatalla Date: Thu, 16 May 2024 10:48:46 +0200 Subject: [PATCH 02/24] Bugfix: changed path in use case ex3_1_SubseasonalECVHindcast.md to resolve 'No data files found' error --- inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md b/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md index 8501513..e554c66 100644 --- a/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md +++ b/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md @@ -18,7 +18,7 @@ After loading startR package, the paths to the hindcast should be defined, inclu library(startR) ecmwf_path <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensforhc/', - 'weekly_mean/$var$_f24h/$sdate$/$var$_$syear$.nc') + 'weekly_mean/$var$_f6h/$sdate$/$var$_$syear$.nc') ``` Now, create the sequence of start dates in 2016 following the scheme in this figure: -- GitLab From d496e87eb8dc8ea4806697ef6f1656f5ce703207 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 29 May 2024 16:43:02 +0200 Subject: [PATCH 03/24] Update issue template to change reference person from aho to vagudets --- .gitlab/issue_templates/Default.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/issue_templates/Default.md b/.gitlab/issue_templates/Default.md index 6d47b71..e16e0c7 100644 --- a/.gitlab/issue_templates/Default.md +++ b/.gitlab/issue_templates/Default.md @@ -1,6 +1,6 @@ (This is a template to report problems or suggest a new development. Please fill in the relevant information and remove the rest.) -Hi @aho, +Hi @vagudets, #### Summary (Bug: Summarize the bug and explain briefly the expected and the current behavior.) -- GitLab From 53a982fd953097fef1e7519e03750ee329a8ef33 Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 8 Jul 2024 15:03:07 +0200 Subject: [PATCH 04/24] Bugfix: Retrieve correct time steps when time is across file dimension and the time steps of the first files are skipped --- .Rbuildignore | 2 +- R/Start.R | 7 ++- ...-DCPP-across-depends-largest_dims_length.R | 63 +++++++++++++++++++ 3 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R diff --git a/.Rbuildignore b/.Rbuildignore index aa7059a..39771b3 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,7 +9,7 @@ ^inst/doc$ ^\.gitlab-ci\.yml$ ## unit tests should be ignored when building the package for CRAN -^tests$ +# ^tests$ ^inst/PlotProfiling\.R$ ^.gitlab$ # Suggested by http://r-pkgs.had.co.nz/package.html diff --git a/R/Start.R b/R/Start.R index 2be297b..42f72b6 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3279,8 +3279,11 @@ Start <- function(..., # dim = indices/selectors, indices_chunk <- c(indices_chunk, rep(item, length(tmp) - length(indices_chunk))) } sub_array_of_indices_by_file <- split(sub_array_of_indices, indices_chunk) - for (item in 2:length(sub_array_of_indices_by_file)) { - sub_array_of_indices_by_file[[item]] <- sub_array_of_indices_by_file[[item]] - cumsum(inner_dim_lengths)[item - 1] + for (item in names((sub_array_of_indices_by_file))) { + # If item is 1, cumsum(inner_dim_lengths)[item - 1] returns numeric(0) + if (as.numeric(item) > 1) { + sub_array_of_indices_by_file[[item]] <- sub_array_of_indices_by_file[[item]] - cumsum(inner_dim_lengths)[as.numeric(item) - 1] + } } transformed_indices <- unlist(sub_array_of_indices_by_file, use.names = FALSE) } diff --git a/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R b/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R new file mode 100644 index 0000000..3e10c55 --- /dev/null +++ b/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R @@ -0,0 +1,63 @@ +suppressMessages({ +test_that("Chunks of DCPP files with largest_dims_length = TRUE - Local execution", { + path <- '/esarchive/exp/CMIP6/dcppA-hindcast/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$chunk$.nc' + path <- paste0('/esarchive/scratch/aho/startR_unittest_files/', path) + + sdates <- c('2017', '2018') +suppressWarnings( + dat1 <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(1:15), + i = indices(1:10), + j = indices(1:10), + time_across = 'chunk', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) +) + +# Start at chunk 2 (skip time steps in the first file) +suppressWarnings( + dat2 <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(3:15), + i = indices(1:10), + j = indices(1:10), + time_across = 'chunk', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) +) + +expect_equal(dat1[1, 1, 1:2, 3:15, 10, 10], dat2[1, 1, 1:2, , 10, 10]) + +# Start at chunk 3 (skip time steps in the first and second files) +suppressWarnings( + dat3 <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(15), + i = indices(1:10), + j = indices(1:10), + time_across = 'chunk', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) +) + +expect_equal(dat1[1, 1, 1:2, 15, 10, 10], dat3[1, 1, 1:2, , 10, 10]) + +}) + +}) #suppressMessages -- GitLab From c3b80f3344895ee418814d1ef3be77a925f97d4e Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 9 Jul 2024 12:30:46 +0200 Subject: [PATCH 05/24] Allow chunking along _across inner dim if merge_across_dims = TRUE --- R/NcDataReader.R | 1 - R/Start.R | 17 +-- R/zzz.R | 3 +- .../testthat/test-Compute-chunk_across_dim.R | 129 ++++++++++++++++++ ...-DCPP-across-depends-largest_dims_length.R | 16 +-- 5 files changed, 147 insertions(+), 19 deletions(-) create mode 100644 tests/testthat/test-Compute-chunk_across_dim.R diff --git a/R/NcDataReader.R b/R/NcDataReader.R index 25e33d4..47817c6 100644 --- a/R/NcDataReader.R +++ b/R/NcDataReader.R @@ -62,7 +62,6 @@ NcDataReader <- function(file_path = NULL, file_object = NULL, if (is.null(file_to_read)) { return(NULL) } - var_requested <- is.null(inner_indices) drop_var_dim <- FALSE diff --git a/R/Start.R b/R/Start.R index 42f72b6..3616f23 100644 --- a/R/Start.R +++ b/R/Start.R @@ -880,7 +880,6 @@ Start <- function(..., # dim = indices/selectors, if (!merge_across_dims & merge_across_dims_narm) { merge_across_dims_narm <- FALSE } - # Leave alone the dimension parameters in the variable dim_params dim_params <- rebuild_dim_params(dim_params, merge_across_dims, inner_dims_across_files) @@ -2774,10 +2773,10 @@ Start <- function(..., # dim = indices/selectors, !is.null(dim_reorder_params[[inner_dim]])) { if ('circular' %in% names(attributes(dim_reorder_params[[inner_dim]]))) { is_circular_dim <- attr(dim_reorder_params[[inner_dim]], "circular") - if (is_circular_dim & is.list(sub_array_of_selectors)) { - tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix - goes_across_prime_meridian <- tmp[1] > tmp[2] - } + if (is_circular_dim & is.list(sub_array_of_selectors)) { + tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix + goes_across_prime_meridian <- tmp[1] > tmp[2] + } } } @@ -2924,7 +2923,7 @@ Start <- function(..., # dim = indices/selectors, sub_array_of_indices[[1]] <- vect[tmp[1]] sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] } - } + } # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. #//////////////////////////////////////////////////////////// @@ -3201,10 +3200,12 @@ Start <- function(..., # dim = indices/selectors, } } # If "_across = + merge_across_dims = FALSE + chunk over ", return error because this instance is not logically correct. - if (chunks[[inner_dim]]["n_chunks"] > 1 & inner_dim %in% inner_dims_across_files) { + if (chunks[[inner_dim]]["n_chunks"] > 1 & inner_dim %in% inner_dims_across_files & + merge_across_dims == FALSE) { stop("Chunk over dimension '", inner_dim, "' is not allowed because '", inner_dim, "' is across '", - names(inner_dims_across_files)[which(inner_dim %in% inner_dims_across_files)], "'.") + names(inner_dims_across_files)[which(inner_dim %in% inner_dims_across_files)], + "' and 'merge_across_dims' is set to FALSE'.") } if (inner_dim %in% names(dim(sub_array_of_selectors))) { diff --git a/R/zzz.R b/R/zzz.R index f098a3b..267c27f 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -151,7 +151,7 @@ rebuild_dim_params <- function(dim_params, merge_across_dims, # Look for chunked dims look_for_chunks <- function(dim_params, dim_names) { - chunks <- vector('list', length(dim_names)) + chunks <- vector('list', length(dim_names)) names(chunks) <- dim_names for (dim_name in dim_names) { if (!is.null(attr(dim_params[[dim_name]], 'chunk'))) { @@ -1021,7 +1021,6 @@ build_work_pieces <- function(work_pieces, i, selectors, file_dims, inner_dims, } j <- j + 1 } - return(work_pieces) } diff --git a/tests/testthat/test-Compute-chunk_across_dim.R b/tests/testthat/test-Compute-chunk_across_dim.R new file mode 100644 index 0000000..0fb3382 --- /dev/null +++ b/tests/testthat/test-Compute-chunk_across_dim.R @@ -0,0 +1,129 @@ +suppressMessages({ +# This unit test tests the chunking over dimension that goes across files. +# 1. across dim is a vector +# a. merge_across_dims is TRUE +# b. merge_across_dims is FALSE +# Note that 1.b. doesn't work. + +path <- '/esarchive/exp/CMIP6/dcppA-hindcast/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$chunk$.nc' +path <- paste0('/esarchive/scratch/aho/startR_unittest_files/', path) + +sdates <- c('2016', '2017', '2018') + +# retrieve = T for verification +suppressWarnings( + data_T <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(1:24), + i = indices(450:452), + j = indices(650:651), + time_across = 'chunk', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) +) + +test_that("1.a. across dim is a vector, merge_across_dims = TRUE", { + +suppressWarnings( +data <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(1:24), + i = indices(450:452), + j = indices(650:651), + merge_across_dims = TRUE, + largest_dims_length = TRUE, + time_across = 'chunk', + return_vars = list(time = 'sdate'), + retrieve = FALSE) +) + +fun <- function(x) { +return(x) +} + +step <- Step(fun = fun, + target_dims = 'dat', output_dims = 'dat') +wf <- AddStep(inputs = data, step_fun = step) +suppressWarnings( +res1 <- Compute(workflow = wf, chunks = list(time = 1))$output1 +) +suppressWarnings( +res2 <- Compute(workflow = wf, chunks = list(time = 2))$output1 +) +suppressWarnings( +res3 <- Compute(workflow = wf, chunks = list(time = 3))$output1 +) + +expect_equal( +as.vector(data_T), +as.vector(res1) +) +expect_equal( +res1, +res2 +) +expect_equal( +res1, +res3 +) + +expect_equal( +as.vector(drop(res1)[ , 1, 1, 1]), +c(29.91125, 29.94805, 30.35584), +tolerance = 0.0001 +) +expect_equal( +as.vector(drop(res1)[ , 2, 1, 1]), +c(29.53878, 29.72491, 30.34167), +tolerance = 0.0001 +) + +}) + +################################################################# +################################################################# +################################################################# + +test_that("1.b. across dim is a vector, merge_across_dims = FALSE", { + +suppressWarnings( +data <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = 'all', + chunk_depends = 'sdate', + time = indices(1:24), + i = indices(450:452), + j = indices(650:651), + merge_across_dims = FALSE, + largest_dims_length = TRUE, + time_across = 'chunk', + return_vars = list(time = 'sdate'), + retrieve = FALSE) +) + +fun <- function(x) { +return(x) +} + +step <- Step(fun = fun, + target_dims = 'dat', output_dims = 'dat') +wf <- AddStep(inputs = data, step_fun = step) + +expect_error( +suppressWarnings( +res <- Compute(workflow = wf, chunks = list(time = 2))$output1), +"Chunk over dimension 'time' is not allowed because 'time' is across 'chunk' and 'merge_across_dims' is set to FALSE'." +) + +}) + +}) #suppressMessages diff --git a/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R b/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R index 3e10c55..843ea98 100644 --- a/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R +++ b/tests/testthat/test-Start-DCPP-across-depends-largest_dims_length.R @@ -11,8 +11,8 @@ suppressWarnings( chunk = 'all', chunk_depends = 'sdate', time = indices(1:15), - i = indices(1:10), - j = indices(1:10), + i = indices(450:452), + j = indices(650:651), time_across = 'chunk', merge_across_dims = TRUE, largest_dims_length = TRUE, @@ -28,8 +28,8 @@ suppressWarnings( chunk = 'all', chunk_depends = 'sdate', time = indices(3:15), - i = indices(1:10), - j = indices(1:10), + i = indices(450:452), + j = indices(650:651), time_across = 'chunk', merge_across_dims = TRUE, largest_dims_length = TRUE, @@ -37,7 +37,7 @@ suppressWarnings( return_vars = list(time = 'sdate')) ) -expect_equal(dat1[1, 1, 1:2, 3:15, 10, 10], dat2[1, 1, 1:2, , 10, 10]) +expect_equal(dat1[1, 1, 1:2, 3:15, , ], dat2[1, 1, 1:2, , , ]) # Start at chunk 3 (skip time steps in the first and second files) suppressWarnings( @@ -47,8 +47,8 @@ suppressWarnings( chunk = 'all', chunk_depends = 'sdate', time = indices(15), - i = indices(1:10), - j = indices(1:10), + i = indices(450:452), + j = indices(650:651), time_across = 'chunk', merge_across_dims = TRUE, largest_dims_length = TRUE, @@ -56,7 +56,7 @@ suppressWarnings( return_vars = list(time = 'sdate')) ) -expect_equal(dat1[1, 1, 1:2, 15, 10, 10], dat3[1, 1, 1:2, , 10, 10]) +expect_equal(dat1[1, 1, 1:2, 15, , ], dat3[1, 1, 1:2, , , ]) }) -- GitLab From b64925a513778f2a8395487d9b60d02c9e587ba4 Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 15 Jul 2024 12:49:42 +0200 Subject: [PATCH 06/24] Add if condition to generate correct paths when a dimension has multiple depending dimensions and the 'all' selector is used --- R/zzz.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/zzz.R b/R/zzz.R index 267c27f..b25f94c 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -382,6 +382,14 @@ find_ufd_value <- function(undefined_file_dims, dat, i, replace_values, depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))] replace_values[depending_dims] <- rep('*', length(depending_dims)) } + # If u_file_dim depends on the same depended dimension as another depending + # dimension, then the value of the depending dim should be replaced with '*' + # to avoid only the first value being used, which can result in the wrong + # path specification. + if (depending_file_dims[[-which(names(depending_file_dims) == u_file_dim)]] == depended_dim) { + depending_dims <- names(depending_file_dims)[-which(names(depending_file_dims) == u_file_dim)] + replace_values[depending_dims] <- rep('*', length(depending_dims)) + } for (j in 1:length(depended_dim_values)) { parsed_values <- c() if (!is.null(depended_dim)) { -- GitLab From 96e28a26aaef041f968338bedaa053cf1e5418e7 Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 15 Jul 2024 13:59:37 +0200 Subject: [PATCH 07/24] Fix if condition --- R/zzz.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/zzz.R b/R/zzz.R index b25f94c..5518474 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -386,7 +386,8 @@ find_ufd_value <- function(undefined_file_dims, dat, i, replace_values, # dimension, then the value of the depending dim should be replaced with '*' # to avoid only the first value being used, which can result in the wrong # path specification. - if (depending_file_dims[[-which(names(depending_file_dims) == u_file_dim)]] == depended_dim) { + if (length(depending_file_dims) > 1 && + depending_file_dims[[-which(names(depending_file_dims) == u_file_dim)]] == depended_dim) { depending_dims <- names(depending_file_dims)[-which(names(depending_file_dims) == u_file_dim)] replace_values[depending_dims] <- rep('*', length(depending_dims)) } -- GitLab From 3d253a71a8ae3e78a281619dfece19b1e712c748 Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 16 Jul 2024 16:35:32 +0200 Subject: [PATCH 08/24] Bugfix: consider case where there are multiple depended dims or more than two depending dims --- R/zzz.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/zzz.R b/R/zzz.R index 5518474..5b11f97 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -386,9 +386,10 @@ find_ufd_value <- function(undefined_file_dims, dat, i, replace_values, # dimension, then the value of the depending dim should be replaced with '*' # to avoid only the first value being used, which can result in the wrong # path specification. + other_depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == u_file_dim)] if (length(depending_file_dims) > 1 && - depending_file_dims[[-which(names(depending_file_dims) == u_file_dim)]] == depended_dim) { - depending_dims <- names(depending_file_dims)[-which(names(depending_file_dims) == u_file_dim)] + any(unlist(other_depending_file_dims) == depended_dim)) { + depending_dims <- names(other_depending_file_dims)[which(other_depending_file_dims == "syear")] replace_values[depending_dims] <- rep('*', length(depending_dims)) } for (j in 1:length(depended_dim_values)) { -- GitLab From ba06cfcc968ac53ce3e44671c99a8f1a9822ca5d Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 16 Jul 2024 17:02:40 +0200 Subject: [PATCH 09/24] Bugfix: hardcoded depended dim --- R/zzz.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/zzz.R b/R/zzz.R index 5b11f97..f198746 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -389,7 +389,7 @@ find_ufd_value <- function(undefined_file_dims, dat, i, replace_values, other_depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == u_file_dim)] if (length(depending_file_dims) > 1 && any(unlist(other_depending_file_dims) == depended_dim)) { - depending_dims <- names(other_depending_file_dims)[which(other_depending_file_dims == "syear")] + depending_dims <- names(other_depending_file_dims)[which(other_depending_file_dims == depended_dim)] replace_values[depending_dims] <- rep('*', length(depending_dims)) } for (j in 1:length(depended_dim_values)) { -- GitLab From edecacfe9be49301b5a656d46d64ba7cb5814192 Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 26 Jul 2024 12:25:37 +0200 Subject: [PATCH 10/24] Allow multiple file dimensions to be specified as metadata_dims --- R/Start.R | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/R/Start.R b/R/Start.R index 3616f23..14f5839 100644 --- a/R/Start.R +++ b/R/Start.R @@ -959,15 +959,21 @@ Start <- function(..., # dim = indices/selectors, metadata_dims <- c(pattern_dims, metadata_dims[-which(metadata_dims == pattern_dims)]) } # Check if metadata_dims has more than 2 elements - if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2)) { - .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", - "function. Keep '", metadata_dims[1], "' and '", metadata_dims[2], "' only.")) - metadata_dims <- metadata_dims[1:2] - } else if (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1) { - .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", - "function. Keep '", metadata_dims[1], "' only.")) - metadata_dims <- metadata_dims[1] - } + if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2) || + (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1)) { + .warning(paste0("Parameter 'metadata_dims' has several many elements which", + " might serve a repetitive purpose. This could impact the ", + "performance of Start().")) + } + # if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2)) { + # .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", + # "function. Keep '", metadata_dims[1], "' and '", metadata_dims[2], "' only.")) + # metadata_dims <- metadata_dims[1:2] + # } else if (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1) { + # .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", + # "function. Keep '", metadata_dims[1], "' only.")) + # metadata_dims <- metadata_dims[1] + # } # Once the pattern dimension with dataset specifications is found, # the variable 'dat' is mounted with the information of each -- GitLab From 62d0439f6db9ab440456a7f7b3b495b17b2a686d Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 26 Jul 2024 14:17:48 +0200 Subject: [PATCH 11/24] Remove commented code, improve warning message --- R/Start.R | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/R/Start.R b/R/Start.R index 14f5839..3cde199 100644 --- a/R/Start.R +++ b/R/Start.R @@ -961,19 +961,11 @@ Start <- function(..., # dim = indices/selectors, # Check if metadata_dims has more than 2 elements if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2) || (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1)) { - .warning(paste0("Parameter 'metadata_dims' has several many elements which", - " might serve a repetitive purpose. This could impact the ", - "performance of Start().")) + .warning(paste0("Parameter 'metadata_dims' contains some elements which", + "might serve a repetitive purpose: ", + metadata_dims[which(metadata_dims != pattern_dims)], + ". This could impact the performance of Start().")) } - # if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2)) { - # .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", - # "function. Keep '", metadata_dims[1], "' and '", metadata_dims[2], "' only.")) - # metadata_dims <- metadata_dims[1:2] - # } else if (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1) { - # .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", - # "function. Keep '", metadata_dims[1], "' only.")) - # metadata_dims <- metadata_dims[1] - # } # Once the pattern dimension with dataset specifications is found, # the variable 'dat' is mounted with the information of each -- GitLab From e69a69bbea3f5888dc63ebeddd7e8bdf867fd61c Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 30 Jul 2024 16:08:25 +0200 Subject: [PATCH 12/24] Create unit tests for multiple depending dims --- tests/testthat/test-Start-multiple_depends.R | 65 ++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tests/testthat/test-Start-multiple_depends.R diff --git a/tests/testthat/test-Start-multiple_depends.R b/tests/testthat/test-Start-multiple_depends.R new file mode 100644 index 0000000..bf600e5 --- /dev/null +++ b/tests/testthat/test-Start-multiple_depends.R @@ -0,0 +1,65 @@ +suppressMessages({ +# This unit test tests the case where a depended dimension has multiple +# depending dimensions and the 'all' selector is used for a depending dim. + +path <- "/esarchive/exp/CMIP6/$dcpp$/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/$dcpp$/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_$dcpp$_s$sdate$-r1i1p1f2_gn_$chunk$.nc" + +path <- paste0('/esarchive/scratch/aho/startR_unittest_files/', path) + +sdates <- c('2018', '2019') + +test_that("1. ", { +suppressWarnings( +dat1 <- Start(dat = path, + var = 'tos', + chunk = 'all', + time = indices(1:14), + time_across = 'chunk', + sdate = sdates, + dcpp = list('2018' = "dcppA-hindcast", '2019' = "dcppB-forecast"), + dcpp_depends = 'sdate', + chunk_depends = 'sdate', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + i = indices(450:460), + j = indices(685:700), + return_vars = list(time = c('chunk', 'sdate')), + retrieve = TRUE) +) + +suppressWarnings( +dat2 <- Start(dat = path, + var = 'tos', + chunk = list('2018' = c('201811-201812', '201901-201912'), + '2019' = c('201911-201912', '202001-202012')), + time = 'all', + time_across = 'chunk', + sdate = sdates, + dcpp = list('2018' = "dcppA-hindcast", '2019' = "dcppB-forecast"), + dcpp_depends = 'sdate', + chunk_depends = 'sdate', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + i = indices(450:460), + j = indices(685:700), + return_vars = list(time = c('chunk', 'sdate')), + retrieve = TRUE) +) + +expect_equal( + as.vector(dat1), + as.vector(dat2) +) +expect_equal( + mean(dat2, na.rm = T), + 29.21144, + tolerance = 0.0001 +) +expect_equal( + dat1[1, 1, 2, 2, 1, 1:3, 10], + c(28.84955, 28.84827, 28.84126), + tolerance = 0.0001 + ) +}) + +}) #suppressMessages -- GitLab From f3f7ed96bd2c5c09830bd7db246e94af83299106 Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 23 Aug 2024 16:21:04 +0200 Subject: [PATCH 13/24] Add check for special wildcard '$var$' --- R/Start.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/Start.R b/R/Start.R index 3616f23..c741de2 100644 --- a/R/Start.R +++ b/R/Start.R @@ -892,6 +892,12 @@ Start <- function(..., # dim = indices/selectors, found_pattern_dim <- found_pattern_dims(pattern_dims, dim_names, var_params, dim_params, dim_reorder_params) + # Check if file pattern contains '$var$' substring + if (!grepl("$var$", dim_params[[found_pattern_dim]], fixed = TRUE)) { + .warning(paste0("The special wildcard '$var$' is not present in the file ", + "path. This might cause Start() to fail if it cannot parse", + "the inner dimensions in all the files.")) + } # Check all *_reorder are NULL or functions, and that they all have # a matching dimension param. i <- 1 -- GitLab From 46634b41f5ec0f5724df544ac75bcb8b1a543498 Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 2 Sep 2024 16:11:33 +0200 Subject: [PATCH 14/24] Update unit test --- tests/testthat/test-Start-multiple_depends.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-Start-multiple_depends.R b/tests/testthat/test-Start-multiple_depends.R index bf600e5..473d061 100644 --- a/tests/testthat/test-Start-multiple_depends.R +++ b/tests/testthat/test-Start-multiple_depends.R @@ -4,7 +4,7 @@ suppressMessages({ path <- "/esarchive/exp/CMIP6/$dcpp$/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/$dcpp$/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_$dcpp$_s$sdate$-r1i1p1f2_gn_$chunk$.nc" -path <- paste0('/esarchive/scratch/aho/startR_unittest_files/', path) +path <- paste0('/esarchive/scratch/aho/startR_unittest_files', path) sdates <- c('2018', '2019') -- GitLab From a110112a395d55b5bfadfbe9297a0d418d118644 Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 2 Sep 2024 16:15:40 +0200 Subject: [PATCH 15/24] Remove NA values from inner_dim_lengths --- R/Start.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/Start.R b/R/Start.R index c741de2..839c130 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2503,6 +2503,7 @@ Start <- function(..., # dim = indices/selectors, } # Find the largest length of each time step inner_dim_lengths <- do.call(pmax, inner_dim_lengths_cat) + inner_dim_lengths <- inner_dim_lengths[which(!is.na(inner_dim_lengths)] } fri <- first_round_indices <- NULL -- GitLab From c7f35b3e175c76cf8dc5964096fe0b64131115ab Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 3 Sep 2024 10:44:35 +0200 Subject: [PATCH 16/24] Fix condition to check $ --- R/Start.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index c741de2..f1a1991 100644 --- a/R/Start.R +++ b/R/Start.R @@ -893,7 +893,7 @@ Start <- function(..., # dim = indices/selectors, dim_params, dim_reorder_params) # Check if file pattern contains '$var$' substring - if (!grepl("$var$", dim_params[[found_pattern_dim]], fixed = TRUE)) { + if (any(!grepl("$var$", dim_params[[found_pattern_dim]], fixed = TRUE))) { .warning(paste0("The special wildcard '$var$' is not present in the file ", "path. This might cause Start() to fail if it cannot parse", "the inner dimensions in all the files.")) -- GitLab From e325e5aa0e0f5807a55fb4184d3d9fdfba3b1737 Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 3 Sep 2024 12:17:44 +0200 Subject: [PATCH 17/24] Create unit tests for multiple depending dimensions case --- R/Start.R | 2 +- tests/testthat/test-Start-multiple_depends.R | 65 ++++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/test-Start-multiple_depends.R diff --git a/R/Start.R b/R/Start.R index c741de2..f1a1991 100644 --- a/R/Start.R +++ b/R/Start.R @@ -893,7 +893,7 @@ Start <- function(..., # dim = indices/selectors, dim_params, dim_reorder_params) # Check if file pattern contains '$var$' substring - if (!grepl("$var$", dim_params[[found_pattern_dim]], fixed = TRUE)) { + if (any(!grepl("$var$", dim_params[[found_pattern_dim]], fixed = TRUE))) { .warning(paste0("The special wildcard '$var$' is not present in the file ", "path. This might cause Start() to fail if it cannot parse", "the inner dimensions in all the files.")) diff --git a/tests/testthat/test-Start-multiple_depends.R b/tests/testthat/test-Start-multiple_depends.R new file mode 100644 index 0000000..473d061 --- /dev/null +++ b/tests/testthat/test-Start-multiple_depends.R @@ -0,0 +1,65 @@ +suppressMessages({ +# This unit test tests the case where a depended dimension has multiple +# depending dimensions and the 'all' selector is used for a depending dim. + +path <- "/esarchive/exp/CMIP6/$dcpp$/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/$dcpp$/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_$dcpp$_s$sdate$-r1i1p1f2_gn_$chunk$.nc" + +path <- paste0('/esarchive/scratch/aho/startR_unittest_files', path) + +sdates <- c('2018', '2019') + +test_that("1. ", { +suppressWarnings( +dat1 <- Start(dat = path, + var = 'tos', + chunk = 'all', + time = indices(1:14), + time_across = 'chunk', + sdate = sdates, + dcpp = list('2018' = "dcppA-hindcast", '2019' = "dcppB-forecast"), + dcpp_depends = 'sdate', + chunk_depends = 'sdate', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + i = indices(450:460), + j = indices(685:700), + return_vars = list(time = c('chunk', 'sdate')), + retrieve = TRUE) +) + +suppressWarnings( +dat2 <- Start(dat = path, + var = 'tos', + chunk = list('2018' = c('201811-201812', '201901-201912'), + '2019' = c('201911-201912', '202001-202012')), + time = 'all', + time_across = 'chunk', + sdate = sdates, + dcpp = list('2018' = "dcppA-hindcast", '2019' = "dcppB-forecast"), + dcpp_depends = 'sdate', + chunk_depends = 'sdate', + merge_across_dims = TRUE, + largest_dims_length = TRUE, + i = indices(450:460), + j = indices(685:700), + return_vars = list(time = c('chunk', 'sdate')), + retrieve = TRUE) +) + +expect_equal( + as.vector(dat1), + as.vector(dat2) +) +expect_equal( + mean(dat2, na.rm = T), + 29.21144, + tolerance = 0.0001 +) +expect_equal( + dat1[1, 1, 2, 2, 1, 1:3, 10], + c(28.84955, 28.84827, 28.84126), + tolerance = 0.0001 + ) +}) + +}) #suppressMessages -- GitLab From 3d4eb815e185a14e267b658ca29ce2449e9c6376 Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 3 Sep 2024 12:30:14 +0200 Subject: [PATCH 18/24] Fix bug (missing parenthesis) --- R/Start.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index 4cf76dd..dfa0034 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2503,7 +2503,7 @@ Start <- function(..., # dim = indices/selectors, } # Find the largest length of each time step inner_dim_lengths <- do.call(pmax, inner_dim_lengths_cat) - inner_dim_lengths <- inner_dim_lengths[which(!is.na(inner_dim_lengths)] + inner_dim_lengths <- inner_dim_lengths[which(!is.na(inner_dim_lengths))] } fri <- first_round_indices <- NULL -- GitLab From 46b20dd7ed767de7fe0e02e23e638e549a0d7b5f Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 6 Sep 2024 10:29:23 +0200 Subject: [PATCH 19/24] Add comment --- R/Start.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/Start.R b/R/Start.R index dfa0034..7eb43af 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2503,6 +2503,9 @@ Start <- function(..., # dim = indices/selectors, } # Find the largest length of each time step inner_dim_lengths <- do.call(pmax, inner_dim_lengths_cat) + ## NOTE: NA values can be present if the size of a depending + ## dimension varies along its depended dim. Removing them allows + ## retrieval of the common indices. Could cause other issues? inner_dim_lengths <- inner_dim_lengths[which(!is.na(inner_dim_lengths))] } -- GitLab From 172ce367785aba8497054addd6f40f5f8b951fb1 Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 6 Sep 2024 14:12:02 +0200 Subject: [PATCH 20/24] Fix typo --- R/Start.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/Start.R b/R/Start.R index 1b03f4d..51cacd5 100644 --- a/R/Start.R +++ b/R/Start.R @@ -894,9 +894,9 @@ Start <- function(..., # dim = indices/selectors, # Check if file pattern contains '$var$' substring if (any(!grepl("$var$", dim_params[[found_pattern_dim]], fixed = TRUE))) { - .warning(paste0("The special wildcard '$var$' is not present in the file ", - "path. This might cause Start() to fail if it cannot parse", - "the inner dimensions in all the files.")) + .warning(paste("The special wildcard '$var$' is not present in the file", + "path. This might cause Start() to fail if it cannot parse", + "the inner dimensions in all the files.") } # Check all *_reorder are NULL or functions, and that they all have # a matching dimension param. -- GitLab From f4a39212df4555a36d1904f4d9e8d21eb2d12a3c Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 6 Sep 2024 14:21:21 +0200 Subject: [PATCH 21/24] Add missing parenthesis --- R/Start.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index f8c36e2..75e352c 100644 --- a/R/Start.R +++ b/R/Start.R @@ -896,7 +896,7 @@ Start <- function(..., # dim = indices/selectors, if (any(!grepl("$var$", dim_params[[found_pattern_dim]], fixed = TRUE))) { .warning(paste("The special wildcard '$var$' is not present in the file", "path. This might cause Start() to fail if it cannot parse", - "the inner dimensions in all the files.") + "the inner dimensions in all the files.")) } # Check all *_reorder are NULL or functions, and that they all have # a matching dimension param. -- GitLab From f6579cc7016ad2fbc19fcb6f38d0c58412cccf85 Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 6 Sep 2024 15:00:00 +0200 Subject: [PATCH 22/24] Update NEWS.md and DESCRIPTION --- DESCRIPTION | 6 +++--- NEWS.md | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 88bfb62..35bb1b9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,12 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 2.3.1 +Version: 2.4.0 Authors@R: c( person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), - person("An-Chi", "Ho", , "an.ho@bsc.es", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4182-5258")), + person("An-Chi", "Ho", , "an.ho@bsc.es", role = c("aut"), comment = c(ORCID = "0000-0002-4182-5258")), person("Nuria", "Perez-Zanon", , "nuria.perez@bsc.es", role = c("aut"), comment = c(ORCID = "0000-0001-8568-3071")), person("Eva", "Rifa", , "eva.rifarovira@bsc.es", role = "ctb"), - person("Victoria", "Agudetse", , "victoria.agudetse@bsc.es", role = "ctb"), + person("Victoria", "Agudetse", , "victoria.agudetse@bsc.es", role = c("cre", "ctb")), person("Bruno", "de Paula Kinoshita", , "bruno.depaulakinoshita@bsc.es", role = "ctb"), person("Javier", "Vegas", , "javier.vegas@bsc.es", role = c("ctb")), person("Pierre-Antoine", "Bretonniere", , "pierre-antoine.bretonniere@bsc.es", role = c("ctb")), diff --git a/NEWS.md b/NEWS.md index c19d7a3..013d39f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# startR v2.4.0 (Release date: ) +- Allow chunking along inner dimensions that go across file dimensions +- Allow more than one file dimension to be specified in "metadata_dims" +- Add check and warning for when special wildcard "$var$" is missing in the path +- Bugfix: Start() retrieve correct time steps when time is across file dimension and the time steps of the first files are skipped +- Bugfix: Generate correct file paths when a file dimension has multiple depending dimensions + # startR v2.3.1 (Release date: 2023-12-22) - Use Autosubmit as workflow manager on hub - New feature: Collect result by Collect() on HPCs -- GitLab From 3da36cc972006a717f96f623d4f05e137485773e Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 6 Sep 2024 15:00:12 +0200 Subject: [PATCH 23/24] Ignore tests to build for CRAN --- .Rbuildignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index 39771b3..aa7059a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,7 +9,7 @@ ^inst/doc$ ^\.gitlab-ci\.yml$ ## unit tests should be ignored when building the package for CRAN -# ^tests$ +^tests$ ^inst/PlotProfiling\.R$ ^.gitlab$ # Suggested by http://r-pkgs.had.co.nz/package.html -- GitLab From d86d248cd34fbc4ccaae0a996451bebceb8dccce Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 10 Sep 2024 14:01:29 +0200 Subject: [PATCH 24/24] Add date to NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 013d39f..ad8b095 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# startR v2.4.0 (Release date: ) +# startR v2.4.0 (Release date: 2024-09-10) - Allow chunking along inner dimensions that go across file dimensions - Allow more than one file dimension to be specified in "metadata_dims" - Add check and warning for when special wildcard "$var$" is missing in the path -- GitLab