From 906093bed81e6a81ac76334ffd697850611d2b88 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 31 Mar 2021 15:27:22 +0200 Subject: [PATCH 1/4] Choose the right indices when selector is value and has dimension of file dim. --- R/Start.R | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/R/Start.R b/R/Start.R index 5139ac5..27663d5 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1853,16 +1853,21 @@ Start <- function(..., # dim = indices/selectors, (inner_dim %in% names(common_return_vars) & is.null(common_return_vars[[inner_dim]])) ) { if (is.character(file_dim_as_selector_array_dim)) { #(1) if (file_dim_as_selector_array_dim %in% found_pattern_dim) { - return_vars[[inner_dim]] <- file_dim_as_selector_array_dim + stop(paste0("Found '", inner_dim, "' selector has dimension of the pattern dim '", + found_pattern_dim, "', which is not allowed. To assign the dependency on the pattern dim, ", + "use 'return_vars = list(", inner_dim, " = 'dat')' instead.")) } else { common_return_vars[[inner_dim]] <- file_dim_as_selector_array_dim + tmp <- file_dim_as_selector_array_dim } } else if (inner_dim %in% inner_dims_across_files) { #(2) file_dim_name <- names(which(inner_dim == inner_dims_across_files)) if (file_dim_name %in% found_pattern_dim) { - return_vars[[inner_dim]] <- file_dim_name + stop(paste0("Found '", inner_dim, "' has across dependency on the pattern dim '", + found_pattern_dim, "', which is not allowed.")) } else { common_return_vars[[inner_dim]] <- file_dim_name + tmp <- file_dim_name } } .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", tmp, @@ -3851,7 +3856,13 @@ Start <- function(..., # dim = indices/selectors, first_round_indices <- lapply(inner_dims, function (x) { if (is.null(file_dim_across_files[[x]])) { - selectors[[x]][['fri']][[1]] + x_dim_name <- attr(attr(selectors[[x]][['fri']], "dim"), "names") + if (!is.null(x_dim_name)) { + which_chunk <- file_to_load_sub_indices[x_dim_name] + selectors[[x]][['fri']][[which_chunk]] + } else { + selectors[[x]][['fri']][[1]] + } } else { which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] selectors[[x]][['fri']][[which_chunk]] @@ -3861,7 +3872,13 @@ Start <- function(..., # dim = indices/selectors, second_round_indices <- lapply(inner_dims, function (x) { if (is.null(file_dim_across_files[[x]])) { - selectors[[x]][['sri']][[1]] + x_dim_name <- attr(attr(selectors[[x]][['sri']], "dim"), "names") + if (!is.null(x_dim_name)) { + which_chunk <- file_to_load_sub_indices[x_dim_name] + selectors[[x]][['sri']][[which_chunk]] + } else { + selectors[[x]][['sri']][[1]] + } } else { which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] selectors[[x]][['sri']][[which_chunk]] -- GitLab From 4445b9d9b072c7b86ae3f6ba4b2d84b4acfd8252 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 31 Mar 2021 16:14:28 +0200 Subject: [PATCH 2/4] Unit test for region with different index between files --- tests/testthat/test-Start-selector_with_dim.R | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tests/testthat/test-Start-selector_with_dim.R diff --git a/tests/testthat/test-Start-selector_with_dim.R b/tests/testthat/test-Start-selector_with_dim.R new file mode 100644 index 0000000..47762d3 --- /dev/null +++ b/tests/testthat/test-Start-selector_with_dim.R @@ -0,0 +1,70 @@ +#--------------------------------------------------- +# If assign a selector with an array that has file dim as dimension, Start() read +# the values depending on the the file dim. +#--------------------------------------------------- +context("Start() implicit inner dimension") + + +test_that("1. region with different index between files", { + +path <- paste0('/esarchive/exp/ecearth/a35b/diags/DCPP/EC-Earth-Consortium/', + 'EC-Earth3-HR/dcppA-hindcast/r1i1p1f1/Omon/$var$_mixed/gn/v20201107/', + '$var$_Omon_EC-Earth3-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$chunk$.nc') + +# two sdates have different index for Nino3. +region <- array('Nino3', dim = c(sdate = 2, region = 1)) + +data <- Start(dat = path, + var = 'tosmean', + sdate = c('1993', '2013'), + chunk = indices(1:2), + chunk_depends = 'sdate', + region = region, + time = 'all', + time_across = 'chunk', + merge_across_dims = TRUE, + return_vars = list(time = c('sdate', 'chunk'), + region = 'sdate'), + retrieve = T) + +data1 <- Start(dat = path, + var = 'tosmean', + sdate = c('1993'), + chunk = indices(1:2), + chunk_depends = 'sdate', + region = 'Nino3', + time = 'all', #c(1:length(forecast_month)), + time_across = 'chunk', + merge_across_dims = TRUE, + return_vars = list(time = c('sdate', 'chunk'), + region = NULL), + retrieve = T) + +data2 <- Start(dat = path, + var = 'tosmean', + sdate = c('2013'), + chunk = indices(1:2), + chunk_depends = 'sdate', + region = 'Nino3', + time = 'all', #c(1:length(forecast_month)), + time_across = 'chunk', + merge_across_dims = TRUE, + return_vars = list(time = c('sdate', 'chunk'), + region = NULL), + retrieve = T) + +expect_equal( +dim(data), +c(dat = 1, var = 1, sdate = 2, region = 1, time = 2) +) +expect_equal( +data[1, 1, 1, 1, ], +data1[1, 1, 1, 1, ] +) +expect_equal( +data[1, 1, 2, 1, ], +data2[1, 1, 1, 1, ] +) + + +}) -- GitLab From 0b8e7d4fc3acfb16e751a3d06a6b4483992634d8 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 31 Mar 2021 16:58:54 +0200 Subject: [PATCH 3/4] Add FAQ about how to define selector if the indices in the files are not aligned --- inst/doc/faq.md | 68 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 3b6d5fa..1770d49 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -24,7 +24,8 @@ This document intends to be the first reference for any doubts that you may have 18. [Use glob expression '*' to define the file path](#18-use-glob-expression-to-define-the-file-path) 19. [Get metadata when the first file does not exist](#19-get-metadata-when-the-first-file-does-not-exist) 20. [Use 'metadata_dims' to retrieve variable metadata](#20-use-metadata_dims-to-retrieve-variable-metadata) - 21. [Retrieve the complete data when the dimension length varies among files](#21-retrieve-the-complete-data-when-the-dimension-length-varies-among-files) + 21. [Retrieve the complete data when the dimension length varies among files](#21-retrieve-the-complete-data-when-the-dimension-length-varies-among-files) + 22. [Define the selector when the indices in the files are not aligned](#22-define-the-selector-when-the-indices-in-the-files-are-not-aligned) 2. **Something goes wrong...** @@ -848,6 +849,71 @@ adopt the provided ones and use the first valid file to decide the rest of dimen By this means, the efficiency can be similar to `largest_dims_length = FALSE`. +### 22. Define the selector when the indices in the files are not aligned +When the data structure between the requested files is not identical, we need to give different +selectors to each file. We can do this by using arrays as the selector and with the parameter +'return_vars' being well-defined. There are two scenarios: (1) different between datasets (2) different along certain file dim. + +(1) Different between datasets +We don't need (and can't) to define the selectors with pattern dim as the dimension. We can use +the value as the selector and specify `return_vars = list( = 'dat')`. By 'return_vars', +Start() knows that this inner_dim differs among the datasets so it examines all the files to get +the correct values. See more details of 'return_vars' at [How-to-16](#16-use-parameter-return_vars-in-start). + +For example, the two datasets, Hadgem3 and NorCPM1, have different initial dates. Hadgem3 initiates +in November while NorCPM1 in October. To retrieve them aligned, we can define the time selector +with the value "2000-11-16 UTC" and define 'return_vars' properly. + +```r +# HadGEM3 (initialised in November) +# NorCPM1 (initialised in October) + +data <- Start(dat = list(list(name = 'hadgem3', path = path_hadgem3), + list(name = 'norcpm1', path = path_norcpm1)), + var = 'tas', + sdate = '2000', + time = as.POSIXct("2000-11-16", tz = 'UTC'), + lat = 'all', + lon = 'all', + synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + return_vars = list(lat = 'dat', lon = 'dat', + time = 'dat'), + retrieve = TRUE) + +``` + +(2) Different along certain file dim +If the difference of indices is among the files in the same dataset, we can use the array with +named dimensions +to define the selector, and define 'return_vars' with the file dim along which the indices differ. + +For example, the 'region' number in the earlier experiments (sdate < 2013) is less than the later experiments (sdate = 2013), +making some regions have different indices between the experiments. The region selector array +should be two-dimensional, with one dimension 'sdate' and the other 'region'. The value of the +array can be either the character string of the region name or the indices in each sdate. +Besides, the dependency should be specified by `return_vars = list(region = 'sdate')`. + +```r +# 'Nino3' in 1st sdate file is index 9 while in 2nd sdate file is index 11 +# Either define with 'Nino3' or the corresponding index works +region <- array('Nino3', dim = c(sdate = 2, region = 1)) +region <- array(c(indices(9), indices(11)), dim = c(sdate = 2, region = 1)) + +data <- Start(dat = path, + var = 'tosmean', + sdate = c('1993', '2013'), + chunk = 'all', + chunk_depends = 'sdate', + region = region, + time = 'all', + time_across = 'chunk', + merge_across_dims = TRUE, + return_vars = list(time = c('sdate', 'chunk'), + region = 'sdate'), + retrieve = T) +``` + + # Something goes wrong... ### 1. No space left on device -- GitLab From 7ea2e0f0c062d191da11e4e2ccde8dd167f8a414 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 31 Mar 2021 17:58:26 +0200 Subject: [PATCH 4/4] Add TODO --- R/Start.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/Start.R b/R/Start.R index 853dcaf..9cecb87 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3483,6 +3483,8 @@ Start <- function(..., # dim = indices/selectors, vars_to_crop <- picked_vars_ordered[[i]] common_vars_to_crop <- picked_common_vars_ordered } else { + #TODO: If fri has different indices in each list, the crop_indices should be + # separated for each list. Otherwise, picked_common_vars later will be wrong. crop_indices <- unique(unlist(fri)) vars_to_crop <- picked_vars[[i]] common_vars_to_crop <- picked_common_vars -- GitLab