From 906093bed81e6a81ac76334ffd697850611d2b88 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Wed, 31 Mar 2021 15:27:22 +0200
Subject: [PATCH 1/4] Choose the right indices when selector is value and has
 dimension of file dim.

---
 R/Start.R | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/R/Start.R b/R/Start.R
index 5139ac5..27663d5 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -1853,16 +1853,21 @@ Start <- function(..., # dim = indices/selectors,
             (inner_dim %in% names(common_return_vars) & is.null(common_return_vars[[inner_dim]])) ) {
           if (is.character(file_dim_as_selector_array_dim)) { #(1)
             if (file_dim_as_selector_array_dim %in% found_pattern_dim) {
-              return_vars[[inner_dim]] <- file_dim_as_selector_array_dim
+              stop(paste0("Found '", inner_dim, "' selector has dimension of the pattern dim '",
+                          found_pattern_dim, "', which is not allowed. To assign the dependency on the pattern dim, ",
+                          "use 'return_vars = list(", inner_dim, " = 'dat')' instead."))
             } else {
               common_return_vars[[inner_dim]] <- file_dim_as_selector_array_dim
+              tmp <- file_dim_as_selector_array_dim
             }
           } else if (inner_dim %in% inner_dims_across_files) { #(2)
             file_dim_name <- names(which(inner_dim == inner_dims_across_files))
             if (file_dim_name %in% found_pattern_dim) {
-              return_vars[[inner_dim]] <- file_dim_name
+              stop(paste0("Found '", inner_dim, "' has across dependency on the pattern dim '",
+                          found_pattern_dim, "', which is not allowed."))
             } else {
               common_return_vars[[inner_dim]] <- file_dim_name
+              tmp <- file_dim_name
             }
           }
           .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", tmp,
@@ -3851,7 +3856,13 @@ Start <- function(..., # dim = indices/selectors,
             first_round_indices <- lapply(inner_dims, 
                                           function (x) {
                                             if (is.null(file_dim_across_files[[x]])) {
-                                              selectors[[x]][['fri']][[1]]
+                                              x_dim_name <- attr(attr(selectors[[x]][['fri']], "dim"), "names")
+                                              if (!is.null(x_dim_name)) {
+                                                which_chunk <- file_to_load_sub_indices[x_dim_name]
+                                                selectors[[x]][['fri']][[which_chunk]]
+                                              } else {
+                                                selectors[[x]][['fri']][[1]]
+                                              }
                                             } else {
                                               which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] 
                                               selectors[[x]][['fri']][[which_chunk]]
@@ -3861,7 +3872,13 @@ Start <- function(..., # dim = indices/selectors,
             second_round_indices <- lapply(inner_dims, 
                                            function (x) {
                                              if (is.null(file_dim_across_files[[x]])) {
-                                               selectors[[x]][['sri']][[1]]
+                                              x_dim_name <- attr(attr(selectors[[x]][['sri']], "dim"), "names")
+                                              if (!is.null(x_dim_name)) {
+                                                which_chunk <- file_to_load_sub_indices[x_dim_name]
+                                                selectors[[x]][['sri']][[which_chunk]]
+                                              } else {
+                                                selectors[[x]][['sri']][[1]]
+                                              }
                                              } else {
                                                which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]]
                                                selectors[[x]][['sri']][[which_chunk]]
-- 
GitLab


From 4445b9d9b072c7b86ae3f6ba4b2d84b4acfd8252 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Wed, 31 Mar 2021 16:14:28 +0200
Subject: [PATCH 2/4] Unit test for region with different index between files

---
 tests/testthat/test-Start-selector_with_dim.R | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 tests/testthat/test-Start-selector_with_dim.R

diff --git a/tests/testthat/test-Start-selector_with_dim.R b/tests/testthat/test-Start-selector_with_dim.R
new file mode 100644
index 0000000..47762d3
--- /dev/null
+++ b/tests/testthat/test-Start-selector_with_dim.R
@@ -0,0 +1,70 @@
+#---------------------------------------------------
+# If assign a selector with an array that has file dim as dimension, Start() read 
+# the values depending on the the file dim. 
+#---------------------------------------------------
+context("Start() implicit inner dimension")
+
+
+test_that("1. region with different index between files", {
+
+path <- paste0('/esarchive/exp/ecearth/a35b/diags/DCPP/EC-Earth-Consortium/',
+               'EC-Earth3-HR/dcppA-hindcast/r1i1p1f1/Omon/$var$_mixed/gn/v20201107/',
+               '$var$_Omon_EC-Earth3-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$chunk$.nc')
+
+# two sdates have different index for Nino3. 
+region <- array('Nino3', dim = c(sdate = 2, region = 1))
+
+data <- Start(dat = path,
+              var = 'tosmean',
+              sdate = c('1993', '2013'),
+              chunk = indices(1:2),
+              chunk_depends = 'sdate',
+              region = region,
+              time = 'all', 
+              time_across = 'chunk',
+              merge_across_dims = TRUE,
+              return_vars = list(time = c('sdate', 'chunk'),
+                                 region = 'sdate'),
+               retrieve = T)
+
+data1 <- Start(dat = path,
+              var = 'tosmean',
+              sdate = c('1993'),
+              chunk = indices(1:2),
+              chunk_depends = 'sdate',
+              region = 'Nino3',
+              time = 'all', #c(1:length(forecast_month)),
+              time_across = 'chunk',
+              merge_across_dims = TRUE,
+              return_vars = list(time = c('sdate', 'chunk'),
+                                 region = NULL),
+               retrieve = T)
+
+data2 <- Start(dat = path,
+              var = 'tosmean',
+              sdate = c('2013'),
+              chunk = indices(1:2),
+              chunk_depends = 'sdate',
+              region = 'Nino3',
+              time = 'all', #c(1:length(forecast_month)),
+              time_across = 'chunk',
+              merge_across_dims = TRUE,
+              return_vars = list(time = c('sdate', 'chunk'),
+                                 region = NULL),
+               retrieve = T)
+
+expect_equal(
+dim(data),
+c(dat = 1, var = 1, sdate = 2, region = 1, time = 2)
+)
+expect_equal(
+data[1, 1, 1, 1, ],
+data1[1, 1, 1, 1, ]
+)
+expect_equal(
+data[1, 1, 2, 1, ],
+data2[1, 1, 1, 1, ]
+)
+
+
+})
-- 
GitLab


From 0b8e7d4fc3acfb16e751a3d06a6b4483992634d8 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Wed, 31 Mar 2021 16:58:54 +0200
Subject: [PATCH 3/4] Add FAQ about how to define selector if the indices in
 the files are not aligned

---
 inst/doc/faq.md | 68 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/inst/doc/faq.md b/inst/doc/faq.md
index 3b6d5fa..1770d49 100644
--- a/inst/doc/faq.md
+++ b/inst/doc/faq.md
@@ -24,7 +24,8 @@ This document intends to be the first reference for any doubts that you may have
    18. [Use glob expression '*' to define the file path](#18-use-glob-expression-to-define-the-file-path)
    19. [Get metadata when the first file does not exist](#19-get-metadata-when-the-first-file-does-not-exist)
    20. [Use 'metadata_dims' to retrieve variable metadata](#20-use-metadata_dims-to-retrieve-variable-metadata)
-   21. [Retrieve the complete data when the dimension length varies among files](#21-retrieve-the-complete-data-when-the-dimension-length-varies-among-files)
+   21. [Retrieve the complete data when the dimension length varies among files](#21-retrieve-the-complete-data-when-the-dimension-length-varies-among-files)  
+   22. [Define the selector when the indices in the files are not aligned](#22-define-the-selector-when-the-indices-in-the-files-are-not-aligned)
 </b>
 
 2. **Something goes wrong...**
@@ -848,6 +849,71 @@ adopt the provided ones and use the first valid file to decide the rest of dimen
 By this means, the efficiency can be similar to `largest_dims_length = FALSE`.
 
 
+### 22. Define the selector when the indices in the files are not aligned  
+When the data structure between the requested files is not identical, we need to give different
+selectors to each file. We can do this by using arrays as the selector and with the parameter
+'return_vars' being well-defined. There are two scenarios: (1) different between datasets (2) different along certain file dim.  
+
+(1) Different between datasets  
+We don't need (and can't) to define the selectors with pattern dim as the dimension. We can use
+the value as the selector and specify `return_vars = list(<inner_dim> = 'dat')`. By 'return_vars',
+Start() knows that this inner_dim differs among the datasets so it examines all the files to get
+the correct values. See more details of 'return_vars' at [How-to-16](#16-use-parameter-return_vars-in-start).  
+
+For example, the two datasets, Hadgem3 and NorCPM1, have different initial dates. Hadgem3 initiates
+in November while NorCPM1 in October. To retrieve them aligned, we can define the time selector
+with the value "2000-11-16 UTC" and define 'return_vars' properly.  
+
+```r
+# HadGEM3 (initialised in November)
+# NorCPM1 (initialised in October)
+
+data <- Start(dat = list(list(name = 'hadgem3', path = path_hadgem3),
+                         list(name = 'norcpm1', path = path_norcpm1)),
+              var = 'tas',
+              sdate = '2000',
+              time = as.POSIXct("2000-11-16", tz = 'UTC'),
+              lat = 'all',
+              lon = 'all',
+              synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')),
+              return_vars = list(lat = 'dat', lon = 'dat',
+                                 time = 'dat'),
+              retrieve = TRUE)
+
+```
+
+(2) Different along certain file dim  
+If the difference of indices is among the files in the same dataset, we can use the array with 
+named dimensions
+to define the selector, and define 'return_vars' with the file dim along which the indices differ. 
+
+For example, the 'region' number in the earlier experiments (sdate < 2013) is less than the later experiments (sdate = 2013), 
+making some regions have different indices between the experiments. The region selector array
+should be two-dimensional, with one dimension 'sdate' and the other 'region'. The value of the
+array can be either the character string of the region name or the indices in each sdate.
+Besides, the dependency should be specified by `return_vars = list(region = 'sdate')`.  
+
+```r 
+# 'Nino3' in 1st sdate file is index 9 while in 2nd sdate file is index 11
+# Either define with 'Nino3' or the corresponding index works
+region <- array('Nino3', dim = c(sdate = 2, region = 1))
+region <- array(c(indices(9), indices(11)), dim = c(sdate = 2, region = 1))
+
+data <- Start(dat = path,
+              var = 'tosmean',
+              sdate = c('1993', '2013'),
+              chunk = 'all',
+              chunk_depends = 'sdate',
+              region = region, 
+              time = 'all', 
+              time_across = 'chunk',
+              merge_across_dims = TRUE, 
+              return_vars = list(time = c('sdate', 'chunk'),
+                                 region = 'sdate'),
+              retrieve = T)
+```
+
+
 # Something goes wrong...
 
 ### 1. No space left on device
-- 
GitLab


From 7ea2e0f0c062d191da11e4e2ccde8dd167f8a414 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Wed, 31 Mar 2021 17:58:26 +0200
Subject: [PATCH 4/4] Add TODO

---
 R/Start.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/Start.R b/R/Start.R
index 853dcaf..9cecb87 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -3483,6 +3483,8 @@ Start <- function(..., # dim = indices/selectors,
               vars_to_crop <- picked_vars_ordered[[i]]
               common_vars_to_crop <- picked_common_vars_ordered
             } else {
+              #TODO: If fri has different indices in each list, the crop_indices should be 
+              #      separated for each list. Otherwise, picked_common_vars later will be wrong.
               crop_indices <- unique(unlist(fri))
               vars_to_crop <- picked_vars[[i]]
               common_vars_to_crop <- picked_common_vars
-- 
GitLab