From 74b4d95bd836e2f3f5cf2e9a06ccf098c0600224 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Tue, 25 Jan 2022 14:15:02 +0100
Subject: [PATCH 1/4] Start() can correctly merge files along the dimension
 that has different length each file, without additional NAs.

---
 R/Start.R                                     | 102 ++++++++++++---
 R/zzz.R                                       |   3 +-
 .../testthat/test-Start-largest_dims_length.R | 122 ++++++++++++++++++
 3 files changed, 206 insertions(+), 21 deletions(-)

diff --git a/R/Start.R b/R/Start.R
index d977e71..fefd6ae 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -1929,8 +1929,19 @@ Start <- function(..., # dim = indices/selectors,
       if (!is.null(indices)) {
         #//////////////////////////////////////////////////
         # Find data_dims
-        ## old code. use the 1st valid file to determine the dims
-        if (!largest_dims_length | is.numeric(largest_dims_length)) {
+        ## If largest_dims_length is a number & !merge_across_dims,
+        ## directly assign this dim as the number;
+        ## If largest_dims_length is a number & this dim is across files, find the dim length of each file
+        find_largest_dims_length_by_files <- FALSE
+        if (is.numeric(largest_dims_length)) {
+          if (names(largest_dims_length) %in% inner_dims_across_files) {
+            find_largest_dims_length_by_files <- TRUE
+          }
+        } else if (largest_dims_length) {
+          find_largest_dims_length_by_files <- TRUE
+        } 
+
+        if (!find_largest_dims_length_by_files) { # old code
           file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]])))
           # The following 5 lines should go several lines below, but were moved
           # here for better performance.
@@ -1958,12 +1969,16 @@ Start <- function(..., # dim = indices/selectors,
             }
           }
 
-        ## largest_dims_length = TRUE
         } else {
-          data_dims <- find_largest_dims_length(
-                         selectors_total_list[[i]], array_of_files_to_load,
-                         selector_indices_save[[i]], dat[[i]], expected_inner_dims[[i]],
-                         synonims, file_dim_reader)
+        ## largest_dims_length = TRUE, or is a number & merge_across_dims is across this dim
+          tmp <- find_largest_dims_length(
+                   selectors_total_list[[i]], array_of_files_to_load,
+                   selector_indices_save[[i]], dat[[i]], expected_inner_dims[[i]],
+                   synonims, file_dim_reader)
+          data_dims <- tmp$largest_data_dims
+          # 'data_dims_each_file' is used when merge_across_dims = TRUE &
+          # the files have different length of inner dim.
+          data_dims_each_file <- tmp$data_dims_all_files
 
           # file_dim_reader returns dimension names as found in the file.
           # Need to translate accoridng to synonims:
@@ -2380,6 +2395,19 @@ Start <- function(..., # dim = indices/selectors,
           # populated instead.
           #////////////////////////////////////////////////////////////////////
 
+          # If the inner dim lengths differ among files,
+          # need to know each length to create the indices for each file later.
+          # Record 'iinner_dim_lengths' here for later usage.
+          inner_dim_lengths <- NULL
+          if (largest_dims_length & !is.null(file_dim)) {
+            inner_dim_lengths <- tryCatch({
+              sapply(data_dims_each_file, '[[', inner_dim)
+            }, error = function(x) {
+              sapply(data_dims_each_file, '[[', 
+                     synonims[[inner_dim]][which(synonims[[inner_dim]] != inner_dim)])
+            })
+          }
+
           fri <- first_round_indices <- NULL
           sri <- second_round_indices <- NULL
           # This variable will keep the indices needed to crop the transformed 
@@ -2397,7 +2425,13 @@ Start <- function(..., # dim = indices/selectors,
               sri <- vector('list', length = chunk_amount)
               dim(sri) <- c(chunk_amount)
               if (selector_array == 'all') {
-                fri[] <- replicate(chunk_amount, list(1:(data_dims[inner_dim])))
+                if (is.null(inner_dim_lengths) | length(unique(inner_dim_lengths)) <= 1) { #old code
+                  fri[] <- replicate(chunk_amount, list(1:(data_dims[inner_dim])))
+                } else {  # files have different inner dim length
+                  for (i_chunk in 1:length(fri)) {
+                    fri[[i_chunk]] <- 1:inner_dim_lengths[i_chunk]
+                  }
+                }
                 taken_chunks <- rep(TRUE, chunk_amount)
                 #sri <- NULL
               } else if (selector_array == 'first') {
@@ -3139,11 +3173,17 @@ Start <- function(..., # dim = indices/selectors,
 
                 if (inner_dim %in% names(dim(sub_array_of_selectors))) {
                   if (is.null(var_with_selectors_name)) {
+                    if (!largest_dims_length | (largest_dims_length & length(unique(inner_dim_lengths)) <= 1)) {  #old code
+                      maximal_indice <- data_dims[inner_dim] * chunk_amount
+                    } else { # files have different length of inner dim
+                      maximal_indice <- sum(inner_dim_lengths)
+                    }
+
                     if (any(na.omit(unlist(sub_array_of_selectors)) < 1) ||
-                        any(na.omit(unlist(sub_array_of_selectors)) > data_dims[inner_dim] * chunk_amount)) {
-                      stop("Provided indices out of range for dimension '", inner_dim, "' ", 
-                           "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", 
-                           data_dims[inner_dim] * chunk_amount, ").")
+                        any(na.omit(unlist(sub_array_of_selectors)) > maximal_indice)) {
+                      stop("Provided indices out of range for dimension '", inner_dim, "' ",
+                           "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ",
+                           maximal_indice, ").")
                     }
                   } else {
                     if (inner_dim %in% names(dim(sub_array_of_values))) {
@@ -3182,13 +3222,33 @@ Start <- function(..., # dim = indices/selectors,
                     sub_array_is_list <- TRUE
                     sub_array_of_indices <- unlist(sub_array_of_indices)
                   }
-                  if (is.null(var_with_selectors_name)) {
-                    indices_chunk <- floor((sub_array_of_indices - 1) / data_dims[inner_dim]) + 1
-                    transformed_indices <- ((sub_array_of_indices - 1) %% data_dims[inner_dim]) + 1
-                  } else {
-                    indices_chunk <- floor((sub_array_of_indices - 1) / var_full_dims[inner_dim]) + 1
-                    transformed_indices <- ((sub_array_of_indices - 1) %% var_full_dims[inner_dim]) + 1
+
+                  # "indices_chunk" refers to in which file the 
+                  # sub_array_of_indices is; "transformed_indices" 
+                  # refers to the indices of sub_array_of_indices in each file.
+                  if (!largest_dims_length | 
+                      (largest_dims_length & length(unique(inner_dim_lengths)) <= 1)) {
+                    # old code; all the files have the same length of inner_dim
+                    if (is.null(var_with_selectors_name)) {
+                      indices_chunk <- floor((sub_array_of_indices - 1) / data_dims[inner_dim]) + 1
+                      transformed_indices <- ((sub_array_of_indices - 1) %% data_dims[inner_dim]) + 1
+                    } else {
+                      indices_chunk <- floor((sub_array_of_indices - 1) / var_full_dims[inner_dim]) + 1
+                      transformed_indices <- ((sub_array_of_indices - 1) %% var_full_dims[inner_dim]) + 1
+                    }
+                  } else {  # files have different inner dim length
+                    indices_chunk <- c()
+                    for (item in 1:length(inner_dim_lengths)) {
+                      tmp <- which(sub_array_of_indices <= cumsum(inner_dim_lengths)[item])
+                      indices_chunk <- c(indices_chunk, rep(item, length(tmp) - length(indices_chunk)))
+                    }
+                    sub_array_of_indices_by_file <- split(sub_array_of_indices, indices_chunk)
+                    for (item in 2:length(sub_array_of_indices_by_file)) {
+                      sub_array_of_indices_by_file[[item]] <- sub_array_of_indices_by_file[[item]] - cumsum(inner_dim_lengths)[item - 1]
+                    }
+                    transformed_indices <- unlist(sub_array_of_indices_by_file, use.names = FALSE)
                   }
+
                   if (sub_array_is_list) {
                     sub_array_of_indices <- as.list(sub_array_of_indices)
                   }
@@ -3197,14 +3257,15 @@ Start <- function(..., # dim = indices/selectors,
                       print("-> GOING TO ITERATE ALONG CHUNKS.")
                     }
                   }
+
                   for (chunk in 1:chunk_amount) {
                     if (!is.null(names(selector_store_position))) {
                       selector_store_position[file_dim] <- chunk
                     } else {
                       selector_store_position <- chunk
                     }
-                    chunk_selectors <- transformed_indices[which(indices_chunk == chunk)]
-                    sub_array_of_indices <- chunk_selectors
+                    sub_array_of_indices <- transformed_indices[which(indices_chunk == chunk)]
+                    sub_array_of_indices <- transformed_indices[which(indices_chunk == chunk)]
                     if (with_transform) {
                       # If the provided selectors are expressed in the world
                       # before transformation
@@ -3246,6 +3307,7 @@ Start <- function(..., # dim = indices/selectors,
                         taken_chunks[chunk] <- TRUE
                       }
                     }
+
                     if (!is.null(var_unorder_indices)) {
                       ordered_fri <- sub_array_of_fri
                       sub_array_of_fri <- var_unorder_indices[sub_array_of_fri]
diff --git a/R/zzz.R b/R/zzz.R
index 941ecbe..6b41a69 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -538,7 +538,8 @@ find_largest_dims_length <- function(selectors_total_list, array_of_files_to_loa
     largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk))
   }
   names(largest_data_dims) <- names(data_dims_all_files[[1]])
-  return(largest_data_dims)
+  return(list(largest_data_dims = largest_data_dims,
+              data_dims_all_files = data_dims_all_files))
 }
 
 # Gererate vars_to_transform from picked_vars[[i]] and picked_common_vars
diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R
index 59b73e8..8657985 100644
--- a/tests/testthat/test-Start-largest_dims_length.R
+++ b/tests/testthat/test-Start-largest_dims_length.R
@@ -134,3 +134,125 @@ dat3 <- Start(dataset = repos,
   )
 
 })
+
+test_that("2. inconsistent time length, merge_across_dims = T", {
+
+path <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/EC-Earth3/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/',
+               '$member$/Amon/$var$/gr/v20210309/',
+               '$var$_Amon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc')
+
+suppressWarnings(
+data <- Start(dataset = path,
+              var = 'psl',
+              sdate = '1970',
+              lat = indices(1),
+              lon = indices(1),
+              fyear = c('197011-197012', '197101-197112'),  #2+12
+              fmonth = 'all',
+              member = 'r6i2p1f1',
+              fmonth_across = 'fyear',
+              merge_across_dims = TRUE,
+              merge_across_dims_narm = TRUE,
+              largest_dims_length = TRUE, #c(fmonth = 12),
+              synonims = list(fmonth = c('time', 'fmonth'),
+                              lon = c('lon', 'longitude'),
+                              lat = c('lat', 'latitude')),
+              return_vars = list(lat = 'dataset', lon = 'dataset',
+                                 fmonth = 'fyear'),
+              retrieve = TRUE)
+)
+expect_equal(
+dim(data),
+c(dataset = 1, var = 1, sdate = 1, lat = 1, lon = 1, fmonth = 14, member = 1)
+)
+expect_equal(
+as.vector(data)[1:5],
+c(101341.03, 100831.62, 99877.38, 101355.11, 101067.74),
+tolerance = 0.001
+)
+
+suppressWarnings(
+data2 <- Start(dataset = path,
+              var = 'psl',
+              sdate = '1970',
+              lat = indices(1),
+              lon = indices(1),
+              fyear = c('197011-197012', '197101-197112'),  #2+12
+              fmonth = indices(1:14),
+              member = 'r6i2p1f1',
+              fmonth_across = 'fyear',
+              merge_across_dims = TRUE,
+              merge_across_dims_narm = TRUE,
+              largest_dims_length = TRUE, #c(fmonth = 12),
+              synonims = list(fmonth = c('time', 'fmonth'),
+                              lon = c('lon', 'longitude'),
+                              lat = c('lat', 'latitude')),
+              return_vars = list(lat = 'dataset', lon = 'dataset',
+                                 fmonth = 'fyear'),
+              retrieve = TRUE)
+)
+
+expect_equal(
+as.vector(data),
+as.vector(data2)
+)
+
+suppressWarnings(
+data3 <- Start(dataset = path,
+              var = 'psl',
+              sdate = '1970',
+              lat = indices(1),
+              lon = indices(1),
+              fyear = c('197011-197012', '197101-197112'),  #2+12
+              fmonth = 'all',
+              member = 'r6i2p1f1',
+              fmonth_across = 'fyear',
+              merge_across_dims = TRUE,
+              merge_across_dims_narm = TRUE,
+              largest_dims_length = c(fmonth = 12),
+              synonims = list(fmonth = c('time', 'fmonth'),
+                              lon = c('lon', 'longitude'),
+                              lat = c('lat', 'latitude')),
+              return_vars = list(lat = 'dataset', lon = 'dataset',
+                                 fmonth = 'fyear'),
+              retrieve = TRUE)
+)
+
+expect_equal(
+as.vector(data),
+as.vector(data3)
+)
+
+#--------------------------------
+suppressWarnings(
+data4 <- Start(dataset = path,
+              var = 'psl',
+              sdate = '1970',
+              lat = indices(1),
+              lon = indices(1),
+              fyear = c('197011-197012', '197101-197112'),  #2+12
+              fmonth = indices(c(1:2, 10:14)),
+              member = 'r6i2p1f1',
+              fmonth_across = 'fyear',
+              merge_across_dims = TRUE,
+              merge_across_dims_narm = TRUE,
+              largest_dims_length = TRUE,
+              synonims = list(fmonth = c('time', 'fmonth'),
+                              lon = c('lon', 'longitude'),
+                              lat = c('lat', 'latitude')),
+              return_vars = list(lat = 'dataset', lon = 'dataset',
+                                 fmonth = 'fyear'),
+              retrieve = TRUE)
+)
+
+expect_equal(
+dim(data4),
+c(dataset = 1, var = 1, sdate = 1, lat = 1, lon = 1, fmonth = 7, member = 1)
+)
+expect_equal(
+as.vector(data4),
+c(101341.03, 100831.62, 101751.00, 101943.87, 100713.17, 101724.04, 99833.31),
+tolerance = 0.001
+)
+
+})
-- 
GitLab


From 290198c637768480b9bab3dffbef633cb2d1931d Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Fri, 28 Jan 2022 18:30:34 +0100
Subject: [PATCH 2/4] Fix the bug of recording inner dim length of each merged
 file dim

---
 R/Start.R | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/R/Start.R b/R/Start.R
index fefd6ae..d3a2b69 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -2400,12 +2400,33 @@ Start <- function(..., # dim = indices/selectors,
           # Record 'iinner_dim_lengths' here for later usage.
           inner_dim_lengths <- NULL
           if (largest_dims_length & !is.null(file_dim)) {
+            # inner_dim_lengths here includes all the files, but we only want
+            # the files of fyear for certain "sdate". We will categorize it later.
             inner_dim_lengths <- tryCatch({
               sapply(data_dims_each_file, '[[', inner_dim)
             }, error = function(x) {
               sapply(data_dims_each_file, '[[', 
                      synonims[[inner_dim]][which(synonims[[inner_dim]] != inner_dim)])
             })
+
+            # Use other file dims as the factors to categorize.
+            other_file_dims <- dim(array_of_files_to_load)[which(file_dims != file_dim)]
+            other_file_dims <- lapply(lapply(other_file_dims, seq, 1), rev)
+            other_file_dims_factor <- expand.grid(other_file_dims)
+            selector_indices_save_subset <-
+              lapply(selector_indices_save[[i]], '[', which(file_dims != file_dim))
+
+            # Put the fyear with the same other file dims (sdate, etc.) together, and find the largest length (in theory all of them should be the same)
+            inner_dim_lengths_cat <- vector('list', dim(other_file_dims_factor)[1])
+            for (i_factor in 1:length(inner_dim_lengths_cat)) {
+
+              inner_dim_lengths_cat[[i_factor]] <- 
+                inner_dim_lengths[which(sapply(lapply(
+                  selector_indices_save_subset, '==', 
+                  other_file_dims_factor[i_factor, ]), all))]
+            }
+            # Find the largest length of each time step
+            inner_dim_lengths <- do.call(pmax, inner_dim_lengths_cat)
           }
 
           fri <- first_round_indices <- NULL
-- 
GitLab


From 8566f522e5b05f33cde2f41bab4b37d03625533e Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Fri, 28 Jan 2022 18:38:41 +0100
Subject: [PATCH 3/4] Revise unit test

---
 .../testthat/test-Start-largest_dims_length.R | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R
index 8657985..a30fc24 100644
--- a/tests/testthat/test-Start-largest_dims_length.R
+++ b/tests/testthat/test-Start-largest_dims_length.R
@@ -227,11 +227,12 @@ as.vector(data3)
 suppressWarnings(
 data4 <- Start(dataset = path,
               var = 'psl',
-              sdate = '1970',
+              sdate = c('1970', '1971'),
               lat = indices(1),
               lon = indices(1),
-              fyear = c('197011-197012', '197101-197112'),  #2+12
-              fmonth = indices(c(1:2, 10:14)),
+              fyear = 'all',
+              fmonth = 'all',
+              fyear_depends = 'sdate',
               member = 'r6i2p1f1',
               fmonth_across = 'fyear',
               merge_across_dims = TRUE,
@@ -247,12 +248,18 @@ data4 <- Start(dataset = path,
 
 expect_equal(
 dim(data4),
-c(dataset = 1, var = 1, sdate = 1, lat = 1, lon = 1, fmonth = 7, member = 1)
+c(dataset = 1, var = 1, sdate = 2, lat = 1, lon = 1, fmonth = 122, member = 1)
+)
+expect_equal(
+as.vector(drop(data4)[1,1:5]),
+c(101341.03, 100831.62, 99877.38, 101355.11, 101067.74),
+tolerance = 0.001
 )
 expect_equal(
-as.vector(data4),
-c(101341.03, 100831.62, 101751.00, 101943.87, 100713.17, 101724.04, 99833.31),
+as.vector(drop(data4)[2,10:14]),
+c(100515.2, 101354.9, 100657.0, 100349.4, 100199.6),
 tolerance = 0.001
 )
 
+
 })
-- 
GitLab


From c9d06abe39ab3e39039a1d7b0be55d8071923ad1 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Mon, 31 Jan 2022 11:57:25 +0100
Subject: [PATCH 4/4] Add the unit test for merge_across_dims_narm = F

---
 .../testthat/test-Start-largest_dims_length.R | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R
index a30fc24..b448f89 100644
--- a/tests/testthat/test-Start-largest_dims_length.R
+++ b/tests/testthat/test-Start-largest_dims_length.R
@@ -261,5 +261,40 @@ c(100515.2, 101354.9, 100657.0, 100349.4, 100199.6),
 tolerance = 0.001
 )
 
+#--------------------------------
+suppressWarnings(
+data5 <- Start(dataset = path,
+              var = 'psl',
+              sdate = c('1970', '1971'),
+              lat = indices(1),
+              lon = indices(1),
+              fyear = 'all',
+              fmonth = 'all',
+              fyear_depends = 'sdate',
+              member = 'r6i2p1f1',
+              fmonth_across = 'fyear',
+              merge_across_dims = TRUE,
+              merge_across_dims_narm = FALSE, #TRUE,
+              largest_dims_length = TRUE,
+              synonims = list(fmonth = c('time', 'fmonth'),
+                              lon = c('lon', 'longitude'),
+                              lat = c('lat', 'latitude')),
+              return_vars = list(lat = 'dataset', lon = 'dataset',
+                                 fmonth = 'fyear'),
+              retrieve = TRUE)
+)
+
+expect_equal(
+dim(data5),
+c(dataset = 1, var = 1, sdate = 2, lat = 1, lon = 1, fmonth = 132, member = 1)
+)
+expect_equal(
+which(is.na(drop(data5))),
+c(5:24)
+)
+expect_equal(
+as.vector(data4),
+as.vector(data5)[-c(5:24)]
+)
 
 })
-- 
GitLab