From 14e6736106fb69779591f34885eca7a5eb42b017 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Wed, 5 May 2021 23:29:17 +0200
Subject: [PATCH 1/4] Reorder chunks when split + merge. Thorough check is
 further needed

---
 R/Start.R | 111 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 58 insertions(+), 53 deletions(-)

diff --git a/R/Start.R b/R/Start.R
index e8a1d7d..9776ead 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -3702,27 +3702,29 @@ Start <- function(..., # dim = indices/selectors,
     }
   }
   #======================================================================
-
-  if (merge_across_dims_narm) {
+  if (merge_across_dims) {
     # only merge_across_dims -> the 'time' dim length needs to be adjusted
     across_inner_dim <- inner_dims_across_files[[1]]  #TODO: more than one?
-    across_file_dim <- names(inner_dims_across_files)  #TODO: more than one?
     # Get the length of each inner_dim ('time') along each file_dim ('file_date')  
     length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length)
-    
-    if (!split_multiselected_dims) {
-      final_dims_fake_name <- names(final_dims_fake)
-      pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim)
-      new_length_inner_dim <- sum(unlist(length_inner_across_dim))
-      if (pos_across_inner_dim != length(final_dims_fake)) {
-        final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)],
-                             new_length_inner_dim,
-                             final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)])
-      } else {
-        final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)],
-                             new_length_inner_dim)
+
+    if (merge_across_dims_narm) {
+      across_file_dim <- names(inner_dims_across_files)  #TODO: more than one?
+
+      if (!split_multiselected_dims) {
+        final_dims_fake_name <- names(final_dims_fake)
+        pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim)
+        new_length_inner_dim <- sum(unlist(length_inner_across_dim))
+        if (pos_across_inner_dim != length(final_dims_fake)) {
+          final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)],
+                               new_length_inner_dim,
+                               final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)])
+        } else {
+          final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)],
+                               new_length_inner_dim)
+        }
+        names(final_dims_fake) <- final_dims_fake_name
       }
-      names(final_dims_fake) <- final_dims_fake_name
     }
   }
   
@@ -4047,43 +4049,46 @@ Start <- function(..., # dim = indices/selectors,
     #       unequal inner_dim ('time') length across file_dim ('file_date').
     #       If merge_across_dims_narm = TRUE, add additional lines to remove these NAs.
     # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. 
-    if (merge_across_dims_narm) {
-      
-      # Get the length of these two dimensions in final_dims
-      length_inner_across_store_dims <- final_dims[across_inner_dim]
-      length_file_across_store_dims <- final_dims[across_file_dim]
-      
-      # Create a logical array for merge_across_dims
-      logi_array <- array(rep(FALSE,
-                              length_file_across_store_dims * length_inner_across_store_dims),
-                          dim = c(length_inner_across_store_dims, length_file_across_store_dims))
-      for (i in 1:length_file_across_store_dims) {  #1:4
-        logi_array[1:length_inner_across_dim[[i]], i] <- TRUE
-      }
-      
-      # First, get the data array with final_dims dimension
-      data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims)
-      
-      # Change the NA derived from additional spaces to -9999, then remove these -9999
-      func_remove_blank <- function(data_array, logi_array) {
-        # dim(data_array) = [time, file_date]
-        # dim(logi_array) = [time, file_date]
-        # Change the blank spaces from NA to -9999
-        data_array[which(!logi_array)] <- -9999
-        return(data_array)
+    if (merge_across_dims) {
+      if (!merge_across_dims_narm) {
+        data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims)
+      } else {
+        # Get the length of these two dimensions in final_dims
+        length_inner_across_store_dims <- final_dims[across_inner_dim]
+        length_file_across_store_dims <- final_dims[across_file_dim]
+        
+        # Create a logical array for merge_across_dims
+        logi_array <- array(rep(FALSE,
+                                length_file_across_store_dims * length_inner_across_store_dims),
+                            dim = c(length_inner_across_store_dims, length_file_across_store_dims))
+        for (i in 1:length_file_across_store_dims) {  #1:4
+          logi_array[1:length_inner_across_dim[[i]], i] <- TRUE
+        }
+        
+        # First, get the data array with final_dims dimension
+        data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims)
+        
+        # Change the NA derived from additional spaces to -9999, then remove these -9999
+        func_remove_blank <- function(data_array, logi_array) {
+          # dim(data_array) = [time, file_date]
+          # dim(logi_array) = [time, file_date]
+          # Change the blank spaces from NA to -9999
+          data_array[which(!logi_array)] <- -9999
+          return(data_array)
+        }
+        data_array_final_dims <- multiApply::Apply(data_array_final_dims,
+                                                   target_dims = c(across_inner_dim, across_file_dim),  #c('time', 'file_date')
+                                                   output_dims = c(across_inner_dim, across_file_dim),
+                                                   fun = func_remove_blank,
+                                                   logi_array = logi_array)$output1
+        ## reorder back to the correct dim
+        tmp <- match(names(final_dims), names(dim(data_array_final_dims)))
+        data_array_final_dims <- .aperm2(data_array_final_dims, tmp)
+        data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999]  # become a vector
       }
-      data_array_final_dims <- multiApply::Apply(data_array_final_dims,
-                                                 target_dims = c(across_inner_dim, across_file_dim),  #c('time', 'file_date')
-                                                 output_dims = c(across_inner_dim, across_file_dim),
-                                                 fun = func_remove_blank,
-                                                 logi_array = logi_array)$output1
-      ## reorder back to the correct dim
-      tmp <- match(names(final_dims), names(dim(data_array_final_dims)))
-      data_array_final_dims <- .aperm2(data_array_final_dims, tmp)
-      data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999]  # become a vector
-      
- #NOTE: When one file contains values for dicrete dimensions, rearrange the 
- #      chunks (i.e., work_piece) is necessary.
+ 
+      #NOTE: When one file contains values for dicrete dimensions, rearrange the 
+      #      chunks (i.e., work_piece) is necessary.
       if (split_multiselected_dims) {
 
       # generate the correct order list from indices_chunk 
@@ -4091,7 +4096,7 @@ Start <- function(..., # dim = indices/selectors,
         i <- 1
         j <- 1
         a <- indices_chunk[i]
-        while (i < length(indices_chunk)) {
+        while (i <= length(indices_chunk)) {
           while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) {
             a <- c(a, indices_chunk[i+1])
             i <- i + 1
-- 
GitLab


From 76907ff7c5ad263c34752caf4f320c9aa4d54507 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Thu, 13 May 2021 17:50:40 +0200
Subject: [PATCH 2/4] Fix mixed-dimension error when reshape params are used.
 Create unit tests

---
 R/Start.R                               |  97 ++---
 inst/doc/usecase/ex1_3_attr_loadin.R    |  44 +--
 tests/testthat/test-Start-reshape.R     | 503 ++++++++++++++++++++++++
 tests/testthat/test-Start-split-merge.R |   2 -
 4 files changed, 555 insertions(+), 91 deletions(-)
 create mode 100644 tests/testthat/test-Start-reshape.R

diff --git a/R/Start.R b/R/Start.R
index 9776ead..33e9fac 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -1852,6 +1852,7 @@ Start <- function(..., # dim = indices/selectors,
             } else {
               common_return_vars[[inner_dim]] <- file_dim_as_selector_array_dim
             }
+            tmp <- file_dim_as_selector_array_dim
           } else if (inner_dim %in% inner_dims_across_files) { #(2)
             file_dim_name <- names(which(inner_dim == inner_dims_across_files))
             if (file_dim_name %in% found_pattern_dim) {
@@ -1859,6 +1860,7 @@ Start <- function(..., # dim = indices/selectors,
             } else {
               common_return_vars[[inner_dim]] <- file_dim_name
             }
+            tmp <- file_dim_name
           }
           .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", tmp,
                           "', but '", inner_dim, "' is not in return_vars list or is NULL. ",
@@ -4049,7 +4051,7 @@ Start <- function(..., # dim = indices/selectors,
     #       unequal inner_dim ('time') length across file_dim ('file_date').
     #       If merge_across_dims_narm = TRUE, add additional lines to remove these NAs.
     # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. 
-    if (merge_across_dims) {
+    if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) {
       if (!merge_across_dims_narm) {
         data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims)
       } else {
@@ -4086,6 +4088,11 @@ Start <- function(..., # dim = indices/selectors,
         data_array_final_dims <- .aperm2(data_array_final_dims, tmp)
         data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999]  # become a vector
       }
+
+      if (length(data_array_tmp) != prod(final_dims_fake)) {
+        stop(paste0("After reshaping, the data do not fit into the expected output dimension. ",
+                    "Check if the reshaping parameters are used correctly."))
+      }
  
       #NOTE: When one file contains values for dicrete dimensions, rearrange the 
       #      chunks (i.e., work_piece) is necessary.
@@ -4110,57 +4117,55 @@ Start <- function(..., # dim = indices/selectors,
         final_order_list <- lapply(final_order_list, length)
         
         if (!all(diff(as.numeric(names(final_order_list))) > 0)) {
-
-
-      # shape the vector into the array without split_dims
-        split_dims_pos <- match(split_dims, final_dims_fake)
-        new_dims <- c()
-        if (split_dims_pos[1] > 1) {
-          new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)])
-        }
-        new_dims <- c(new_dims,  prod(split_dims))
-        names(new_dims)[split_dims_pos[1]] <- across_inner_dim
-        if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) {
-          new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)])
-        }
-        final_dims_fake_no_split <- new_dims
-        data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split)
-      # seperate 'time' dim into each work_piece length
-        data_array_seperate <- list()
-        tmp <- cumsum(unlist(length_inner_across_dim))
-        tmp <- c(0, tmp)
-        for (i in 1:length(length_inner_across_dim)) {
-          data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim,
-                                             (tmp[i] + 1):tmp[i + 1])
-        }
-
-      # re-build the array: chunk 
-        which_chunk <- as.numeric(names(final_order_list))
-        how_many_indices <- unlist(final_order_list)
-        array_piece <- list()
-        ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate)))
-        for (i in 1:length(final_order_list)) {
-          array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]],
-                                     across_inner_dim,
-                                     ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1))
-          ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i]
-        }
+          # shape the vector into the array without split_dims
+          split_dims_pos <- match(split_dims, final_dims_fake)
+          new_dims <- c()
+          if (split_dims_pos[1] > 1) {
+            new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)])
+          }
+          new_dims <- c(new_dims,  prod(split_dims))
+          names(new_dims)[split_dims_pos[1]] <- across_inner_dim
+          if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) {
+            new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)])
+          }
+          final_dims_fake_no_split <- new_dims
+          data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split)
+          # seperate 'time' dim into each work_piece length
+          data_array_seperate <- list()
+          tmp <- cumsum(unlist(length_inner_across_dim))
+          tmp <- c(0, tmp)
+          for (i in 1:length(length_inner_across_dim)) {
+            data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim,
+                                               (tmp[i] + 1):tmp[i + 1])
+          }
+  
+          # re-build the array: chunk 
+          which_chunk <- as.numeric(names(final_order_list))
+          how_many_indices <- unlist(final_order_list)
+          array_piece <- list()
+          ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate)))
+          for (i in 1:length(final_order_list)) {
+            array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]],
+                                       across_inner_dim,
+                                       ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1))
+            ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i]
+          }
         
-        # re-build the array: paste
-        data_array_tmp <- array_piece[[1]]
-        along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim)
-        if (length(array_piece) > 1) {
-          for (i in 2:length(array_piece)) {
-          data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]],
-                                        along = along_pos)
+          # re-build the array: paste
+          data_array_tmp <- array_piece[[1]]
+          along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim)
+          if (length(array_piece) > 1) {
+            for (i in 2:length(array_piece)) {
+            data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]],
+                                          along = along_pos)
+            }
           }
-        }
-      }        
+        }        
       }
   
       data_array <- array(data_array_tmp, dim = final_dims_fake)
       
-    } else {  # merge_across_dims_narm = F (old version)
+    } else {  # ! (merge_across_dims + split_multiselected_dims) (old version)
       data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake)
     }
     
diff --git a/inst/doc/usecase/ex1_3_attr_loadin.R b/inst/doc/usecase/ex1_3_attr_loadin.R
index d514c30..e2c8211 100644
--- a/inst/doc/usecase/ex1_3_attr_loadin.R
+++ b/inst/doc/usecase/ex1_3_attr_loadin.R
@@ -77,7 +77,7 @@
 # NOTE: 'merge_across_dims_narm = TRUE' is necessary because the observational 
 #       data have unequal time length of 30-day and 31-day months.
 #       If the NAs are not removed, unwanted NAs will exist and make the
-#       values misplaced in the array. See 'bonus' below for more explanation.
+#       values misplaced in the array.
 
 #------- Check erai -----------
 dim(erai)
@@ -129,45 +129,3 @@ attr(erai, 'Variables')$common$time[2, ]
 
 
-# //////////////////"BONUS"//////////////////////
-# Here is something more to show the usage of parameter 'merge_across_dims_narm'.
-# If the last day of 30-day months is NA instead of the first day of the following month,
-# NAs are needed to exist in the array. In this case, 'merge_across_dims_narm'
-# should be FALSE.
-
- dates <- attr(system4, 'Variables')$common$time
- dates[2, 31]
-#[1] "1994-07-01 UTC"
- dates[2, 31] <- NA  # Jun
- dates[5, 31] <- NA  # Sep
- dates[7, 31] <- NA  # Nov
-
-  erai <- Start(dat = repos_obs,
-                var = 'tas',
-                file_date = dates_file,
-                time = values(dates),
-                latitude = indices(1:10),
-                longitude = indices(1:10),
-                time_var = 'time',
-                time_across = 'file_date',
-                merge_across_dims = TRUE,
-                #keep NAs of the last day in 30-day months
-                merge_across_dims_narm = FALSE,
-                split_multiselected_dims = TRUE,
-                return_vars = list(latitude = NULL,
-                                   longitude = NULL,
-                                   time = 'file_date'),
-                retrieve = TRUE)
-
-#------- Check erai -----------
-erai[1, 1, 2, , 1, 1] # June
-# [1] 269.9410 269.6855 268.7380 268.5008 270.3236 271.5151 270.5046 270.1686
-# [9] 270.5395 272.0379 272.5489 271.1494 270.7764 270.5678 272.0331 273.7856
-#[17] 273.9849 274.5904 273.4369 273.8404 274.4068 274.2292 274.7375 275.5104
-#[25] 275.4324 274.9408 274.8679 276.5602 275.0995 274.6409       NA
-erai[1, 1, 5, , 1, 1] # Sep
-# [1] 270.0656 270.7113 268.4678 271.6489 271.2354 269.7831 269.8045 268.7994
-# [9] 266.3092 262.2734 265.0124 261.8378 265.3950 257.1690 255.8402 264.8826
-#[17] 267.8663 266.6875 262.5502 258.5476 258.9617 263.6396 257.1111 264.8644
-#[25] 261.0085 256.7690 256.5811 256.4331 256.1260 256.4716       NA
-#------------------------------
diff --git a/tests/testthat/test-Start-reshape.R b/tests/testthat/test-Start-reshape.R
new file mode 100644
index 0000000..793a3b3
--- /dev/null
+++ b/tests/testthat/test-Start-reshape.R
@@ -0,0 +1,503 @@
+context("Start() reshape parameters check")
+# This one is more comprehensive than test-Start-split-merge.R
+
+path_exp <- '/esarchive/exp/ecmwf/system5c3s/daily_mean/$var$_f6h/$var$_$sdate$.nc'
+path_obs <- '/esarchive/recon/ecmwf/era5/daily_mean/$var$_f1h-r360x181/$var$_$date$.nc'
+var <- 'tas'
+sdate <- paste0(1993:1995, '1201')
+
+suppressWarnings(
+exp <- Start(dat = path_exp,
+             var = var,
+             sdate = sdate,
+             time = indices(1:90), #indices(1:91),
+             ensemble = indices(1),
+             lat = indices(1),
+             lon = indices(1),
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'sdate'),
+             retrieve = FALSE, silent = T)
+)
+dates <- attr(exp, 'Variables')$common$time
+
+# easyNCDF
+library(easyNCDF)
+# obs
+easy_sdate <- c('199312', paste0(rep(1994:1995, each = 3), c('01', '02', '12')), 
+                '199601', '199602')
+easy_array <- c()
+
+for (i in 1:length(easy_sdate)) {
+  easy_file <- NcOpen(paste0('/esarchive/recon/ecmwf/era5/daily_mean/tas_f1h-r360x181/tas_',
+                             easy_sdate[i], '.nc'))
+  if (substr(easy_sdate[i], 5, 6) == '02') {
+    sub_time <- 1:28
+  } else {
+    sub_time <- 1:31
+  }
+  easy_obs <- NcToArray(easy_file, vars_to_read = 'tas',
+                        dim_indices = list(lon = c(1), lat = c(1), time = sub_time))
+  NcClose(easy_file)
+  easy_array <- c(easy_array, as.vector(easy_obs))
+}
+dim(easy_array) <- c(time = 90, sdate = 3)
+
+
+
+test_that("1. split + merge + narm", {
+
+sorted_dates <- sort(unique(format(dates, '%Y%m')))
+unsorted_dates <- unique(format(dates, '%Y%m'))
+
+# unsorted dates
+obs1 <- Start(dat = path_obs,
+             var = var,
+             date = unsorted_dates,
+             time = values(dates),  #dim: [sdate = 3, time = 90]
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+             merge_across_dims_narm = TRUE,
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+
+# sorted_dates
+obs2 <- Start(dat = path_obs,
+             var = var,
+             date = sorted_dates,
+             time = values(dates),  #dim: [sdate = 3, time = 90]
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+             merge_across_dims_narm = TRUE,
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+
+expect_equal(
+dim(obs1),
+c(dat = 1, var = 1, sdate = 3, time = 90, lat = 1, lon = 1)
+)
+expect_equal(
+dim(obs1),
+dim(obs2)
+)
+expect_equal(
+as.vector(obs1),
+as.vector(obs2)
+)
+expect_equal(
+as.vector(obs1[1, 1, 1, , 1, 1]),
+as.vector(easy_array[, 1])
+)
+expect_equal(
+as.vector(obs1[1, 1, 2, , 1, 1]),
+as.vector(easy_array[, 2])
+)
+expect_equal(
+as.vector(obs1[1, 1, 3, , 1, 1]),
+as.vector(easy_array[, 3])
+)
+
+})
+
+
+test_that("2. split + merge", {
+
+exp <- Start(dat = path_exp,
+             var = var,
+             sdate = sdate,
+             time = indices(1:62),
+             ensemble = indices(1),
+             lat = indices(1),
+             lon = indices(1),
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'sdate'),
+             retrieve = FALSE)
+
+dates <- attr(exp, 'Variables')$common$time
+
+sorted_dates <- sort(unique(format(dates, '%Y%m')))
+unsorted_dates <- unique(format(dates, '%Y%m'))
+
+# unsorted dates
+obs1 <- Start(dat = path_obs,
+             var = var,
+             date = unsorted_dates,
+             time = values(dates),  #dim: [sdate = 3, time = 62]
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+#             merge_across_dims_narm = TRUE,
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+
+# sorted_dates
+obs2 <- Start(dat = path_obs,
+             var = var,
+             date = sorted_dates,
+             time = values(dates),  #dim: [sdate = 3, time = 62]
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+#             merge_across_dims_narm = TRUE,
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+
+expect_equal(
+dim(obs1),
+c(dat = 1, var = 1, sdate = 3, time = 62, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(obs1[1, 1, 1, , 1, 1]),
+as.vector(easy_array[1:62, 1])
+)
+expect_equal(
+as.vector(obs1[1, 1, 2, , 1, 1]),
+as.vector(easy_array[1:62, 2])
+)
+expect_equal(
+as.vector(obs1[1, 1, 3, , 1, 1]),
+as.vector(easy_array[1:62, 3])
+)
+expect_equal(
+as.vector(obs1),
+as.vector(obs2)
+)
+
+})
+
+
+
+test_that("3. merge", {
+# NOTE: The three files are all regarded to have time = 31, despite 199402 only has 28.
+#       It happens when time = 'all' or time = indices(). It seems reasonable when 
+#       'merge_across_dims' is not used, but if it is used, it's common to expect 31+31+28.
+#       See the next test "4. merge + narm". 199402 is still regarded as 31, so NAs are not
+#       removed.
+suppressWarnings(
+obs3 <- Start(dat = path_obs,
+             var = var,
+             date = c('199312', '199401', '199402'),
+             time = 'all',
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+#             merge_across_dims_narm = TRUE,
+#             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+)
+
+
+expect_equal(
+dim(obs3),
+c(dat = 1, var = 1, time = 93, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(obs3),
+c(as.vector(easy_array[, 1]), NA, NA, NA)
+)
+
+})
+
+
+test_that("4. merge + narm", {
+
+# (1) Notice that the NAs at the tail of 199402 won't be removed because Start()
+#     considers all the files have the same length, i.e., 31.
+#     The NAs in 199402 are regarded as part of the original file.
+
+obs3 <- Start(dat = path_obs,
+             var = var,
+             date = c('199312', '199401', '199402'),
+             time = 'all',
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+             merge_across_dims_narm = TRUE,
+#             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+
+expect_equal(
+dim(obs3),
+c(dat = 1, var = 1, time = 93, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(obs3),
+c(as.vector(easy_array[, 1]), NA, NA, NA)
+)
+
+# (2) It's tricky that 199402 is considered time = 31 because Start() considers
+#     all the files have the same length. So it won't return an error when 
+#     time = indices(93).
+#     The first 14 time steps of 199312 will be removed but the NAs at the tail
+#     of 199402 will be preserved.
+obs4 <- Start(dat = path_obs,
+             var = var,
+             date = c('199312', '199401', '199402'),
+             time = indices(15:93),
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+             merge_across_dims_narm = TRUE,
+#             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+
+expect_equal(
+dim(obs4),
+c(dat = 1, var = 1, time = 79, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(obs4),
+c(as.vector(easy_array[15:90, 1]), NA, NA, NA)
+)
+
+# (3) If time is values(), 199402 is considered time = 28, so NAs will be removed.
+obs5 <- Start(dat = path_obs,
+             var = var,
+             date = c('199312', '199401', '199402'),
+             time = dates[1, ],
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+             merge_across_dims_narm = TRUE,
+#             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+expect_equal(
+dim(obs5),
+c(dat = 1, var = 1, time = 90, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(obs5),
+as.vector(easy_array[1:90, 1])
+)
+
+})
+
+test_that("5. split", {
+
+date_array <- c('199312', '199401', '199412', '199501')
+dim(date_array) <- c(month = 2, year = 2)
+
+# split file dim
+obs1 <- Start(dat = path_obs,
+             var = var,
+             date = date_array, # [month = 2, year = 2]
+             time = indices(1:31),
+             lat = indices(1),
+             lon = indices(1),
+#             time_across = 'date',
+#             merge_across_dims = TRUE,
+#             merge_across_dims_narm = TRUE,
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL),
+                              #  time = 'date'),
+             retrieve = TRUE)
+
+expect_equal(
+dim(obs1),
+c(dat = 1, var = 1, month = 2, year = 2, time = 31, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(obs1[1, 1, 1, 1, , 1, 1]),
+as.vector(easy_array[1:31, 1])
+)
+expect_equal(
+as.vector(obs1[1, 1, 2, 1, , 1, 1]),
+as.vector(easy_array[32:62, 1])
+)
+expect_equal(
+as.vector(obs1[1, 1, 1, 2, , 1, 1]),
+as.vector(easy_array[1:31, 2])
+)
+expect_equal(
+as.vector(obs1[1, 1, 2, 2, , 1, 1]),
+as.vector(easy_array[32:62, 2])
+)
+
+# split inner time
+## time is indices
+time_array <- array(1:62, dim = c(day = 31, month = 2))
+exp1 <- Start(dat = path_exp,
+             var = var,
+             sdate = sdate[1],
+             time = time_array,
+             ensemble = indices(1),
+             lat = indices(1),
+             lon = indices(1),
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'sdate'),
+             retrieve = TRUE)
+
+# easyNCDF
+easy_sdate_exp <- '19931201'
+easy_file_exp <- NcOpen(paste0('/esarchive/exp/ecmwf/system5c3s/daily_mean/tas_f6h/tas_',
+                               easy_sdate_exp, '.nc'))
+easy_exp <- NcToArray(easy_file_exp, vars_to_read = 'tas',
+                        dim_indices = list(longitude = c(1), latitude = c(1), ensemble = c(1),
+                                           time = 1:62))
+NcClose(easy_file_exp)
+
+expect_equal(
+dim(exp1),
+c(dat = 1, var = 1, sdate = 1, day = 31, month = 2, ensemble = 1, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(exp1),
+as.vector(easy_exp)
+)
+
+## time is values
+time_array <- dates[1, 1:62]
+dim(time_array) <- c(day = 31, month = 2)
+exp2 <- Start(dat = path_exp,
+             var = var,
+             sdate = sdate[1],
+             time = time_array,
+             ensemble = indices(1),
+             lat = indices(1),
+             lon = indices(1),
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL),
+#                                time = 'sdate'),
+             retrieve = TRUE)
+expect_equal(
+dim(exp2),
+c(dat = 1, var = 1, sdate = 1, day = 31, month = 2, ensemble = 1, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(exp1),
+as.vector(exp2)
+)
+
+
+})
+
+test_that("6. repetitive values", {
+
+exp <- Start(dat = path_exp,
+             var = var,
+             sdate = c('19931101', '19931201'),
+             time = indices(1:61),
+             ensemble = indices(1),
+             lat = indices(1),
+             lon = indices(1),
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'sdate'),
+             retrieve = F)
+dates <- attr(exp, 'Variables')$common$time
+
+# sorted and unsorted are the same here
+sorted_dates <- sort(unique(format(dates, '%Y%m')))
+#unsorted_dates <- unique(format(dates, '%Y%m'))
+
+# sorted_dates
+obs2 <- Start(dat = path_obs,
+             var = var,
+             date = sorted_dates,
+             time = values(dates),  #dim: [sdate = 2, time = 61]
+             lat = indices(1),
+             lon = indices(1),
+             time_across = 'date',
+             merge_across_dims = TRUE,
+             merge_across_dims_narm = TRUE,
+             split_multiselected_dims = TRUE,
+             synonims = list(lat = c('lat', 'latitude'),
+                             lon = c('lon', 'longitude')),
+             return_vars = list(lon = NULL,
+                                lat = NULL,
+                                time = 'date'),
+             retrieve = TRUE)
+
+# easyNCDF
+easy_file_199311 <- NcOpen(paste0('/esarchive/recon/ecmwf/era5/daily_mean/tas_f1h-r360x181/tas_',
+                                  '199311', '.nc'))
+easy_obs_199311 <- NcToArray(easy_file_199311, vars_to_read = 'tas',
+                             dim_indices = list(lon = c(1), lat = c(1), time = 1:30))
+NcClose(easy_file_199311)
+
+expect_equal(
+dim(obs2),
+c(dat = 1, var = 1, sdate = 2, time = 61, lat = 1, lon = 1)
+)
+expect_equal(
+as.vector(obs2[1, 1, 1, 1:30, 1, 1]),
+as.vector(easy_obs_199311)
+)
+expect_equal(
+as.vector(obs2[1, 1, 1, 31:61, 1, 1]),
+as.vector(obs2[1, 1, 2, 1:31, 1, 1])
+)
+expect_equal(
+as.vector(obs2[1, 1, 2, 31:61, 1, 1]),
+easy_array[31:61 ,1]
+)
+
+})
diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R
index da21f92..9376f9a 100644
--- a/tests/testthat/test-Start-split-merge.R
+++ b/tests/testthat/test-Start-split-merge.R
@@ -1,5 +1,3 @@
-#if (identical(Sys.getenv("NOT_CRAN"), "")) Sys.setenv(NOT_CRAN='true')
-
 context("Start() split + merge dim and value check")
 
 var_name <- 'sfcWind'
-- 
GitLab


From 841b3e3ed0e35b64564ae60f6529b09f2fea0408 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Fri, 14 May 2021 12:12:02 +0200
Subject: [PATCH 3/4] Change default value of 'merge_across_dims_narm' to TRUE.

---
 R/Start.R | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/R/Start.R b/R/Start.R
index 33e9fac..ff4b978 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -632,7 +632,8 @@
 #'  across another dimension. For example, if the dimension 'time' extends 
 #'  across dimension 'chunk', and the time length along the first chunk is 2 
 #'  while along the second chunk is 10. Setting this parameter as TRUE can 
-#'  remove the additional 8 NAs at position 3 to 10. The default value is FALSE.
+#'  remove the additional 8 NAs at position 3 to 10. The default value is TRUE,
+#'  but will be automatically turned to FALSE if 'merge_across_dims = FALSE'.
 #'@param split_multiselected_dims A logical value indicating whether to split a 
 #'  dimension that has been selected with a multidimensional array of selectors
 #'  into as many dimensions as present in the selector array. The default value
@@ -820,7 +821,7 @@ Start <- function(..., # dim = indices/selectors,
                   metadata_dims = NULL, 
                   selector_checker = SelectorChecker,
                   merge_across_dims = FALSE,
-                  merge_across_dims_narm = FALSE,
+                  merge_across_dims_narm = TRUE,
                   split_multiselected_dims = FALSE,
                   path_glob_permissive = FALSE,
                   largest_dims_length = FALSE,
@@ -863,9 +864,6 @@ Start <- function(..., # dim = indices/selectors,
   }
   if (!merge_across_dims & merge_across_dims_narm) {
     merge_across_dims_narm <- FALSE
-    .warning(paste0("Parameter 'merge_across_dims_narm' can only be TRUE when ",
-                   "'merge_across_dims' is TRUE. Set 'merge_across_dims_narm'",
-                   " to FALSE."))
   }
   
   # Leave alone the dimension parameters in the variable dim_params
-- 
GitLab


From e8571282cb73b234e2f34283afd7a68a016e22a0 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Fri, 14 May 2021 12:12:17 +0200
Subject: [PATCH 4/4] Update .Rd

---
 DESCRIPTION            |   2 +-
 man/AddStep.Rd         |   1 -
 man/CDORemapper.Rd     |   1 -
 man/Collect.Rd         |   1 -
 man/Compute.Rd         |  16 +-
 man/NcCloser.Rd        |   1 -
 man/NcDataReader.Rd    |  10 +-
 man/NcDimReader.Rd     |  10 +-
 man/NcOpener.Rd        |   1 -
 man/NcVarReader.Rd     |  10 +-
 man/SelectorChecker.Rd |   4 +-
 man/Sort.Rd            |   9 +-
 man/Start.Rd           | 923 +++++++++++++++++++++--------------------
 man/Step.Rd            |  10 +-
 man/indices.Rd         |   1 -
 man/values.Rd          |   1 -
 16 files changed, 516 insertions(+), 485 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 5a33c3c..2dfda5d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -38,4 +38,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/
 BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues
 LazyData: true
 SystemRequirements: cdo ecFlow
-RoxygenNote: 5.0.0
+RoxygenNote: 7.0.1
diff --git a/man/AddStep.Rd b/man/AddStep.Rd
index 3eece05..0d0ce46 100644
--- a/man/AddStep.Rd
+++ b/man/AddStep.Rd
@@ -54,4 +54,3 @@ create the complete workflow. It is the final step before data processing.
  wf <- AddStep(data, step, pi_val = pi_short)
 
 }
-
diff --git a/man/CDORemapper.Rd b/man/CDORemapper.Rd
index 4f56baa..763be77 100644
--- a/man/CDORemapper.Rd
+++ b/man/CDORemapper.Rd
@@ -65,4 +65,3 @@ perform the interpolation, hence CDO is required to be installed.
 \seealso{
 \code{\link[s2dverification]{CDORemap}}
 }
-
diff --git a/man/Collect.Rd b/man/Collect.Rd
index 44a7dee..97b529b 100644
--- a/man/Collect.Rd
+++ b/man/Collect.Rd
@@ -83,4 +83,3 @@ of results as one data array when the execution is done. See more details on
  }
 
 }
-
diff --git a/man/Compute.Rd b/man/Compute.Rd
index e07106a..7d6db4d 100644
--- a/man/Compute.Rd
+++ b/man/Compute.Rd
@@ -4,9 +4,18 @@
 \alias{Compute}
 \title{Specify the execution parameters and trigger the execution}
 \usage{
-Compute(workflow, chunks = "auto", threads_load = 1, threads_compute = 1,
-  cluster = NULL, ecflow_suite_dir = NULL, ecflow_server = NULL,
-  silent = FALSE, debug = FALSE, wait = TRUE)
+Compute(
+  workflow,
+  chunks = "auto",
+  threads_load = 1,
+  threads_compute = 1,
+  cluster = NULL,
+  ecflow_suite_dir = NULL,
+  ecflow_server = NULL,
+  silent = FALSE,
+  debug = FALSE,
+  wait = TRUE
+)
 }
 \arguments{
 \item{workflow}{A list of the class 'startR_workflow' returned by function 
@@ -104,4 +113,3 @@ arrays and additional metadata.
  res <- Compute(wf, chunks = list(longitude = 4, sdate = 2))
 
 }
-
diff --git a/man/NcCloser.Rd b/man/NcCloser.Rd
index 65beab8..588f63a 100644
--- a/man/NcCloser.Rd
+++ b/man/NcCloser.Rd
@@ -32,4 +32,3 @@ NcCloser(connection)
 \code{\link{NcOpener}} \code{\link{NcDataReader}} 
  \code{\link{NcDimReader}} \code{\link{NcVarReader}}
 }
-
diff --git a/man/NcDataReader.Rd b/man/NcDataReader.Rd
index a6d32c7..9014789 100644
--- a/man/NcDataReader.Rd
+++ b/man/NcDataReader.Rd
@@ -4,8 +4,13 @@
 \alias{NcDataReader}
 \title{NetCDF file data reader for 'startR'}
 \usage{
-NcDataReader(file_path = NULL, file_object = NULL, file_selectors = NULL,
-  inner_indices = NULL, synonims)
+NcDataReader(
+  file_path = NULL,
+  file_object = NULL,
+  file_selectors = NULL,
+  inner_indices = NULL,
+  synonims
+)
 }
 \arguments{
 \item{file_path}{A character string indicating the path to the data file to 
@@ -61,4 +66,3 @@ in turn uses nc_var_get() in the package 'ncdf4'.
 \code{\link{NcOpener}} \code{\link{NcDimReader}} 
  \code{\link{NcCloser}} \code{\link{NcVarReader}}
 }
-
diff --git a/man/NcDimReader.Rd b/man/NcDimReader.Rd
index d539ffd..38dd870 100644
--- a/man/NcDimReader.Rd
+++ b/man/NcDimReader.Rd
@@ -4,8 +4,13 @@
 \alias{NcDimReader}
 \title{NetCDF dimension reader for 'startR'}
 \usage{
-NcDimReader(file_path = NULL, file_object = NULL, file_selectors = NULL,
-  inner_indices = NULL, synonims)
+NcDimReader(
+  file_path = NULL,
+  file_object = NULL,
+  file_selectors = NULL,
+  inner_indices = NULL,
+  synonims
+)
 }
 \arguments{
 \item{file_path}{A character string indicating the path to the data file to 
@@ -58,4 +63,3 @@ This function uses the function NcReadDims() in the package 'easyNCDF'.
 \code{\link{NcOpener}} \code{\link{NcDataReader}} 
  \code{\link{NcCloser}} \code{\link{NcVarReader}}
 }
-
diff --git a/man/NcOpener.Rd b/man/NcOpener.Rd
index e46384c..30885fc 100644
--- a/man/NcOpener.Rd
+++ b/man/NcOpener.Rd
@@ -34,4 +34,3 @@ NcCloser(connection)
 \code{\link{NcDimReader}} \code{\link{NcDataReader}} 
  \code{\link{NcCloser}} \code{\link{NcVarReader}}
 }
-
diff --git a/man/NcVarReader.Rd b/man/NcVarReader.Rd
index c601907..fb093ae 100644
--- a/man/NcVarReader.Rd
+++ b/man/NcVarReader.Rd
@@ -4,8 +4,13 @@
 \alias{NcVarReader}
 \title{NetCDF variable reader for 'startR'}
 \usage{
-NcVarReader(file_path = NULL, file_object = NULL, file_selectors = NULL,
-  var_name = NULL, synonims)
+NcVarReader(
+  file_path = NULL,
+  file_object = NULL,
+  file_selectors = NULL,
+  var_name = NULL,
+  synonims
+)
 }
 \arguments{
 \item{file_path}{A character string indicating the path to the data file to 
@@ -58,4 +63,3 @@ nc_var_get() in the package 'ncdf4'.
 \code{\link{NcOpener}} \code{\link{NcDataReader}} 
  \code{\link{NcCloser}} \code{\link{NcDimReader}}
 }
-
diff --git a/man/SelectorChecker.Rd b/man/SelectorChecker.Rd
index ef83575..e1cf112 100644
--- a/man/SelectorChecker.Rd
+++ b/man/SelectorChecker.Rd
@@ -4,8 +4,7 @@
 \alias{SelectorChecker}
 \title{Translate a set of selectors into a set of numeric indices}
 \usage{
-SelectorChecker(selectors, var = NULL, return_indices = TRUE,
-  tolerance = NULL)
+SelectorChecker(selectors, var = NULL, return_indices = TRUE, tolerance = NULL)
 }
 \arguments{
 \item{selectors}{A vector or a list of two of numeric indices or variable 
@@ -50,4 +49,3 @@ sub_array_of_values <- seq(90, -90, length.out = 258)[2:257]
 SelectorChecker(sub_array_of_selectors, sub_array_of_values)
 
 }
-
diff --git a/man/Sort.Rd b/man/Sort.Rd
index 9ab516e..25a92fe 100644
--- a/man/Sort.Rd
+++ b/man/Sort.Rd
@@ -1,8 +1,8 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/Sort.R
 \name{Sort}
-\alias{CircularSort}
 \alias{Sort}
+\alias{CircularSort}
 \title{Sort the coordinate variable values in a Start() call}
 \usage{
 Sort(...)
@@ -10,12 +10,12 @@ Sort(...)
 CircularSort(start, end, ...)
 }
 \arguments{
+\item{\dots}{Additional parameters to adjust the reorderig. See function
+sort() for more details.}
+
 \item{start}{A numeric indicating the lower bound of the circular range.}
 
 \item{end}{A numeric indicating the upper bound of the circular range.}
-
-\item{\dots}{Additional parameters to adjust the reorderig. See function
-sort() for more details.}
 }
 \value{
 A list of 2 containing:
@@ -57,4 +57,3 @@ range. This is useful for circular coordinates such as the Earth longitudes.
                retrieve = FALSE)
 
 }
-
diff --git a/man/Start.Rd b/man/Start.Rd
index 680168e..efd258f 100644
--- a/man/Start.Rd
+++ b/man/Start.Rd
@@ -4,408 +4,36 @@
 \alias{Start}
 \title{Declare, discover, subset and retrieve multidimensional distributed data sets}
 \usage{
-Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener,
-  file_var_reader = NcVarReader, file_dim_reader = NcDimReader,
-  file_data_reader = NcDataReader, file_closer = NcCloser,
-  transform = NULL, transform_params = NULL, transform_vars = NULL,
-  transform_extra_cells = 2, apply_indices_after_transform = FALSE,
-  pattern_dims = NULL, metadata_dims = NULL,
-  selector_checker = SelectorChecker, merge_across_dims = FALSE,
-  merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE,
-  path_glob_permissive = FALSE, largest_dims_length = FALSE,
-  retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL,
-  silent = FALSE, debug = FALSE)
+Start(
+  ...,
+  return_vars = NULL,
+  synonims = NULL,
+  file_opener = NcOpener,
+  file_var_reader = NcVarReader,
+  file_dim_reader = NcDimReader,
+  file_data_reader = NcDataReader,
+  file_closer = NcCloser,
+  transform = NULL,
+  transform_params = NULL,
+  transform_vars = NULL,
+  transform_extra_cells = 2,
+  apply_indices_after_transform = FALSE,
+  pattern_dims = NULL,
+  metadata_dims = NULL,
+  selector_checker = SelectorChecker,
+  merge_across_dims = FALSE,
+  merge_across_dims_narm = TRUE,
+  split_multiselected_dims = FALSE,
+  path_glob_permissive = FALSE,
+  largest_dims_length = FALSE,
+  retrieve = FALSE,
+  num_procs = 1,
+  ObjectBigmemory = NULL,
+  silent = FALSE,
+  debug = FALSE
+)
 }
 \arguments{
-\item{return_vars}{A named list where the names are the names of the 
-variables to be fetched in the files, and the values are vectors of 
-character strings with the names of the file dimension which to retrieve each
-variable for, or NULL if the variable has to be retrieved only once 
-from any (the first) of the involved files.\cr\cr
-Apart from retrieving a multidimensional data array, retrieving auxiliary 
-variables inside the files can also be needed. The parameter 
-'return_vars' allows for requesting such variables, as long as a 
-'file_var_reader' function is also specified in the call to 
-Start() (see documentation on the corresponding parameter). 
-\cr\cr
-In the case of the the item sales example (see documentation on parameter 
-\code{\dots)}, the store location variable is requested with the parameter\cr 
-\code{return_vars = list(store_location = NULL)}.\cr This will cause 
-Start() to fetch once the variable 'store_location' and return it in 
-the component\cr \code{$Variables$common$store_location},\cr and will be an 
-array of character strings with the location names, with the dimensions 
-\code{c('store' = 100)}. Although useless in this example, we could ask 
-Start() to fetch and return such variable for each file along the 
-items dimension as follows: \cr 
-\code{return_vars = list(store_location = c('item'))}.\cr In that case, the 
-variable will be fetched once from a file of each of the items, and will be 
-returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}.
-\cr\cr
-If a variable is requested along a file dimension that contains path pattern 
-specifications ('source' in the example), the fetched variable values will be 
-returned in the component\cr \code{$Variables$<dataset_name>$<variable_name>}.\cr 
-For example:
-\cr
-\command{
-\cr # data <- Start(source = list(
-\cr #                 list(name = 'sourceA',
-\cr #                      path = paste0('/sourceA/$variable$/',
-\cr #                                    '$section$/$item$.data')),
-\cr #                 list(name = 'sourceB',
-\cr #                      path = paste0('/sourceB/$section$/',
-\cr #                                    '$variable$/$item$.data'))
-\cr #               ),
-\cr #               variable = 'sales',
-\cr #               section = 'first',
-\cr #               item = indices(c(1, 3)),
-\cr #               item_depends = 'section',
-\cr #               store = 'Barcelona',
-\cr #               store_var = 'store_location',
-\cr #               month = 'all',
-\cr #               return_vars = list(store_location = c('source',
-\cr #                                                     'item')))
-\cr # # Checking the structure of the returned variables
-\cr # str(found_data$Variables)
-\cr # Named list
-\cr # ..$common: NULL
-\cr # ..$sourceA: Named list
-\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ...
-\cr # ..$sourceB: Named list
-\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ...
-\cr # # Checking the dimensions of the returned variable
-\cr # # for the source A
-\cr # dim(found_data$Variables$sourceA)
-\cr #     item   store
-\cr #        3       3
-}
-\cr\cr
-The names of the requested variables do not necessarily have to match the 
-actual variable names inside the files. A list of alternative names to be 
-seeked can be specified via the parameter 'synonims'.}
-
-\item{synonims}{A named list where the names are the requested variable or 
-dimension names, and the values are vectors of character strings with 
-alternative names to seek for such dimension or variable.\cr\cr
-In some requests, data from different sources may follow different naming 
-conventions for the dimensions or variables, or even files in the same source
-could have varying names. This parameter is in order for Start() to 
-properly identify the dimensions or variables with different names.
-\cr\cr
-In the example used in parameter 'return_vars', it may be the case that 
-the two involved data sources follow slightly different naming conventions. 
-For example, source A uses 'sect' as name for the sections dimension, whereas 
-source B uses 'section'; source A uses 'store_loc' as variable name for the 
-store locations, whereas source B uses 'store_location'. This can be taken 
-into account as follows:
-\cr
-\command{
-\cr # data <- Start(source = list(
-\cr #                 list(name = 'sourceA',
-\cr #                      path = paste0('/sourceA/$variable$/',
-\cr #                                    '$section$/$item$.data')),
-\cr #                 list(name = 'sourceB',
-\cr #                      path = paste0('/sourceB/$section$/',
-\cr #                                    '$variable$/$item$.data'))
-\cr #               ),
-\cr #               variable = 'sales',
-\cr #               section = 'first',
-\cr #               item = indices(c(1, 3)),
-\cr #               item_depends = 'section',
-\cr #               store = 'Barcelona',
-\cr #               store_var = 'store_location',
-\cr #               month = 'all',
-\cr #               return_vars = list(store_location = c('source',
-\cr #                                                     'item')),
-\cr #               synonims = list(
-\cr #                 section = c('sec', 'section'),
-\cr #                 store_location = c('store_loc',
-\cr #                                    'store_location')
-\cr #               ))
-}
-\cr}
-
-\item{file_opener}{A function that receives as a single parameter 
- 'file_path' a character string with the path to a file to be opened, 
- and returns an object with an open connection to the file (optionally with 
- header information) on success, or returns NULL on failure.
-\cr\cr
-This parameter takes by default NcOpener() (an opener function for NetCDF
-files).
-\cr\cr
-See NcOpener() for a template to build a file opener for your own file 
-format.}
-
-\item{file_var_reader}{A function with the header \code{file_path = NULL}, 
- \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, 
- \code{synonims} that returns an array with auxiliary data (i.e. data from a
- variable) inside a file. Start() will provide automatically either a 
- 'file_path' or a 'file_object' to the 'file_var_reader'
- function (the function has to be ready to work whichever of these two is 
- provided). The parameter 'file_selectors' will also be provided 
- automatically to the variable reader, containing a named list where the 
- names are the names of the file dimensions of the queried data set (see 
- documentation on \code{\dots}) and the values are single character strings 
- with the components used to build the path to the file being read (the one 
- provided in 'file_path' or 'file_object'). The parameter 'var_name'
- will be filled in automatically by Start() also, with the name of one
- of the variales to be read. The parameter 'synonims' will be filled in 
- with exactly the same value as provided in the parameter 'synonims' in 
- the call to Start(), and has to be used in the code of the variable 
- reader to check for alternative variable names inside the target file. The 
- 'file_var_reader' must return a (multi)dimensional array with named 
- dimensions, and optionally with the attribute 'variales' with other 
- additional metadata on the retrieved variable.
-\cr\cr
-Usually, the 'file_var_reader' should be a degenerate case of the 
-'file_data_reader' (see documentation on the corresponding parameter), 
-so it is recommended to code the 'file_data_reder' in first place.
-\cr\cr
-This parameter takes by default NcVarReader() (a variable reader function
-for NetCDF files).
-\cr\cr
-See NcVarReader() for a template to build a variale reader for your own 
-file format.}
-
-\item{file_dim_reader}{A function with the header \code{file_path = NULL}, 
- \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} 
- that returns a named numeric vector where the names are the names of the 
- dimensions of the multidimensional data array in the file and the values are
- the sizes of such dimensions. Start() will provide automatically 
- either a 'file_path' or a 'file_object' to the 
- 'file_dim_reader' function (the function has to be ready to work 
- whichever of these two is provided). The parameter 'file_selectors'
- will also be provided automatically to the dimension reader, containing a
- named list where the names are the names of the file dimensions of the 
- queried data set (see documentation on \code{\dots}) and the values are 
- single character strings with the components used to build the path to the 
- file being read (the one provided in 'file_path' or 'file_object'). 
- The parameter 'synonims' will be filled in with exactly the same value 
- as provided in the parameter 'synonims' in the call to Start(), 
- and can optionally be used in advanced configurations.
-\cr\cr
-This parameter takes by default NcDimReader() (a dimension reader 
-function for NetCDF files).
-\cr\cr
-See NcDimReader() for (an advanced) template to build a dimension reader
-for your own file format.}
-
-\item{file_data_reader}{A function with the header \code{file_path = NULL}, 
- \code{file_object = NULL}, \code{file_selectors = NULL}, 
- \code{inner_indices = NULL}, \code{synonims} that returns a subset of the 
- multidimensional data array inside a file (even if internally it is not an 
- array). Start() will provide automatically either a 'file_path'
- or a 'file_object' to the 'file_data_reader' function (the 
- function has to be ready to work whichever of these two is provided). The
- parameter 'file_selectors' will also be provided automatically to the
- data reader, containing a named list where the names are the names of the
- file dimensions of the queried data set (see documentation on \code{\dots})
- and the values are single character strings with the components used to 
- build the path to the file being read (the one provided in 'file_path' or 
- 'file_object'). The parameter 'inner_indices' will be filled in 
- automatically by Start() also, with a named list of numeric vectors, 
- where the names are the names of all the expected inner dimensions in a file
- to be read, and the numeric vectors are the indices to be taken from the 
- corresponding dimension (the indices may not be consecutive nor in order).
- The parameter 'synonims' will be filled in with exactly the same value 
- as provided in the parameter 'synonims' in the call to Start(), 
- and has to be used in the code of the data reader to check for alternative 
- dimension names inside the target file. The 'file_data_reader' must 
- return a (multi)dimensional array with named dimensions, and optionally with
- the attribute 'variables' with other additional metadata on the retrieved 
- data.
-\cr\cr
-Usually, 'file_data_reader' should use 'file_dim_reader'
-(see documentation on the corresponding parameter), so it is recommended to 
-code 'file_dim_reder' in first place.
-\cr\cr
-This parameter takes by default NcDataReader() (a data reader function 
-for NetCDF files).
-\cr\cr
-See NcDataReader() for a template to build a data reader for your own 
-file format.}
-
-\item{file_closer}{A function that receives as a single parameter 
- 'file_object' an open connection (as returned by 'file_opener') 
- to one of the files to be read, optionally with header information, and 
- closes the open connection. Always returns NULL.
-\cr\cr
-This parameter takes by default NcCloser() (a closer function for NetCDF 
-files).
-\cr\cr
-See NcCloser() for a template to build a file closer for your own file 
-format.}
-
-\item{transform}{A function with the header \code{dara_array}, 
-\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as
-input, through the parameter \code{data_array}, a subset of a 
-multidimensional array (as returned by 'file_data_reader'), applies a 
-transformation to it and returns it, preserving the amount of dimensions but
-potentially modifying their size. This transformation may require data from 
-other auxiliary variables, automatically provided to 'transform' 
-through the parameter 'variables', in the form of a named list where
-the names are the variable names and the values are (multi)dimensional
-arrays. Which variables need to be sent to 'transform' can be specified
-with the parameter 'transform_vars' in Start(). The parameter 
-'file_selectors' will also be provided automatically to 
-'transform', containing a named list where the names are the names of 
-the file dimensions of the queried data set (see documentation on 
-\code{\dots}) and the values are single character strings with the 
-components used to build the path to the file the subset being processed 
-belongs to. The parameter \code{\dots} will be filled in with other 
-additional parameters to adjust the transformation, exactly as provided in 
-the call to Start() via the parameter 'transform_params'.}
-
-\item{transform_params}{A named list with additional parameters to be sent to 
-the 'transform' function (if specified). See documentation on parameter
-'transform' for details.}
-
-\item{transform_vars}{A vector of character strings with the names of 
-auxiliary variables to be sent to the 'transform' function (if 
-specified). All the variables to be sent to 'transform' must also 
-have been requested as return variables in the parameter 'return_vars' 
-of Start().}
-
-\item{transform_extra_cells}{An integer of extra indices to retrieve from the 
-data set, beyond the requested indices in \code{\dots}, in order for 
-'transform' to dispose of additional information to properly apply 
-whichever transformation (if needed). As many as 
-'transform_extra_cells' will be retrieved beyond each of the limits for
-each of those inner dimensions associated to a coordinate variable and sent 
-to 'transform' (i.e. present in 'transform_vars'). After 
-'transform' has finished, Start() will take again and return a 
-subset of the result, for the returned data to fall within the specified 
-bounds in \code{\dots}. The default value is 2.}
-
-\item{apply_indices_after_transform}{A logical value indicating when a 
-'transform' is specified in Start() and numeric indices are 
-provided for any of the inner dimensions that depend on coordinate variables,
-these numeric indices can be made effective (retrieved) before applying the 
-transformation or after. The boolean flag allows to adjust this behaviour. 
-It takes FALSE by default (numeric indices are applied before sending
-data to 'transform').}
-
-\item{pattern_dims}{A character string indicating the name of the dimension 
-with path pattern specifications (see \code{\dots} for details). If not  
-specified, Start() assumes the first provided dimension is the pattern 
-dimension, with a warning.}
-
-\item{metadata_dims}{A vector of character strings with the names of the file 
-dimensions which to return metadata for. As noted in 'file_data_reader', 
-the data reader can optionally return auxiliary data via the attribute 
-'variables' of the returned array. Start() by default returns the 
-auxiliary data read for only the first file of each source (or data set) in 
-the pattern dimension (see \code{\dots} for info on what the pattern 
-dimension is). However it can be configured to return the metadata for all 
-the files along any set of file dimensions. The default value is NULL, and
-it will be assigned automatically as parameter 'pattern_dims'.}
-
-\item{selector_checker}{A function used internaly by Start() to 
-translate a set of selectors (values for a dimension associated to a 
-coordinate variable) into a set of numeric indices. It takes by default 
-SelectorChecker() and, in principle, it should not be required to 
-change it for customized file formats. The option to replace it is left open
-for more versatility. See the code of SelectorChecker() for details on
-the inputs, functioning and outputs of a selector checker.}
-
-\item{merge_across_dims}{A logical value indicating whether to merge 
-dimensions across which another dimension extends (according to the 
-'<dimname>_across' parameters). Takes the value FALSE by default. For 
-example, if the dimension 'time' extends across the dimension 'chunk' and 
-\code{merge_across_dims = TRUE}, the resulting data array will only contain
-only the dimension 'time' as long as all the chunks together.}
-
-\item{merge_across_dims_narm}{A logical value indicating whether to remove
-the additional NAs from data when parameter 'merge_across_dims' is TRUE.
-It is helpful when the length of the to-be-merged dimension is different 
-across another dimension. For example, if the dimension 'time' extends 
-across dimension 'chunk', and the time length along the first chunk is 2 
-while along the second chunk is 10. Setting this parameter as TRUE can 
-remove the additional 8 NAs at position 3 to 10. The default value is FALSE.}
-
-\item{split_multiselected_dims}{A logical value indicating whether to split a 
-dimension that has been selected with a multidimensional array of selectors
-into as many dimensions as present in the selector array. The default value
-is FALSE.}
-
-\item{path_glob_permissive}{A logical value or an integer specifying how many
- folder levels in the path pattern, beginning from the end, the shell glob
- expressions must be preserved and worked out for each file. The default 
- value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr
-When specifying a path pattern for a dataset, it might contain shell glob 
-experissions. For each dataset, the first file matching the path pattern is 
-found, and the found file is used to work out fixed values for the glob 
-expressions that will be used for all the files of the dataset. However, in 
-some cases, the values of the shell glob expressions may not be constant for 
-all files in a dataset, and they need to be worked out for each file 
-involved.\cr\cr
-For example, a path pattern could be as follows: \cr
-\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving 
-\code{path_glob_permissive = FALSE} will trigger automatic seek of the 
- contents to replace the asterisks (e.g. the first asterisk matches with 
- \code{'bar'} and the second with \code{'baz'}. The found contents will be 
- used for all files in the dataset (in the example, the path pattern will be
- fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if
- any of the files in the dataset have other contents in the position of the
- asterisks, Start() will not find them (in the example, a file like \cr
- \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be
- found). Setting \code{path_glob_permissive = 1} would preserve global
- expressions in the latest level (in the example, the fixed path pattern
- would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the
- problematic file mentioned before would be found), but of course this would
- slow down the Start() call if the dataset involves a large number of
- files. Setting \code{path_glob_permissive = 2} would leave the original path
- pattern with the original glob expressions in the 1st and 2nd levels (in the
- example, both asterisks would be preserved, thus would allow Start()
- to recognize files such as \cr
- \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr
-Note that each glob expression can only represent one possibility (Start() 
-chooses the first). Because /code{*} is not the tag, which means it cannot
-be a dimension of the output array. Therefore, only one possibility can be
-adopted. For example, if \cr
-\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr
-has two matches:\cr
-\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr
-\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr
-only the first found file will be used.}
-
-\item{largest_dims_length}{A logical value or a named integer vector
- indicating if Start() should examine all the files to get the largest 
- length of the inner dimensions (TRUE) or use the first valid file of each 
- dataset as the returned dimension length (FALSE). Since examining all the 
- files could be time-consuming, a vector can be used to explicitly specify
- the expected length of the inner dimensions. For those inner dimensions not
- specified, the first valid file will be used. The default value is FALSE.\cr\cr
- This parameter is useful when the required files don't have consistent 
- inner dimension. For example, there are 10 required experimental data files
- of a series of start dates. The data only contain 25 members for the first
- 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'},
- the returned member dimension length will be 25 only. The 26th to 51st 
- members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'},
- the returned member dimension length will be 51. To save the resource,
-\code{'largest_dims_length = c(member = 51)'} can also be used.}
-
-\item{retrieve}{A logical value indicating whether to retrieve the data
-defined in the Start() call or to explore only its dimension lengths 
-and names, and the values for the file and inner dimensions. The default
-value is FALSE.}
-
-\item{num_procs}{An integer of number of processes to be created for the
-parallel execution of the retrieval/transformation/arrangement of the
-multiple involved files in a call to Start(). If set to NULL,
-takes the number of available cores (as detected by detectCores() in 
-the package 'future'). The default value is 1 (no parallel execution).}
-
-\item{ObjectBigmemory}{a character string to be included as part of the 
-bigmemory object name. This parameter is thought to be used internally by the
-chunking capabilities of startR.}
-
-\item{silent}{A logical value of whether to display progress messages (FALSE)
-or not (TRUE). The default value is FALSE.}
-
-\item{debug}{A logical value of whether to return detailed messages on the
-progress and operations in a Start() call (TRUE) or not (FALSE). The
-default value is FALSE.}
-
 \item{\dots}{A selection of custemized parameters depending on the data 
 format. When we retrieve data from one or a collection of data sets, 
 the involved data can be perceived as belonging to a large multi-dimensional 
@@ -620,61 +248,451 @@ as they have been specified in the call. For example, the following call:
 \cr #               variable = 'all')
 }
 \cr\cr
-would return an array with the following dimensions:
-\cr
-\command{
-\cr #  source    month    store      item  section variable
-\cr #       1       24      100         3        2        2
-}
+would return an array with the following dimensions:
+\cr
+\command{
+\cr #  source    month    store      item  section variable
+\cr #       1       24      100         3        2        2
+}
+\cr\cr
+Next, a more advanced example to retrieve data for only the sales records, for
+the first section ('electronics'), for the 1st and 3rd items and for the 
+stores located in Barcelona (assuming the files contain the variable 
+'store_location' with the name of the city each of the 100 stores are located 
+at):
+\cr
+\command{
+\cr # data <- Start(source = paste0('/data/$variable$/',
+\cr #                               '$section$/$item$.data'),
+\cr #               variable = 'sales',
+\cr #               section = 'first',
+\cr #               item = indices(c(1, 3)),
+\cr #               item_depends = 'section',
+\cr #               store = 'Barcelona',
+\cr #               store_var = 'store_location',
+\cr #               month = 'all',
+\cr #               return_vars = list(store_location = NULL))
+}
+\cr\cr
+The defined names for the dimensions do not necessarily have to match the 
+names of the dimensions inside the file. Lists of alternative names to be 
+seeked can be defined in the parameter 'synonims'.
+\cr\cr
+If data from multiple sources (not necessarily following the same structure) 
+has to be retrieved, it can be done by providing a vector of character strings
+with path pattern specifications, or, in the extended form, by providing a 
+list of lists with the components 'name' and 'path', and the name of the 
+dataset and path pattern as values, respectively. For example:
+\cr
+\command{
+\cr # data <- Start(source = list(
+\cr #                 list(name = 'sourceA',
+\cr #                      path = paste0('/sourceA/$variable$/',
+\cr #                                    '$section$/$item$.data')),
+\cr #                 list(name = 'sourceB',
+\cr #                      path = paste0('/sourceB/$section$/',
+\cr #                                    '$variable$/$item$.data'))
+\cr #               ),
+\cr #               variable = 'sales',
+\cr #               section = 'first',
+\cr #               item = indices(c(1, 3)),
+\cr #               item_depends = 'section',
+\cr #               store = 'Barcelona',
+\cr #               store_var = 'store_location',
+\cr #               month = 'all',
+\cr #               return_vars = list(store_location = NULL))
+}
+\cr}
+
+\item{return_vars}{A named list where the names are the names of the 
+variables to be fetched in the files, and the values are vectors of 
+character strings with the names of the file dimension which to retrieve each
+variable for, or NULL if the variable has to be retrieved only once 
+from any (the first) of the involved files.\cr\cr
+Apart from retrieving a multidimensional data array, retrieving auxiliary 
+variables inside the files can also be needed. The parameter 
+'return_vars' allows for requesting such variables, as long as a 
+'file_var_reader' function is also specified in the call to 
+Start() (see documentation on the corresponding parameter). 
+\cr\cr
+In the case of the the item sales example (see documentation on parameter 
+\code{\dots)}, the store location variable is requested with the parameter\cr 
+\code{return_vars = list(store_location = NULL)}.\cr This will cause 
+Start() to fetch once the variable 'store_location' and return it in 
+the component\cr \code{$Variables$common$store_location},\cr and will be an 
+array of character strings with the location names, with the dimensions 
+\code{c('store' = 100)}. Although useless in this example, we could ask 
+Start() to fetch and return such variable for each file along the 
+items dimension as follows: \cr 
+\code{return_vars = list(store_location = c('item'))}.\cr In that case, the 
+variable will be fetched once from a file of each of the items, and will be 
+returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}.
+\cr\cr
+If a variable is requested along a file dimension that contains path pattern 
+specifications ('source' in the example), the fetched variable values will be 
+returned in the component\cr \code{$Variables$<dataset_name>$<variable_name>}.\cr 
+For example:
+\cr
+\command{
+\cr # data <- Start(source = list(
+\cr #                 list(name = 'sourceA',
+\cr #                      path = paste0('/sourceA/$variable$/',
+\cr #                                    '$section$/$item$.data')),
+\cr #                 list(name = 'sourceB',
+\cr #                      path = paste0('/sourceB/$section$/',
+\cr #                                    '$variable$/$item$.data'))
+\cr #               ),
+\cr #               variable = 'sales',
+\cr #               section = 'first',
+\cr #               item = indices(c(1, 3)),
+\cr #               item_depends = 'section',
+\cr #               store = 'Barcelona',
+\cr #               store_var = 'store_location',
+\cr #               month = 'all',
+\cr #               return_vars = list(store_location = c('source',
+\cr #                                                     'item')))
+\cr # # Checking the structure of the returned variables
+\cr # str(found_data$Variables)
+\cr # Named list
+\cr # ..$common: NULL
+\cr # ..$sourceA: Named list
+\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ...
+\cr # ..$sourceB: Named list
+\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ...
+\cr # # Checking the dimensions of the returned variable
+\cr # # for the source A
+\cr # dim(found_data$Variables$sourceA)
+\cr #     item   store
+\cr #        3       3
+}
+\cr\cr
+The names of the requested variables do not necessarily have to match the 
+actual variable names inside the files. A list of alternative names to be 
+seeked can be specified via the parameter 'synonims'.}
+
+\item{synonims}{A named list where the names are the requested variable or 
+dimension names, and the values are vectors of character strings with 
+alternative names to seek for such dimension or variable.\cr\cr
+In some requests, data from different sources may follow different naming 
+conventions for the dimensions or variables, or even files in the same source
+could have varying names. This parameter is in order for Start() to 
+properly identify the dimensions or variables with different names.
+\cr\cr
+In the example used in parameter 'return_vars', it may be the case that 
+the two involved data sources follow slightly different naming conventions. 
+For example, source A uses 'sect' as name for the sections dimension, whereas 
+source B uses 'section'; source A uses 'store_loc' as variable name for the 
+store locations, whereas source B uses 'store_location'. This can be taken 
+into account as follows:
+\cr
+\command{
+\cr # data <- Start(source = list(
+\cr #                 list(name = 'sourceA',
+\cr #                      path = paste0('/sourceA/$variable$/',
+\cr #                                    '$section$/$item$.data')),
+\cr #                 list(name = 'sourceB',
+\cr #                      path = paste0('/sourceB/$section$/',
+\cr #                                    '$variable$/$item$.data'))
+\cr #               ),
+\cr #               variable = 'sales',
+\cr #               section = 'first',
+\cr #               item = indices(c(1, 3)),
+\cr #               item_depends = 'section',
+\cr #               store = 'Barcelona',
+\cr #               store_var = 'store_location',
+\cr #               month = 'all',
+\cr #               return_vars = list(store_location = c('source',
+\cr #                                                     'item')),
+\cr #               synonims = list(
+\cr #                 section = c('sec', 'section'),
+\cr #                 store_location = c('store_loc',
+\cr #                                    'store_location')
+\cr #               ))
+}
+\cr}
+
+\item{file_opener}{A function that receives as a single parameter 
+ 'file_path' a character string with the path to a file to be opened, 
+ and returns an object with an open connection to the file (optionally with 
+ header information) on success, or returns NULL on failure.
+\cr\cr
+This parameter takes by default NcOpener() (an opener function for NetCDF
+files).
+\cr\cr
+See NcOpener() for a template to build a file opener for your own file 
+format.}
+
+\item{file_var_reader}{A function with the header \code{file_path = NULL}, 
+ \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, 
+ \code{synonims} that returns an array with auxiliary data (i.e. data from a
+ variable) inside a file. Start() will provide automatically either a 
+ 'file_path' or a 'file_object' to the 'file_var_reader'
+ function (the function has to be ready to work whichever of these two is 
+ provided). The parameter 'file_selectors' will also be provided 
+ automatically to the variable reader, containing a named list where the 
+ names are the names of the file dimensions of the queried data set (see 
+ documentation on \code{\dots}) and the values are single character strings 
+ with the components used to build the path to the file being read (the one 
+ provided in 'file_path' or 'file_object'). The parameter 'var_name'
+ will be filled in automatically by Start() also, with the name of one
+ of the variales to be read. The parameter 'synonims' will be filled in 
+ with exactly the same value as provided in the parameter 'synonims' in 
+ the call to Start(), and has to be used in the code of the variable 
+ reader to check for alternative variable names inside the target file. The 
+ 'file_var_reader' must return a (multi)dimensional array with named 
+ dimensions, and optionally with the attribute 'variales' with other 
+ additional metadata on the retrieved variable.
+\cr\cr
+Usually, the 'file_var_reader' should be a degenerate case of the 
+'file_data_reader' (see documentation on the corresponding parameter), 
+so it is recommended to code the 'file_data_reder' in first place.
+\cr\cr
+This parameter takes by default NcVarReader() (a variable reader function
+for NetCDF files).
+\cr\cr
+See NcVarReader() for a template to build a variale reader for your own 
+file format.}
+
+\item{file_dim_reader}{A function with the header \code{file_path = NULL}, 
+ \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} 
+ that returns a named numeric vector where the names are the names of the 
+ dimensions of the multidimensional data array in the file and the values are
+ the sizes of such dimensions. Start() will provide automatically 
+ either a 'file_path' or a 'file_object' to the 
+ 'file_dim_reader' function (the function has to be ready to work 
+ whichever of these two is provided). The parameter 'file_selectors'
+ will also be provided automatically to the dimension reader, containing a
+ named list where the names are the names of the file dimensions of the 
+ queried data set (see documentation on \code{\dots}) and the values are 
+ single character strings with the components used to build the path to the 
+ file being read (the one provided in 'file_path' or 'file_object'). 
+ The parameter 'synonims' will be filled in with exactly the same value 
+ as provided in the parameter 'synonims' in the call to Start(), 
+ and can optionally be used in advanced configurations.
+\cr\cr
+This parameter takes by default NcDimReader() (a dimension reader 
+function for NetCDF files).
+\cr\cr
+See NcDimReader() for (an advanced) template to build a dimension reader
+for your own file format.}
+
+\item{file_data_reader}{A function with the header \code{file_path = NULL}, 
+ \code{file_object = NULL}, \code{file_selectors = NULL}, 
+ \code{inner_indices = NULL}, \code{synonims} that returns a subset of the 
+ multidimensional data array inside a file (even if internally it is not an 
+ array). Start() will provide automatically either a 'file_path'
+ or a 'file_object' to the 'file_data_reader' function (the 
+ function has to be ready to work whichever of these two is provided). The
+ parameter 'file_selectors' will also be provided automatically to the
+ data reader, containing a named list where the names are the names of the
+ file dimensions of the queried data set (see documentation on \code{\dots})
+ and the values are single character strings with the components used to 
+ build the path to the file being read (the one provided in 'file_path' or 
+ 'file_object'). The parameter 'inner_indices' will be filled in 
+ automatically by Start() also, with a named list of numeric vectors, 
+ where the names are the names of all the expected inner dimensions in a file
+ to be read, and the numeric vectors are the indices to be taken from the 
+ corresponding dimension (the indices may not be consecutive nor in order).
+ The parameter 'synonims' will be filled in with exactly the same value 
+ as provided in the parameter 'synonims' in the call to Start(), 
+ and has to be used in the code of the data reader to check for alternative 
+ dimension names inside the target file. The 'file_data_reader' must 
+ return a (multi)dimensional array with named dimensions, and optionally with
+ the attribute 'variables' with other additional metadata on the retrieved 
+ data.
+\cr\cr
+Usually, 'file_data_reader' should use 'file_dim_reader'
+(see documentation on the corresponding parameter), so it is recommended to 
+code 'file_dim_reder' in first place.
 \cr\cr
-Next, a more advanced example to retrieve data for only the sales records, for
-the first section ('electronics'), for the 1st and 3rd items and for the 
-stores located in Barcelona (assuming the files contain the variable 
-'store_location' with the name of the city each of the 100 stores are located 
-at):
-\cr
-\command{
-\cr # data <- Start(source = paste0('/data/$variable$/',
-\cr #                               '$section$/$item$.data'),
-\cr #               variable = 'sales',
-\cr #               section = 'first',
-\cr #               item = indices(c(1, 3)),
-\cr #               item_depends = 'section',
-\cr #               store = 'Barcelona',
-\cr #               store_var = 'store_location',
-\cr #               month = 'all',
-\cr #               return_vars = list(store_location = NULL))
-}
+This parameter takes by default NcDataReader() (a data reader function 
+for NetCDF files).
 \cr\cr
-The defined names for the dimensions do not necessarily have to match the 
-names of the dimensions inside the file. Lists of alternative names to be 
-seeked can be defined in the parameter 'synonims'.
+See NcDataReader() for a template to build a data reader for your own 
+file format.}
+
+\item{file_closer}{A function that receives as a single parameter 
+ 'file_object' an open connection (as returned by 'file_opener') 
+ to one of the files to be read, optionally with header information, and 
+ closes the open connection. Always returns NULL.
 \cr\cr
-If data from multiple sources (not necessarily following the same structure) 
-has to be retrieved, it can be done by providing a vector of character strings
-with path pattern specifications, or, in the extended form, by providing a 
-list of lists with the components 'name' and 'path', and the name of the 
-dataset and path pattern as values, respectively. For example:
-\cr
-\command{
-\cr # data <- Start(source = list(
-\cr #                 list(name = 'sourceA',
-\cr #                      path = paste0('/sourceA/$variable$/',
-\cr #                                    '$section$/$item$.data')),
-\cr #                 list(name = 'sourceB',
-\cr #                      path = paste0('/sourceB/$section$/',
-\cr #                                    '$variable$/$item$.data'))
-\cr #               ),
-\cr #               variable = 'sales',
-\cr #               section = 'first',
-\cr #               item = indices(c(1, 3)),
-\cr #               item_depends = 'section',
-\cr #               store = 'Barcelona',
-\cr #               store_var = 'store_location',
-\cr #               month = 'all',
-\cr #               return_vars = list(store_location = NULL))
-}
-\cr}
+This parameter takes by default NcCloser() (a closer function for NetCDF 
+files).
+\cr\cr
+See NcCloser() for a template to build a file closer for your own file 
+format.}
+
+\item{transform}{A function with the header \code{dara_array}, 
+\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as
+input, through the parameter \code{data_array}, a subset of a 
+multidimensional array (as returned by 'file_data_reader'), applies a 
+transformation to it and returns it, preserving the amount of dimensions but
+potentially modifying their size. This transformation may require data from 
+other auxiliary variables, automatically provided to 'transform' 
+through the parameter 'variables', in the form of a named list where
+the names are the variable names and the values are (multi)dimensional
+arrays. Which variables need to be sent to 'transform' can be specified
+with the parameter 'transform_vars' in Start(). The parameter 
+'file_selectors' will also be provided automatically to 
+'transform', containing a named list where the names are the names of 
+the file dimensions of the queried data set (see documentation on 
+\code{\dots}) and the values are single character strings with the 
+components used to build the path to the file the subset being processed 
+belongs to. The parameter \code{\dots} will be filled in with other 
+additional parameters to adjust the transformation, exactly as provided in 
+the call to Start() via the parameter 'transform_params'.}
+
+\item{transform_params}{A named list with additional parameters to be sent to 
+the 'transform' function (if specified). See documentation on parameter
+'transform' for details.}
+
+\item{transform_vars}{A vector of character strings with the names of 
+auxiliary variables to be sent to the 'transform' function (if 
+specified). All the variables to be sent to 'transform' must also 
+have been requested as return variables in the parameter 'return_vars' 
+of Start().}
+
+\item{transform_extra_cells}{An integer of extra indices to retrieve from the 
+data set, beyond the requested indices in \code{\dots}, in order for 
+'transform' to dispose of additional information to properly apply 
+whichever transformation (if needed). As many as 
+'transform_extra_cells' will be retrieved beyond each of the limits for
+each of those inner dimensions associated to a coordinate variable and sent 
+to 'transform' (i.e. present in 'transform_vars'). After 
+'transform' has finished, Start() will take again and return a 
+subset of the result, for the returned data to fall within the specified 
+bounds in \code{\dots}. The default value is 2.}
+
+\item{apply_indices_after_transform}{A logical value indicating when a 
+'transform' is specified in Start() and numeric indices are 
+provided for any of the inner dimensions that depend on coordinate variables,
+these numeric indices can be made effective (retrieved) before applying the 
+transformation or after. The boolean flag allows to adjust this behaviour. 
+It takes FALSE by default (numeric indices are applied before sending
+data to 'transform').}
+
+\item{pattern_dims}{A character string indicating the name of the dimension 
+with path pattern specifications (see \code{\dots} for details). If not  
+specified, Start() assumes the first provided dimension is the pattern 
+dimension, with a warning.}
+
+\item{metadata_dims}{A vector of character strings with the names of the file 
+dimensions which to return metadata for. As noted in 'file_data_reader', 
+the data reader can optionally return auxiliary data via the attribute 
+'variables' of the returned array. Start() by default returns the 
+auxiliary data read for only the first file of each source (or data set) in 
+the pattern dimension (see \code{\dots} for info on what the pattern 
+dimension is). However it can be configured to return the metadata for all 
+the files along any set of file dimensions. The default value is NULL, and
+it will be assigned automatically as parameter 'pattern_dims'.}
+
+\item{selector_checker}{A function used internaly by Start() to 
+translate a set of selectors (values for a dimension associated to a 
+coordinate variable) into a set of numeric indices. It takes by default 
+SelectorChecker() and, in principle, it should not be required to 
+change it for customized file formats. The option to replace it is left open
+for more versatility. See the code of SelectorChecker() for details on
+the inputs, functioning and outputs of a selector checker.}
+
+\item{merge_across_dims}{A logical value indicating whether to merge 
+dimensions across which another dimension extends (according to the 
+'<dimname>_across' parameters). Takes the value FALSE by default. For 
+example, if the dimension 'time' extends across the dimension 'chunk' and 
+\code{merge_across_dims = TRUE}, the resulting data array will only contain
+only the dimension 'time' as long as all the chunks together.}
+
+\item{merge_across_dims_narm}{A logical value indicating whether to remove
+the additional NAs from data when parameter 'merge_across_dims' is TRUE.
+It is helpful when the length of the to-be-merged dimension is different 
+across another dimension. For example, if the dimension 'time' extends 
+across dimension 'chunk', and the time length along the first chunk is 2 
+while along the second chunk is 10. Setting this parameter as TRUE can 
+remove the additional 8 NAs at position 3 to 10. The default value is TRUE,
+but will be automatically turned to FALSE if 'merge_across_dims = FALSE'.}
+
+\item{split_multiselected_dims}{A logical value indicating whether to split a 
+dimension that has been selected with a multidimensional array of selectors
+into as many dimensions as present in the selector array. The default value
+is FALSE.}
+
+\item{path_glob_permissive}{A logical value or an integer specifying how many
+ folder levels in the path pattern, beginning from the end, the shell glob
+ expressions must be preserved and worked out for each file. The default 
+ value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr
+When specifying a path pattern for a dataset, it might contain shell glob 
+experissions. For each dataset, the first file matching the path pattern is 
+found, and the found file is used to work out fixed values for the glob 
+expressions that will be used for all the files of the dataset. However, in 
+some cases, the values of the shell glob expressions may not be constant for 
+all files in a dataset, and they need to be worked out for each file 
+involved.\cr\cr
+For example, a path pattern could be as follows: \cr
+\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving 
+\code{path_glob_permissive = FALSE} will trigger automatic seek of the 
+ contents to replace the asterisks (e.g. the first asterisk matches with 
+ \code{'bar'} and the second with \code{'baz'}. The found contents will be 
+ used for all files in the dataset (in the example, the path pattern will be
+ fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if
+ any of the files in the dataset have other contents in the position of the
+ asterisks, Start() will not find them (in the example, a file like \cr
+ \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be
+ found). Setting \code{path_glob_permissive = 1} would preserve global
+ expressions in the latest level (in the example, the fixed path pattern
+ would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the
+ problematic file mentioned before would be found), but of course this would
+ slow down the Start() call if the dataset involves a large number of
+ files. Setting \code{path_glob_permissive = 2} would leave the original path
+ pattern with the original glob expressions in the 1st and 2nd levels (in the
+ example, both asterisks would be preserved, thus would allow Start()
+ to recognize files such as \cr
+ \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr
+Note that each glob expression can only represent one possibility (Start() 
+chooses the first). Because /code{*} is not the tag, which means it cannot
+be a dimension of the output array. Therefore, only one possibility can be
+adopted. For example, if \cr
+\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr
+has two matches:\cr
+\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr
+\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr
+only the first found file will be used.}
+
+\item{largest_dims_length}{A logical value or a named integer vector
+ indicating if Start() should examine all the files to get the largest 
+ length of the inner dimensions (TRUE) or use the first valid file of each 
+ dataset as the returned dimension length (FALSE). Since examining all the 
+ files could be time-consuming, a vector can be used to explicitly specify
+ the expected length of the inner dimensions. For those inner dimensions not
+ specified, the first valid file will be used. The default value is FALSE.\cr\cr
+ This parameter is useful when the required files don't have consistent 
+ inner dimension. For example, there are 10 required experimental data files
+ of a series of start dates. The data only contain 25 members for the first
+ 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'},
+ the returned member dimension length will be 25 only. The 26th to 51st 
+ members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'},
+ the returned member dimension length will be 51. To save the resource,
+\code{'largest_dims_length = c(member = 51)'} can also be used.}
+
+\item{retrieve}{A logical value indicating whether to retrieve the data
+defined in the Start() call or to explore only its dimension lengths 
+and names, and the values for the file and inner dimensions. The default
+value is FALSE.}
+
+\item{num_procs}{An integer of number of processes to be created for the
+parallel execution of the retrieval/transformation/arrangement of the
+multiple involved files in a call to Start(). If set to NULL,
+takes the number of available cores (as detected by detectCores() in 
+the package 'future'). The default value is 1 (no parallel execution).}
+
+\item{ObjectBigmemory}{a character string to be included as part of the 
+bigmemory object name. This parameter is thought to be used internally by the
+chunking capabilities of startR.}
+
+\item{silent}{A logical value of whether to display progress messages (FALSE)
+or not (TRUE). The default value is FALSE.}
+
+\item{debug}{A logical value of whether to return detailed messages on the
+progress and operations in a Start() call (TRUE) or not (FALSE). The
+default value is FALSE.}
 }
 \value{
 If \code{retrieve = TRUE} the involved data is loaded into RAM memory
@@ -830,4 +848,3 @@ file format.
                retrieve = FALSE)
 
 }
-
diff --git a/man/Step.Rd b/man/Step.Rd
index 65f0c72..c473ccb 100644
--- a/man/Step.Rd
+++ b/man/Step.Rd
@@ -4,8 +4,13 @@
 \alias{Step}
 \title{Define the operation applied on declared data.}
 \usage{
-Step(fun, target_dims, output_dims, use_libraries = NULL,
-  use_attributes = NULL)
+Step(
+  fun,
+  target_dims,
+  output_dims,
+  use_libraries = NULL,
+  use_attributes = NULL
+)
 }
 \arguments{
 \item{fun}{A function in R format defining the operation to be applied to the 
@@ -70,4 +75,3 @@ to the expected order for this function.
  wf <- AddStep(data, step)
 
 }
-
diff --git a/man/indices.Rd b/man/indices.Rd
index a3d85ea..6233b71 100644
--- a/man/indices.Rd
+++ b/man/indices.Rd
@@ -39,4 +39,3 @@ original data. See details in the documentation of the parameter \code{\dots}
 \seealso{
 \code{\link{values}}
 }
-
diff --git a/man/values.Rd b/man/values.Rd
index 3300f19..31ce95a 100644
--- a/man/values.Rd
+++ b/man/values.Rd
@@ -41,4 +41,3 @@ coordinate variable. See details in the documentation of the parameter
 \seealso{
 \code{\link{indices}}
 }
-
-- 
GitLab