From fd91c1b34d7fef59da15d28837d1fb0b7247dda3 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Thu, 21 May 2020 22:55:44 +0200
Subject: [PATCH 1/4] Add additional data array rearrangement if
 merge_across_dim + split_multiselected_dims. It corrects the displaced values
 when the values in the same file are called more than 1 time.

---
 R/Start.R | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/R/Start.R b/R/Start.R
index 3c544a4..17d116f 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -3165,6 +3165,71 @@ Start <- function(..., # dim = indices/selectors,
       data_array_final_dims <- .aperm2(data_array_final_dims, tmp)
       data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999]  # become a vector
       
+ #NOTE: When one file contains values for dicrete dimensions, rearrange the 
+ #      chunks (i.e., work_piece) is necessary.
+      if (split_multiselected_dims) {
+      # shape the vector into the array without split_dims
+        split_dims_pos <- match(split_dims, final_dims_fake)
+        new_dims <- c()
+        if (split_dims_pos[1] > 1) {
+          new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)])
+        }
+        new_dims <- c(new_dims,  prod(split_dims))
+        names(new_dims)[split_dims_pos[1]] <- across_inner_dim
+        if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) {
+          new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)])
+        }
+        final_dims_fake_no_split <- new_dims
+        data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split)
+      # seperate 'time' dim into each work_piece length
+        data_array_seperate <- list()
+        tmp <- cumsum(unlist(length_inner_across_dim))
+        tmp <- c(0, tmp)
+        for (i in 1:length(length_inner_across_dim)) {
+          data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim,
+                                             (tmp[i] + 1):tmp[i + 1])
+        }
+      # generate the correct order list from indices_chunk 
+        final_order_list <- list()
+        i <- 1
+        j <- 1
+        a <- indices_chunk[i]
+        while (i < length(indices_chunk)) {
+          while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) {
+            a <- c(a, indices_chunk[i+1])
+            i <- i + 1
+          }
+          final_order_list[[j]] <- a
+          a <- indices_chunk[i+1]
+          i <- i + 1
+          j <- j + 1
+        }
+        names(final_order_list) <- sapply(final_order_list, '[[', 1)
+        final_order_list <- lapply(final_order_list, length)
+
+      # re-build the array: chunk 
+        which_chunk <- as.numeric(names(final_order_list))
+        how_many_indices <- unlist(final_order_list)
+        array_piece <- list()
+        ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate)))
+        for (i in 1:length(final_order_list)) {
+          array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]],
+                                     across_inner_dim,
+                                     ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1))
+          ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i]
+        }
+        
+        # re-build the array: paste
+        data_array_tmp <- array_piece[[1]]
+        along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim)
+        if (length(array_piece) > 1) {
+          for (i in 2:length(array_piece)) {
+          data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]],
+                                        along = along_pos)
+          }
+        }
+      }        
+        
       data_array <- array(data_array_tmp, dim = final_dims_fake)
       
     } else {  # merge_across_dims_narm = F (old version)
-- 
GitLab


From 90a4c78e9bddbcd490b2b6134d305ab25db9d7c7 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Thu, 21 May 2020 23:55:57 +0200
Subject: [PATCH 2/4] Remove the indices_chunk part to the first, so if the
 array doesn't need reorganization, the new development is skipped.

---
 R/Start.R | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/R/Start.R b/R/Start.R
index 17d116f..c04894a 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -3168,6 +3168,28 @@ Start <- function(..., # dim = indices/selectors,
  #NOTE: When one file contains values for dicrete dimensions, rearrange the 
  #      chunks (i.e., work_piece) is necessary.
       if (split_multiselected_dims) {
+
+      # generate the correct order list from indices_chunk 
+        final_order_list <- list()
+        i <- 1
+        j <- 1
+        a <- indices_chunk[i]
+        while (i < length(indices_chunk)) {
+          while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) {
+            a <- c(a, indices_chunk[i+1])
+            i <- i + 1
+          }
+          final_order_list[[j]] <- a
+          a <- indices_chunk[i+1]
+          i <- i + 1
+          j <- j + 1
+        }
+        names(final_order_list) <- sapply(final_order_list, '[[', 1)
+        final_order_list <- lapply(final_order_list, length)
+        
+        if (!all(diff(as.numeric(names(final_order_list))) > 0)) {
+
+
       # shape the vector into the array without split_dims
         split_dims_pos <- match(split_dims, final_dims_fake)
         new_dims <- c()
@@ -3189,23 +3211,6 @@ Start <- function(..., # dim = indices/selectors,
           data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim,
                                              (tmp[i] + 1):tmp[i + 1])
         }
-      # generate the correct order list from indices_chunk 
-        final_order_list <- list()
-        i <- 1
-        j <- 1
-        a <- indices_chunk[i]
-        while (i < length(indices_chunk)) {
-          while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) {
-            a <- c(a, indices_chunk[i+1])
-            i <- i + 1
-          }
-          final_order_list[[j]] <- a
-          a <- indices_chunk[i+1]
-          i <- i + 1
-          j <- j + 1
-        }
-        names(final_order_list) <- sapply(final_order_list, '[[', 1)
-        final_order_list <- lapply(final_order_list, length)
 
       # re-build the array: chunk 
         which_chunk <- as.numeric(names(final_order_list))
@@ -3229,7 +3234,8 @@ Start <- function(..., # dim = indices/selectors,
           }
         }
       }        
-        
+      }
+  
       data_array <- array(data_array_tmp, dim = final_dims_fake)
       
     } else {  # merge_across_dims_narm = F (old version)
-- 
GitLab


From 54064a5c5f77a33eea1ea1d831b465cf81044f34 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Fri, 22 May 2020 14:36:25 +0200
Subject: [PATCH 3/4] Revise the documents for the new development

---
 NEWS.md                              |  5 +++++
 inst/doc/faq.md                      | 13 +++----------
 inst/doc/usecase/ex1_7_split_merge.R | 17 +++--------------
 3 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 1206070..a153e77 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,8 @@
+# startR v1.0.3 (Release date: 2020-)
+- Bugfix for requiring the repetitive values from a single file when using 
+'merge_across_dims' and 'split_multiselected_dims'. The value positions were not 
+correct before.
+  
 # startR v1.0.2 (Release date: 2020-05-11)
 - Bugfix for longitude transformation when the required grid point across the borders. The bug apprears at v1.0.0 and v1.0.1.  
 - Add one new parameter 'merge_across_dims_narm' in Start(). If it is TRUE,
diff --git a/inst/doc/faq.md b/inst/doc/faq.md
index bcb937a..932ee59 100644
--- a/inst/doc/faq.md
+++ b/inst/doc/faq.md
@@ -627,16 +627,9 @@ print(attr(obs, 'Dimensions'))
 #    1     1     4     3   256   512 
 ```
 
-Notice that when 'split_multiselected_dims' and 'merge_across_dims' are both used, 
-and dimension number of the splitted selector is more than two, a potential 
-problem of mixed dimension might occur. The following script is from part of 
-the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). 
-
-It is important to check if **the order of file_date is in line with dates dimensions order!** 
-Regardless 'time', which is explicitly specified in Start(), the vector should list 'sdate' first, then 'syear'. 
-As you can see below, the first element '199607' is sdate = 1, and the second element '199612' is sdate = 2. If the order is wrong, you will still get a 
-return data but with mixed dimensions. Because 'sdate' and 'syear' are only 
-implied in the given selector vector without any attributes, Start() cannot check if the order of file_date and dates are consistent or not.
+The splited dimension can have more than two dimensions. 
+The following example comes from the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). 
+The 'time' selector has three dimensions 'sdate', 'syear', and 'time'.
 
 ```r
 dates <- attr(hcst, 'Variables')$common$time
diff --git a/inst/doc/usecase/ex1_7_split_merge.R b/inst/doc/usecase/ex1_7_split_merge.R
index 86a704d..539a24d 100644
--- a/inst/doc/usecase/ex1_7_split_merge.R
+++ b/inst/doc/usecase/ex1_7_split_merge.R
@@ -48,26 +48,15 @@ dim(dates)
 #-----------------------------------------------------------------------
 
 #-----------------------------------------------------------------------
-# This two lines should NOT be used!! It is an example showing the potential
-# problem when using 'split_multiselected_dims' and 'merge_across_dims'.
-# If 'dates' is reordered to the order that 'syear' ahead of 'sdate', while 
-# 'file_date' below remains the same, the result will have mixed dimension.
-# The order of 'time', which is the name of the inner dimension `time = values(dates)` 
-# below, is not important. We only need to pay attention on 'sdate' and 'syear' 
-# because they are relavent of the required files (i.e., 'file_date' below).
-# See more explanation in FAQ How-to-#17.
-
+# If you need to reorder the dimensions of the 'time' selector, you can use
+# s2dv::Reorder function. These two lines are not used in the following example.
 library(s2dv)
 dates <- Reorder(dates, c('syear', 'sdate', 'time'))
 #-----------------------------------------------------------------------
 
 #-----------------------------------------------------------------------
 # Use 'dates' generated above to get the required files. 
-# It is important to check if the order of file_date is in line with 
-# dates dimensions! Regardless 'time', the vector should list 'sdate' first,
-# then 'syear'. As you can see below, the first element '199607' is sdate = 1,
-# and the second element '199612' is sdate = 2.
-
+# The order of file_date is not important. sort() is not necessary to use.
 file_date <- sort(unique(gsub('-', '', 
                               sapply(as.character(dates), substr, 1, 7))))
 print(file_date)
-- 
GitLab


From f52225f93767302ae11535cdfea8a1149802e9de Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Fri, 22 May 2020 15:17:00 +0200
Subject: [PATCH 4/4] Create unit test for multiple sdates

---
 tests/testthat/test-Start-multiple-sdates.R | 157 ++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 tests/testthat/test-Start-multiple-sdates.R

diff --git a/tests/testthat/test-Start-multiple-sdates.R b/tests/testthat/test-Start-multiple-sdates.R
new file mode 100644
index 0000000..cd5bac8
--- /dev/null
+++ b/tests/testthat/test-Start-multiple-sdates.R
@@ -0,0 +1,157 @@
+context("Start() multiple sdate with split + merge dim")
+
+# When certain values in one observation file are required more than once,
+# and 'merge_across_dims' + 'split_multiselected_dims' are used, the values may be misplaced. 
+# It might happen when reading experimental data with many start dates, 
+# and the corresponding observations are required to have the same data structure.
+
+ecmwf_path_hc <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc')
+obs_path <-paste0('/esarchive/recon/ecmwf/era5/daily/$var$-r240x121/$var$_$file_date$.nc')
+
+var_name <- 'sfcWind'
+var100_name <- 'windagl100'
+
+sdates.seq <- c("20161222","20161229","20170105","20170112")
+
+test_that("1. ", {
+hcst<-Start(dat = ecmwf_path_hc,
+            var = var_name,
+            sdate = sdates.seq,
+            syear =  'all',
+            time = 'all',            
+            latitude = indices(1),
+            longitude = indices(1),
+            ensemble = 'all',
+            syear_depends = 'sdate',
+            return_vars = list(latitude = 'dat',
+                                 longitude = 'dat',
+                                 time = c('sdate','syear')
+                                  ),
+            retrieve = F)
+dates <- attr(hcst, 'Variables')$common$time
+file_date <- unique(sapply(dates, format, '%Y%m'))
+
+obs <- Start(dat = obs_path,
+             var = var100_name,
+             latitude = indices(1),
+             longitude = indices(1),
+             file_date= file_date,
+             time = values(dates),    # 'sdate' 'syear' 'time'
+             time_var = 'time',
+             time_across = 'file_date',
+             merge_across_dims= TRUE,
+             merge_across_dims_narm = TRUE,
+             split_multiselected_dims = TRUE,
+             synonims = list(latitude=c('lat','latitude'),
+                             longitude=c('lon','longitude')),
+             return_vars = list(latitude = 'dat',
+                                longitude = 'dat',#
+                                time = c('file_date')),
+             retrieve = T)
+
+  expect_equal(
+  dim(obs),
+  c(dat = 1, var = 1, latitude = 1, longitude = 1, sdate = 4, syear = 20, time = 47)
+  )
+  expect_equal(
+  obs[1, 1, 1, 1, 1, 1, 8:15],
+  c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463, 2.337417, 7.127212, 3.592193),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  obs[1, 1, 1, 1, 2, 1, 1:5],
+  c(10.727190, 21.909714,  3.220845,  7.524321,  7.308463),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  obs[1, 1, 1, 1, 2, 1, 31:38],
+  c(10.680604, 4.843633, 4.981896, 4.833428, 1.426522, 3.625800, 7.037229, 2.911440),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  obs[1, 1, 1, 1, 1, 2, 9:15],
+  c(11.189878, 11.198478, 8.868102, 10.766751, 19.929094, 20.872601, 14.304168),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  mean(obs),
+  8.627518,
+  tolerance = 0.0001
+  )
+  expect_equal(
+  length(obs[which(is.na(obs))]),
+  0
+  )
+})
+
+test_that("2. change the file_date order", {
+  hcst<-Start(dat = ecmwf_path_hc,
+              var = var_name,
+              sdate = sdates.seq,
+              syear =  indices(1:20),
+              time = 'all',
+              latitude = indices(1),
+              longitude = indices(1),
+              ensemble = 'all',
+              syear_depends = 'sdate',
+              return_vars = list(latitude = 'dat',
+                                 longitude = 'dat',
+                                 time = c('sdate','syear')
+                                  ),
+              retrieve = F)
+  dates <- attr(hcst, 'Variables')$common$time
+  file_date <- sort(unique(sapply(dates, format, '%Y%m')))
+
+
+obs <- Start(dat = obs_path,
+             var = var100_name,
+             latitude = indices(1),
+             longitude = indices(1),
+             file_date= file_date,
+             time = values(dates),    # 'sdate' 'syear' 'time'
+             time_var = 'time',
+             time_across = 'file_date',
+             merge_across_dims= TRUE,
+             merge_across_dims_narm = TRUE,
+             split_multiselected_dims = TRUE,
+             synonims = list(latitude=c('lat','latitude'),
+                             longitude=c('lon','longitude')),
+             return_vars = list(latitude = 'dat',
+                                longitude = 'dat',#
+                                time = c('file_date')),
+             retrieve = T)
+
+  expect_equal(
+  dim(obs),
+  c(dat = 1, var = 1, latitude = 1, longitude = 1, sdate = 4, syear = 20, time = 47)
+  )
+  expect_equal(
+  obs[1, 1, 1, 1, 1, 1, 8:15],
+  c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463, 2.337417, 7.127212, 3.592193),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  obs[1, 1, 1, 1, 2, 1, 1:5],
+  c(10.727190, 21.909714,  3.220845,  7.524321,  7.308463),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  obs[1, 1, 1, 1, 2, 1, 31:38],
+  c(10.680604, 4.843633, 4.981896, 4.833428, 1.426522, 3.625800, 7.037229, 2.911440),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  obs[1, 1, 1, 1, 1, 2, 9:15],
+  c(11.189878, 11.198478, 8.868102, 10.766751, 19.929094, 20.872601, 14.304168),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  mean(obs),
+  8.627518,
+  tolerance = 0.0001
+  )
+  expect_equal(
+  length(obs[which(is.na(obs))]),
+  0
+  )
+})
-- 
GitLab