From fd91c1b34d7fef59da15d28837d1fb0b7247dda3 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 21 May 2020 22:55:44 +0200 Subject: [PATCH 1/4] Add additional data array rearrangement if merge_across_dim + split_multiselected_dims. It corrects the displaced values when the values in the same file are called more than 1 time. --- R/Start.R | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/R/Start.R b/R/Start.R index 3c544a4..17d116f 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3165,6 +3165,71 @@ Start <- function(..., # dim = indices/selectors, data_array_final_dims <- .aperm2(data_array_final_dims, tmp) data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. + if (split_multiselected_dims) { + # shape the vector into the array without split_dims + split_dims_pos <- match(split_dims, final_dims_fake) + new_dims <- c() + if (split_dims_pos[1] > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, prod(split_dims)) + names(new_dims)[split_dims_pos[1]] <- across_inner_dim + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake_no_split <- new_dims + data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) + # seperate 'time' dim into each work_piece length + data_array_seperate <- list() + tmp <- cumsum(unlist(length_inner_across_dim)) + tmp <- c(0, tmp) + for (i in 1:length(length_inner_across_dim)) { + data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + # generate the correct order list from indices_chunk + final_order_list <- list() + i <- 1 + j <- 1 + a <- indices_chunk[i] + while (i < length(indices_chunk)) { + while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { + a <- c(a, indices_chunk[i+1]) + i <- i + 1 + } + final_order_list[[j]] <- a + a <- indices_chunk[i+1] + i <- i + 1 + j <- j + 1 + } + names(final_order_list) <- sapply(final_order_list, '[[', 1) + final_order_list <- lapply(final_order_list, length) + + # re-build the array: chunk + which_chunk <- as.numeric(names(final_order_list)) + how_many_indices <- unlist(final_order_list) + array_piece <- list() + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + for (i in 1:length(final_order_list)) { + array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], + across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] + } + + # re-build the array: paste + data_array_tmp <- array_piece[[1]] + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + if (length(array_piece) > 1) { + for (i in 2:length(array_piece)) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } + } + } + data_array <- array(data_array_tmp, dim = final_dims_fake) } else { # merge_across_dims_narm = F (old version) -- GitLab From 90a4c78e9bddbcd490b2b6134d305ab25db9d7c7 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 21 May 2020 23:55:57 +0200 Subject: [PATCH 2/4] Remove the indices_chunk part to the first, so if the array doesn't need reorganization, the new development is skipped. --- R/Start.R | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/R/Start.R b/R/Start.R index 17d116f..c04894a 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3168,6 +3168,28 @@ Start <- function(..., # dim = indices/selectors, #NOTE: When one file contains values for dicrete dimensions, rearrange the # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { + + # generate the correct order list from indices_chunk + final_order_list <- list() + i <- 1 + j <- 1 + a <- indices_chunk[i] + while (i < length(indices_chunk)) { + while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { + a <- c(a, indices_chunk[i+1]) + i <- i + 1 + } + final_order_list[[j]] <- a + a <- indices_chunk[i+1] + i <- i + 1 + j <- j + 1 + } + names(final_order_list) <- sapply(final_order_list, '[[', 1) + final_order_list <- lapply(final_order_list, length) + + if (!all(diff(as.numeric(names(final_order_list))) > 0)) { + + # shape the vector into the array without split_dims split_dims_pos <- match(split_dims, final_dims_fake) new_dims <- c() @@ -3189,23 +3211,6 @@ Start <- function(..., # dim = indices/selectors, data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, (tmp[i] + 1):tmp[i + 1]) } - # generate the correct order list from indices_chunk - final_order_list <- list() - i <- 1 - j <- 1 - a <- indices_chunk[i] - while (i < length(indices_chunk)) { - while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { - a <- c(a, indices_chunk[i+1]) - i <- i + 1 - } - final_order_list[[j]] <- a - a <- indices_chunk[i+1] - i <- i + 1 - j <- j + 1 - } - names(final_order_list) <- sapply(final_order_list, '[[', 1) - final_order_list <- lapply(final_order_list, length) # re-build the array: chunk which_chunk <- as.numeric(names(final_order_list)) @@ -3229,7 +3234,8 @@ Start <- function(..., # dim = indices/selectors, } } } - + } + data_array <- array(data_array_tmp, dim = final_dims_fake) } else { # merge_across_dims_narm = F (old version) -- GitLab From 54064a5c5f77a33eea1ea1d831b465cf81044f34 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 22 May 2020 14:36:25 +0200 Subject: [PATCH 3/4] Revise the documents for the new development --- NEWS.md | 5 +++++ inst/doc/faq.md | 13 +++---------- inst/doc/usecase/ex1_7_split_merge.R | 17 +++-------------- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1206070..a153e77 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# startR v1.0.3 (Release date: 2020-) +- Bugfix for requiring the repetitive values from a single file when using +'merge_across_dims' and 'split_multiselected_dims'. The value positions were not +correct before. + # startR v1.0.2 (Release date: 2020-05-11) - Bugfix for longitude transformation when the required grid point across the borders. The bug apprears at v1.0.0 and v1.0.1. - Add one new parameter 'merge_across_dims_narm' in Start(). If it is TRUE, diff --git a/inst/doc/faq.md b/inst/doc/faq.md index bcb937a..932ee59 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -627,16 +627,9 @@ print(attr(obs, 'Dimensions')) # 1 1 4 3 256 512 ``` -Notice that when 'split_multiselected_dims' and 'merge_across_dims' are both used, -and dimension number of the splitted selector is more than two, a potential -problem of mixed dimension might occur. The following script is from part of -the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). - -It is important to check if **the order of file_date is in line with dates dimensions order!** -Regardless 'time', which is explicitly specified in Start(), the vector should list 'sdate' first, then 'syear'. -As you can see below, the first element '199607' is sdate = 1, and the second element '199612' is sdate = 2. If the order is wrong, you will still get a -return data but with mixed dimensions. Because 'sdate' and 'syear' are only -implied in the given selector vector without any attributes, Start() cannot check if the order of file_date and dates are consistent or not. +The splited dimension can have more than two dimensions. +The following example comes from the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). +The 'time' selector has three dimensions 'sdate', 'syear', and 'time'. ```r dates <- attr(hcst, 'Variables')$common$time diff --git a/inst/doc/usecase/ex1_7_split_merge.R b/inst/doc/usecase/ex1_7_split_merge.R index 86a704d..539a24d 100644 --- a/inst/doc/usecase/ex1_7_split_merge.R +++ b/inst/doc/usecase/ex1_7_split_merge.R @@ -48,26 +48,15 @@ dim(dates) #----------------------------------------------------------------------- #----------------------------------------------------------------------- -# This two lines should NOT be used!! It is an example showing the potential -# problem when using 'split_multiselected_dims' and 'merge_across_dims'. -# If 'dates' is reordered to the order that 'syear' ahead of 'sdate', while -# 'file_date' below remains the same, the result will have mixed dimension. -# The order of 'time', which is the name of the inner dimension `time = values(dates)` -# below, is not important. We only need to pay attention on 'sdate' and 'syear' -# because they are relavent of the required files (i.e., 'file_date' below). -# See more explanation in FAQ How-to-#17. - +# If you need to reorder the dimensions of the 'time' selector, you can use +# s2dv::Reorder function. These two lines are not used in the following example. library(s2dv) dates <- Reorder(dates, c('syear', 'sdate', 'time')) #----------------------------------------------------------------------- #----------------------------------------------------------------------- # Use 'dates' generated above to get the required files. -# It is important to check if the order of file_date is in line with -# dates dimensions! Regardless 'time', the vector should list 'sdate' first, -# then 'syear'. As you can see below, the first element '199607' is sdate = 1, -# and the second element '199612' is sdate = 2. - +# The order of file_date is not important. sort() is not necessary to use. file_date <- sort(unique(gsub('-', '', sapply(as.character(dates), substr, 1, 7)))) print(file_date) -- GitLab From f52225f93767302ae11535cdfea8a1149802e9de Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 22 May 2020 15:17:00 +0200 Subject: [PATCH 4/4] Create unit test for multiple sdates --- tests/testthat/test-Start-multiple-sdates.R | 157 ++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 tests/testthat/test-Start-multiple-sdates.R diff --git a/tests/testthat/test-Start-multiple-sdates.R b/tests/testthat/test-Start-multiple-sdates.R new file mode 100644 index 0000000..cd5bac8 --- /dev/null +++ b/tests/testthat/test-Start-multiple-sdates.R @@ -0,0 +1,157 @@ +context("Start() multiple sdate with split + merge dim") + +# When certain values in one observation file are required more than once, +# and 'merge_across_dims' + 'split_multiselected_dims' are used, the values may be misplaced. +# It might happen when reading experimental data with many start dates, +# and the corresponding observations are required to have the same data structure. + +ecmwf_path_hc <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc') +obs_path <-paste0('/esarchive/recon/ecmwf/era5/daily/$var$-r240x121/$var$_$file_date$.nc') + +var_name <- 'sfcWind' +var100_name <- 'windagl100' + +sdates.seq <- c("20161222","20161229","20170105","20170112") + +test_that("1. ", { +hcst<-Start(dat = ecmwf_path_hc, + var = var_name, + sdate = sdates.seq, + syear = 'all', + time = 'all', + latitude = indices(1), + longitude = indices(1), + ensemble = 'all', + syear_depends = 'sdate', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate','syear') + ), + retrieve = F) +dates <- attr(hcst, 'Variables')$common$time +file_date <- unique(sapply(dates, format, '%Y%m')) + +obs <- Start(dat = obs_path, + var = var100_name, + latitude = indices(1), + longitude = indices(1), + file_date= file_date, + time = values(dates), # 'sdate' 'syear' 'time' + time_var = 'time', + time_across = 'file_date', + merge_across_dims= TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude=c('lat','latitude'), + longitude=c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat',# + time = c('file_date')), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 1, longitude = 1, sdate = 4, syear = 20, time = 47) + ) + expect_equal( + obs[1, 1, 1, 1, 1, 1, 8:15], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463, 2.337417, 7.127212, 3.592193), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 1:5], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 31:38], + c(10.680604, 4.843633, 4.981896, 4.833428, 1.426522, 3.625800, 7.037229, 2.911440), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 1, 2, 9:15], + c(11.189878, 11.198478, 8.868102, 10.766751, 19.929094, 20.872601, 14.304168), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 8.627518, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +}) + +test_that("2. change the file_date order", { + hcst<-Start(dat = ecmwf_path_hc, + var = var_name, + sdate = sdates.seq, + syear = indices(1:20), + time = 'all', + latitude = indices(1), + longitude = indices(1), + ensemble = 'all', + syear_depends = 'sdate', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate','syear') + ), + retrieve = F) + dates <- attr(hcst, 'Variables')$common$time + file_date <- sort(unique(sapply(dates, format, '%Y%m'))) + + +obs <- Start(dat = obs_path, + var = var100_name, + latitude = indices(1), + longitude = indices(1), + file_date= file_date, + time = values(dates), # 'sdate' 'syear' 'time' + time_var = 'time', + time_across = 'file_date', + merge_across_dims= TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude=c('lat','latitude'), + longitude=c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat',# + time = c('file_date')), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 1, longitude = 1, sdate = 4, syear = 20, time = 47) + ) + expect_equal( + obs[1, 1, 1, 1, 1, 1, 8:15], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463, 2.337417, 7.127212, 3.592193), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 1:5], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 31:38], + c(10.680604, 4.843633, 4.981896, 4.833428, 1.426522, 3.625800, 7.037229, 2.911440), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 1, 2, 9:15], + c(11.189878, 11.198478, 8.868102, 10.766751, 19.929094, 20.872601, 14.304168), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 8.627518, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +}) -- GitLab