diff --git a/NEWS.md b/NEWS.md index 1206070fcc2b84384c8bb35de0eb2dcf1652ac72..a153e7745da1063f87d75f160c9cae4d9aec19a0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# startR v1.0.3 (Release date: 2020-) +- Bugfix for requiring the repetitive values from a single file when using +'merge_across_dims' and 'split_multiselected_dims'. The value positions were not +correct before. + # startR v1.0.2 (Release date: 2020-05-11) - Bugfix for longitude transformation when the required grid point across the borders. The bug apprears at v1.0.0 and v1.0.1. - Add one new parameter 'merge_across_dims_narm' in Start(). If it is TRUE, diff --git a/R/Start.R b/R/Start.R index 3c544a4cc58fb14bc65c777616476cf143b558ae..c04894aa7aeb29ff89e939e4788cacd49d757c11 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3165,6 +3165,77 @@ Start <- function(..., # dim = indices/selectors, data_array_final_dims <- .aperm2(data_array_final_dims, tmp) data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. + if (split_multiselected_dims) { + + # generate the correct order list from indices_chunk + final_order_list <- list() + i <- 1 + j <- 1 + a <- indices_chunk[i] + while (i < length(indices_chunk)) { + while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { + a <- c(a, indices_chunk[i+1]) + i <- i + 1 + } + final_order_list[[j]] <- a + a <- indices_chunk[i+1] + i <- i + 1 + j <- j + 1 + } + names(final_order_list) <- sapply(final_order_list, '[[', 1) + final_order_list <- lapply(final_order_list, length) + + if (!all(diff(as.numeric(names(final_order_list))) > 0)) { + + + # shape the vector into the array without split_dims + split_dims_pos <- match(split_dims, final_dims_fake) + new_dims <- c() + if (split_dims_pos[1] > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, prod(split_dims)) + names(new_dims)[split_dims_pos[1]] <- across_inner_dim + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake_no_split <- new_dims + data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) + # seperate 'time' dim into each work_piece length + data_array_seperate <- list() + tmp <- cumsum(unlist(length_inner_across_dim)) + tmp <- c(0, tmp) + for (i in 1:length(length_inner_across_dim)) { + data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + + # re-build the array: chunk + which_chunk <- as.numeric(names(final_order_list)) + how_many_indices <- unlist(final_order_list) + array_piece <- list() + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + for (i in 1:length(final_order_list)) { + array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], + across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] + } + + # re-build the array: paste + data_array_tmp <- array_piece[[1]] + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + if (length(array_piece) > 1) { + for (i in 2:length(array_piece)) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } + } + } + } + data_array <- array(data_array_tmp, dim = final_dims_fake) } else { # merge_across_dims_narm = F (old version) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index bcb937adcb7a0865af796e6ca9615d87666fdc6d..932ee598aca8151fb0f4ed295e5d43ba7d5e0e87 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -627,16 +627,9 @@ print(attr(obs, 'Dimensions')) # 1 1 4 3 256 512 ``` -Notice that when 'split_multiselected_dims' and 'merge_across_dims' are both used, -and dimension number of the splitted selector is more than two, a potential -problem of mixed dimension might occur. The following script is from part of -the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). - -It is important to check if **the order of file_date is in line with dates dimensions order!** -Regardless 'time', which is explicitly specified in Start(), the vector should list 'sdate' first, then 'syear'. -As you can see below, the first element '199607' is sdate = 1, and the second element '199612' is sdate = 2. If the order is wrong, you will still get a -return data but with mixed dimensions. Because 'sdate' and 'syear' are only -implied in the given selector vector without any attributes, Start() cannot check if the order of file_date and dates are consistent or not. +The splited dimension can have more than two dimensions. +The following example comes from the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). +The 'time' selector has three dimensions 'sdate', 'syear', and 'time'. ```r dates <- attr(hcst, 'Variables')$common$time diff --git a/inst/doc/usecase/ex1_7_split_merge.R b/inst/doc/usecase/ex1_7_split_merge.R index 86a704d62f5b3f8db16a4378715f771cf5b3e309..539a24d8051f796dff317d45b7630c926f8e3ec3 100644 --- a/inst/doc/usecase/ex1_7_split_merge.R +++ b/inst/doc/usecase/ex1_7_split_merge.R @@ -48,26 +48,15 @@ dim(dates) #----------------------------------------------------------------------- #----------------------------------------------------------------------- -# This two lines should NOT be used!! It is an example showing the potential -# problem when using 'split_multiselected_dims' and 'merge_across_dims'. -# If 'dates' is reordered to the order that 'syear' ahead of 'sdate', while -# 'file_date' below remains the same, the result will have mixed dimension. -# The order of 'time', which is the name of the inner dimension `time = values(dates)` -# below, is not important. We only need to pay attention on 'sdate' and 'syear' -# because they are relavent of the required files (i.e., 'file_date' below). -# See more explanation in FAQ How-to-#17. - +# If you need to reorder the dimensions of the 'time' selector, you can use +# s2dv::Reorder function. These two lines are not used in the following example. library(s2dv) dates <- Reorder(dates, c('syear', 'sdate', 'time')) #----------------------------------------------------------------------- #----------------------------------------------------------------------- # Use 'dates' generated above to get the required files. -# It is important to check if the order of file_date is in line with -# dates dimensions! Regardless 'time', the vector should list 'sdate' first, -# then 'syear'. As you can see below, the first element '199607' is sdate = 1, -# and the second element '199612' is sdate = 2. - +# The order of file_date is not important. sort() is not necessary to use. file_date <- sort(unique(gsub('-', '', sapply(as.character(dates), substr, 1, 7)))) print(file_date) diff --git a/tests/testthat/test-Start-multiple-sdates.R b/tests/testthat/test-Start-multiple-sdates.R new file mode 100644 index 0000000000000000000000000000000000000000..cd5bac8f86128d86334c985fabfc389c893b1a62 --- /dev/null +++ b/tests/testthat/test-Start-multiple-sdates.R @@ -0,0 +1,157 @@ +context("Start() multiple sdate with split + merge dim") + +# When certain values in one observation file are required more than once, +# and 'merge_across_dims' + 'split_multiselected_dims' are used, the values may be misplaced. +# It might happen when reading experimental data with many start dates, +# and the corresponding observations are required to have the same data structure. + +ecmwf_path_hc <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc') +obs_path <-paste0('/esarchive/recon/ecmwf/era5/daily/$var$-r240x121/$var$_$file_date$.nc') + +var_name <- 'sfcWind' +var100_name <- 'windagl100' + +sdates.seq <- c("20161222","20161229","20170105","20170112") + +test_that("1. ", { +hcst<-Start(dat = ecmwf_path_hc, + var = var_name, + sdate = sdates.seq, + syear = 'all', + time = 'all', + latitude = indices(1), + longitude = indices(1), + ensemble = 'all', + syear_depends = 'sdate', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate','syear') + ), + retrieve = F) +dates <- attr(hcst, 'Variables')$common$time +file_date <- unique(sapply(dates, format, '%Y%m')) + +obs <- Start(dat = obs_path, + var = var100_name, + latitude = indices(1), + longitude = indices(1), + file_date= file_date, + time = values(dates), # 'sdate' 'syear' 'time' + time_var = 'time', + time_across = 'file_date', + merge_across_dims= TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude=c('lat','latitude'), + longitude=c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat',# + time = c('file_date')), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 1, longitude = 1, sdate = 4, syear = 20, time = 47) + ) + expect_equal( + obs[1, 1, 1, 1, 1, 1, 8:15], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463, 2.337417, 7.127212, 3.592193), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 1:5], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 31:38], + c(10.680604, 4.843633, 4.981896, 4.833428, 1.426522, 3.625800, 7.037229, 2.911440), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 1, 2, 9:15], + c(11.189878, 11.198478, 8.868102, 10.766751, 19.929094, 20.872601, 14.304168), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 8.627518, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +}) + +test_that("2. change the file_date order", { + hcst<-Start(dat = ecmwf_path_hc, + var = var_name, + sdate = sdates.seq, + syear = indices(1:20), + time = 'all', + latitude = indices(1), + longitude = indices(1), + ensemble = 'all', + syear_depends = 'sdate', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate','syear') + ), + retrieve = F) + dates <- attr(hcst, 'Variables')$common$time + file_date <- sort(unique(sapply(dates, format, '%Y%m'))) + + +obs <- Start(dat = obs_path, + var = var100_name, + latitude = indices(1), + longitude = indices(1), + file_date= file_date, + time = values(dates), # 'sdate' 'syear' 'time' + time_var = 'time', + time_across = 'file_date', + merge_across_dims= TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude=c('lat','latitude'), + longitude=c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat',# + time = c('file_date')), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 1, longitude = 1, sdate = 4, syear = 20, time = 47) + ) + expect_equal( + obs[1, 1, 1, 1, 1, 1, 8:15], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463, 2.337417, 7.127212, 3.592193), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 1:5], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 31:38], + c(10.680604, 4.843633, 4.981896, 4.833428, 1.426522, 3.625800, 7.037229, 2.911440), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 1, 2, 9:15], + c(11.189878, 11.198478, 8.868102, 10.766751, 19.929094, 20.872601, 14.304168), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 8.627518, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +})