From 4a83aeaa359f788cc6c4031134ebb32d167d19d7 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 9 Mar 2021 13:16:19 +0100 Subject: [PATCH 01/18] Fix the time unit attribute retrieval if the format is YYYY-M-D. Also, split_multiselected_dims is changed to FALSE if it is useless. --- R/NcDataReader.R | 7 +++++++ R/Start.R | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/R/NcDataReader.R b/R/NcDataReader.R index 741ffbb..ebc58fc 100644 --- a/R/NcDataReader.R +++ b/R/NcDataReader.R @@ -221,6 +221,13 @@ NcDataReader <- function(file_path = NULL, file_object = NULL, # Origin year and month ori_year <- as.numeric(substr(parts[2], 1, 4)) ori_month <- as.numeric(substr(parts[2], 6, 7)) + if (is.na(ori_month)) { + ori_month <- as.numeric(substr(parts[2], 6, 6)) + } + if (!is.numeric(ori_year) | !is.numeric(ori_month)) { + stop(paste0("The time unit attribute format is not 'YYYY-MM-DD' or 'YYYY-M-D'. ", + "Check the file or contact the maintainer.")) + } if (calendar == 'gregorian') { # Find how many years + months diff --git a/R/Start.R b/R/Start.R index 80e1c83..d15fdb7 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3647,6 +3647,9 @@ Start <- function(..., # dim = indices/selectors, } } } + #========================================================================= + # Find the dimension to split if split_multiselected_dims = TRUE. + # If there is no dimension able to be split, change split_multiselected_dims to FALSE. all_split_dims <- NULL if (split_multiselected_dims) { for (dim_param in 1:length(dim_params)) { @@ -3678,6 +3681,13 @@ Start <- function(..., # dim = indices/selectors, } } } + if (is.null(all_split_dims)) { + split_multiselected_dims <- FALSE + .warning(paste0("Not found any dimensions able to be split. The parameter ", + "'split_multiselected_dims' is changed to FALSE.")) + } + #====================================================================== + if (merge_across_dims_narm) { # only merge_across_dims -> the 'time' dim length needs to be adjusted across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? @@ -3730,6 +3740,7 @@ Start <- function(..., # dim = indices/selectors, if (split_multiselected_dims & merge_across_dims) { # TODO: More than one split? inner_dim_pos_in_split_dims <- which(names(all_split_dims[[1]]) == inner_dims_across_files) + # if inner_dim is not the first, change! if (inner_dim_pos_in_split_dims != 1) { split_dims <- c(split_dims[inner_dim_pos_in_split_dims], -- GitLab From c18c265d7f091c5f7d206c17bc188d0e5766c0d7 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 9 Mar 2021 13:31:34 +0100 Subject: [PATCH 02/18] Change the warning place. --- R/Start.R | 10 +++++----- tests/testthat/test-Start-split-merge.R | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R/Start.R b/R/Start.R index d15fdb7..22eb323 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3680,11 +3680,11 @@ Start <- function(..., # dim = indices/selectors, } } } - } - if (is.null(all_split_dims)) { - split_multiselected_dims <- FALSE - .warning(paste0("Not found any dimensions able to be split. The parameter ", - "'split_multiselected_dims' is changed to FALSE.")) + if (is.null(all_split_dims)) { + split_multiselected_dims <- FALSE + .warning(paste0("Not found any dimensions able to be split. The parameter ", + "'split_multiselected_dims' is changed to FALSE.")) + } } #====================================================================== diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R index e998844..d73cf1e 100644 --- a/tests/testthat/test-Start-split-merge.R +++ b/tests/testthat/test-Start-split-merge.R @@ -5,6 +5,7 @@ context("Start() split + merge dim and value check") var_name <- 'sfcWind' path.exp <- '/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc' +suppressWarnings( hcst <- Start(dat = path.exp, var = var_name, sdate = c('20160704', '20161222'), @@ -19,6 +20,7 @@ hcst <- Start(dat = path.exp, time = c('sdate', 'syear')), retrieve = F, silent = TRUE) +) dates <- attr(hcst, 'Variables')$common$time file_date <- sort(unique(gsub('-', '', sapply(as.character(dates), substr, 1, 7)))) -- GitLab From 3c3783e3b188f79c683eb2e2df96e3f1ea3174a2 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 17 Mar 2021 17:25:08 +0100 Subject: [PATCH 03/18] Consider different dim order when finding the largest dim length --- R/Start.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/R/Start.R b/R/Start.R index 80e1c83..5139ac5 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2260,6 +2260,17 @@ Start <- function(..., # dim = indices/selectors, # Find the longest dimensions from all the files largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) + + # The inner dim order may differ among files. Need to align them before + # find out the largest dim length. + dim_names_first_file <- names(data_dims_all_files[[1]]) + same_dim_order <-lapply(lapply(data_dims_all_files, names), + identical, dim_names_first_file) + for (to_fix in which(!unlist(same_dim_order))) { + data_dims_all_files[[to_fix]] <- data_dims_all_files[[to_fix]][match(dim_names_first_file, + names(data_dims_all_files[[to_fix]]))] + } + for (kk in 1:length(data_dims_all_files[[1]])) { largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) } -- GitLab From f53ad8f54236fa0857f14950de829f5550c4042b Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 19 Mar 2021 12:48:23 +0100 Subject: [PATCH 04/18] Revise the check for synomins return_vars name --- R/Start.R | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/R/Start.R b/R/Start.R index 5139ac5..850bc7d 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1400,32 +1400,27 @@ Start <- function(..., # dim = indices/selectors, } } -#///////////////////////////////////////////////////// +#================================================== # Check if return_vars name is inner dim name. If it is synonim, change back to # inner dim name and return a warning. dim_params_found_file_dims <- dim_params[found_file_dims[[i]]] - file_dims_array_dim_names <- unlist(lapply(lapply(dim_params_found_file_dims, dim), - names)) - if (any(!names(return_vars) %in% expected_inner_dims[[i]] & - !names(return_vars) %in% unlist(var_params) & - !names(return_vars) %in% file_dims_array_dim_names)) { - not_inner_dimname_ind <- which(!names(return_vars) %in% expected_inner_dims[[i]]) - for (not_inner_dim in 1:length(not_inner_dimname_ind)) { - wrong_name_return_vars <- names(return_vars)[not_inner_dimname_ind[not_inner_dim]] - if (!wrong_name_return_vars %in% unlist(synonims)) { - stop(paste0("Could not find variable '", wrong_name_return_vars, "' (or its ", - "synonims if specified) in the inner dimension names. The expected ", - "name should be ", paste(paste0("'", expected_inner_dims[[i]], "'"), collapse = ', '), ".")) - } else { - inner_dim_name <- names(unlist(lapply(lapply(synonims, '%in%', wrong_name_return_vars), which))) - names(return_vars)[not_inner_dimname_ind[not_inner_dim]] <- inner_dim_name - .warning(paste0("The name '", wrong_name_return_vars, "' in parameter 'return_vars' ", - "is synonim. Change it back to the inner dimension name, '", - inner_dim_name, "'.")) - } + if (any(names(return_vars) %in% unlist(synonims) & + !names(return_vars) %in% names(synonims))) { + use_syn_names <- which(names(return_vars) %in% unlist(synonims) & + !names(return_vars) %in% names(synonims)) + for (use_syn_name in use_syn_names) { + wrong_name <- names(return_vars)[use_syn_name] + names(return_vars)[use_syn_name] <- names(unlist( + lapply(lapply(synonims, '%in%', + names(return_vars)[use_syn_name]), + which))) + .warning(paste0("The name '", wrong_name, + "' in parameter 'return_vars' in synonim. ", + "Change it back to the inner dimension name, '", + names(return_vars)[use_syn_name], "'.")) } } -#///////////////////////////////////////////////////// +#===================================================== ## (Check the *_var parameters). if (any(!(unlist(var_params) %in% names(return_vars)))) { -- GitLab From 83c8e6dbcc2f428bf6eaff9d4d77c7df3f692636 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 19 Mar 2021 13:09:59 +0100 Subject: [PATCH 05/18] Revise unit tests about implicit inner dim and split file dim --- .../testthat/test-Start-implicit_inner_dim.R | 62 +++---------------- tests/testthat/test-Start-split-merge.R | 56 ++++++++++++++--- 2 files changed, 58 insertions(+), 60 deletions(-) diff --git a/tests/testthat/test-Start-implicit_inner_dim.R b/tests/testthat/test-Start-implicit_inner_dim.R index 621fe5b..26e3ce7 100644 --- a/tests/testthat/test-Start-implicit_inner_dim.R +++ b/tests/testthat/test-Start-implicit_inner_dim.R @@ -1,20 +1,15 @@ context("Start() implicit inner dimension") -# The unit test is for the implicitly defined inner dimension. If a file dimension selector -# is an array with named dimensions, and 'split_multiselected_dims' is used, then the file -# dim can be split into multiple dimensions that may contain inner dimensions. - -# merge_across_dims + split_multiselected_dims + implicit inner dim???? - +# The unit test is for the implicit inner dimension. If the inner dimension length is 1, +# startR allows it not to be specified in the call. Users can still define it in +# 'return_vars'. #--------------------------------------------------------------- -test_that("1. Split into inner dimension", { +test_that("1. time = 1", { obs.path <- "/esarchive/recon/ecmwf/era5/monthly_mean/$var$_f1h-r1440x721cds/$var$_$file_date$.nc" variable <- "prlr" -dates_file <- c("201311","201312","201411","201412") -dim(dates_file) <- c(time = 2, syear = 2) +dates_file <- c("201311","201312") -# (1) suppressWarnings( obs <- Start(dat = obs.path, var = variable, @@ -25,8 +20,8 @@ obs <- Start(dat = obs.path, longitude_reorder = CircularSort(-180, 180), synonims = list(latitude = c('lat', 'latitude'), longitude = c('lon', 'longitude')), - return_vars = list(latitude = 'dat', - longitude = 'dat', + return_vars = list(latitude = NULL, + longitude = NULL, time = 'file_date'), split_multiselected_dims = TRUE, retrieve = FALSE) @@ -34,11 +29,11 @@ obs <- Start(dat = obs.path, expect_equal( attr(obs, 'Dimensions'), -c(dat = 1, var = 1, time = 2, syear = 2, latitude = 18, longitude = 81) +c(dat = 1, var = 1, file_date = 2, latitude = 18, longitude = 81) ) expect_equal( dim(attr(obs, 'Variables')$common$time), -c(file_date = 4, time = 1) +c(file_date = 2, time = 1) ) expect_equal( attr(obs, 'Variables')$common$time[1, 1], @@ -48,42 +43,3 @@ as.POSIXct('2013-11-15 23:30:00', tz = 'UTC') }) -test_that("2. Split into file dimension", { - -obs.path <- "/esarchive/recon/ecmwf/era5/monthly_mean/$var$_f1h-r1440x721cds/$var$_$file_date$.nc" -variable <- "prlr" -dates_file <- c("201311","201312","201411","201412") -dim(dates_file) <- c(smonth = 2, syear = 2) - -suppressWarnings( -obs <- Start(dat = obs.path, - var = variable, - file_date = dates_file, - time = 'all', - latitude = values(list(35.6, 40)), - latitude_reorder = Sort(decreasing = TRUE), - longitude = values(list(-10, 10)), - longitude_reorder = CircularSort(-180, 180), - synonims = list(latitude = c('lat', 'latitude'), - longitude = c('lon', 'longitude')), - return_vars = list(latitude = 'dat', - longitude = 'dat', - time = 'file_date'), - split_multiselected_dims = TRUE, - retrieve = FALSE) -) - -expect_equal( -attr(obs, 'Dimensions'), -c(dat = 1, var = 1, smonth = 2, syear = 2, time = 1, latitude = 18, longitude = 81) -) -expect_equal( -dim(attr(obs, 'Variables')$common$time), -c(file_date = 4, time = 1) -) -expect_equal( -attr(obs, 'Variables')$common$time[1, 1], -as.POSIXct('2013-11-15 23:30:00', tz = 'UTC') -) - -}) diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R index e998844..10e8ab6 100644 --- a/tests/testthat/test-Start-split-merge.R +++ b/tests/testthat/test-Start-split-merge.R @@ -4,7 +4,7 @@ context("Start() split + merge dim and value check") var_name <- 'sfcWind' path.exp <- '/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc' - +suppressWarnings( hcst <- Start(dat = path.exp, var = var_name, sdate = c('20160704', '20161222'), @@ -19,6 +19,7 @@ hcst <- Start(dat = path.exp, time = c('sdate', 'syear')), retrieve = F, silent = TRUE) +) dates <- attr(hcst, 'Variables')$common$time file_date <- sort(unique(gsub('-', '', sapply(as.character(dates), substr, 1, 7)))) @@ -27,7 +28,7 @@ path.obs <- '/esarchive/recon/ecmwf/era5/1hourly/$var$/$var$_$file_date$.nc' test_that("1. split + merge + narm", { -skip_on_cran() +suppressWarnings( obs <- Start(dat = path.obs, var = var_name, file_date = file_date, # a vector @@ -44,7 +45,7 @@ obs <- Start(dat = path.obs, longitude = 'dat', time = 'file_date'), retrieve = T) - +) expect_equal( dim(obs), c(dat = 1, var = 1, latitude = 10, longitude = 10, sdate = 2, syear = 3, time = 12) @@ -67,7 +68,7 @@ obs <- Start(dat = path.obs, test_that("2. no split + merge + narm", { -skip_on_cran() +suppressWarnings( obs <- Start(dat = path.obs, var = var_name, file_date = file_date, # a vector @@ -84,7 +85,7 @@ obs <- Start(dat = path.obs, longitude = 'dat', time = 'file_date'), retrieve = T) - +) expect_equal( dim(obs), c(dat = 1, var = 1, latitude = 10, longitude = 10, time = 72) @@ -107,7 +108,7 @@ obs <- Start(dat = path.obs, test_that("3. no split + merge + no narm", { -skip_on_cran() +suppressWarnings( obs <- Start(dat = path.obs, var = var_name, file_date = file_date, # a vector @@ -124,7 +125,7 @@ obs <- Start(dat = path.obs, longitude = 'dat', time = 'file_date'), retrieve = T) - +) expect_equal( dim(obs), c(dat = 1, var = 1, latitude = 10, longitude = 10, time = 108) @@ -145,3 +146,44 @@ obs <- Start(dat = path.obs, ) }) +test_that("4. split only", { + +obs.path <- "/esarchive/recon/ecmwf/era5/monthly_mean/$var$_f1h-r1440x721cds/$var$_$file_date$.nc" +variable <- "prlr" +dates_file <- c("201311","201312","201411","201412") +dim(dates_file) <- c(smonth = 2, syear = 2) + +suppressWarnings( +obs <- Start(dat = obs.path, + var = variable, + file_date = dates_file, + time = 'all', + latitude = values(list(35.6, 40)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-10, 10)), + longitude_reorder = CircularSort(-180, 180), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + split_multiselected_dims = TRUE, + retrieve = FALSE) +) + +expect_equal( +attr(obs, 'Dimensions'), +c(dat = 1, var = 1, smonth = 2, syear = 2, time = 1, latitude = 18, longitude = 81) +) +expect_equal( +dim(attr(obs, 'Variables')$common$time), +c(file_date = 4, time = 1) +) +expect_equal( +attr(obs, 'Variables')$common$time[1, 1], +as.POSIXct('2013-11-15 23:30:00', tz = 'UTC') +) + + + +}) -- GitLab From 06950a03cc272e9d1c4f916ebc31c90c0e324bed Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 19 Mar 2021 13:28:23 +0100 Subject: [PATCH 06/18] Revise split_multiselected_dims --- inst/doc/faq.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 24e7acb..3b6d5fa 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -611,7 +611,30 @@ List of 1 The selectors can be not only vectors, but also multidimensional array. For instance, the 'time' dimension can be assigned by a two-dimensional array `[sdate = 12, time = 31]`, which is 31 timesteps for 12 start dates. You may want to have both 'sdate' and 'time' in the output dimension, even though 'sdate' is not explicitly specified in Start(). -The parameter 'split_multiselected_dims' is for this goal. It is common in the case that uses experimental data attributes to get the corresponding observational data. +The parameter 'split_multiselected_dims' is for this goal. It can be used to reshape the +file dimensions, and it is also common in the case that experimental data attributes are +used to define observational data inner dimensions, so we can get the corresponding observational data in the same dimension structure. + +Here is a simple example. By defining the selector of the file dimension 'file_date' as a +two-dimensional array, we can reshape this dimension into 'month' and 'year'. + +```r +obs.path <- "/esarchive/recon/ecmwf/era5/monthly_mean/$var$_f1h-r1440x721cds/$var$_$file_date$.nc" +file_date <- c("201311","201312","201411","201412") +dim(file_date) <- c(month = 2, year = 2) + +obs <- Start(dat = obs.path, + var = 'prlr', + file_date = file_date, + time = 'all', + lat = indices(1:10), + lon = indices(1:10), + return_vars = list(lat = NULL, + lon = NULL, + time = 'file_date'), + split_multiselected_dims = TRUE, + retrieve = TRUE) +``` The following script is part of the use case [ex1_2_exp_obs_attr.R](inst/doc/usecase/ex1_2_exp_obs_attr.R). The time selector for observational data comes from experimental data above (neglected here). The dimension number of the selector is two. -- GitLab From bf1b0cf090e4bdcc8f8e06ff081e7c364a553f85 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 19 Mar 2021 14:12:36 +0100 Subject: [PATCH 07/18] Stop if merge_across_dims and split_multiselected_dims are both used but this file dim selector is multi-dim array --- R/Start.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index 850bc7d..3d82d57 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3636,6 +3636,7 @@ Start <- function(..., # dim = indices/selectors, # final_dims will be used for collocation of data, whereas final_dims_fake # will be used for shaping the final array to be returned to the user. final_dims_fake <- final_dims + if (merge_across_dims) { if (!is.null(inner_dims_across_files)) { for (file_dim_across in names(inner_dims_across_files)) { @@ -3666,7 +3667,15 @@ Start <- function(..., # dim = indices/selectors, 1:length(split_dims)) } old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) - + + # If merge_across_dims and split_multiselected_dims are both used, + # on one file dim, and this file dim is multi-dim, it doesn't work. + if (identical(old_dim_pos, integer(0))) { + stop(paste0("The dimension '", names(dim_params)[dim_param], + "' to be split cannot be found after 'merge_across_dims' ", + "is used. Check if the reshape parameters are used appropriately.")) + } + # NOTE: Three steps to create new dims. # 1st: Put in the dims before split_dim. # 2nd: Replace the old_dim with split_dims. @@ -3684,6 +3693,7 @@ Start <- function(..., # dim = indices/selectors, } } } + if (merge_across_dims_narm) { # only merge_across_dims -> the 'time' dim length needs to be adjusted across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? -- GitLab From 02dcafbdc06eb80f7c0e600725abe6d3242e1fa8 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 29 Apr 2021 16:30:34 +0200 Subject: [PATCH 08/18] Refine How_to_11 --- inst/doc/faq.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 3b6d5fa..714a7ef 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -475,7 +475,7 @@ The code to reproduce this behaviour could be found in the Use Cases section, [e ### 11. Select the longitude/latitude region There are three ways to specify the dimension selectors: special keywords('all', 'first', 'last'), indices, or values (find more details in [pratical guide](inst/doc/practical_guide.md)). -The parameter 'xxx_reorder' is only effective when using **values**. +For now, the parameter 'xxx_reorder' is only effective when using **values**. There are two reorder functions in startR package, **Sort()** for latitude and **CircularSort()** for longitude. Sort() is a wrapper function of base function sort(), rearranging the values from low to high (decreasing = TRUE, default) or @@ -487,9 +487,16 @@ It requires two input numbers defining the borders of the whole region, which ar `longitude_reorder = CircularSort(0, 360)` means that the left border is 0 and the right border is 360, so 360 will be put back to 0, 361 will be put back to 1, and -1 will become 359. After circulating values, CircularSort() also sorts the values from small to big. It may cause the discontinous sub-region, but the problem can be solved by assigning the borders correctly. +Note that the two points in CircularSort() are regarded as the same point. Hence, if you want to load the global longitude, lonmin/lonmax should be slightly different, e.g., 0/359.9, 0.1/360, -179.9/180, -180/179.9, etc. Otherwise, only one point will be returned. The following chart helps you to decide how to use CircularSort() to get the desired region. -The first row represents the longitude border of the requested region, e.g., `values(list(lon.min, lon.max))`. +The first row represents the longitude border of the requested region, e.g., `values(list(lon.min, lon.max))`, +and the white part is the returned longitude range corresponding to each CircularSort() setting. +Here are some summaries: +- The original longitude range does not matter. No matter the original longitude is [0, 360] or [-180, 180], Start() will return the values shown in the chart according to the lonmin/lonmax you set. +- The lonmin/lonmax value should be consistent with CircularSort(), so the returned values are continuous. For example, if `lonmin/lonmax = -60/60`, `CircularSort(-180, 180)` should be used. +- Define the longitude range as the one you want to get, regardless the original file. For example, if you want the data to be [-180, 180], define `lonmin/lonmax = -179.9/180` and `CircularSort(-180, 180)`, even if the original longitude in the netCDF file is [0, 360]. + Note that this chart only provides the idea. The real numbers may slightly differ depending on the original/transform values. -- GitLab From 7c5a8dfa2594c80d2eeaed35289153fb4c044026 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 30 Apr 2021 09:17:01 +0200 Subject: [PATCH 09/18] Fix link --- inst/doc/usecase.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index ac4e4ad..137f473 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -25,7 +25,7 @@ in a comparable structure. It also shows how to use parameters `xxx_tolerance`, 5. [Use reorder functions to get desired lat/lon region](inst/doc/usecase/ex1_5_latlon_reorder.R) This script shows you how to use reorder function (`Sort()`, `CircularSort()`) to -get the desired longitude and latitude region. See [FAQ How-to-#11] (/inst/doc/faq.md#11-read-latitude-and-longitude-with-the-usage-of-parameter-xxx_reorder) +get the desired longitude and latitude region. See [FAQ How-to-#11] (inst/doc/faq.md#11-select-the-longitudelatitude-region) for more explanation. 6. [Loading gridpoint data](inst/doc/usecase/ex1_6_gridpoint_data.R) -- GitLab From 9d2c358a5cc99103b9ecc17f209a9db65dfb0b55 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 30 Apr 2021 09:17:43 +0200 Subject: [PATCH 10/18] Fix Sort(decreasing = T) error --- inst/doc/faq.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 714a7ef..83676f3 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -478,8 +478,8 @@ There are three ways to specify the dimension selectors: special keywords('all', For now, the parameter 'xxx_reorder' is only effective when using **values**. There are two reorder functions in startR package, **Sort()** for latitude and **CircularSort()** for longitude. -Sort() is a wrapper function of base function sort(), rearranging the values from low to high (decreasing = TRUE, default) or -from high to low (decreasing = FALSE). For example, if you want to sort latitude from 90 to -90, use `latitude_reorder = Sort(decreasing = TRUE)`. +Sort() is a wrapper function of base function sort(), rearranging the values from low to high (decreasing = FALSE, default) or +from high to low (decreasing = TRUE). For example, if you want to sort latitude from 90 to -90, use `latitude_reorder = Sort(decreasing = TRUE)`. By this means, the result will always from big to small value no matter how the original order is. On the other hand, the concept of CircularSort() is different. It is used for a circular region, putting the out-of-region values back to the region. -- GitLab From 45cbe5a9aedaad408b21cc6cc93a5b4edc24c4d7 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 30 Apr 2021 09:20:17 +0200 Subject: [PATCH 11/18] Fix typo --- inst/doc/usecase.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 137f473..06f2c04 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -25,7 +25,7 @@ in a comparable structure. It also shows how to use parameters `xxx_tolerance`, 5. [Use reorder functions to get desired lat/lon region](inst/doc/usecase/ex1_5_latlon_reorder.R) This script shows you how to use reorder function (`Sort()`, `CircularSort()`) to -get the desired longitude and latitude region. See [FAQ How-to-#11] (inst/doc/faq.md#11-select-the-longitudelatitude-region) +get the desired longitude and latitude region. See [FAQ How-to-#11](inst/doc/faq.md#11-select-the-longitudelatitude-region) for more explanation. 6. [Loading gridpoint data](inst/doc/usecase/ex1_6_gridpoint_data.R) -- GitLab From 14e6736106fb69779591f34885eca7a5eb42b017 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 5 May 2021 23:29:17 +0200 Subject: [PATCH 12/18] Reorder chunks when split + merge. Thorough check is further needed --- R/Start.R | 111 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/R/Start.R b/R/Start.R index e8a1d7d..9776ead 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3702,27 +3702,29 @@ Start <- function(..., # dim = indices/selectors, } } #====================================================================== - - if (merge_across_dims_narm) { + if (merge_across_dims) { # only merge_across_dims -> the 'time' dim length needs to be adjusted across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? - across_file_dim <- names(inner_dims_across_files) #TODO: more than one? # Get the length of each inner_dim ('time') along each file_dim ('file_date') length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) - - if (!split_multiselected_dims) { - final_dims_fake_name <- names(final_dims_fake) - pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) - new_length_inner_dim <- sum(unlist(length_inner_across_dim)) - if (pos_across_inner_dim != length(final_dims_fake)) { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim, - final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) - } else { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim) + + if (merge_across_dims_narm) { + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + + if (!split_multiselected_dims) { + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name } - names(final_dims_fake) <- final_dims_fake_name } } @@ -4047,43 +4049,46 @@ Start <- function(..., # dim = indices/selectors, # unequal inner_dim ('time') length across file_dim ('file_date'). # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. - if (merge_across_dims_narm) { - - # Get the length of these two dimensions in final_dims - length_inner_across_store_dims <- final_dims[across_inner_dim] - length_file_across_store_dims <- final_dims[across_file_dim] - - # Create a logical array for merge_across_dims - logi_array <- array(rep(FALSE, - length_file_across_store_dims * length_inner_across_store_dims), - dim = c(length_inner_across_store_dims, length_file_across_store_dims)) - for (i in 1:length_file_across_store_dims) { #1:4 - logi_array[1:length_inner_across_dim[[i]], i] <- TRUE - } - - # First, get the data array with final_dims dimension - data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) - - # Change the NA derived from additional spaces to -9999, then remove these -9999 - func_remove_blank <- function(data_array, logi_array) { - # dim(data_array) = [time, file_date] - # dim(logi_array) = [time, file_date] - # Change the blank spaces from NA to -9999 - data_array[which(!logi_array)] <- -9999 - return(data_array) + if (merge_across_dims) { + if (!merge_across_dims_narm) { + data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) + } else { + # Get the length of these two dimensions in final_dims + length_inner_across_store_dims <- final_dims[across_inner_dim] + length_file_across_store_dims <- final_dims[across_file_dim] + + # Create a logical array for merge_across_dims + logi_array <- array(rep(FALSE, + length_file_across_store_dims * length_inner_across_store_dims), + dim = c(length_inner_across_store_dims, length_file_across_store_dims)) + for (i in 1:length_file_across_store_dims) { #1:4 + logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + } + + # First, get the data array with final_dims dimension + data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) + + # Change the NA derived from additional spaces to -9999, then remove these -9999 + func_remove_blank <- function(data_array, logi_array) { + # dim(data_array) = [time, file_date] + # dim(logi_array) = [time, file_date] + # Change the blank spaces from NA to -9999 + data_array[which(!logi_array)] <- -9999 + return(data_array) + } + data_array_final_dims <- multiApply::Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector } - data_array_final_dims <- multiApply::Apply(data_array_final_dims, - target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') - output_dims = c(across_inner_dim, across_file_dim), - fun = func_remove_blank, - logi_array = logi_array)$output1 - ## reorder back to the correct dim - tmp <- match(names(final_dims), names(dim(data_array_final_dims))) - data_array_final_dims <- .aperm2(data_array_final_dims, tmp) - data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector - - #NOTE: When one file contains values for dicrete dimensions, rearrange the - # chunks (i.e., work_piece) is necessary. + + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { # generate the correct order list from indices_chunk @@ -4091,7 +4096,7 @@ Start <- function(..., # dim = indices/selectors, i <- 1 j <- 1 a <- indices_chunk[i] - while (i < length(indices_chunk)) { + while (i <= length(indices_chunk)) { while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { a <- c(a, indices_chunk[i+1]) i <- i + 1 -- GitLab From 76907ff7c5ad263c34752caf4f320c9aa4d54507 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 13 May 2021 17:50:40 +0200 Subject: [PATCH 13/18] Fix mixed-dimension error when reshape params are used. Create unit tests --- R/Start.R | 97 ++--- inst/doc/usecase/ex1_3_attr_loadin.R | 44 +-- tests/testthat/test-Start-reshape.R | 503 ++++++++++++++++++++++++ tests/testthat/test-Start-split-merge.R | 2 - 4 files changed, 555 insertions(+), 91 deletions(-) create mode 100644 tests/testthat/test-Start-reshape.R diff --git a/R/Start.R b/R/Start.R index 9776ead..33e9fac 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1852,6 +1852,7 @@ Start <- function(..., # dim = indices/selectors, } else { common_return_vars[[inner_dim]] <- file_dim_as_selector_array_dim } + tmp <- file_dim_as_selector_array_dim } else if (inner_dim %in% inner_dims_across_files) { #(2) file_dim_name <- names(which(inner_dim == inner_dims_across_files)) if (file_dim_name %in% found_pattern_dim) { @@ -1859,6 +1860,7 @@ Start <- function(..., # dim = indices/selectors, } else { common_return_vars[[inner_dim]] <- file_dim_name } + tmp <- file_dim_name } .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", tmp, "', but '", inner_dim, "' is not in return_vars list or is NULL. ", @@ -4049,7 +4051,7 @@ Start <- function(..., # dim = indices/selectors, # unequal inner_dim ('time') length across file_dim ('file_date'). # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. - if (merge_across_dims) { + if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { if (!merge_across_dims_narm) { data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) } else { @@ -4086,6 +4088,11 @@ Start <- function(..., # dim = indices/selectors, data_array_final_dims <- .aperm2(data_array_final_dims, tmp) data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector } + + if (length(data_array_tmp) != prod(final_dims_fake)) { + stop(paste0("After reshaping, the data do not fit into the expected output dimension. ", + "Check if the reshaping parameters are used correctly.")) + } #NOTE: When one file contains values for dicrete dimensions, rearrange the # chunks (i.e., work_piece) is necessary. @@ -4110,57 +4117,55 @@ Start <- function(..., # dim = indices/selectors, final_order_list <- lapply(final_order_list, length) if (!all(diff(as.numeric(names(final_order_list))) > 0)) { - - - # shape the vector into the array without split_dims - split_dims_pos <- match(split_dims, final_dims_fake) - new_dims <- c() - if (split_dims_pos[1] > 1) { - new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) - } - new_dims <- c(new_dims, prod(split_dims)) - names(new_dims)[split_dims_pos[1]] <- across_inner_dim - if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) - } - final_dims_fake_no_split <- new_dims - data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) - # seperate 'time' dim into each work_piece length - data_array_seperate <- list() - tmp <- cumsum(unlist(length_inner_across_dim)) - tmp <- c(0, tmp) - for (i in 1:length(length_inner_across_dim)) { - data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, - (tmp[i] + 1):tmp[i + 1]) - } - - # re-build the array: chunk - which_chunk <- as.numeric(names(final_order_list)) - how_many_indices <- unlist(final_order_list) - array_piece <- list() - ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) - for (i in 1:length(final_order_list)) { - array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], - across_inner_dim, - ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) - ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - } + # shape the vector into the array without split_dims + split_dims_pos <- match(split_dims, final_dims_fake) + new_dims <- c() + if (split_dims_pos[1] > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, prod(split_dims)) + names(new_dims)[split_dims_pos[1]] <- across_inner_dim + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake_no_split <- new_dims + data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) + # seperate 'time' dim into each work_piece length + data_array_seperate <- list() + tmp <- cumsum(unlist(length_inner_across_dim)) + tmp <- c(0, tmp) + for (i in 1:length(length_inner_across_dim)) { + data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + + # re-build the array: chunk + which_chunk <- as.numeric(names(final_order_list)) + how_many_indices <- unlist(final_order_list) + array_piece <- list() + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + for (i in 1:length(final_order_list)) { + array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], + across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] + } - # re-build the array: paste - data_array_tmp <- array_piece[[1]] - along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) - if (length(array_piece) > 1) { - for (i in 2:length(array_piece)) { - data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], - along = along_pos) + # re-build the array: paste + data_array_tmp <- array_piece[[1]] + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + if (length(array_piece) > 1) { + for (i in 2:length(array_piece)) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } } - } - } + } } data_array <- array(data_array_tmp, dim = final_dims_fake) - } else { # merge_across_dims_narm = F (old version) + } else { # ! (merge_across_dims + split_multiselected_dims) (old version) data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) } diff --git a/inst/doc/usecase/ex1_3_attr_loadin.R b/inst/doc/usecase/ex1_3_attr_loadin.R index d514c30..e2c8211 100644 --- a/inst/doc/usecase/ex1_3_attr_loadin.R +++ b/inst/doc/usecase/ex1_3_attr_loadin.R @@ -77,7 +77,7 @@ # NOTE: 'merge_across_dims_narm = TRUE' is necessary because the observational # data have unequal time length of 30-day and 31-day months. # If the NAs are not removed, unwanted NAs will exist and make the -# values misplaced in the array. See 'bonus' below for more explanation. +# values misplaced in the array. #------- Check erai ----------- dim(erai) @@ -129,45 +129,3 @@ attr(erai, 'Variables')$common$time[2, ] -# //////////////////"BONUS"////////////////////// -# Here is something more to show the usage of parameter 'merge_across_dims_narm'. -# If the last day of 30-day months is NA instead of the first day of the following month, -# NAs are needed to exist in the array. In this case, 'merge_across_dims_narm' -# should be FALSE. - - dates <- attr(system4, 'Variables')$common$time - dates[2, 31] -#[1] "1994-07-01 UTC" - dates[2, 31] <- NA # Jun - dates[5, 31] <- NA # Sep - dates[7, 31] <- NA # Nov - - erai <- Start(dat = repos_obs, - var = 'tas', - file_date = dates_file, - time = values(dates), - latitude = indices(1:10), - longitude = indices(1:10), - time_var = 'time', - time_across = 'file_date', - merge_across_dims = TRUE, - #keep NAs of the last day in 30-day months - merge_across_dims_narm = FALSE, - split_multiselected_dims = TRUE, - return_vars = list(latitude = NULL, - longitude = NULL, - time = 'file_date'), - retrieve = TRUE) - -#------- Check erai ----------- -erai[1, 1, 2, , 1, 1] # June -# [1] 269.9410 269.6855 268.7380 268.5008 270.3236 271.5151 270.5046 270.1686 -# [9] 270.5395 272.0379 272.5489 271.1494 270.7764 270.5678 272.0331 273.7856 -#[17] 273.9849 274.5904 273.4369 273.8404 274.4068 274.2292 274.7375 275.5104 -#[25] 275.4324 274.9408 274.8679 276.5602 275.0995 274.6409 NA -erai[1, 1, 5, , 1, 1] # Sep -# [1] 270.0656 270.7113 268.4678 271.6489 271.2354 269.7831 269.8045 268.7994 -# [9] 266.3092 262.2734 265.0124 261.8378 265.3950 257.1690 255.8402 264.8826 -#[17] 267.8663 266.6875 262.5502 258.5476 258.9617 263.6396 257.1111 264.8644 -#[25] 261.0085 256.7690 256.5811 256.4331 256.1260 256.4716 NA -#------------------------------ diff --git a/tests/testthat/test-Start-reshape.R b/tests/testthat/test-Start-reshape.R new file mode 100644 index 0000000..793a3b3 --- /dev/null +++ b/tests/testthat/test-Start-reshape.R @@ -0,0 +1,503 @@ +context("Start() reshape parameters check") +# This one is more comprehensive than test-Start-split-merge.R + +path_exp <- '/esarchive/exp/ecmwf/system5c3s/daily_mean/$var$_f6h/$var$_$sdate$.nc' +path_obs <- '/esarchive/recon/ecmwf/era5/daily_mean/$var$_f1h-r360x181/$var$_$date$.nc' +var <- 'tas' +sdate <- paste0(1993:1995, '1201') + +suppressWarnings( +exp <- Start(dat = path_exp, + var = var, + sdate = sdate, + time = indices(1:90), #indices(1:91), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = FALSE, silent = T) +) +dates <- attr(exp, 'Variables')$common$time + +# easyNCDF +library(easyNCDF) +# obs +easy_sdate <- c('199312', paste0(rep(1994:1995, each = 3), c('01', '02', '12')), + '199601', '199602') +easy_array <- c() + +for (i in 1:length(easy_sdate)) { + easy_file <- NcOpen(paste0('/esarchive/recon/ecmwf/era5/daily_mean/tas_f1h-r360x181/tas_', + easy_sdate[i], '.nc')) + if (substr(easy_sdate[i], 5, 6) == '02') { + sub_time <- 1:28 + } else { + sub_time <- 1:31 + } + easy_obs <- NcToArray(easy_file, vars_to_read = 'tas', + dim_indices = list(lon = c(1), lat = c(1), time = sub_time)) + NcClose(easy_file) + easy_array <- c(easy_array, as.vector(easy_obs)) +} +dim(easy_array) <- c(time = 90, sdate = 3) + + + +test_that("1. split + merge + narm", { + +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +unsorted_dates <- unique(format(dates, '%Y%m')) + +# unsorted dates +obs1 <- Start(dat = path_obs, + var = var, + date = unsorted_dates, + time = values(dates), #dim: [sdate = 3, time = 90] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 3, time = 90] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, sdate = 3, time = 90, lat = 1, lon = 1) +) +expect_equal( +dim(obs1), +dim(obs2) +) +expect_equal( +as.vector(obs1), +as.vector(obs2) +) +expect_equal( +as.vector(obs1[1, 1, 1, , 1, 1]), +as.vector(easy_array[, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, , 1, 1]), +as.vector(easy_array[, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 3, , 1, 1]), +as.vector(easy_array[, 3]) +) + +}) + + +test_that("2. split + merge", { + +exp <- Start(dat = path_exp, + var = var, + sdate = sdate, + time = indices(1:62), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = FALSE) + +dates <- attr(exp, 'Variables')$common$time + +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +unsorted_dates <- unique(format(dates, '%Y%m')) + +# unsorted dates +obs1 <- Start(dat = path_obs, + var = var, + date = unsorted_dates, + time = values(dates), #dim: [sdate = 3, time = 62] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 3, time = 62] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, sdate = 3, time = 62, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs1[1, 1, 1, , 1, 1]), +as.vector(easy_array[1:62, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, , 1, 1]), +as.vector(easy_array[1:62, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 3, , 1, 1]), +as.vector(easy_array[1:62, 3]) +) +expect_equal( +as.vector(obs1), +as.vector(obs2) +) + +}) + + + +test_that("3. merge", { +# NOTE: The three files are all regarded to have time = 31, despite 199402 only has 28. +# It happens when time = 'all' or time = indices(). It seems reasonable when +# 'merge_across_dims' is not used, but if it is used, it's common to expect 31+31+28. +# See the next test "4. merge + narm". 199402 is still regarded as 31, so NAs are not +# removed. +suppressWarnings( +obs3 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = 'all', + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) +) + + +expect_equal( +dim(obs3), +c(dat = 1, var = 1, time = 93, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs3), +c(as.vector(easy_array[, 1]), NA, NA, NA) +) + +}) + + +test_that("4. merge + narm", { + +# (1) Notice that the NAs at the tail of 199402 won't be removed because Start() +# considers all the files have the same length, i.e., 31. +# The NAs in 199402 are regarded as part of the original file. + +obs3 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = 'all', + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs3), +c(dat = 1, var = 1, time = 93, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs3), +c(as.vector(easy_array[, 1]), NA, NA, NA) +) + +# (2) It's tricky that 199402 is considered time = 31 because Start() considers +# all the files have the same length. So it won't return an error when +# time = indices(93). +# The first 14 time steps of 199312 will be removed but the NAs at the tail +# of 199402 will be preserved. +obs4 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = indices(15:93), + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs4), +c(dat = 1, var = 1, time = 79, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs4), +c(as.vector(easy_array[15:90, 1]), NA, NA, NA) +) + +# (3) If time is values(), 199402 is considered time = 28, so NAs will be removed. +obs5 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = dates[1, ], + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) +expect_equal( +dim(obs5), +c(dat = 1, var = 1, time = 90, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs5), +as.vector(easy_array[1:90, 1]) +) + +}) + +test_that("5. split", { + +date_array <- c('199312', '199401', '199412', '199501') +dim(date_array) <- c(month = 2, year = 2) + +# split file dim +obs1 <- Start(dat = path_obs, + var = var, + date = date_array, # [month = 2, year = 2] + time = indices(1:31), + lat = indices(1), + lon = indices(1), +# time_across = 'date', +# merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL), + # time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, month = 2, year = 2, time = 31, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs1[1, 1, 1, 1, , 1, 1]), +as.vector(easy_array[1:31, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, 1, , 1, 1]), +as.vector(easy_array[32:62, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 1, 2, , 1, 1]), +as.vector(easy_array[1:31, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 2, 2, , 1, 1]), +as.vector(easy_array[32:62, 2]) +) + +# split inner time +## time is indices +time_array <- array(1:62, dim = c(day = 31, month = 2)) +exp1 <- Start(dat = path_exp, + var = var, + sdate = sdate[1], + time = time_array, + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = TRUE) + +# easyNCDF +easy_sdate_exp <- '19931201' +easy_file_exp <- NcOpen(paste0('/esarchive/exp/ecmwf/system5c3s/daily_mean/tas_f6h/tas_', + easy_sdate_exp, '.nc')) +easy_exp <- NcToArray(easy_file_exp, vars_to_read = 'tas', + dim_indices = list(longitude = c(1), latitude = c(1), ensemble = c(1), + time = 1:62)) +NcClose(easy_file_exp) + +expect_equal( +dim(exp1), +c(dat = 1, var = 1, sdate = 1, day = 31, month = 2, ensemble = 1, lat = 1, lon = 1) +) +expect_equal( +as.vector(exp1), +as.vector(easy_exp) +) + +## time is values +time_array <- dates[1, 1:62] +dim(time_array) <- c(day = 31, month = 2) +exp2 <- Start(dat = path_exp, + var = var, + sdate = sdate[1], + time = time_array, + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL), +# time = 'sdate'), + retrieve = TRUE) +expect_equal( +dim(exp2), +c(dat = 1, var = 1, sdate = 1, day = 31, month = 2, ensemble = 1, lat = 1, lon = 1) +) +expect_equal( +as.vector(exp1), +as.vector(exp2) +) + + +}) + +test_that("6. repetitive values", { + +exp <- Start(dat = path_exp, + var = var, + sdate = c('19931101', '19931201'), + time = indices(1:61), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = F) +dates <- attr(exp, 'Variables')$common$time + +# sorted and unsorted are the same here +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +#unsorted_dates <- unique(format(dates, '%Y%m')) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 2, time = 61] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# easyNCDF +easy_file_199311 <- NcOpen(paste0('/esarchive/recon/ecmwf/era5/daily_mean/tas_f1h-r360x181/tas_', + '199311', '.nc')) +easy_obs_199311 <- NcToArray(easy_file_199311, vars_to_read = 'tas', + dim_indices = list(lon = c(1), lat = c(1), time = 1:30)) +NcClose(easy_file_199311) + +expect_equal( +dim(obs2), +c(dat = 1, var = 1, sdate = 2, time = 61, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs2[1, 1, 1, 1:30, 1, 1]), +as.vector(easy_obs_199311) +) +expect_equal( +as.vector(obs2[1, 1, 1, 31:61, 1, 1]), +as.vector(obs2[1, 1, 2, 1:31, 1, 1]) +) +expect_equal( +as.vector(obs2[1, 1, 2, 31:61, 1, 1]), +easy_array[31:61 ,1] +) + +}) diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R index da21f92..9376f9a 100644 --- a/tests/testthat/test-Start-split-merge.R +++ b/tests/testthat/test-Start-split-merge.R @@ -1,5 +1,3 @@ -#if (identical(Sys.getenv("NOT_CRAN"), "")) Sys.setenv(NOT_CRAN='true') - context("Start() split + merge dim and value check") var_name <- 'sfcWind' -- GitLab From ddfa2387a8222ade842dd33e0b8e6bb2f60bcad0 Mon Sep 17 00:00:00 2001 From: nperez Date: Thu, 13 May 2021 19:41:41 +0200 Subject: [PATCH 14/18] Test for DCPP HadGem3 --- .../testthat/test-Start-DCPP-across-depends.R | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/testthat/test-Start-DCPP-across-depends.R diff --git a/tests/testthat/test-Start-DCPP-across-depends.R b/tests/testthat/test-Start-DCPP-across-depends.R new file mode 100644 index 0000000..40f4268 --- /dev/null +++ b/tests/testthat/test-Start-DCPP-across-depends.R @@ -0,0 +1,26 @@ +context("DCPP successfull retrieved for depends and across parameters.") +test_that("Chunks of DCPP files- Local execution", { + + path <- '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$chunk$.nc' + + sdates <- c('2017', '2018') + dat <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = indices(3:5), + chunk_depends = 'sdate', + time = 'all', + i = indices(1:10), + j = indices(1:10), + time_across = 'chunk', + merge_across_dims = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) + + one <- Start(dat = '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s2018-r1i1p1f2_gn_202201-202212.nc', + var = 'tos', time = 'all', i = indices(1:10), j = indices(1:10), + retrieve = TRUE) + + expect_equal(dat[1,1,2,25:36,,], one[1,1,,,]) +}) + -- GitLab From 841b3e3ed0e35b64564ae60f6529b09f2fea0408 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 14 May 2021 12:12:02 +0200 Subject: [PATCH 15/18] Change default value of 'merge_across_dims_narm' to TRUE. --- R/Start.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/R/Start.R b/R/Start.R index 33e9fac..ff4b978 100644 --- a/R/Start.R +++ b/R/Start.R @@ -632,7 +632,8 @@ #' across another dimension. For example, if the dimension 'time' extends #' across dimension 'chunk', and the time length along the first chunk is 2 #' while along the second chunk is 10. Setting this parameter as TRUE can -#' remove the additional 8 NAs at position 3 to 10. The default value is FALSE. +#' remove the additional 8 NAs at position 3 to 10. The default value is TRUE, +#' but will be automatically turned to FALSE if 'merge_across_dims = FALSE'. #'@param split_multiselected_dims A logical value indicating whether to split a #' dimension that has been selected with a multidimensional array of selectors #' into as many dimensions as present in the selector array. The default value @@ -820,7 +821,7 @@ Start <- function(..., # dim = indices/selectors, metadata_dims = NULL, selector_checker = SelectorChecker, merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, + merge_across_dims_narm = TRUE, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, largest_dims_length = FALSE, @@ -863,9 +864,6 @@ Start <- function(..., # dim = indices/selectors, } if (!merge_across_dims & merge_across_dims_narm) { merge_across_dims_narm <- FALSE - .warning(paste0("Parameter 'merge_across_dims_narm' can only be TRUE when ", - "'merge_across_dims' is TRUE. Set 'merge_across_dims_narm'", - " to FALSE.")) } # Leave alone the dimension parameters in the variable dim_params -- GitLab From e8571282cb73b234e2f34283afd7a68a016e22a0 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 14 May 2021 12:12:17 +0200 Subject: [PATCH 16/18] Update .Rd --- DESCRIPTION | 2 +- man/AddStep.Rd | 1 - man/CDORemapper.Rd | 1 - man/Collect.Rd | 1 - man/Compute.Rd | 16 +- man/NcCloser.Rd | 1 - man/NcDataReader.Rd | 10 +- man/NcDimReader.Rd | 10 +- man/NcOpener.Rd | 1 - man/NcVarReader.Rd | 10 +- man/SelectorChecker.Rd | 4 +- man/Sort.Rd | 9 +- man/Start.Rd | 923 +++++++++++++++++++++-------------------- man/Step.Rd | 10 +- man/indices.Rd | 1 - man/values.Rd | 1 - 16 files changed, 516 insertions(+), 485 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5a33c3c..2dfda5d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,4 +38,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true SystemRequirements: cdo ecFlow -RoxygenNote: 5.0.0 +RoxygenNote: 7.0.1 diff --git a/man/AddStep.Rd b/man/AddStep.Rd index 3eece05..0d0ce46 100644 --- a/man/AddStep.Rd +++ b/man/AddStep.Rd @@ -54,4 +54,3 @@ create the complete workflow. It is the final step before data processing. wf <- AddStep(data, step, pi_val = pi_short) } - diff --git a/man/CDORemapper.Rd b/man/CDORemapper.Rd index 4f56baa..763be77 100644 --- a/man/CDORemapper.Rd +++ b/man/CDORemapper.Rd @@ -65,4 +65,3 @@ perform the interpolation, hence CDO is required to be installed. \seealso{ \code{\link[s2dverification]{CDORemap}} } - diff --git a/man/Collect.Rd b/man/Collect.Rd index 44a7dee..97b529b 100644 --- a/man/Collect.Rd +++ b/man/Collect.Rd @@ -83,4 +83,3 @@ of results as one data array when the execution is done. See more details on } } - diff --git a/man/Compute.Rd b/man/Compute.Rd index e07106a..7d6db4d 100644 --- a/man/Compute.Rd +++ b/man/Compute.Rd @@ -4,9 +4,18 @@ \alias{Compute} \title{Specify the execution parameters and trigger the execution} \usage{ -Compute(workflow, chunks = "auto", threads_load = 1, threads_compute = 1, - cluster = NULL, ecflow_suite_dir = NULL, ecflow_server = NULL, - silent = FALSE, debug = FALSE, wait = TRUE) +Compute( + workflow, + chunks = "auto", + threads_load = 1, + threads_compute = 1, + cluster = NULL, + ecflow_suite_dir = NULL, + ecflow_server = NULL, + silent = FALSE, + debug = FALSE, + wait = TRUE +) } \arguments{ \item{workflow}{A list of the class 'startR_workflow' returned by function @@ -104,4 +113,3 @@ arrays and additional metadata. res <- Compute(wf, chunks = list(longitude = 4, sdate = 2)) } - diff --git a/man/NcCloser.Rd b/man/NcCloser.Rd index 65beab8..588f63a 100644 --- a/man/NcCloser.Rd +++ b/man/NcCloser.Rd @@ -32,4 +32,3 @@ NcCloser(connection) \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcDimReader}} \code{\link{NcVarReader}} } - diff --git a/man/NcDataReader.Rd b/man/NcDataReader.Rd index a6d32c7..9014789 100644 --- a/man/NcDataReader.Rd +++ b/man/NcDataReader.Rd @@ -4,8 +4,13 @@ \alias{NcDataReader} \title{NetCDF file data reader for 'startR'} \usage{ -NcDataReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDataReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -61,4 +66,3 @@ in turn uses nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDimReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcDimReader.Rd b/man/NcDimReader.Rd index d539ffd..38dd870 100644 --- a/man/NcDimReader.Rd +++ b/man/NcDimReader.Rd @@ -4,8 +4,13 @@ \alias{NcDimReader} \title{NetCDF dimension reader for 'startR'} \usage{ -NcDimReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDimReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ This function uses the function NcReadDims() in the package 'easyNCDF'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcOpener.Rd b/man/NcOpener.Rd index e46384c..30885fc 100644 --- a/man/NcOpener.Rd +++ b/man/NcOpener.Rd @@ -34,4 +34,3 @@ NcCloser(connection) \code{\link{NcDimReader}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcVarReader.Rd b/man/NcVarReader.Rd index c601907..fb093ae 100644 --- a/man/NcVarReader.Rd +++ b/man/NcVarReader.Rd @@ -4,8 +4,13 @@ \alias{NcVarReader} \title{NetCDF variable reader for 'startR'} \usage{ -NcVarReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - var_name = NULL, synonims) +NcVarReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + var_name = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcDimReader}} } - diff --git a/man/SelectorChecker.Rd b/man/SelectorChecker.Rd index ef83575..e1cf112 100644 --- a/man/SelectorChecker.Rd +++ b/man/SelectorChecker.Rd @@ -4,8 +4,7 @@ \alias{SelectorChecker} \title{Translate a set of selectors into a set of numeric indices} \usage{ -SelectorChecker(selectors, var = NULL, return_indices = TRUE, - tolerance = NULL) +SelectorChecker(selectors, var = NULL, return_indices = TRUE, tolerance = NULL) } \arguments{ \item{selectors}{A vector or a list of two of numeric indices or variable @@ -50,4 +49,3 @@ sub_array_of_values <- seq(90, -90, length.out = 258)[2:257] SelectorChecker(sub_array_of_selectors, sub_array_of_values) } - diff --git a/man/Sort.Rd b/man/Sort.Rd index 9ab516e..25a92fe 100644 --- a/man/Sort.Rd +++ b/man/Sort.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/Sort.R \name{Sort} -\alias{CircularSort} \alias{Sort} +\alias{CircularSort} \title{Sort the coordinate variable values in a Start() call} \usage{ Sort(...) @@ -10,12 +10,12 @@ Sort(...) CircularSort(start, end, ...) } \arguments{ +\item{\dots}{Additional parameters to adjust the reorderig. See function +sort() for more details.} + \item{start}{A numeric indicating the lower bound of the circular range.} \item{end}{A numeric indicating the upper bound of the circular range.} - -\item{\dots}{Additional parameters to adjust the reorderig. See function -sort() for more details.} } \value{ A list of 2 containing: @@ -57,4 +57,3 @@ range. This is useful for circular coordinates such as the Earth longitudes. retrieve = FALSE) } - diff --git a/man/Start.Rd b/man/Start.Rd index 680168e..efd258f 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -4,408 +4,36 @@ \alias{Start} \title{Declare, discover, subset and retrieve multidimensional distributed data sets} \usage{ -Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, - file_var_reader = NcVarReader, file_dim_reader = NcDimReader, - file_data_reader = NcDataReader, file_closer = NcCloser, - transform = NULL, transform_params = NULL, transform_vars = NULL, - transform_extra_cells = 2, apply_indices_after_transform = FALSE, - pattern_dims = NULL, metadata_dims = NULL, - selector_checker = SelectorChecker, merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, largest_dims_length = FALSE, - retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, - silent = FALSE, debug = FALSE) +Start( + ..., + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, + debug = FALSE +) } \arguments{ -\item{return_vars}{A named list where the names are the names of the -variables to be fetched in the files, and the values are vectors of -character strings with the names of the file dimension which to retrieve each -variable for, or NULL if the variable has to be retrieved only once -from any (the first) of the involved files.\cr\cr -Apart from retrieving a multidimensional data array, retrieving auxiliary -variables inside the files can also be needed. The parameter -'return_vars' allows for requesting such variables, as long as a -'file_var_reader' function is also specified in the call to -Start() (see documentation on the corresponding parameter). -\cr\cr -In the case of the the item sales example (see documentation on parameter -\code{\dots)}, the store location variable is requested with the parameter\cr -\code{return_vars = list(store_location = NULL)}.\cr This will cause -Start() to fetch once the variable 'store_location' and return it in -the component\cr \code{$Variables$common$store_location},\cr and will be an -array of character strings with the location names, with the dimensions -\code{c('store' = 100)}. Although useless in this example, we could ask -Start() to fetch and return such variable for each file along the -items dimension as follows: \cr -\code{return_vars = list(store_location = c('item'))}.\cr In that case, the -variable will be fetched once from a file of each of the items, and will be -returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. -\cr\cr -If a variable is requested along a file dimension that contains path pattern -specifications ('source' in the example), the fetched variable values will be -returned in the component\cr \code{$Variables$$}.\cr -For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item'))) -\cr # # Checking the structure of the returned variables -\cr # str(found_data$Variables) -\cr # Named list -\cr # ..$common: NULL -\cr # ..$sourceA: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # ..$sourceB: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # # Checking the dimensions of the returned variable -\cr # # for the source A -\cr # dim(found_data$Variables$sourceA) -\cr # item store -\cr # 3 3 -} -\cr\cr -The names of the requested variables do not necessarily have to match the -actual variable names inside the files. A list of alternative names to be -seeked can be specified via the parameter 'synonims'.} - -\item{synonims}{A named list where the names are the requested variable or -dimension names, and the values are vectors of character strings with -alternative names to seek for such dimension or variable.\cr\cr -In some requests, data from different sources may follow different naming -conventions for the dimensions or variables, or even files in the same source -could have varying names. This parameter is in order for Start() to -properly identify the dimensions or variables with different names. -\cr\cr -In the example used in parameter 'return_vars', it may be the case that -the two involved data sources follow slightly different naming conventions. -For example, source A uses 'sect' as name for the sections dimension, whereas -source B uses 'section'; source A uses 'store_loc' as variable name for the -store locations, whereas source B uses 'store_location'. This can be taken -into account as follows: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item')), -\cr # synonims = list( -\cr # section = c('sec', 'section'), -\cr # store_location = c('store_loc', -\cr # 'store_location') -\cr # )) -} -\cr} - -\item{file_opener}{A function that receives as a single parameter - 'file_path' a character string with the path to a file to be opened, - and returns an object with an open connection to the file (optionally with - header information) on success, or returns NULL on failure. -\cr\cr -This parameter takes by default NcOpener() (an opener function for NetCDF -files). -\cr\cr -See NcOpener() for a template to build a file opener for your own file -format.} - -\item{file_var_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, - \code{synonims} that returns an array with auxiliary data (i.e. data from a - variable) inside a file. Start() will provide automatically either a - 'file_path' or a 'file_object' to the 'file_var_reader' - function (the function has to be ready to work whichever of these two is - provided). The parameter 'file_selectors' will also be provided - automatically to the variable reader, containing a named list where the - names are the names of the file dimensions of the queried data set (see - documentation on \code{\dots}) and the values are single character strings - with the components used to build the path to the file being read (the one - provided in 'file_path' or 'file_object'). The parameter 'var_name' - will be filled in automatically by Start() also, with the name of one - of the variales to be read. The parameter 'synonims' will be filled in - with exactly the same value as provided in the parameter 'synonims' in - the call to Start(), and has to be used in the code of the variable - reader to check for alternative variable names inside the target file. The - 'file_var_reader' must return a (multi)dimensional array with named - dimensions, and optionally with the attribute 'variales' with other - additional metadata on the retrieved variable. -\cr\cr -Usually, the 'file_var_reader' should be a degenerate case of the -'file_data_reader' (see documentation on the corresponding parameter), -so it is recommended to code the 'file_data_reder' in first place. -\cr\cr -This parameter takes by default NcVarReader() (a variable reader function -for NetCDF files). -\cr\cr -See NcVarReader() for a template to build a variale reader for your own -file format.} - -\item{file_dim_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} - that returns a named numeric vector where the names are the names of the - dimensions of the multidimensional data array in the file and the values are - the sizes of such dimensions. Start() will provide automatically - either a 'file_path' or a 'file_object' to the - 'file_dim_reader' function (the function has to be ready to work - whichever of these two is provided). The parameter 'file_selectors' - will also be provided automatically to the dimension reader, containing a - named list where the names are the names of the file dimensions of the - queried data set (see documentation on \code{\dots}) and the values are - single character strings with the components used to build the path to the - file being read (the one provided in 'file_path' or 'file_object'). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and can optionally be used in advanced configurations. -\cr\cr -This parameter takes by default NcDimReader() (a dimension reader -function for NetCDF files). -\cr\cr -See NcDimReader() for (an advanced) template to build a dimension reader -for your own file format.} - -\item{file_data_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, - \code{inner_indices = NULL}, \code{synonims} that returns a subset of the - multidimensional data array inside a file (even if internally it is not an - array). Start() will provide automatically either a 'file_path' - or a 'file_object' to the 'file_data_reader' function (the - function has to be ready to work whichever of these two is provided). The - parameter 'file_selectors' will also be provided automatically to the - data reader, containing a named list where the names are the names of the - file dimensions of the queried data set (see documentation on \code{\dots}) - and the values are single character strings with the components used to - build the path to the file being read (the one provided in 'file_path' or - 'file_object'). The parameter 'inner_indices' will be filled in - automatically by Start() also, with a named list of numeric vectors, - where the names are the names of all the expected inner dimensions in a file - to be read, and the numeric vectors are the indices to be taken from the - corresponding dimension (the indices may not be consecutive nor in order). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and has to be used in the code of the data reader to check for alternative - dimension names inside the target file. The 'file_data_reader' must - return a (multi)dimensional array with named dimensions, and optionally with - the attribute 'variables' with other additional metadata on the retrieved - data. -\cr\cr -Usually, 'file_data_reader' should use 'file_dim_reader' -(see documentation on the corresponding parameter), so it is recommended to -code 'file_dim_reder' in first place. -\cr\cr -This parameter takes by default NcDataReader() (a data reader function -for NetCDF files). -\cr\cr -See NcDataReader() for a template to build a data reader for your own -file format.} - -\item{file_closer}{A function that receives as a single parameter - 'file_object' an open connection (as returned by 'file_opener') - to one of the files to be read, optionally with header information, and - closes the open connection. Always returns NULL. -\cr\cr -This parameter takes by default NcCloser() (a closer function for NetCDF -files). -\cr\cr -See NcCloser() for a template to build a file closer for your own file -format.} - -\item{transform}{A function with the header \code{dara_array}, -\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as -input, through the parameter \code{data_array}, a subset of a -multidimensional array (as returned by 'file_data_reader'), applies a -transformation to it and returns it, preserving the amount of dimensions but -potentially modifying their size. This transformation may require data from -other auxiliary variables, automatically provided to 'transform' -through the parameter 'variables', in the form of a named list where -the names are the variable names and the values are (multi)dimensional -arrays. Which variables need to be sent to 'transform' can be specified -with the parameter 'transform_vars' in Start(). The parameter -'file_selectors' will also be provided automatically to -'transform', containing a named list where the names are the names of -the file dimensions of the queried data set (see documentation on -\code{\dots}) and the values are single character strings with the -components used to build the path to the file the subset being processed -belongs to. The parameter \code{\dots} will be filled in with other -additional parameters to adjust the transformation, exactly as provided in -the call to Start() via the parameter 'transform_params'.} - -\item{transform_params}{A named list with additional parameters to be sent to -the 'transform' function (if specified). See documentation on parameter -'transform' for details.} - -\item{transform_vars}{A vector of character strings with the names of -auxiliary variables to be sent to the 'transform' function (if -specified). All the variables to be sent to 'transform' must also -have been requested as return variables in the parameter 'return_vars' -of Start().} - -\item{transform_extra_cells}{An integer of extra indices to retrieve from the -data set, beyond the requested indices in \code{\dots}, in order for -'transform' to dispose of additional information to properly apply -whichever transformation (if needed). As many as -'transform_extra_cells' will be retrieved beyond each of the limits for -each of those inner dimensions associated to a coordinate variable and sent -to 'transform' (i.e. present in 'transform_vars'). After -'transform' has finished, Start() will take again and return a -subset of the result, for the returned data to fall within the specified -bounds in \code{\dots}. The default value is 2.} - -\item{apply_indices_after_transform}{A logical value indicating when a -'transform' is specified in Start() and numeric indices are -provided for any of the inner dimensions that depend on coordinate variables, -these numeric indices can be made effective (retrieved) before applying the -transformation or after. The boolean flag allows to adjust this behaviour. -It takes FALSE by default (numeric indices are applied before sending -data to 'transform').} - -\item{pattern_dims}{A character string indicating the name of the dimension -with path pattern specifications (see \code{\dots} for details). If not -specified, Start() assumes the first provided dimension is the pattern -dimension, with a warning.} - -\item{metadata_dims}{A vector of character strings with the names of the file -dimensions which to return metadata for. As noted in 'file_data_reader', -the data reader can optionally return auxiliary data via the attribute -'variables' of the returned array. Start() by default returns the -auxiliary data read for only the first file of each source (or data set) in -the pattern dimension (see \code{\dots} for info on what the pattern -dimension is). However it can be configured to return the metadata for all -the files along any set of file dimensions. The default value is NULL, and -it will be assigned automatically as parameter 'pattern_dims'.} - -\item{selector_checker}{A function used internaly by Start() to -translate a set of selectors (values for a dimension associated to a -coordinate variable) into a set of numeric indices. It takes by default -SelectorChecker() and, in principle, it should not be required to -change it for customized file formats. The option to replace it is left open -for more versatility. See the code of SelectorChecker() for details on -the inputs, functioning and outputs of a selector checker.} - -\item{merge_across_dims}{A logical value indicating whether to merge -dimensions across which another dimension extends (according to the -'_across' parameters). Takes the value FALSE by default. For -example, if the dimension 'time' extends across the dimension 'chunk' and -\code{merge_across_dims = TRUE}, the resulting data array will only contain -only the dimension 'time' as long as all the chunks together.} - -\item{merge_across_dims_narm}{A logical value indicating whether to remove -the additional NAs from data when parameter 'merge_across_dims' is TRUE. -It is helpful when the length of the to-be-merged dimension is different -across another dimension. For example, if the dimension 'time' extends -across dimension 'chunk', and the time length along the first chunk is 2 -while along the second chunk is 10. Setting this parameter as TRUE can -remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} - -\item{split_multiselected_dims}{A logical value indicating whether to split a -dimension that has been selected with a multidimensional array of selectors -into as many dimensions as present in the selector array. The default value -is FALSE.} - -\item{path_glob_permissive}{A logical value or an integer specifying how many - folder levels in the path pattern, beginning from the end, the shell glob - expressions must be preserved and worked out for each file. The default - value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr -When specifying a path pattern for a dataset, it might contain shell glob -experissions. For each dataset, the first file matching the path pattern is -found, and the found file is used to work out fixed values for the glob -expressions that will be used for all the files of the dataset. However, in -some cases, the values of the shell glob expressions may not be constant for -all files in a dataset, and they need to be worked out for each file -involved.\cr\cr -For example, a path pattern could be as follows: \cr -\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving -\code{path_glob_permissive = FALSE} will trigger automatic seek of the - contents to replace the asterisks (e.g. the first asterisk matches with - \code{'bar'} and the second with \code{'baz'}. The found contents will be - used for all files in the dataset (in the example, the path pattern will be - fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if - any of the files in the dataset have other contents in the position of the - asterisks, Start() will not find them (in the example, a file like \cr - \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be - found). Setting \code{path_glob_permissive = 1} would preserve global - expressions in the latest level (in the example, the fixed path pattern - would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the - problematic file mentioned before would be found), but of course this would - slow down the Start() call if the dataset involves a large number of - files. Setting \code{path_glob_permissive = 2} would leave the original path - pattern with the original glob expressions in the 1st and 2nd levels (in the - example, both asterisks would be preserved, thus would allow Start() - to recognize files such as \cr - \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr -Note that each glob expression can only represent one possibility (Start() -chooses the first). Because /code{*} is not the tag, which means it cannot -be a dimension of the output array. Therefore, only one possibility can be -adopted. For example, if \cr -\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr -has two matches:\cr -\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr -\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr -only the first found file will be used.} - -\item{largest_dims_length}{A logical value or a named integer vector - indicating if Start() should examine all the files to get the largest - length of the inner dimensions (TRUE) or use the first valid file of each - dataset as the returned dimension length (FALSE). Since examining all the - files could be time-consuming, a vector can be used to explicitly specify - the expected length of the inner dimensions. For those inner dimensions not - specified, the first valid file will be used. The default value is FALSE.\cr\cr - This parameter is useful when the required files don't have consistent - inner dimension. For example, there are 10 required experimental data files - of a series of start dates. The data only contain 25 members for the first - 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, - the returned member dimension length will be 25 only. The 26th to 51st - members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, - the returned member dimension length will be 51. To save the resource, -\code{'largest_dims_length = c(member = 51)'} can also be used.} - -\item{retrieve}{A logical value indicating whether to retrieve the data -defined in the Start() call or to explore only its dimension lengths -and names, and the values for the file and inner dimensions. The default -value is FALSE.} - -\item{num_procs}{An integer of number of processes to be created for the -parallel execution of the retrieval/transformation/arrangement of the -multiple involved files in a call to Start(). If set to NULL, -takes the number of available cores (as detected by detectCores() in -the package 'future'). The default value is 1 (no parallel execution).} - -\item{ObjectBigmemory}{a character string to be included as part of the -bigmemory object name. This parameter is thought to be used internally by the -chunking capabilities of startR.} - -\item{silent}{A logical value of whether to display progress messages (FALSE) -or not (TRUE). The default value is FALSE.} - -\item{debug}{A logical value of whether to return detailed messages on the -progress and operations in a Start() call (TRUE) or not (FALSE). The -default value is FALSE.} - \item{\dots}{A selection of custemized parameters depending on the data format. When we retrieve data from one or a collection of data sets, the involved data can be perceived as belonging to a large multi-dimensional @@ -620,61 +248,451 @@ as they have been specified in the call. For example, the following call: \cr # variable = 'all') } \cr\cr -would return an array with the following dimensions: -\cr -\command{ -\cr # source month store item section variable -\cr # 1 24 100 3 2 2 -} +would return an array with the following dimensions: +\cr +\command{ +\cr # source month store item section variable +\cr # 1 24 100 3 2 2 +} +\cr\cr +Next, a more advanced example to retrieve data for only the sales records, for +the first section ('electronics'), for the 1st and 3rd items and for the +stores located in Barcelona (assuming the files contain the variable +'store_location' with the name of the city each of the 100 stores are located +at): +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr\cr +The defined names for the dimensions do not necessarily have to match the +names of the dimensions inside the file. Lists of alternative names to be +seeked can be defined in the parameter 'synonims'. +\cr\cr +If data from multiple sources (not necessarily following the same structure) +has to be retrieved, it can be done by providing a vector of character strings +with path pattern specifications, or, in the extended form, by providing a +list of lists with the components 'name' and 'path', and the name of the +dataset and path pattern as values, respectively. For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr} + +\item{return_vars}{A named list where the names are the names of the +variables to be fetched in the files, and the values are vectors of +character strings with the names of the file dimension which to retrieve each +variable for, or NULL if the variable has to be retrieved only once +from any (the first) of the involved files.\cr\cr +Apart from retrieving a multidimensional data array, retrieving auxiliary +variables inside the files can also be needed. The parameter +'return_vars' allows for requesting such variables, as long as a +'file_var_reader' function is also specified in the call to +Start() (see documentation on the corresponding parameter). +\cr\cr +In the case of the the item sales example (see documentation on parameter +\code{\dots)}, the store location variable is requested with the parameter\cr +\code{return_vars = list(store_location = NULL)}.\cr This will cause +Start() to fetch once the variable 'store_location' and return it in +the component\cr \code{$Variables$common$store_location},\cr and will be an +array of character strings with the location names, with the dimensions +\code{c('store' = 100)}. Although useless in this example, we could ask +Start() to fetch and return such variable for each file along the +items dimension as follows: \cr +\code{return_vars = list(store_location = c('item'))}.\cr In that case, the +variable will be fetched once from a file of each of the items, and will be +returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. +\cr\cr +If a variable is requested along a file dimension that contains path pattern +specifications ('source' in the example), the fetched variable values will be +returned in the component\cr \code{$Variables$$}.\cr +For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item'))) +\cr # # Checking the structure of the returned variables +\cr # str(found_data$Variables) +\cr # Named list +\cr # ..$common: NULL +\cr # ..$sourceA: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # ..$sourceB: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # # Checking the dimensions of the returned variable +\cr # # for the source A +\cr # dim(found_data$Variables$sourceA) +\cr # item store +\cr # 3 3 +} +\cr\cr +The names of the requested variables do not necessarily have to match the +actual variable names inside the files. A list of alternative names to be +seeked can be specified via the parameter 'synonims'.} + +\item{synonims}{A named list where the names are the requested variable or +dimension names, and the values are vectors of character strings with +alternative names to seek for such dimension or variable.\cr\cr +In some requests, data from different sources may follow different naming +conventions for the dimensions or variables, or even files in the same source +could have varying names. This parameter is in order for Start() to +properly identify the dimensions or variables with different names. +\cr\cr +In the example used in parameter 'return_vars', it may be the case that +the two involved data sources follow slightly different naming conventions. +For example, source A uses 'sect' as name for the sections dimension, whereas +source B uses 'section'; source A uses 'store_loc' as variable name for the +store locations, whereas source B uses 'store_location'. This can be taken +into account as follows: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item')), +\cr # synonims = list( +\cr # section = c('sec', 'section'), +\cr # store_location = c('store_loc', +\cr # 'store_location') +\cr # )) +} +\cr} + +\item{file_opener}{A function that receives as a single parameter + 'file_path' a character string with the path to a file to be opened, + and returns an object with an open connection to the file (optionally with + header information) on success, or returns NULL on failure. +\cr\cr +This parameter takes by default NcOpener() (an opener function for NetCDF +files). +\cr\cr +See NcOpener() for a template to build a file opener for your own file +format.} + +\item{file_var_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, + \code{synonims} that returns an array with auxiliary data (i.e. data from a + variable) inside a file. Start() will provide automatically either a + 'file_path' or a 'file_object' to the 'file_var_reader' + function (the function has to be ready to work whichever of these two is + provided). The parameter 'file_selectors' will also be provided + automatically to the variable reader, containing a named list where the + names are the names of the file dimensions of the queried data set (see + documentation on \code{\dots}) and the values are single character strings + with the components used to build the path to the file being read (the one + provided in 'file_path' or 'file_object'). The parameter 'var_name' + will be filled in automatically by Start() also, with the name of one + of the variales to be read. The parameter 'synonims' will be filled in + with exactly the same value as provided in the parameter 'synonims' in + the call to Start(), and has to be used in the code of the variable + reader to check for alternative variable names inside the target file. The + 'file_var_reader' must return a (multi)dimensional array with named + dimensions, and optionally with the attribute 'variales' with other + additional metadata on the retrieved variable. +\cr\cr +Usually, the 'file_var_reader' should be a degenerate case of the +'file_data_reader' (see documentation on the corresponding parameter), +so it is recommended to code the 'file_data_reder' in first place. +\cr\cr +This parameter takes by default NcVarReader() (a variable reader function +for NetCDF files). +\cr\cr +See NcVarReader() for a template to build a variale reader for your own +file format.} + +\item{file_dim_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} + that returns a named numeric vector where the names are the names of the + dimensions of the multidimensional data array in the file and the values are + the sizes of such dimensions. Start() will provide automatically + either a 'file_path' or a 'file_object' to the + 'file_dim_reader' function (the function has to be ready to work + whichever of these two is provided). The parameter 'file_selectors' + will also be provided automatically to the dimension reader, containing a + named list where the names are the names of the file dimensions of the + queried data set (see documentation on \code{\dots}) and the values are + single character strings with the components used to build the path to the + file being read (the one provided in 'file_path' or 'file_object'). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and can optionally be used in advanced configurations. +\cr\cr +This parameter takes by default NcDimReader() (a dimension reader +function for NetCDF files). +\cr\cr +See NcDimReader() for (an advanced) template to build a dimension reader +for your own file format.} + +\item{file_data_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, + \code{inner_indices = NULL}, \code{synonims} that returns a subset of the + multidimensional data array inside a file (even if internally it is not an + array). Start() will provide automatically either a 'file_path' + or a 'file_object' to the 'file_data_reader' function (the + function has to be ready to work whichever of these two is provided). The + parameter 'file_selectors' will also be provided automatically to the + data reader, containing a named list where the names are the names of the + file dimensions of the queried data set (see documentation on \code{\dots}) + and the values are single character strings with the components used to + build the path to the file being read (the one provided in 'file_path' or + 'file_object'). The parameter 'inner_indices' will be filled in + automatically by Start() also, with a named list of numeric vectors, + where the names are the names of all the expected inner dimensions in a file + to be read, and the numeric vectors are the indices to be taken from the + corresponding dimension (the indices may not be consecutive nor in order). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and has to be used in the code of the data reader to check for alternative + dimension names inside the target file. The 'file_data_reader' must + return a (multi)dimensional array with named dimensions, and optionally with + the attribute 'variables' with other additional metadata on the retrieved + data. +\cr\cr +Usually, 'file_data_reader' should use 'file_dim_reader' +(see documentation on the corresponding parameter), so it is recommended to +code 'file_dim_reder' in first place. \cr\cr -Next, a more advanced example to retrieve data for only the sales records, for -the first section ('electronics'), for the 1st and 3rd items and for the -stores located in Barcelona (assuming the files contain the variable -'store_location' with the name of the city each of the 100 stores are located -at): -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} +This parameter takes by default NcDataReader() (a data reader function +for NetCDF files). \cr\cr -The defined names for the dimensions do not necessarily have to match the -names of the dimensions inside the file. Lists of alternative names to be -seeked can be defined in the parameter 'synonims'. +See NcDataReader() for a template to build a data reader for your own +file format.} + +\item{file_closer}{A function that receives as a single parameter + 'file_object' an open connection (as returned by 'file_opener') + to one of the files to be read, optionally with header information, and + closes the open connection. Always returns NULL. \cr\cr -If data from multiple sources (not necessarily following the same structure) -has to be retrieved, it can be done by providing a vector of character strings -with path pattern specifications, or, in the extended form, by providing a -list of lists with the components 'name' and 'path', and the name of the -dataset and path pattern as values, respectively. For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} -\cr} +This parameter takes by default NcCloser() (a closer function for NetCDF +files). +\cr\cr +See NcCloser() for a template to build a file closer for your own file +format.} + +\item{transform}{A function with the header \code{dara_array}, +\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as +input, through the parameter \code{data_array}, a subset of a +multidimensional array (as returned by 'file_data_reader'), applies a +transformation to it and returns it, preserving the amount of dimensions but +potentially modifying their size. This transformation may require data from +other auxiliary variables, automatically provided to 'transform' +through the parameter 'variables', in the form of a named list where +the names are the variable names and the values are (multi)dimensional +arrays. Which variables need to be sent to 'transform' can be specified +with the parameter 'transform_vars' in Start(). The parameter +'file_selectors' will also be provided automatically to +'transform', containing a named list where the names are the names of +the file dimensions of the queried data set (see documentation on +\code{\dots}) and the values are single character strings with the +components used to build the path to the file the subset being processed +belongs to. The parameter \code{\dots} will be filled in with other +additional parameters to adjust the transformation, exactly as provided in +the call to Start() via the parameter 'transform_params'.} + +\item{transform_params}{A named list with additional parameters to be sent to +the 'transform' function (if specified). See documentation on parameter +'transform' for details.} + +\item{transform_vars}{A vector of character strings with the names of +auxiliary variables to be sent to the 'transform' function (if +specified). All the variables to be sent to 'transform' must also +have been requested as return variables in the parameter 'return_vars' +of Start().} + +\item{transform_extra_cells}{An integer of extra indices to retrieve from the +data set, beyond the requested indices in \code{\dots}, in order for +'transform' to dispose of additional information to properly apply +whichever transformation (if needed). As many as +'transform_extra_cells' will be retrieved beyond each of the limits for +each of those inner dimensions associated to a coordinate variable and sent +to 'transform' (i.e. present in 'transform_vars'). After +'transform' has finished, Start() will take again and return a +subset of the result, for the returned data to fall within the specified +bounds in \code{\dots}. The default value is 2.} + +\item{apply_indices_after_transform}{A logical value indicating when a +'transform' is specified in Start() and numeric indices are +provided for any of the inner dimensions that depend on coordinate variables, +these numeric indices can be made effective (retrieved) before applying the +transformation or after. The boolean flag allows to adjust this behaviour. +It takes FALSE by default (numeric indices are applied before sending +data to 'transform').} + +\item{pattern_dims}{A character string indicating the name of the dimension +with path pattern specifications (see \code{\dots} for details). If not +specified, Start() assumes the first provided dimension is the pattern +dimension, with a warning.} + +\item{metadata_dims}{A vector of character strings with the names of the file +dimensions which to return metadata for. As noted in 'file_data_reader', +the data reader can optionally return auxiliary data via the attribute +'variables' of the returned array. Start() by default returns the +auxiliary data read for only the first file of each source (or data set) in +the pattern dimension (see \code{\dots} for info on what the pattern +dimension is). However it can be configured to return the metadata for all +the files along any set of file dimensions. The default value is NULL, and +it will be assigned automatically as parameter 'pattern_dims'.} + +\item{selector_checker}{A function used internaly by Start() to +translate a set of selectors (values for a dimension associated to a +coordinate variable) into a set of numeric indices. It takes by default +SelectorChecker() and, in principle, it should not be required to +change it for customized file formats. The option to replace it is left open +for more versatility. See the code of SelectorChecker() for details on +the inputs, functioning and outputs of a selector checker.} + +\item{merge_across_dims}{A logical value indicating whether to merge +dimensions across which another dimension extends (according to the +'_across' parameters). Takes the value FALSE by default. For +example, if the dimension 'time' extends across the dimension 'chunk' and +\code{merge_across_dims = TRUE}, the resulting data array will only contain +only the dimension 'time' as long as all the chunks together.} + +\item{merge_across_dims_narm}{A logical value indicating whether to remove +the additional NAs from data when parameter 'merge_across_dims' is TRUE. +It is helpful when the length of the to-be-merged dimension is different +across another dimension. For example, if the dimension 'time' extends +across dimension 'chunk', and the time length along the first chunk is 2 +while along the second chunk is 10. Setting this parameter as TRUE can +remove the additional 8 NAs at position 3 to 10. The default value is TRUE, +but will be automatically turned to FALSE if 'merge_across_dims = FALSE'.} + +\item{split_multiselected_dims}{A logical value indicating whether to split a +dimension that has been selected with a multidimensional array of selectors +into as many dimensions as present in the selector array. The default value +is FALSE.} + +\item{path_glob_permissive}{A logical value or an integer specifying how many + folder levels in the path pattern, beginning from the end, the shell glob + expressions must be preserved and worked out for each file. The default + value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr +When specifying a path pattern for a dataset, it might contain shell glob +experissions. For each dataset, the first file matching the path pattern is +found, and the found file is used to work out fixed values for the glob +expressions that will be used for all the files of the dataset. However, in +some cases, the values of the shell glob expressions may not be constant for +all files in a dataset, and they need to be worked out for each file +involved.\cr\cr +For example, a path pattern could be as follows: \cr +\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving +\code{path_glob_permissive = FALSE} will trigger automatic seek of the + contents to replace the asterisks (e.g. the first asterisk matches with + \code{'bar'} and the second with \code{'baz'}. The found contents will be + used for all files in the dataset (in the example, the path pattern will be + fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if + any of the files in the dataset have other contents in the position of the + asterisks, Start() will not find them (in the example, a file like \cr + \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be + found). Setting \code{path_glob_permissive = 1} would preserve global + expressions in the latest level (in the example, the fixed path pattern + would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the + problematic file mentioned before would be found), but of course this would + slow down the Start() call if the dataset involves a large number of + files. Setting \code{path_glob_permissive = 2} would leave the original path + pattern with the original glob expressions in the 1st and 2nd levels (in the + example, both asterisks would be preserved, thus would allow Start() + to recognize files such as \cr + \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +Note that each glob expression can only represent one possibility (Start() +chooses the first). Because /code{*} is not the tag, which means it cannot +be a dimension of the output array. Therefore, only one possibility can be +adopted. For example, if \cr +\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +has two matches:\cr +\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +only the first found file will be used.} + +\item{largest_dims_length}{A logical value or a named integer vector + indicating if Start() should examine all the files to get the largest + length of the inner dimensions (TRUE) or use the first valid file of each + dataset as the returned dimension length (FALSE). Since examining all the + files could be time-consuming, a vector can be used to explicitly specify + the expected length of the inner dimensions. For those inner dimensions not + specified, the first valid file will be used. The default value is FALSE.\cr\cr + This parameter is useful when the required files don't have consistent + inner dimension. For example, there are 10 required experimental data files + of a series of start dates. The data only contain 25 members for the first + 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, + the returned member dimension length will be 25 only. The 26th to 51st + members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, + the returned member dimension length will be 51. To save the resource, +\code{'largest_dims_length = c(member = 51)'} can also be used.} + +\item{retrieve}{A logical value indicating whether to retrieve the data +defined in the Start() call or to explore only its dimension lengths +and names, and the values for the file and inner dimensions. The default +value is FALSE.} + +\item{num_procs}{An integer of number of processes to be created for the +parallel execution of the retrieval/transformation/arrangement of the +multiple involved files in a call to Start(). If set to NULL, +takes the number of available cores (as detected by detectCores() in +the package 'future'). The default value is 1 (no parallel execution).} + +\item{ObjectBigmemory}{a character string to be included as part of the +bigmemory object name. This parameter is thought to be used internally by the +chunking capabilities of startR.} + +\item{silent}{A logical value of whether to display progress messages (FALSE) +or not (TRUE). The default value is FALSE.} + +\item{debug}{A logical value of whether to return detailed messages on the +progress and operations in a Start() call (TRUE) or not (FALSE). The +default value is FALSE.} } \value{ If \code{retrieve = TRUE} the involved data is loaded into RAM memory @@ -830,4 +848,3 @@ file format. retrieve = FALSE) } - diff --git a/man/Step.Rd b/man/Step.Rd index 65f0c72..c473ccb 100644 --- a/man/Step.Rd +++ b/man/Step.Rd @@ -4,8 +4,13 @@ \alias{Step} \title{Define the operation applied on declared data.} \usage{ -Step(fun, target_dims, output_dims, use_libraries = NULL, - use_attributes = NULL) +Step( + fun, + target_dims, + output_dims, + use_libraries = NULL, + use_attributes = NULL +) } \arguments{ \item{fun}{A function in R format defining the operation to be applied to the @@ -70,4 +75,3 @@ to the expected order for this function. wf <- AddStep(data, step) } - diff --git a/man/indices.Rd b/man/indices.Rd index a3d85ea..6233b71 100644 --- a/man/indices.Rd +++ b/man/indices.Rd @@ -39,4 +39,3 @@ original data. See details in the documentation of the parameter \code{\dots} \seealso{ \code{\link{values}} } - diff --git a/man/values.Rd b/man/values.Rd index 3300f19..31ce95a 100644 --- a/man/values.Rd +++ b/man/values.Rd @@ -41,4 +41,3 @@ coordinate variable. See details in the documentation of the parameter \seealso{ \code{\link{indices}} } - -- GitLab From a12220c8deb6431dc2380612b2ec4538e3828eaf Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 14 May 2021 12:30:19 +0200 Subject: [PATCH 17/18] Add two more tests for middle points. --- .../testthat/test-Start-DCPP-across-depends.R | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-Start-DCPP-across-depends.R b/tests/testthat/test-Start-DCPP-across-depends.R index 40f4268..f5a5dc6 100644 --- a/tests/testthat/test-Start-DCPP-across-depends.R +++ b/tests/testthat/test-Start-DCPP-across-depends.R @@ -17,10 +17,26 @@ test_that("Chunks of DCPP files- Local execution", { retrieve = TRUE, return_vars = list(time = 'sdate')) - one <- Start(dat = '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s2018-r1i1p1f2_gn_202201-202212.nc', +# [sdate = 2, chunk = 3] +dat_2018_chunk3 <- Start(dat = '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s2018-r1i1p1f2_gn_202201-202212.nc', var = 'tos', time = 'all', i = indices(1:10), j = indices(1:10), retrieve = TRUE) - expect_equal(dat[1,1,2,25:36,,], one[1,1,,,]) +expect_equal(dat[1,1,2,25:36,,], dat_2018_chunk3[1,1,,,]) + +# [sdate = 1, chunk = 2] +dat_2017_chunk2 <- Start(dat = '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s2017-r1i1p1f2_gn_202001-202012.nc', + var = 'tos', time = 'all', i = indices(1:10), j = indices(1:10), + retrieve = TRUE) + +expect_equal(dat[1,1,1,13:24,,], dat_2017_chunk2[1,1,,,]) + +# [sdate = 2, chunk = 1] +dat_2018_chunk1 <- Start(dat = '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s2018-r1i1p1f2_gn_202001-202012.nc', + var = 'tos', time = 'all', i = indices(1:10), j = indices(1:10), + retrieve = TRUE) + +expect_equal(dat[1,1,2,1:12,,], dat_2018_chunk1[1,1,,,]) + }) -- GitLab From 0517daaa2f49b25631dd91d8797a3a5d72bba8bc Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 14 May 2021 16:34:05 +0200 Subject: [PATCH 18/18] Version bump to 2.1.0-4 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2dfda5d..27d4a77 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 2.1.0-3 +Version: 2.1.0-4 Authors@R: c( person("BSC-CNS", role = c("aut", "cph")), person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), -- GitLab