From 14e6736106fb69779591f34885eca7a5eb42b017 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 5 May 2021 23:29:17 +0200 Subject: [PATCH 1/4] Reorder chunks when split + merge. Thorough check is further needed --- R/Start.R | 111 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/R/Start.R b/R/Start.R index e8a1d7d..9776ead 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3702,27 +3702,29 @@ Start <- function(..., # dim = indices/selectors, } } #====================================================================== - - if (merge_across_dims_narm) { + if (merge_across_dims) { # only merge_across_dims -> the 'time' dim length needs to be adjusted across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? - across_file_dim <- names(inner_dims_across_files) #TODO: more than one? # Get the length of each inner_dim ('time') along each file_dim ('file_date') length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) - - if (!split_multiselected_dims) { - final_dims_fake_name <- names(final_dims_fake) - pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) - new_length_inner_dim <- sum(unlist(length_inner_across_dim)) - if (pos_across_inner_dim != length(final_dims_fake)) { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim, - final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) - } else { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim) + + if (merge_across_dims_narm) { + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + + if (!split_multiselected_dims) { + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name } - names(final_dims_fake) <- final_dims_fake_name } } @@ -4047,43 +4049,46 @@ Start <- function(..., # dim = indices/selectors, # unequal inner_dim ('time') length across file_dim ('file_date'). # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. - if (merge_across_dims_narm) { - - # Get the length of these two dimensions in final_dims - length_inner_across_store_dims <- final_dims[across_inner_dim] - length_file_across_store_dims <- final_dims[across_file_dim] - - # Create a logical array for merge_across_dims - logi_array <- array(rep(FALSE, - length_file_across_store_dims * length_inner_across_store_dims), - dim = c(length_inner_across_store_dims, length_file_across_store_dims)) - for (i in 1:length_file_across_store_dims) { #1:4 - logi_array[1:length_inner_across_dim[[i]], i] <- TRUE - } - - # First, get the data array with final_dims dimension - data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) - - # Change the NA derived from additional spaces to -9999, then remove these -9999 - func_remove_blank <- function(data_array, logi_array) { - # dim(data_array) = [time, file_date] - # dim(logi_array) = [time, file_date] - # Change the blank spaces from NA to -9999 - data_array[which(!logi_array)] <- -9999 - return(data_array) + if (merge_across_dims) { + if (!merge_across_dims_narm) { + data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) + } else { + # Get the length of these two dimensions in final_dims + length_inner_across_store_dims <- final_dims[across_inner_dim] + length_file_across_store_dims <- final_dims[across_file_dim] + + # Create a logical array for merge_across_dims + logi_array <- array(rep(FALSE, + length_file_across_store_dims * length_inner_across_store_dims), + dim = c(length_inner_across_store_dims, length_file_across_store_dims)) + for (i in 1:length_file_across_store_dims) { #1:4 + logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + } + + # First, get the data array with final_dims dimension + data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) + + # Change the NA derived from additional spaces to -9999, then remove these -9999 + func_remove_blank <- function(data_array, logi_array) { + # dim(data_array) = [time, file_date] + # dim(logi_array) = [time, file_date] + # Change the blank spaces from NA to -9999 + data_array[which(!logi_array)] <- -9999 + return(data_array) + } + data_array_final_dims <- multiApply::Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector } - data_array_final_dims <- multiApply::Apply(data_array_final_dims, - target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') - output_dims = c(across_inner_dim, across_file_dim), - fun = func_remove_blank, - logi_array = logi_array)$output1 - ## reorder back to the correct dim - tmp <- match(names(final_dims), names(dim(data_array_final_dims))) - data_array_final_dims <- .aperm2(data_array_final_dims, tmp) - data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector - - #NOTE: When one file contains values for dicrete dimensions, rearrange the - # chunks (i.e., work_piece) is necessary. + + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { # generate the correct order list from indices_chunk @@ -4091,7 +4096,7 @@ Start <- function(..., # dim = indices/selectors, i <- 1 j <- 1 a <- indices_chunk[i] - while (i < length(indices_chunk)) { + while (i <= length(indices_chunk)) { while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { a <- c(a, indices_chunk[i+1]) i <- i + 1 -- GitLab From 76907ff7c5ad263c34752caf4f320c9aa4d54507 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 13 May 2021 17:50:40 +0200 Subject: [PATCH 2/4] Fix mixed-dimension error when reshape params are used. Create unit tests --- R/Start.R | 97 ++--- inst/doc/usecase/ex1_3_attr_loadin.R | 44 +-- tests/testthat/test-Start-reshape.R | 503 ++++++++++++++++++++++++ tests/testthat/test-Start-split-merge.R | 2 - 4 files changed, 555 insertions(+), 91 deletions(-) create mode 100644 tests/testthat/test-Start-reshape.R diff --git a/R/Start.R b/R/Start.R index 9776ead..33e9fac 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1852,6 +1852,7 @@ Start <- function(..., # dim = indices/selectors, } else { common_return_vars[[inner_dim]] <- file_dim_as_selector_array_dim } + tmp <- file_dim_as_selector_array_dim } else if (inner_dim %in% inner_dims_across_files) { #(2) file_dim_name <- names(which(inner_dim == inner_dims_across_files)) if (file_dim_name %in% found_pattern_dim) { @@ -1859,6 +1860,7 @@ Start <- function(..., # dim = indices/selectors, } else { common_return_vars[[inner_dim]] <- file_dim_name } + tmp <- file_dim_name } .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", tmp, "', but '", inner_dim, "' is not in return_vars list or is NULL. ", @@ -4049,7 +4051,7 @@ Start <- function(..., # dim = indices/selectors, # unequal inner_dim ('time') length across file_dim ('file_date'). # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. - if (merge_across_dims) { + if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { if (!merge_across_dims_narm) { data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) } else { @@ -4086,6 +4088,11 @@ Start <- function(..., # dim = indices/selectors, data_array_final_dims <- .aperm2(data_array_final_dims, tmp) data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector } + + if (length(data_array_tmp) != prod(final_dims_fake)) { + stop(paste0("After reshaping, the data do not fit into the expected output dimension. ", + "Check if the reshaping parameters are used correctly.")) + } #NOTE: When one file contains values for dicrete dimensions, rearrange the # chunks (i.e., work_piece) is necessary. @@ -4110,57 +4117,55 @@ Start <- function(..., # dim = indices/selectors, final_order_list <- lapply(final_order_list, length) if (!all(diff(as.numeric(names(final_order_list))) > 0)) { - - - # shape the vector into the array without split_dims - split_dims_pos <- match(split_dims, final_dims_fake) - new_dims <- c() - if (split_dims_pos[1] > 1) { - new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) - } - new_dims <- c(new_dims, prod(split_dims)) - names(new_dims)[split_dims_pos[1]] <- across_inner_dim - if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) - } - final_dims_fake_no_split <- new_dims - data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) - # seperate 'time' dim into each work_piece length - data_array_seperate <- list() - tmp <- cumsum(unlist(length_inner_across_dim)) - tmp <- c(0, tmp) - for (i in 1:length(length_inner_across_dim)) { - data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, - (tmp[i] + 1):tmp[i + 1]) - } - - # re-build the array: chunk - which_chunk <- as.numeric(names(final_order_list)) - how_many_indices <- unlist(final_order_list) - array_piece <- list() - ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) - for (i in 1:length(final_order_list)) { - array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], - across_inner_dim, - ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) - ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - } + # shape the vector into the array without split_dims + split_dims_pos <- match(split_dims, final_dims_fake) + new_dims <- c() + if (split_dims_pos[1] > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, prod(split_dims)) + names(new_dims)[split_dims_pos[1]] <- across_inner_dim + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake_no_split <- new_dims + data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) + # seperate 'time' dim into each work_piece length + data_array_seperate <- list() + tmp <- cumsum(unlist(length_inner_across_dim)) + tmp <- c(0, tmp) + for (i in 1:length(length_inner_across_dim)) { + data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + + # re-build the array: chunk + which_chunk <- as.numeric(names(final_order_list)) + how_many_indices <- unlist(final_order_list) + array_piece <- list() + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + for (i in 1:length(final_order_list)) { + array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], + across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] + } - # re-build the array: paste - data_array_tmp <- array_piece[[1]] - along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) - if (length(array_piece) > 1) { - for (i in 2:length(array_piece)) { - data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], - along = along_pos) + # re-build the array: paste + data_array_tmp <- array_piece[[1]] + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + if (length(array_piece) > 1) { + for (i in 2:length(array_piece)) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } } - } - } + } } data_array <- array(data_array_tmp, dim = final_dims_fake) - } else { # merge_across_dims_narm = F (old version) + } else { # ! (merge_across_dims + split_multiselected_dims) (old version) data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) } diff --git a/inst/doc/usecase/ex1_3_attr_loadin.R b/inst/doc/usecase/ex1_3_attr_loadin.R index d514c30..e2c8211 100644 --- a/inst/doc/usecase/ex1_3_attr_loadin.R +++ b/inst/doc/usecase/ex1_3_attr_loadin.R @@ -77,7 +77,7 @@ # NOTE: 'merge_across_dims_narm = TRUE' is necessary because the observational # data have unequal time length of 30-day and 31-day months. # If the NAs are not removed, unwanted NAs will exist and make the -# values misplaced in the array. See 'bonus' below for more explanation. +# values misplaced in the array. #------- Check erai ----------- dim(erai) @@ -129,45 +129,3 @@ attr(erai, 'Variables')$common$time[2, ] -# //////////////////"BONUS"////////////////////// -# Here is something more to show the usage of parameter 'merge_across_dims_narm'. -# If the last day of 30-day months is NA instead of the first day of the following month, -# NAs are needed to exist in the array. In this case, 'merge_across_dims_narm' -# should be FALSE. - - dates <- attr(system4, 'Variables')$common$time - dates[2, 31] -#[1] "1994-07-01 UTC" - dates[2, 31] <- NA # Jun - dates[5, 31] <- NA # Sep - dates[7, 31] <- NA # Nov - - erai <- Start(dat = repos_obs, - var = 'tas', - file_date = dates_file, - time = values(dates), - latitude = indices(1:10), - longitude = indices(1:10), - time_var = 'time', - time_across = 'file_date', - merge_across_dims = TRUE, - #keep NAs of the last day in 30-day months - merge_across_dims_narm = FALSE, - split_multiselected_dims = TRUE, - return_vars = list(latitude = NULL, - longitude = NULL, - time = 'file_date'), - retrieve = TRUE) - -#------- Check erai ----------- -erai[1, 1, 2, , 1, 1] # June -# [1] 269.9410 269.6855 268.7380 268.5008 270.3236 271.5151 270.5046 270.1686 -# [9] 270.5395 272.0379 272.5489 271.1494 270.7764 270.5678 272.0331 273.7856 -#[17] 273.9849 274.5904 273.4369 273.8404 274.4068 274.2292 274.7375 275.5104 -#[25] 275.4324 274.9408 274.8679 276.5602 275.0995 274.6409 NA -erai[1, 1, 5, , 1, 1] # Sep -# [1] 270.0656 270.7113 268.4678 271.6489 271.2354 269.7831 269.8045 268.7994 -# [9] 266.3092 262.2734 265.0124 261.8378 265.3950 257.1690 255.8402 264.8826 -#[17] 267.8663 266.6875 262.5502 258.5476 258.9617 263.6396 257.1111 264.8644 -#[25] 261.0085 256.7690 256.5811 256.4331 256.1260 256.4716 NA -#------------------------------ diff --git a/tests/testthat/test-Start-reshape.R b/tests/testthat/test-Start-reshape.R new file mode 100644 index 0000000..793a3b3 --- /dev/null +++ b/tests/testthat/test-Start-reshape.R @@ -0,0 +1,503 @@ +context("Start() reshape parameters check") +# This one is more comprehensive than test-Start-split-merge.R + +path_exp <- '/esarchive/exp/ecmwf/system5c3s/daily_mean/$var$_f6h/$var$_$sdate$.nc' +path_obs <- '/esarchive/recon/ecmwf/era5/daily_mean/$var$_f1h-r360x181/$var$_$date$.nc' +var <- 'tas' +sdate <- paste0(1993:1995, '1201') + +suppressWarnings( +exp <- Start(dat = path_exp, + var = var, + sdate = sdate, + time = indices(1:90), #indices(1:91), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = FALSE, silent = T) +) +dates <- attr(exp, 'Variables')$common$time + +# easyNCDF +library(easyNCDF) +# obs +easy_sdate <- c('199312', paste0(rep(1994:1995, each = 3), c('01', '02', '12')), + '199601', '199602') +easy_array <- c() + +for (i in 1:length(easy_sdate)) { + easy_file <- NcOpen(paste0('/esarchive/recon/ecmwf/era5/daily_mean/tas_f1h-r360x181/tas_', + easy_sdate[i], '.nc')) + if (substr(easy_sdate[i], 5, 6) == '02') { + sub_time <- 1:28 + } else { + sub_time <- 1:31 + } + easy_obs <- NcToArray(easy_file, vars_to_read = 'tas', + dim_indices = list(lon = c(1), lat = c(1), time = sub_time)) + NcClose(easy_file) + easy_array <- c(easy_array, as.vector(easy_obs)) +} +dim(easy_array) <- c(time = 90, sdate = 3) + + + +test_that("1. split + merge + narm", { + +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +unsorted_dates <- unique(format(dates, '%Y%m')) + +# unsorted dates +obs1 <- Start(dat = path_obs, + var = var, + date = unsorted_dates, + time = values(dates), #dim: [sdate = 3, time = 90] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 3, time = 90] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, sdate = 3, time = 90, lat = 1, lon = 1) +) +expect_equal( +dim(obs1), +dim(obs2) +) +expect_equal( +as.vector(obs1), +as.vector(obs2) +) +expect_equal( +as.vector(obs1[1, 1, 1, , 1, 1]), +as.vector(easy_array[, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, , 1, 1]), +as.vector(easy_array[, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 3, , 1, 1]), +as.vector(easy_array[, 3]) +) + +}) + + +test_that("2. split + merge", { + +exp <- Start(dat = path_exp, + var = var, + sdate = sdate, + time = indices(1:62), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = FALSE) + +dates <- attr(exp, 'Variables')$common$time + +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +unsorted_dates <- unique(format(dates, '%Y%m')) + +# unsorted dates +obs1 <- Start(dat = path_obs, + var = var, + date = unsorted_dates, + time = values(dates), #dim: [sdate = 3, time = 62] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 3, time = 62] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, sdate = 3, time = 62, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs1[1, 1, 1, , 1, 1]), +as.vector(easy_array[1:62, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, , 1, 1]), +as.vector(easy_array[1:62, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 3, , 1, 1]), +as.vector(easy_array[1:62, 3]) +) +expect_equal( +as.vector(obs1), +as.vector(obs2) +) + +}) + + + +test_that("3. merge", { +# NOTE: The three files are all regarded to have time = 31, despite 199402 only has 28. +# It happens when time = 'all' or time = indices(). It seems reasonable when +# 'merge_across_dims' is not used, but if it is used, it's common to expect 31+31+28. +# See the next test "4. merge + narm". 199402 is still regarded as 31, so NAs are not +# removed. +suppressWarnings( +obs3 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = 'all', + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) +) + + +expect_equal( +dim(obs3), +c(dat = 1, var = 1, time = 93, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs3), +c(as.vector(easy_array[, 1]), NA, NA, NA) +) + +}) + + +test_that("4. merge + narm", { + +# (1) Notice that the NAs at the tail of 199402 won't be removed because Start() +# considers all the files have the same length, i.e., 31. +# The NAs in 199402 are regarded as part of the original file. + +obs3 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = 'all', + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs3), +c(dat = 1, var = 1, time = 93, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs3), +c(as.vector(easy_array[, 1]), NA, NA, NA) +) + +# (2) It's tricky that 199402 is considered time = 31 because Start() considers +# all the files have the same length. So it won't return an error when +# time = indices(93). +# The first 14 time steps of 199312 will be removed but the NAs at the tail +# of 199402 will be preserved. +obs4 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = indices(15:93), + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs4), +c(dat = 1, var = 1, time = 79, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs4), +c(as.vector(easy_array[15:90, 1]), NA, NA, NA) +) + +# (3) If time is values(), 199402 is considered time = 28, so NAs will be removed. +obs5 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = dates[1, ], + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) +expect_equal( +dim(obs5), +c(dat = 1, var = 1, time = 90, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs5), +as.vector(easy_array[1:90, 1]) +) + +}) + +test_that("5. split", { + +date_array <- c('199312', '199401', '199412', '199501') +dim(date_array) <- c(month = 2, year = 2) + +# split file dim +obs1 <- Start(dat = path_obs, + var = var, + date = date_array, # [month = 2, year = 2] + time = indices(1:31), + lat = indices(1), + lon = indices(1), +# time_across = 'date', +# merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL), + # time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, month = 2, year = 2, time = 31, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs1[1, 1, 1, 1, , 1, 1]), +as.vector(easy_array[1:31, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, 1, , 1, 1]), +as.vector(easy_array[32:62, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 1, 2, , 1, 1]), +as.vector(easy_array[1:31, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 2, 2, , 1, 1]), +as.vector(easy_array[32:62, 2]) +) + +# split inner time +## time is indices +time_array <- array(1:62, dim = c(day = 31, month = 2)) +exp1 <- Start(dat = path_exp, + var = var, + sdate = sdate[1], + time = time_array, + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = TRUE) + +# easyNCDF +easy_sdate_exp <- '19931201' +easy_file_exp <- NcOpen(paste0('/esarchive/exp/ecmwf/system5c3s/daily_mean/tas_f6h/tas_', + easy_sdate_exp, '.nc')) +easy_exp <- NcToArray(easy_file_exp, vars_to_read = 'tas', + dim_indices = list(longitude = c(1), latitude = c(1), ensemble = c(1), + time = 1:62)) +NcClose(easy_file_exp) + +expect_equal( +dim(exp1), +c(dat = 1, var = 1, sdate = 1, day = 31, month = 2, ensemble = 1, lat = 1, lon = 1) +) +expect_equal( +as.vector(exp1), +as.vector(easy_exp) +) + +## time is values +time_array <- dates[1, 1:62] +dim(time_array) <- c(day = 31, month = 2) +exp2 <- Start(dat = path_exp, + var = var, + sdate = sdate[1], + time = time_array, + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL), +# time = 'sdate'), + retrieve = TRUE) +expect_equal( +dim(exp2), +c(dat = 1, var = 1, sdate = 1, day = 31, month = 2, ensemble = 1, lat = 1, lon = 1) +) +expect_equal( +as.vector(exp1), +as.vector(exp2) +) + + +}) + +test_that("6. repetitive values", { + +exp <- Start(dat = path_exp, + var = var, + sdate = c('19931101', '19931201'), + time = indices(1:61), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = F) +dates <- attr(exp, 'Variables')$common$time + +# sorted and unsorted are the same here +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +#unsorted_dates <- unique(format(dates, '%Y%m')) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 2, time = 61] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# easyNCDF +easy_file_199311 <- NcOpen(paste0('/esarchive/recon/ecmwf/era5/daily_mean/tas_f1h-r360x181/tas_', + '199311', '.nc')) +easy_obs_199311 <- NcToArray(easy_file_199311, vars_to_read = 'tas', + dim_indices = list(lon = c(1), lat = c(1), time = 1:30)) +NcClose(easy_file_199311) + +expect_equal( +dim(obs2), +c(dat = 1, var = 1, sdate = 2, time = 61, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs2[1, 1, 1, 1:30, 1, 1]), +as.vector(easy_obs_199311) +) +expect_equal( +as.vector(obs2[1, 1, 1, 31:61, 1, 1]), +as.vector(obs2[1, 1, 2, 1:31, 1, 1]) +) +expect_equal( +as.vector(obs2[1, 1, 2, 31:61, 1, 1]), +easy_array[31:61 ,1] +) + +}) diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R index da21f92..9376f9a 100644 --- a/tests/testthat/test-Start-split-merge.R +++ b/tests/testthat/test-Start-split-merge.R @@ -1,5 +1,3 @@ -#if (identical(Sys.getenv("NOT_CRAN"), "")) Sys.setenv(NOT_CRAN='true') - context("Start() split + merge dim and value check") var_name <- 'sfcWind' -- GitLab From 841b3e3ed0e35b64564ae60f6529b09f2fea0408 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 14 May 2021 12:12:02 +0200 Subject: [PATCH 3/4] Change default value of 'merge_across_dims_narm' to TRUE. --- R/Start.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/R/Start.R b/R/Start.R index 33e9fac..ff4b978 100644 --- a/R/Start.R +++ b/R/Start.R @@ -632,7 +632,8 @@ #' across another dimension. For example, if the dimension 'time' extends #' across dimension 'chunk', and the time length along the first chunk is 2 #' while along the second chunk is 10. Setting this parameter as TRUE can -#' remove the additional 8 NAs at position 3 to 10. The default value is FALSE. +#' remove the additional 8 NAs at position 3 to 10. The default value is TRUE, +#' but will be automatically turned to FALSE if 'merge_across_dims = FALSE'. #'@param split_multiselected_dims A logical value indicating whether to split a #' dimension that has been selected with a multidimensional array of selectors #' into as many dimensions as present in the selector array. The default value @@ -820,7 +821,7 @@ Start <- function(..., # dim = indices/selectors, metadata_dims = NULL, selector_checker = SelectorChecker, merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, + merge_across_dims_narm = TRUE, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, largest_dims_length = FALSE, @@ -863,9 +864,6 @@ Start <- function(..., # dim = indices/selectors, } if (!merge_across_dims & merge_across_dims_narm) { merge_across_dims_narm <- FALSE - .warning(paste0("Parameter 'merge_across_dims_narm' can only be TRUE when ", - "'merge_across_dims' is TRUE. Set 'merge_across_dims_narm'", - " to FALSE.")) } # Leave alone the dimension parameters in the variable dim_params -- GitLab From e8571282cb73b234e2f34283afd7a68a016e22a0 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 14 May 2021 12:12:17 +0200 Subject: [PATCH 4/4] Update .Rd --- DESCRIPTION | 2 +- man/AddStep.Rd | 1 - man/CDORemapper.Rd | 1 - man/Collect.Rd | 1 - man/Compute.Rd | 16 +- man/NcCloser.Rd | 1 - man/NcDataReader.Rd | 10 +- man/NcDimReader.Rd | 10 +- man/NcOpener.Rd | 1 - man/NcVarReader.Rd | 10 +- man/SelectorChecker.Rd | 4 +- man/Sort.Rd | 9 +- man/Start.Rd | 923 +++++++++++++++++++++-------------------- man/Step.Rd | 10 +- man/indices.Rd | 1 - man/values.Rd | 1 - 16 files changed, 516 insertions(+), 485 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5a33c3c..2dfda5d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,4 +38,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true SystemRequirements: cdo ecFlow -RoxygenNote: 5.0.0 +RoxygenNote: 7.0.1 diff --git a/man/AddStep.Rd b/man/AddStep.Rd index 3eece05..0d0ce46 100644 --- a/man/AddStep.Rd +++ b/man/AddStep.Rd @@ -54,4 +54,3 @@ create the complete workflow. It is the final step before data processing. wf <- AddStep(data, step, pi_val = pi_short) } - diff --git a/man/CDORemapper.Rd b/man/CDORemapper.Rd index 4f56baa..763be77 100644 --- a/man/CDORemapper.Rd +++ b/man/CDORemapper.Rd @@ -65,4 +65,3 @@ perform the interpolation, hence CDO is required to be installed. \seealso{ \code{\link[s2dverification]{CDORemap}} } - diff --git a/man/Collect.Rd b/man/Collect.Rd index 44a7dee..97b529b 100644 --- a/man/Collect.Rd +++ b/man/Collect.Rd @@ -83,4 +83,3 @@ of results as one data array when the execution is done. See more details on } } - diff --git a/man/Compute.Rd b/man/Compute.Rd index e07106a..7d6db4d 100644 --- a/man/Compute.Rd +++ b/man/Compute.Rd @@ -4,9 +4,18 @@ \alias{Compute} \title{Specify the execution parameters and trigger the execution} \usage{ -Compute(workflow, chunks = "auto", threads_load = 1, threads_compute = 1, - cluster = NULL, ecflow_suite_dir = NULL, ecflow_server = NULL, - silent = FALSE, debug = FALSE, wait = TRUE) +Compute( + workflow, + chunks = "auto", + threads_load = 1, + threads_compute = 1, + cluster = NULL, + ecflow_suite_dir = NULL, + ecflow_server = NULL, + silent = FALSE, + debug = FALSE, + wait = TRUE +) } \arguments{ \item{workflow}{A list of the class 'startR_workflow' returned by function @@ -104,4 +113,3 @@ arrays and additional metadata. res <- Compute(wf, chunks = list(longitude = 4, sdate = 2)) } - diff --git a/man/NcCloser.Rd b/man/NcCloser.Rd index 65beab8..588f63a 100644 --- a/man/NcCloser.Rd +++ b/man/NcCloser.Rd @@ -32,4 +32,3 @@ NcCloser(connection) \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcDimReader}} \code{\link{NcVarReader}} } - diff --git a/man/NcDataReader.Rd b/man/NcDataReader.Rd index a6d32c7..9014789 100644 --- a/man/NcDataReader.Rd +++ b/man/NcDataReader.Rd @@ -4,8 +4,13 @@ \alias{NcDataReader} \title{NetCDF file data reader for 'startR'} \usage{ -NcDataReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDataReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -61,4 +66,3 @@ in turn uses nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDimReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcDimReader.Rd b/man/NcDimReader.Rd index d539ffd..38dd870 100644 --- a/man/NcDimReader.Rd +++ b/man/NcDimReader.Rd @@ -4,8 +4,13 @@ \alias{NcDimReader} \title{NetCDF dimension reader for 'startR'} \usage{ -NcDimReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDimReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ This function uses the function NcReadDims() in the package 'easyNCDF'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcOpener.Rd b/man/NcOpener.Rd index e46384c..30885fc 100644 --- a/man/NcOpener.Rd +++ b/man/NcOpener.Rd @@ -34,4 +34,3 @@ NcCloser(connection) \code{\link{NcDimReader}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcVarReader.Rd b/man/NcVarReader.Rd index c601907..fb093ae 100644 --- a/man/NcVarReader.Rd +++ b/man/NcVarReader.Rd @@ -4,8 +4,13 @@ \alias{NcVarReader} \title{NetCDF variable reader for 'startR'} \usage{ -NcVarReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - var_name = NULL, synonims) +NcVarReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + var_name = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcDimReader}} } - diff --git a/man/SelectorChecker.Rd b/man/SelectorChecker.Rd index ef83575..e1cf112 100644 --- a/man/SelectorChecker.Rd +++ b/man/SelectorChecker.Rd @@ -4,8 +4,7 @@ \alias{SelectorChecker} \title{Translate a set of selectors into a set of numeric indices} \usage{ -SelectorChecker(selectors, var = NULL, return_indices = TRUE, - tolerance = NULL) +SelectorChecker(selectors, var = NULL, return_indices = TRUE, tolerance = NULL) } \arguments{ \item{selectors}{A vector or a list of two of numeric indices or variable @@ -50,4 +49,3 @@ sub_array_of_values <- seq(90, -90, length.out = 258)[2:257] SelectorChecker(sub_array_of_selectors, sub_array_of_values) } - diff --git a/man/Sort.Rd b/man/Sort.Rd index 9ab516e..25a92fe 100644 --- a/man/Sort.Rd +++ b/man/Sort.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/Sort.R \name{Sort} -\alias{CircularSort} \alias{Sort} +\alias{CircularSort} \title{Sort the coordinate variable values in a Start() call} \usage{ Sort(...) @@ -10,12 +10,12 @@ Sort(...) CircularSort(start, end, ...) } \arguments{ +\item{\dots}{Additional parameters to adjust the reorderig. See function +sort() for more details.} + \item{start}{A numeric indicating the lower bound of the circular range.} \item{end}{A numeric indicating the upper bound of the circular range.} - -\item{\dots}{Additional parameters to adjust the reorderig. See function -sort() for more details.} } \value{ A list of 2 containing: @@ -57,4 +57,3 @@ range. This is useful for circular coordinates such as the Earth longitudes. retrieve = FALSE) } - diff --git a/man/Start.Rd b/man/Start.Rd index 680168e..efd258f 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -4,408 +4,36 @@ \alias{Start} \title{Declare, discover, subset and retrieve multidimensional distributed data sets} \usage{ -Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, - file_var_reader = NcVarReader, file_dim_reader = NcDimReader, - file_data_reader = NcDataReader, file_closer = NcCloser, - transform = NULL, transform_params = NULL, transform_vars = NULL, - transform_extra_cells = 2, apply_indices_after_transform = FALSE, - pattern_dims = NULL, metadata_dims = NULL, - selector_checker = SelectorChecker, merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, largest_dims_length = FALSE, - retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, - silent = FALSE, debug = FALSE) +Start( + ..., + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, + debug = FALSE +) } \arguments{ -\item{return_vars}{A named list where the names are the names of the -variables to be fetched in the files, and the values are vectors of -character strings with the names of the file dimension which to retrieve each -variable for, or NULL if the variable has to be retrieved only once -from any (the first) of the involved files.\cr\cr -Apart from retrieving a multidimensional data array, retrieving auxiliary -variables inside the files can also be needed. The parameter -'return_vars' allows for requesting such variables, as long as a -'file_var_reader' function is also specified in the call to -Start() (see documentation on the corresponding parameter). -\cr\cr -In the case of the the item sales example (see documentation on parameter -\code{\dots)}, the store location variable is requested with the parameter\cr -\code{return_vars = list(store_location = NULL)}.\cr This will cause -Start() to fetch once the variable 'store_location' and return it in -the component\cr \code{$Variables$common$store_location},\cr and will be an -array of character strings with the location names, with the dimensions -\code{c('store' = 100)}. Although useless in this example, we could ask -Start() to fetch and return such variable for each file along the -items dimension as follows: \cr -\code{return_vars = list(store_location = c('item'))}.\cr In that case, the -variable will be fetched once from a file of each of the items, and will be -returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. -\cr\cr -If a variable is requested along a file dimension that contains path pattern -specifications ('source' in the example), the fetched variable values will be -returned in the component\cr \code{$Variables$$}.\cr -For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item'))) -\cr # # Checking the structure of the returned variables -\cr # str(found_data$Variables) -\cr # Named list -\cr # ..$common: NULL -\cr # ..$sourceA: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # ..$sourceB: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # # Checking the dimensions of the returned variable -\cr # # for the source A -\cr # dim(found_data$Variables$sourceA) -\cr # item store -\cr # 3 3 -} -\cr\cr -The names of the requested variables do not necessarily have to match the -actual variable names inside the files. A list of alternative names to be -seeked can be specified via the parameter 'synonims'.} - -\item{synonims}{A named list where the names are the requested variable or -dimension names, and the values are vectors of character strings with -alternative names to seek for such dimension or variable.\cr\cr -In some requests, data from different sources may follow different naming -conventions for the dimensions or variables, or even files in the same source -could have varying names. This parameter is in order for Start() to -properly identify the dimensions or variables with different names. -\cr\cr -In the example used in parameter 'return_vars', it may be the case that -the two involved data sources follow slightly different naming conventions. -For example, source A uses 'sect' as name for the sections dimension, whereas -source B uses 'section'; source A uses 'store_loc' as variable name for the -store locations, whereas source B uses 'store_location'. This can be taken -into account as follows: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item')), -\cr # synonims = list( -\cr # section = c('sec', 'section'), -\cr # store_location = c('store_loc', -\cr # 'store_location') -\cr # )) -} -\cr} - -\item{file_opener}{A function that receives as a single parameter - 'file_path' a character string with the path to a file to be opened, - and returns an object with an open connection to the file (optionally with - header information) on success, or returns NULL on failure. -\cr\cr -This parameter takes by default NcOpener() (an opener function for NetCDF -files). -\cr\cr -See NcOpener() for a template to build a file opener for your own file -format.} - -\item{file_var_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, - \code{synonims} that returns an array with auxiliary data (i.e. data from a - variable) inside a file. Start() will provide automatically either a - 'file_path' or a 'file_object' to the 'file_var_reader' - function (the function has to be ready to work whichever of these two is - provided). The parameter 'file_selectors' will also be provided - automatically to the variable reader, containing a named list where the - names are the names of the file dimensions of the queried data set (see - documentation on \code{\dots}) and the values are single character strings - with the components used to build the path to the file being read (the one - provided in 'file_path' or 'file_object'). The parameter 'var_name' - will be filled in automatically by Start() also, with the name of one - of the variales to be read. The parameter 'synonims' will be filled in - with exactly the same value as provided in the parameter 'synonims' in - the call to Start(), and has to be used in the code of the variable - reader to check for alternative variable names inside the target file. The - 'file_var_reader' must return a (multi)dimensional array with named - dimensions, and optionally with the attribute 'variales' with other - additional metadata on the retrieved variable. -\cr\cr -Usually, the 'file_var_reader' should be a degenerate case of the -'file_data_reader' (see documentation on the corresponding parameter), -so it is recommended to code the 'file_data_reder' in first place. -\cr\cr -This parameter takes by default NcVarReader() (a variable reader function -for NetCDF files). -\cr\cr -See NcVarReader() for a template to build a variale reader for your own -file format.} - -\item{file_dim_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} - that returns a named numeric vector where the names are the names of the - dimensions of the multidimensional data array in the file and the values are - the sizes of such dimensions. Start() will provide automatically - either a 'file_path' or a 'file_object' to the - 'file_dim_reader' function (the function has to be ready to work - whichever of these two is provided). The parameter 'file_selectors' - will also be provided automatically to the dimension reader, containing a - named list where the names are the names of the file dimensions of the - queried data set (see documentation on \code{\dots}) and the values are - single character strings with the components used to build the path to the - file being read (the one provided in 'file_path' or 'file_object'). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and can optionally be used in advanced configurations. -\cr\cr -This parameter takes by default NcDimReader() (a dimension reader -function for NetCDF files). -\cr\cr -See NcDimReader() for (an advanced) template to build a dimension reader -for your own file format.} - -\item{file_data_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, - \code{inner_indices = NULL}, \code{synonims} that returns a subset of the - multidimensional data array inside a file (even if internally it is not an - array). Start() will provide automatically either a 'file_path' - or a 'file_object' to the 'file_data_reader' function (the - function has to be ready to work whichever of these two is provided). The - parameter 'file_selectors' will also be provided automatically to the - data reader, containing a named list where the names are the names of the - file dimensions of the queried data set (see documentation on \code{\dots}) - and the values are single character strings with the components used to - build the path to the file being read (the one provided in 'file_path' or - 'file_object'). The parameter 'inner_indices' will be filled in - automatically by Start() also, with a named list of numeric vectors, - where the names are the names of all the expected inner dimensions in a file - to be read, and the numeric vectors are the indices to be taken from the - corresponding dimension (the indices may not be consecutive nor in order). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and has to be used in the code of the data reader to check for alternative - dimension names inside the target file. The 'file_data_reader' must - return a (multi)dimensional array with named dimensions, and optionally with - the attribute 'variables' with other additional metadata on the retrieved - data. -\cr\cr -Usually, 'file_data_reader' should use 'file_dim_reader' -(see documentation on the corresponding parameter), so it is recommended to -code 'file_dim_reder' in first place. -\cr\cr -This parameter takes by default NcDataReader() (a data reader function -for NetCDF files). -\cr\cr -See NcDataReader() for a template to build a data reader for your own -file format.} - -\item{file_closer}{A function that receives as a single parameter - 'file_object' an open connection (as returned by 'file_opener') - to one of the files to be read, optionally with header information, and - closes the open connection. Always returns NULL. -\cr\cr -This parameter takes by default NcCloser() (a closer function for NetCDF -files). -\cr\cr -See NcCloser() for a template to build a file closer for your own file -format.} - -\item{transform}{A function with the header \code{dara_array}, -\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as -input, through the parameter \code{data_array}, a subset of a -multidimensional array (as returned by 'file_data_reader'), applies a -transformation to it and returns it, preserving the amount of dimensions but -potentially modifying their size. This transformation may require data from -other auxiliary variables, automatically provided to 'transform' -through the parameter 'variables', in the form of a named list where -the names are the variable names and the values are (multi)dimensional -arrays. Which variables need to be sent to 'transform' can be specified -with the parameter 'transform_vars' in Start(). The parameter -'file_selectors' will also be provided automatically to -'transform', containing a named list where the names are the names of -the file dimensions of the queried data set (see documentation on -\code{\dots}) and the values are single character strings with the -components used to build the path to the file the subset being processed -belongs to. The parameter \code{\dots} will be filled in with other -additional parameters to adjust the transformation, exactly as provided in -the call to Start() via the parameter 'transform_params'.} - -\item{transform_params}{A named list with additional parameters to be sent to -the 'transform' function (if specified). See documentation on parameter -'transform' for details.} - -\item{transform_vars}{A vector of character strings with the names of -auxiliary variables to be sent to the 'transform' function (if -specified). All the variables to be sent to 'transform' must also -have been requested as return variables in the parameter 'return_vars' -of Start().} - -\item{transform_extra_cells}{An integer of extra indices to retrieve from the -data set, beyond the requested indices in \code{\dots}, in order for -'transform' to dispose of additional information to properly apply -whichever transformation (if needed). As many as -'transform_extra_cells' will be retrieved beyond each of the limits for -each of those inner dimensions associated to a coordinate variable and sent -to 'transform' (i.e. present in 'transform_vars'). After -'transform' has finished, Start() will take again and return a -subset of the result, for the returned data to fall within the specified -bounds in \code{\dots}. The default value is 2.} - -\item{apply_indices_after_transform}{A logical value indicating when a -'transform' is specified in Start() and numeric indices are -provided for any of the inner dimensions that depend on coordinate variables, -these numeric indices can be made effective (retrieved) before applying the -transformation or after. The boolean flag allows to adjust this behaviour. -It takes FALSE by default (numeric indices are applied before sending -data to 'transform').} - -\item{pattern_dims}{A character string indicating the name of the dimension -with path pattern specifications (see \code{\dots} for details). If not -specified, Start() assumes the first provided dimension is the pattern -dimension, with a warning.} - -\item{metadata_dims}{A vector of character strings with the names of the file -dimensions which to return metadata for. As noted in 'file_data_reader', -the data reader can optionally return auxiliary data via the attribute -'variables' of the returned array. Start() by default returns the -auxiliary data read for only the first file of each source (or data set) in -the pattern dimension (see \code{\dots} for info on what the pattern -dimension is). However it can be configured to return the metadata for all -the files along any set of file dimensions. The default value is NULL, and -it will be assigned automatically as parameter 'pattern_dims'.} - -\item{selector_checker}{A function used internaly by Start() to -translate a set of selectors (values for a dimension associated to a -coordinate variable) into a set of numeric indices. It takes by default -SelectorChecker() and, in principle, it should not be required to -change it for customized file formats. The option to replace it is left open -for more versatility. See the code of SelectorChecker() for details on -the inputs, functioning and outputs of a selector checker.} - -\item{merge_across_dims}{A logical value indicating whether to merge -dimensions across which another dimension extends (according to the -'_across' parameters). Takes the value FALSE by default. For -example, if the dimension 'time' extends across the dimension 'chunk' and -\code{merge_across_dims = TRUE}, the resulting data array will only contain -only the dimension 'time' as long as all the chunks together.} - -\item{merge_across_dims_narm}{A logical value indicating whether to remove -the additional NAs from data when parameter 'merge_across_dims' is TRUE. -It is helpful when the length of the to-be-merged dimension is different -across another dimension. For example, if the dimension 'time' extends -across dimension 'chunk', and the time length along the first chunk is 2 -while along the second chunk is 10. Setting this parameter as TRUE can -remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} - -\item{split_multiselected_dims}{A logical value indicating whether to split a -dimension that has been selected with a multidimensional array of selectors -into as many dimensions as present in the selector array. The default value -is FALSE.} - -\item{path_glob_permissive}{A logical value or an integer specifying how many - folder levels in the path pattern, beginning from the end, the shell glob - expressions must be preserved and worked out for each file. The default - value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr -When specifying a path pattern for a dataset, it might contain shell glob -experissions. For each dataset, the first file matching the path pattern is -found, and the found file is used to work out fixed values for the glob -expressions that will be used for all the files of the dataset. However, in -some cases, the values of the shell glob expressions may not be constant for -all files in a dataset, and they need to be worked out for each file -involved.\cr\cr -For example, a path pattern could be as follows: \cr -\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving -\code{path_glob_permissive = FALSE} will trigger automatic seek of the - contents to replace the asterisks (e.g. the first asterisk matches with - \code{'bar'} and the second with \code{'baz'}. The found contents will be - used for all files in the dataset (in the example, the path pattern will be - fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if - any of the files in the dataset have other contents in the position of the - asterisks, Start() will not find them (in the example, a file like \cr - \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be - found). Setting \code{path_glob_permissive = 1} would preserve global - expressions in the latest level (in the example, the fixed path pattern - would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the - problematic file mentioned before would be found), but of course this would - slow down the Start() call if the dataset involves a large number of - files. Setting \code{path_glob_permissive = 2} would leave the original path - pattern with the original glob expressions in the 1st and 2nd levels (in the - example, both asterisks would be preserved, thus would allow Start() - to recognize files such as \cr - \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr -Note that each glob expression can only represent one possibility (Start() -chooses the first). Because /code{*} is not the tag, which means it cannot -be a dimension of the output array. Therefore, only one possibility can be -adopted. For example, if \cr -\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr -has two matches:\cr -\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr -\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr -only the first found file will be used.} - -\item{largest_dims_length}{A logical value or a named integer vector - indicating if Start() should examine all the files to get the largest - length of the inner dimensions (TRUE) or use the first valid file of each - dataset as the returned dimension length (FALSE). Since examining all the - files could be time-consuming, a vector can be used to explicitly specify - the expected length of the inner dimensions. For those inner dimensions not - specified, the first valid file will be used. The default value is FALSE.\cr\cr - This parameter is useful when the required files don't have consistent - inner dimension. For example, there are 10 required experimental data files - of a series of start dates. The data only contain 25 members for the first - 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, - the returned member dimension length will be 25 only. The 26th to 51st - members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, - the returned member dimension length will be 51. To save the resource, -\code{'largest_dims_length = c(member = 51)'} can also be used.} - -\item{retrieve}{A logical value indicating whether to retrieve the data -defined in the Start() call or to explore only its dimension lengths -and names, and the values for the file and inner dimensions. The default -value is FALSE.} - -\item{num_procs}{An integer of number of processes to be created for the -parallel execution of the retrieval/transformation/arrangement of the -multiple involved files in a call to Start(). If set to NULL, -takes the number of available cores (as detected by detectCores() in -the package 'future'). The default value is 1 (no parallel execution).} - -\item{ObjectBigmemory}{a character string to be included as part of the -bigmemory object name. This parameter is thought to be used internally by the -chunking capabilities of startR.} - -\item{silent}{A logical value of whether to display progress messages (FALSE) -or not (TRUE). The default value is FALSE.} - -\item{debug}{A logical value of whether to return detailed messages on the -progress and operations in a Start() call (TRUE) or not (FALSE). The -default value is FALSE.} - \item{\dots}{A selection of custemized parameters depending on the data format. When we retrieve data from one or a collection of data sets, the involved data can be perceived as belonging to a large multi-dimensional @@ -620,61 +248,451 @@ as they have been specified in the call. For example, the following call: \cr # variable = 'all') } \cr\cr -would return an array with the following dimensions: -\cr -\command{ -\cr # source month store item section variable -\cr # 1 24 100 3 2 2 -} +would return an array with the following dimensions: +\cr +\command{ +\cr # source month store item section variable +\cr # 1 24 100 3 2 2 +} +\cr\cr +Next, a more advanced example to retrieve data for only the sales records, for +the first section ('electronics'), for the 1st and 3rd items and for the +stores located in Barcelona (assuming the files contain the variable +'store_location' with the name of the city each of the 100 stores are located +at): +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr\cr +The defined names for the dimensions do not necessarily have to match the +names of the dimensions inside the file. Lists of alternative names to be +seeked can be defined in the parameter 'synonims'. +\cr\cr +If data from multiple sources (not necessarily following the same structure) +has to be retrieved, it can be done by providing a vector of character strings +with path pattern specifications, or, in the extended form, by providing a +list of lists with the components 'name' and 'path', and the name of the +dataset and path pattern as values, respectively. For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr} + +\item{return_vars}{A named list where the names are the names of the +variables to be fetched in the files, and the values are vectors of +character strings with the names of the file dimension which to retrieve each +variable for, or NULL if the variable has to be retrieved only once +from any (the first) of the involved files.\cr\cr +Apart from retrieving a multidimensional data array, retrieving auxiliary +variables inside the files can also be needed. The parameter +'return_vars' allows for requesting such variables, as long as a +'file_var_reader' function is also specified in the call to +Start() (see documentation on the corresponding parameter). +\cr\cr +In the case of the the item sales example (see documentation on parameter +\code{\dots)}, the store location variable is requested with the parameter\cr +\code{return_vars = list(store_location = NULL)}.\cr This will cause +Start() to fetch once the variable 'store_location' and return it in +the component\cr \code{$Variables$common$store_location},\cr and will be an +array of character strings with the location names, with the dimensions +\code{c('store' = 100)}. Although useless in this example, we could ask +Start() to fetch and return such variable for each file along the +items dimension as follows: \cr +\code{return_vars = list(store_location = c('item'))}.\cr In that case, the +variable will be fetched once from a file of each of the items, and will be +returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. +\cr\cr +If a variable is requested along a file dimension that contains path pattern +specifications ('source' in the example), the fetched variable values will be +returned in the component\cr \code{$Variables$$}.\cr +For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item'))) +\cr # # Checking the structure of the returned variables +\cr # str(found_data$Variables) +\cr # Named list +\cr # ..$common: NULL +\cr # ..$sourceA: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # ..$sourceB: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # # Checking the dimensions of the returned variable +\cr # # for the source A +\cr # dim(found_data$Variables$sourceA) +\cr # item store +\cr # 3 3 +} +\cr\cr +The names of the requested variables do not necessarily have to match the +actual variable names inside the files. A list of alternative names to be +seeked can be specified via the parameter 'synonims'.} + +\item{synonims}{A named list where the names are the requested variable or +dimension names, and the values are vectors of character strings with +alternative names to seek for such dimension or variable.\cr\cr +In some requests, data from different sources may follow different naming +conventions for the dimensions or variables, or even files in the same source +could have varying names. This parameter is in order for Start() to +properly identify the dimensions or variables with different names. +\cr\cr +In the example used in parameter 'return_vars', it may be the case that +the two involved data sources follow slightly different naming conventions. +For example, source A uses 'sect' as name for the sections dimension, whereas +source B uses 'section'; source A uses 'store_loc' as variable name for the +store locations, whereas source B uses 'store_location'. This can be taken +into account as follows: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item')), +\cr # synonims = list( +\cr # section = c('sec', 'section'), +\cr # store_location = c('store_loc', +\cr # 'store_location') +\cr # )) +} +\cr} + +\item{file_opener}{A function that receives as a single parameter + 'file_path' a character string with the path to a file to be opened, + and returns an object with an open connection to the file (optionally with + header information) on success, or returns NULL on failure. +\cr\cr +This parameter takes by default NcOpener() (an opener function for NetCDF +files). +\cr\cr +See NcOpener() for a template to build a file opener for your own file +format.} + +\item{file_var_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, + \code{synonims} that returns an array with auxiliary data (i.e. data from a + variable) inside a file. Start() will provide automatically either a + 'file_path' or a 'file_object' to the 'file_var_reader' + function (the function has to be ready to work whichever of these two is + provided). The parameter 'file_selectors' will also be provided + automatically to the variable reader, containing a named list where the + names are the names of the file dimensions of the queried data set (see + documentation on \code{\dots}) and the values are single character strings + with the components used to build the path to the file being read (the one + provided in 'file_path' or 'file_object'). The parameter 'var_name' + will be filled in automatically by Start() also, with the name of one + of the variales to be read. The parameter 'synonims' will be filled in + with exactly the same value as provided in the parameter 'synonims' in + the call to Start(), and has to be used in the code of the variable + reader to check for alternative variable names inside the target file. The + 'file_var_reader' must return a (multi)dimensional array with named + dimensions, and optionally with the attribute 'variales' with other + additional metadata on the retrieved variable. +\cr\cr +Usually, the 'file_var_reader' should be a degenerate case of the +'file_data_reader' (see documentation on the corresponding parameter), +so it is recommended to code the 'file_data_reder' in first place. +\cr\cr +This parameter takes by default NcVarReader() (a variable reader function +for NetCDF files). +\cr\cr +See NcVarReader() for a template to build a variale reader for your own +file format.} + +\item{file_dim_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} + that returns a named numeric vector where the names are the names of the + dimensions of the multidimensional data array in the file and the values are + the sizes of such dimensions. Start() will provide automatically + either a 'file_path' or a 'file_object' to the + 'file_dim_reader' function (the function has to be ready to work + whichever of these two is provided). The parameter 'file_selectors' + will also be provided automatically to the dimension reader, containing a + named list where the names are the names of the file dimensions of the + queried data set (see documentation on \code{\dots}) and the values are + single character strings with the components used to build the path to the + file being read (the one provided in 'file_path' or 'file_object'). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and can optionally be used in advanced configurations. +\cr\cr +This parameter takes by default NcDimReader() (a dimension reader +function for NetCDF files). +\cr\cr +See NcDimReader() for (an advanced) template to build a dimension reader +for your own file format.} + +\item{file_data_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, + \code{inner_indices = NULL}, \code{synonims} that returns a subset of the + multidimensional data array inside a file (even if internally it is not an + array). Start() will provide automatically either a 'file_path' + or a 'file_object' to the 'file_data_reader' function (the + function has to be ready to work whichever of these two is provided). The + parameter 'file_selectors' will also be provided automatically to the + data reader, containing a named list where the names are the names of the + file dimensions of the queried data set (see documentation on \code{\dots}) + and the values are single character strings with the components used to + build the path to the file being read (the one provided in 'file_path' or + 'file_object'). The parameter 'inner_indices' will be filled in + automatically by Start() also, with a named list of numeric vectors, + where the names are the names of all the expected inner dimensions in a file + to be read, and the numeric vectors are the indices to be taken from the + corresponding dimension (the indices may not be consecutive nor in order). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and has to be used in the code of the data reader to check for alternative + dimension names inside the target file. The 'file_data_reader' must + return a (multi)dimensional array with named dimensions, and optionally with + the attribute 'variables' with other additional metadata on the retrieved + data. +\cr\cr +Usually, 'file_data_reader' should use 'file_dim_reader' +(see documentation on the corresponding parameter), so it is recommended to +code 'file_dim_reder' in first place. \cr\cr -Next, a more advanced example to retrieve data for only the sales records, for -the first section ('electronics'), for the 1st and 3rd items and for the -stores located in Barcelona (assuming the files contain the variable -'store_location' with the name of the city each of the 100 stores are located -at): -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} +This parameter takes by default NcDataReader() (a data reader function +for NetCDF files). \cr\cr -The defined names for the dimensions do not necessarily have to match the -names of the dimensions inside the file. Lists of alternative names to be -seeked can be defined in the parameter 'synonims'. +See NcDataReader() for a template to build a data reader for your own +file format.} + +\item{file_closer}{A function that receives as a single parameter + 'file_object' an open connection (as returned by 'file_opener') + to one of the files to be read, optionally with header information, and + closes the open connection. Always returns NULL. \cr\cr -If data from multiple sources (not necessarily following the same structure) -has to be retrieved, it can be done by providing a vector of character strings -with path pattern specifications, or, in the extended form, by providing a -list of lists with the components 'name' and 'path', and the name of the -dataset and path pattern as values, respectively. For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} -\cr} +This parameter takes by default NcCloser() (a closer function for NetCDF +files). +\cr\cr +See NcCloser() for a template to build a file closer for your own file +format.} + +\item{transform}{A function with the header \code{dara_array}, +\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as +input, through the parameter \code{data_array}, a subset of a +multidimensional array (as returned by 'file_data_reader'), applies a +transformation to it and returns it, preserving the amount of dimensions but +potentially modifying their size. This transformation may require data from +other auxiliary variables, automatically provided to 'transform' +through the parameter 'variables', in the form of a named list where +the names are the variable names and the values are (multi)dimensional +arrays. Which variables need to be sent to 'transform' can be specified +with the parameter 'transform_vars' in Start(). The parameter +'file_selectors' will also be provided automatically to +'transform', containing a named list where the names are the names of +the file dimensions of the queried data set (see documentation on +\code{\dots}) and the values are single character strings with the +components used to build the path to the file the subset being processed +belongs to. The parameter \code{\dots} will be filled in with other +additional parameters to adjust the transformation, exactly as provided in +the call to Start() via the parameter 'transform_params'.} + +\item{transform_params}{A named list with additional parameters to be sent to +the 'transform' function (if specified). See documentation on parameter +'transform' for details.} + +\item{transform_vars}{A vector of character strings with the names of +auxiliary variables to be sent to the 'transform' function (if +specified). All the variables to be sent to 'transform' must also +have been requested as return variables in the parameter 'return_vars' +of Start().} + +\item{transform_extra_cells}{An integer of extra indices to retrieve from the +data set, beyond the requested indices in \code{\dots}, in order for +'transform' to dispose of additional information to properly apply +whichever transformation (if needed). As many as +'transform_extra_cells' will be retrieved beyond each of the limits for +each of those inner dimensions associated to a coordinate variable and sent +to 'transform' (i.e. present in 'transform_vars'). After +'transform' has finished, Start() will take again and return a +subset of the result, for the returned data to fall within the specified +bounds in \code{\dots}. The default value is 2.} + +\item{apply_indices_after_transform}{A logical value indicating when a +'transform' is specified in Start() and numeric indices are +provided for any of the inner dimensions that depend on coordinate variables, +these numeric indices can be made effective (retrieved) before applying the +transformation or after. The boolean flag allows to adjust this behaviour. +It takes FALSE by default (numeric indices are applied before sending +data to 'transform').} + +\item{pattern_dims}{A character string indicating the name of the dimension +with path pattern specifications (see \code{\dots} for details). If not +specified, Start() assumes the first provided dimension is the pattern +dimension, with a warning.} + +\item{metadata_dims}{A vector of character strings with the names of the file +dimensions which to return metadata for. As noted in 'file_data_reader', +the data reader can optionally return auxiliary data via the attribute +'variables' of the returned array. Start() by default returns the +auxiliary data read for only the first file of each source (or data set) in +the pattern dimension (see \code{\dots} for info on what the pattern +dimension is). However it can be configured to return the metadata for all +the files along any set of file dimensions. The default value is NULL, and +it will be assigned automatically as parameter 'pattern_dims'.} + +\item{selector_checker}{A function used internaly by Start() to +translate a set of selectors (values for a dimension associated to a +coordinate variable) into a set of numeric indices. It takes by default +SelectorChecker() and, in principle, it should not be required to +change it for customized file formats. The option to replace it is left open +for more versatility. See the code of SelectorChecker() for details on +the inputs, functioning and outputs of a selector checker.} + +\item{merge_across_dims}{A logical value indicating whether to merge +dimensions across which another dimension extends (according to the +'_across' parameters). Takes the value FALSE by default. For +example, if the dimension 'time' extends across the dimension 'chunk' and +\code{merge_across_dims = TRUE}, the resulting data array will only contain +only the dimension 'time' as long as all the chunks together.} + +\item{merge_across_dims_narm}{A logical value indicating whether to remove +the additional NAs from data when parameter 'merge_across_dims' is TRUE. +It is helpful when the length of the to-be-merged dimension is different +across another dimension. For example, if the dimension 'time' extends +across dimension 'chunk', and the time length along the first chunk is 2 +while along the second chunk is 10. Setting this parameter as TRUE can +remove the additional 8 NAs at position 3 to 10. The default value is TRUE, +but will be automatically turned to FALSE if 'merge_across_dims = FALSE'.} + +\item{split_multiselected_dims}{A logical value indicating whether to split a +dimension that has been selected with a multidimensional array of selectors +into as many dimensions as present in the selector array. The default value +is FALSE.} + +\item{path_glob_permissive}{A logical value or an integer specifying how many + folder levels in the path pattern, beginning from the end, the shell glob + expressions must be preserved and worked out for each file. The default + value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr +When specifying a path pattern for a dataset, it might contain shell glob +experissions. For each dataset, the first file matching the path pattern is +found, and the found file is used to work out fixed values for the glob +expressions that will be used for all the files of the dataset. However, in +some cases, the values of the shell glob expressions may not be constant for +all files in a dataset, and they need to be worked out for each file +involved.\cr\cr +For example, a path pattern could be as follows: \cr +\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving +\code{path_glob_permissive = FALSE} will trigger automatic seek of the + contents to replace the asterisks (e.g. the first asterisk matches with + \code{'bar'} and the second with \code{'baz'}. The found contents will be + used for all files in the dataset (in the example, the path pattern will be + fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if + any of the files in the dataset have other contents in the position of the + asterisks, Start() will not find them (in the example, a file like \cr + \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be + found). Setting \code{path_glob_permissive = 1} would preserve global + expressions in the latest level (in the example, the fixed path pattern + would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the + problematic file mentioned before would be found), but of course this would + slow down the Start() call if the dataset involves a large number of + files. Setting \code{path_glob_permissive = 2} would leave the original path + pattern with the original glob expressions in the 1st and 2nd levels (in the + example, both asterisks would be preserved, thus would allow Start() + to recognize files such as \cr + \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +Note that each glob expression can only represent one possibility (Start() +chooses the first). Because /code{*} is not the tag, which means it cannot +be a dimension of the output array. Therefore, only one possibility can be +adopted. For example, if \cr +\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +has two matches:\cr +\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +only the first found file will be used.} + +\item{largest_dims_length}{A logical value or a named integer vector + indicating if Start() should examine all the files to get the largest + length of the inner dimensions (TRUE) or use the first valid file of each + dataset as the returned dimension length (FALSE). Since examining all the + files could be time-consuming, a vector can be used to explicitly specify + the expected length of the inner dimensions. For those inner dimensions not + specified, the first valid file will be used. The default value is FALSE.\cr\cr + This parameter is useful when the required files don't have consistent + inner dimension. For example, there are 10 required experimental data files + of a series of start dates. The data only contain 25 members for the first + 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, + the returned member dimension length will be 25 only. The 26th to 51st + members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, + the returned member dimension length will be 51. To save the resource, +\code{'largest_dims_length = c(member = 51)'} can also be used.} + +\item{retrieve}{A logical value indicating whether to retrieve the data +defined in the Start() call or to explore only its dimension lengths +and names, and the values for the file and inner dimensions. The default +value is FALSE.} + +\item{num_procs}{An integer of number of processes to be created for the +parallel execution of the retrieval/transformation/arrangement of the +multiple involved files in a call to Start(). If set to NULL, +takes the number of available cores (as detected by detectCores() in +the package 'future'). The default value is 1 (no parallel execution).} + +\item{ObjectBigmemory}{a character string to be included as part of the +bigmemory object name. This parameter is thought to be used internally by the +chunking capabilities of startR.} + +\item{silent}{A logical value of whether to display progress messages (FALSE) +or not (TRUE). The default value is FALSE.} + +\item{debug}{A logical value of whether to return detailed messages on the +progress and operations in a Start() call (TRUE) or not (FALSE). The +default value is FALSE.} } \value{ If \code{retrieve = TRUE} the involved data is loaded into RAM memory @@ -830,4 +848,3 @@ file format. retrieve = FALSE) } - diff --git a/man/Step.Rd b/man/Step.Rd index 65f0c72..c473ccb 100644 --- a/man/Step.Rd +++ b/man/Step.Rd @@ -4,8 +4,13 @@ \alias{Step} \title{Define the operation applied on declared data.} \usage{ -Step(fun, target_dims, output_dims, use_libraries = NULL, - use_attributes = NULL) +Step( + fun, + target_dims, + output_dims, + use_libraries = NULL, + use_attributes = NULL +) } \arguments{ \item{fun}{A function in R format defining the operation to be applied to the @@ -70,4 +75,3 @@ to the expected order for this function. wf <- AddStep(data, step) } - diff --git a/man/indices.Rd b/man/indices.Rd index a3d85ea..6233b71 100644 --- a/man/indices.Rd +++ b/man/indices.Rd @@ -39,4 +39,3 @@ original data. See details in the documentation of the parameter \code{\dots} \seealso{ \code{\link{values}} } - diff --git a/man/values.Rd b/man/values.Rd index 3300f19..31ce95a 100644 --- a/man/values.Rd +++ b/man/values.Rd @@ -41,4 +41,3 @@ coordinate variable. See details in the documentation of the parameter \seealso{ \code{\link{indices}} } - -- GitLab