diff --git a/DESCRIPTION b/DESCRIPTION index 5a33c3cf0b6cd3569354ef2356e7402c92c3a85c..27d4a775592e188fc7353cc232e428e3c33edf61 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 2.1.0-3 +Version: 2.1.0-4 Authors@R: c( person("BSC-CNS", role = c("aut", "cph")), person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), @@ -38,4 +38,4 @@ URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues LazyData: true SystemRequirements: cdo ecFlow -RoxygenNote: 5.0.0 +RoxygenNote: 7.0.1 diff --git a/R/NcDataReader.R b/R/NcDataReader.R index 741ffbb6d1ca54f4e1e224df04feddefef9682fb..ebc58fc27c6a6c5a379e9773de1ffe27bed2ecc3 100644 --- a/R/NcDataReader.R +++ b/R/NcDataReader.R @@ -221,6 +221,13 @@ NcDataReader <- function(file_path = NULL, file_object = NULL, # Origin year and month ori_year <- as.numeric(substr(parts[2], 1, 4)) ori_month <- as.numeric(substr(parts[2], 6, 7)) + if (is.na(ori_month)) { + ori_month <- as.numeric(substr(parts[2], 6, 6)) + } + if (!is.numeric(ori_year) | !is.numeric(ori_month)) { + stop(paste0("The time unit attribute format is not 'YYYY-MM-DD' or 'YYYY-M-D'. ", + "Check the file or contact the maintainer.")) + } if (calendar == 'gregorian') { # Find how many years + months diff --git a/R/Start.R b/R/Start.R index 80e1c83d869f45cfc6228078329d87f1816d7bb7..ff4b97831b5c7337cdab1da166b9016895ac92b3 100644 --- a/R/Start.R +++ b/R/Start.R @@ -632,7 +632,8 @@ #' across another dimension. For example, if the dimension 'time' extends #' across dimension 'chunk', and the time length along the first chunk is 2 #' while along the second chunk is 10. Setting this parameter as TRUE can -#' remove the additional 8 NAs at position 3 to 10. The default value is FALSE. +#' remove the additional 8 NAs at position 3 to 10. The default value is TRUE, +#' but will be automatically turned to FALSE if 'merge_across_dims = FALSE'. #'@param split_multiselected_dims A logical value indicating whether to split a #' dimension that has been selected with a multidimensional array of selectors #' into as many dimensions as present in the selector array. The default value @@ -820,7 +821,7 @@ Start <- function(..., # dim = indices/selectors, metadata_dims = NULL, selector_checker = SelectorChecker, merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, + merge_across_dims_narm = TRUE, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, largest_dims_length = FALSE, @@ -863,9 +864,6 @@ Start <- function(..., # dim = indices/selectors, } if (!merge_across_dims & merge_across_dims_narm) { merge_across_dims_narm <- FALSE - .warning(paste0("Parameter 'merge_across_dims_narm' can only be TRUE when ", - "'merge_across_dims' is TRUE. Set 'merge_across_dims_narm'", - " to FALSE.")) } # Leave alone the dimension parameters in the variable dim_params @@ -1400,32 +1398,27 @@ Start <- function(..., # dim = indices/selectors, } } -#///////////////////////////////////////////////////// +#================================================== # Check if return_vars name is inner dim name. If it is synonim, change back to # inner dim name and return a warning. dim_params_found_file_dims <- dim_params[found_file_dims[[i]]] - file_dims_array_dim_names <- unlist(lapply(lapply(dim_params_found_file_dims, dim), - names)) - if (any(!names(return_vars) %in% expected_inner_dims[[i]] & - !names(return_vars) %in% unlist(var_params) & - !names(return_vars) %in% file_dims_array_dim_names)) { - not_inner_dimname_ind <- which(!names(return_vars) %in% expected_inner_dims[[i]]) - for (not_inner_dim in 1:length(not_inner_dimname_ind)) { - wrong_name_return_vars <- names(return_vars)[not_inner_dimname_ind[not_inner_dim]] - if (!wrong_name_return_vars %in% unlist(synonims)) { - stop(paste0("Could not find variable '", wrong_name_return_vars, "' (or its ", - "synonims if specified) in the inner dimension names. The expected ", - "name should be ", paste(paste0("'", expected_inner_dims[[i]], "'"), collapse = ', '), ".")) - } else { - inner_dim_name <- names(unlist(lapply(lapply(synonims, '%in%', wrong_name_return_vars), which))) - names(return_vars)[not_inner_dimname_ind[not_inner_dim]] <- inner_dim_name - .warning(paste0("The name '", wrong_name_return_vars, "' in parameter 'return_vars' ", - "is synonim. Change it back to the inner dimension name, '", - inner_dim_name, "'.")) - } + if (any(names(return_vars) %in% unlist(synonims) & + !names(return_vars) %in% names(synonims))) { + use_syn_names <- which(names(return_vars) %in% unlist(synonims) & + !names(return_vars) %in% names(synonims)) + for (use_syn_name in use_syn_names) { + wrong_name <- names(return_vars)[use_syn_name] + names(return_vars)[use_syn_name] <- names(unlist( + lapply(lapply(synonims, '%in%', + names(return_vars)[use_syn_name]), + which))) + .warning(paste0("The name '", wrong_name, + "' in parameter 'return_vars' in synonim. ", + "Change it back to the inner dimension name, '", + names(return_vars)[use_syn_name], "'.")) } } -#///////////////////////////////////////////////////// +#===================================================== ## (Check the *_var parameters). if (any(!(unlist(var_params) %in% names(return_vars)))) { @@ -1857,6 +1850,7 @@ Start <- function(..., # dim = indices/selectors, } else { common_return_vars[[inner_dim]] <- file_dim_as_selector_array_dim } + tmp <- file_dim_as_selector_array_dim } else if (inner_dim %in% inner_dims_across_files) { #(2) file_dim_name <- names(which(inner_dim == inner_dims_across_files)) if (file_dim_name %in% found_pattern_dim) { @@ -1864,6 +1858,7 @@ Start <- function(..., # dim = indices/selectors, } else { common_return_vars[[inner_dim]] <- file_dim_name } + tmp <- file_dim_name } .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", tmp, "', but '", inner_dim, "' is not in return_vars list or is NULL. ", @@ -2260,6 +2255,17 @@ Start <- function(..., # dim = indices/selectors, # Find the longest dimensions from all the files largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) + + # The inner dim order may differ among files. Need to align them before + # find out the largest dim length. + dim_names_first_file <- names(data_dims_all_files[[1]]) + same_dim_order <-lapply(lapply(data_dims_all_files, names), + identical, dim_names_first_file) + for (to_fix in which(!unlist(same_dim_order))) { + data_dims_all_files[[to_fix]] <- data_dims_all_files[[to_fix]][match(dim_names_first_file, + names(data_dims_all_files[[to_fix]]))] + } + for (kk in 1:length(data_dims_all_files[[1]])) { largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) } @@ -3630,6 +3636,7 @@ Start <- function(..., # dim = indices/selectors, # final_dims will be used for collocation of data, whereas final_dims_fake # will be used for shaping the final array to be returned to the user. final_dims_fake <- final_dims + if (merge_across_dims) { if (!is.null(inner_dims_across_files)) { for (file_dim_across in names(inner_dims_across_files)) { @@ -3647,6 +3654,9 @@ Start <- function(..., # dim = indices/selectors, } } } + #========================================================================= + # Find the dimension to split if split_multiselected_dims = TRUE. + # If there is no dimension able to be split, change split_multiselected_dims to FALSE. all_split_dims <- NULL if (split_multiselected_dims) { for (dim_param in 1:length(dim_params)) { @@ -3660,7 +3670,15 @@ Start <- function(..., # dim = indices/selectors, 1:length(split_dims)) } old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) - + + # If merge_across_dims and split_multiselected_dims are both used, + # on one file dim, and this file dim is multi-dim, it doesn't work. + if (identical(old_dim_pos, integer(0))) { + stop(paste0("The dimension '", names(dim_params)[dim_param], + "' to be split cannot be found after 'merge_across_dims' ", + "is used. Check if the reshape parameters are used appropriately.")) + } + # NOTE: Three steps to create new dims. # 1st: Put in the dims before split_dim. # 2nd: Replace the old_dim with split_dims. @@ -3677,27 +3695,36 @@ Start <- function(..., # dim = indices/selectors, } } } + if (is.null(all_split_dims)) { + split_multiselected_dims <- FALSE + .warning(paste0("Not found any dimensions able to be split. The parameter ", + "'split_multiselected_dims' is changed to FALSE.")) + } } - if (merge_across_dims_narm) { + #====================================================================== + if (merge_across_dims) { # only merge_across_dims -> the 'time' dim length needs to be adjusted across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? - across_file_dim <- names(inner_dims_across_files) #TODO: more than one? # Get the length of each inner_dim ('time') along each file_dim ('file_date') length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) - - if (!split_multiselected_dims) { - final_dims_fake_name <- names(final_dims_fake) - pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) - new_length_inner_dim <- sum(unlist(length_inner_across_dim)) - if (pos_across_inner_dim != length(final_dims_fake)) { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim, - final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) - } else { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim) + + if (merge_across_dims_narm) { + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + + if (!split_multiselected_dims) { + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name } - names(final_dims_fake) <- final_dims_fake_name } } @@ -3730,6 +3757,7 @@ Start <- function(..., # dim = indices/selectors, if (split_multiselected_dims & merge_across_dims) { # TODO: More than one split? inner_dim_pos_in_split_dims <- which(names(all_split_dims[[1]]) == inner_dims_across_files) + # if inner_dim is not the first, change! if (inner_dim_pos_in_split_dims != 1) { split_dims <- c(split_dims[inner_dim_pos_in_split_dims], @@ -4021,43 +4049,51 @@ Start <- function(..., # dim = indices/selectors, # unequal inner_dim ('time') length across file_dim ('file_date'). # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. - if (merge_across_dims_narm) { - - # Get the length of these two dimensions in final_dims - length_inner_across_store_dims <- final_dims[across_inner_dim] - length_file_across_store_dims <- final_dims[across_file_dim] - - # Create a logical array for merge_across_dims - logi_array <- array(rep(FALSE, - length_file_across_store_dims * length_inner_across_store_dims), - dim = c(length_inner_across_store_dims, length_file_across_store_dims)) - for (i in 1:length_file_across_store_dims) { #1:4 - logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { + if (!merge_across_dims_narm) { + data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) + } else { + # Get the length of these two dimensions in final_dims + length_inner_across_store_dims <- final_dims[across_inner_dim] + length_file_across_store_dims <- final_dims[across_file_dim] + + # Create a logical array for merge_across_dims + logi_array <- array(rep(FALSE, + length_file_across_store_dims * length_inner_across_store_dims), + dim = c(length_inner_across_store_dims, length_file_across_store_dims)) + for (i in 1:length_file_across_store_dims) { #1:4 + logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + } + + # First, get the data array with final_dims dimension + data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) + + # Change the NA derived from additional spaces to -9999, then remove these -9999 + func_remove_blank <- function(data_array, logi_array) { + # dim(data_array) = [time, file_date] + # dim(logi_array) = [time, file_date] + # Change the blank spaces from NA to -9999 + data_array[which(!logi_array)] <- -9999 + return(data_array) + } + data_array_final_dims <- multiApply::Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector } - - # First, get the data array with final_dims dimension - data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) - - # Change the NA derived from additional spaces to -9999, then remove these -9999 - func_remove_blank <- function(data_array, logi_array) { - # dim(data_array) = [time, file_date] - # dim(logi_array) = [time, file_date] - # Change the blank spaces from NA to -9999 - data_array[which(!logi_array)] <- -9999 - return(data_array) + + if (length(data_array_tmp) != prod(final_dims_fake)) { + stop(paste0("After reshaping, the data do not fit into the expected output dimension. ", + "Check if the reshaping parameters are used correctly.")) } - data_array_final_dims <- multiApply::Apply(data_array_final_dims, - target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') - output_dims = c(across_inner_dim, across_file_dim), - fun = func_remove_blank, - logi_array = logi_array)$output1 - ## reorder back to the correct dim - tmp <- match(names(final_dims), names(dim(data_array_final_dims))) - data_array_final_dims <- .aperm2(data_array_final_dims, tmp) - data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector - - #NOTE: When one file contains values for dicrete dimensions, rearrange the - # chunks (i.e., work_piece) is necessary. + + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { # generate the correct order list from indices_chunk @@ -4065,7 +4101,7 @@ Start <- function(..., # dim = indices/selectors, i <- 1 j <- 1 a <- indices_chunk[i] - while (i < length(indices_chunk)) { + while (i <= length(indices_chunk)) { while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { a <- c(a, indices_chunk[i+1]) i <- i + 1 @@ -4079,57 +4115,55 @@ Start <- function(..., # dim = indices/selectors, final_order_list <- lapply(final_order_list, length) if (!all(diff(as.numeric(names(final_order_list))) > 0)) { - - - # shape the vector into the array without split_dims - split_dims_pos <- match(split_dims, final_dims_fake) - new_dims <- c() - if (split_dims_pos[1] > 1) { - new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) - } - new_dims <- c(new_dims, prod(split_dims)) - names(new_dims)[split_dims_pos[1]] <- across_inner_dim - if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) - } - final_dims_fake_no_split <- new_dims - data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) - # seperate 'time' dim into each work_piece length - data_array_seperate <- list() - tmp <- cumsum(unlist(length_inner_across_dim)) - tmp <- c(0, tmp) - for (i in 1:length(length_inner_across_dim)) { - data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, - (tmp[i] + 1):tmp[i + 1]) - } - - # re-build the array: chunk - which_chunk <- as.numeric(names(final_order_list)) - how_many_indices <- unlist(final_order_list) - array_piece <- list() - ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) - for (i in 1:length(final_order_list)) { - array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], - across_inner_dim, - ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) - ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - } + # shape the vector into the array without split_dims + split_dims_pos <- match(split_dims, final_dims_fake) + new_dims <- c() + if (split_dims_pos[1] > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, prod(split_dims)) + names(new_dims)[split_dims_pos[1]] <- across_inner_dim + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake_no_split <- new_dims + data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) + # seperate 'time' dim into each work_piece length + data_array_seperate <- list() + tmp <- cumsum(unlist(length_inner_across_dim)) + tmp <- c(0, tmp) + for (i in 1:length(length_inner_across_dim)) { + data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + + # re-build the array: chunk + which_chunk <- as.numeric(names(final_order_list)) + how_many_indices <- unlist(final_order_list) + array_piece <- list() + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + for (i in 1:length(final_order_list)) { + array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], + across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] + } - # re-build the array: paste - data_array_tmp <- array_piece[[1]] - along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) - if (length(array_piece) > 1) { - for (i in 2:length(array_piece)) { - data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], - along = along_pos) + # re-build the array: paste + data_array_tmp <- array_piece[[1]] + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + if (length(array_piece) > 1) { + for (i in 2:length(array_piece)) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } } - } - } + } } data_array <- array(data_array_tmp, dim = final_dims_fake) - } else { # merge_across_dims_narm = F (old version) + } else { # ! (merge_across_dims + split_multiselected_dims) (old version) data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) } diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 24e7acb0dd0fb0cc2b0447819a040117c027b448..83676f37b3c0cc0c2f810bc2614ee2c0af0f2a3e 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -475,11 +475,11 @@ The code to reproduce this behaviour could be found in the Use Cases section, [e ### 11. Select the longitude/latitude region There are three ways to specify the dimension selectors: special keywords('all', 'first', 'last'), indices, or values (find more details in [pratical guide](inst/doc/practical_guide.md)). -The parameter 'xxx_reorder' is only effective when using **values**. +For now, the parameter 'xxx_reorder' is only effective when using **values**. There are two reorder functions in startR package, **Sort()** for latitude and **CircularSort()** for longitude. -Sort() is a wrapper function of base function sort(), rearranging the values from low to high (decreasing = TRUE, default) or -from high to low (decreasing = FALSE). For example, if you want to sort latitude from 90 to -90, use `latitude_reorder = Sort(decreasing = TRUE)`. +Sort() is a wrapper function of base function sort(), rearranging the values from low to high (decreasing = FALSE, default) or +from high to low (decreasing = TRUE). For example, if you want to sort latitude from 90 to -90, use `latitude_reorder = Sort(decreasing = TRUE)`. By this means, the result will always from big to small value no matter how the original order is. On the other hand, the concept of CircularSort() is different. It is used for a circular region, putting the out-of-region values back to the region. @@ -487,9 +487,16 @@ It requires two input numbers defining the borders of the whole region, which ar `longitude_reorder = CircularSort(0, 360)` means that the left border is 0 and the right border is 360, so 360 will be put back to 0, 361 will be put back to 1, and -1 will become 359. After circulating values, CircularSort() also sorts the values from small to big. It may cause the discontinous sub-region, but the problem can be solved by assigning the borders correctly. +Note that the two points in CircularSort() are regarded as the same point. Hence, if you want to load the global longitude, lonmin/lonmax should be slightly different, e.g., 0/359.9, 0.1/360, -179.9/180, -180/179.9, etc. Otherwise, only one point will be returned. The following chart helps you to decide how to use CircularSort() to get the desired region. -The first row represents the longitude border of the requested region, e.g., `values(list(lon.min, lon.max))`. +The first row represents the longitude border of the requested region, e.g., `values(list(lon.min, lon.max))`, +and the white part is the returned longitude range corresponding to each CircularSort() setting. +Here are some summaries: +- The original longitude range does not matter. No matter the original longitude is [0, 360] or [-180, 180], Start() will return the values shown in the chart according to the lonmin/lonmax you set. +- The lonmin/lonmax value should be consistent with CircularSort(), so the returned values are continuous. For example, if `lonmin/lonmax = -60/60`, `CircularSort(-180, 180)` should be used. +- Define the longitude range as the one you want to get, regardless the original file. For example, if you want the data to be [-180, 180], define `lonmin/lonmax = -179.9/180` and `CircularSort(-180, 180)`, even if the original longitude in the netCDF file is [0, 360]. + Note that this chart only provides the idea. The real numbers may slightly differ depending on the original/transform values. @@ -611,7 +618,30 @@ List of 1 The selectors can be not only vectors, but also multidimensional array. For instance, the 'time' dimension can be assigned by a two-dimensional array `[sdate = 12, time = 31]`, which is 31 timesteps for 12 start dates. You may want to have both 'sdate' and 'time' in the output dimension, even though 'sdate' is not explicitly specified in Start(). -The parameter 'split_multiselected_dims' is for this goal. It is common in the case that uses experimental data attributes to get the corresponding observational data. +The parameter 'split_multiselected_dims' is for this goal. It can be used to reshape the +file dimensions, and it is also common in the case that experimental data attributes are +used to define observational data inner dimensions, so we can get the corresponding observational data in the same dimension structure. + +Here is a simple example. By defining the selector of the file dimension 'file_date' as a +two-dimensional array, we can reshape this dimension into 'month' and 'year'. + +```r +obs.path <- "/esarchive/recon/ecmwf/era5/monthly_mean/$var$_f1h-r1440x721cds/$var$_$file_date$.nc" +file_date <- c("201311","201312","201411","201412") +dim(file_date) <- c(month = 2, year = 2) + +obs <- Start(dat = obs.path, + var = 'prlr', + file_date = file_date, + time = 'all', + lat = indices(1:10), + lon = indices(1:10), + return_vars = list(lat = NULL, + lon = NULL, + time = 'file_date'), + split_multiselected_dims = TRUE, + retrieve = TRUE) +``` The following script is part of the use case [ex1_2_exp_obs_attr.R](inst/doc/usecase/ex1_2_exp_obs_attr.R). The time selector for observational data comes from experimental data above (neglected here). The dimension number of the selector is two. diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index ac4e4adde9ec1a7188a785dd3ced3e2e13f28794..06f2c044eb64673cedaf1141a00c0cc69bc2b5b5 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -25,7 +25,7 @@ in a comparable structure. It also shows how to use parameters `xxx_tolerance`, 5. [Use reorder functions to get desired lat/lon region](inst/doc/usecase/ex1_5_latlon_reorder.R) This script shows you how to use reorder function (`Sort()`, `CircularSort()`) to -get the desired longitude and latitude region. See [FAQ How-to-#11] (/inst/doc/faq.md#11-read-latitude-and-longitude-with-the-usage-of-parameter-xxx_reorder) +get the desired longitude and latitude region. See [FAQ How-to-#11](inst/doc/faq.md#11-select-the-longitudelatitude-region) for more explanation. 6. [Loading gridpoint data](inst/doc/usecase/ex1_6_gridpoint_data.R) diff --git a/inst/doc/usecase/ex1_3_attr_loadin.R b/inst/doc/usecase/ex1_3_attr_loadin.R index d514c3061745a7aa926c31e094727806b3219238..e2c821117a82612e842d29d3a90cdf3869f5c2cd 100644 --- a/inst/doc/usecase/ex1_3_attr_loadin.R +++ b/inst/doc/usecase/ex1_3_attr_loadin.R @@ -77,7 +77,7 @@ # NOTE: 'merge_across_dims_narm = TRUE' is necessary because the observational # data have unequal time length of 30-day and 31-day months. # If the NAs are not removed, unwanted NAs will exist and make the -# values misplaced in the array. See 'bonus' below for more explanation. +# values misplaced in the array. #------- Check erai ----------- dim(erai) @@ -129,45 +129,3 @@ attr(erai, 'Variables')$common$time[2, ] -# //////////////////"BONUS"////////////////////// -# Here is something more to show the usage of parameter 'merge_across_dims_narm'. -# If the last day of 30-day months is NA instead of the first day of the following month, -# NAs are needed to exist in the array. In this case, 'merge_across_dims_narm' -# should be FALSE. - - dates <- attr(system4, 'Variables')$common$time - dates[2, 31] -#[1] "1994-07-01 UTC" - dates[2, 31] <- NA # Jun - dates[5, 31] <- NA # Sep - dates[7, 31] <- NA # Nov - - erai <- Start(dat = repos_obs, - var = 'tas', - file_date = dates_file, - time = values(dates), - latitude = indices(1:10), - longitude = indices(1:10), - time_var = 'time', - time_across = 'file_date', - merge_across_dims = TRUE, - #keep NAs of the last day in 30-day months - merge_across_dims_narm = FALSE, - split_multiselected_dims = TRUE, - return_vars = list(latitude = NULL, - longitude = NULL, - time = 'file_date'), - retrieve = TRUE) - -#------- Check erai ----------- -erai[1, 1, 2, , 1, 1] # June -# [1] 269.9410 269.6855 268.7380 268.5008 270.3236 271.5151 270.5046 270.1686 -# [9] 270.5395 272.0379 272.5489 271.1494 270.7764 270.5678 272.0331 273.7856 -#[17] 273.9849 274.5904 273.4369 273.8404 274.4068 274.2292 274.7375 275.5104 -#[25] 275.4324 274.9408 274.8679 276.5602 275.0995 274.6409 NA -erai[1, 1, 5, , 1, 1] # Sep -# [1] 270.0656 270.7113 268.4678 271.6489 271.2354 269.7831 269.8045 268.7994 -# [9] 266.3092 262.2734 265.0124 261.8378 265.3950 257.1690 255.8402 264.8826 -#[17] 267.8663 266.6875 262.5502 258.5476 258.9617 263.6396 257.1111 264.8644 -#[25] 261.0085 256.7690 256.5811 256.4331 256.1260 256.4716 NA -#------------------------------ diff --git a/man/AddStep.Rd b/man/AddStep.Rd index 3eece051f27640e918c98484ea0d66964a7b2304..0d0ce4697cf8bf6677f543d4daf52ada265a86ba 100644 --- a/man/AddStep.Rd +++ b/man/AddStep.Rd @@ -54,4 +54,3 @@ create the complete workflow. It is the final step before data processing. wf <- AddStep(data, step, pi_val = pi_short) } - diff --git a/man/CDORemapper.Rd b/man/CDORemapper.Rd index 4f56baaa4dda00233912c1c2c1cccb5b5945b3aa..763be7708c73e0be54b1edcc7e7d8b9d1aeebda4 100644 --- a/man/CDORemapper.Rd +++ b/man/CDORemapper.Rd @@ -65,4 +65,3 @@ perform the interpolation, hence CDO is required to be installed. \seealso{ \code{\link[s2dverification]{CDORemap}} } - diff --git a/man/Collect.Rd b/man/Collect.Rd index 44a7dee1d3c613d0a0c40663c8b87870b33eed65..97b529b07b001dd96d68c7de8db83802e8503ce1 100644 --- a/man/Collect.Rd +++ b/man/Collect.Rd @@ -83,4 +83,3 @@ of results as one data array when the execution is done. See more details on } } - diff --git a/man/Compute.Rd b/man/Compute.Rd index e07106a84abe08b157ca74f94966c589f23c6784..7d6db4da4d82cf462cb8cb83d1dddfa5ab516a9f 100644 --- a/man/Compute.Rd +++ b/man/Compute.Rd @@ -4,9 +4,18 @@ \alias{Compute} \title{Specify the execution parameters and trigger the execution} \usage{ -Compute(workflow, chunks = "auto", threads_load = 1, threads_compute = 1, - cluster = NULL, ecflow_suite_dir = NULL, ecflow_server = NULL, - silent = FALSE, debug = FALSE, wait = TRUE) +Compute( + workflow, + chunks = "auto", + threads_load = 1, + threads_compute = 1, + cluster = NULL, + ecflow_suite_dir = NULL, + ecflow_server = NULL, + silent = FALSE, + debug = FALSE, + wait = TRUE +) } \arguments{ \item{workflow}{A list of the class 'startR_workflow' returned by function @@ -104,4 +113,3 @@ arrays and additional metadata. res <- Compute(wf, chunks = list(longitude = 4, sdate = 2)) } - diff --git a/man/NcCloser.Rd b/man/NcCloser.Rd index 65beab8542da4d3770fe362902ac5efe92b69722..588f63a9e5fe2e17baac471c6fa7edd7dafdf2cf 100644 --- a/man/NcCloser.Rd +++ b/man/NcCloser.Rd @@ -32,4 +32,3 @@ NcCloser(connection) \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcDimReader}} \code{\link{NcVarReader}} } - diff --git a/man/NcDataReader.Rd b/man/NcDataReader.Rd index a6d32c7b2235fa615a621f0c9f9cd4af13364349..90147896788a84a84c7913997888bdc59b0c6fa8 100644 --- a/man/NcDataReader.Rd +++ b/man/NcDataReader.Rd @@ -4,8 +4,13 @@ \alias{NcDataReader} \title{NetCDF file data reader for 'startR'} \usage{ -NcDataReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDataReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -61,4 +66,3 @@ in turn uses nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDimReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcDimReader.Rd b/man/NcDimReader.Rd index d539ffdc9eac1869c0e9e60e623ae9688b204e35..38dd870d97062db858c068173362164884a0aec1 100644 --- a/man/NcDimReader.Rd +++ b/man/NcDimReader.Rd @@ -4,8 +4,13 @@ \alias{NcDimReader} \title{NetCDF dimension reader for 'startR'} \usage{ -NcDimReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - inner_indices = NULL, synonims) +NcDimReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + inner_indices = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ This function uses the function NcReadDims() in the package 'easyNCDF'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcOpener.Rd b/man/NcOpener.Rd index e46384ca2a1b4b7ce281c4b74fc2a29e5ebf52f4..30885fcd3719e31b5e31201988d73c136388a0b5 100644 --- a/man/NcOpener.Rd +++ b/man/NcOpener.Rd @@ -34,4 +34,3 @@ NcCloser(connection) \code{\link{NcDimReader}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcVarReader}} } - diff --git a/man/NcVarReader.Rd b/man/NcVarReader.Rd index c601907304c7c17639742da81750015804c2ebbd..fb093aefbe19b6c0b327deca83f4e571d36d09f2 100644 --- a/man/NcVarReader.Rd +++ b/man/NcVarReader.Rd @@ -4,8 +4,13 @@ \alias{NcVarReader} \title{NetCDF variable reader for 'startR'} \usage{ -NcVarReader(file_path = NULL, file_object = NULL, file_selectors = NULL, - var_name = NULL, synonims) +NcVarReader( + file_path = NULL, + file_object = NULL, + file_selectors = NULL, + var_name = NULL, + synonims +) } \arguments{ \item{file_path}{A character string indicating the path to the data file to @@ -58,4 +63,3 @@ nc_var_get() in the package 'ncdf4'. \code{\link{NcOpener}} \code{\link{NcDataReader}} \code{\link{NcCloser}} \code{\link{NcDimReader}} } - diff --git a/man/SelectorChecker.Rd b/man/SelectorChecker.Rd index ef83575bab49fd5bca62d7a86d7df9f5840dae3c..e1cf112630b9722fbdb2d3e9ce77df0c8b52788d 100644 --- a/man/SelectorChecker.Rd +++ b/man/SelectorChecker.Rd @@ -4,8 +4,7 @@ \alias{SelectorChecker} \title{Translate a set of selectors into a set of numeric indices} \usage{ -SelectorChecker(selectors, var = NULL, return_indices = TRUE, - tolerance = NULL) +SelectorChecker(selectors, var = NULL, return_indices = TRUE, tolerance = NULL) } \arguments{ \item{selectors}{A vector or a list of two of numeric indices or variable @@ -50,4 +49,3 @@ sub_array_of_values <- seq(90, -90, length.out = 258)[2:257] SelectorChecker(sub_array_of_selectors, sub_array_of_values) } - diff --git a/man/Sort.Rd b/man/Sort.Rd index 9ab516ed187f4bdb853eba58effd4049346a0d95..25a92fe62871bb6408648b675320c9f169f95330 100644 --- a/man/Sort.Rd +++ b/man/Sort.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/Sort.R \name{Sort} -\alias{CircularSort} \alias{Sort} +\alias{CircularSort} \title{Sort the coordinate variable values in a Start() call} \usage{ Sort(...) @@ -10,12 +10,12 @@ Sort(...) CircularSort(start, end, ...) } \arguments{ +\item{\dots}{Additional parameters to adjust the reorderig. See function +sort() for more details.} + \item{start}{A numeric indicating the lower bound of the circular range.} \item{end}{A numeric indicating the upper bound of the circular range.} - -\item{\dots}{Additional parameters to adjust the reorderig. See function -sort() for more details.} } \value{ A list of 2 containing: @@ -57,4 +57,3 @@ range. This is useful for circular coordinates such as the Earth longitudes. retrieve = FALSE) } - diff --git a/man/Start.Rd b/man/Start.Rd index 680168e44903df0b1ae68c2294096bd0c5ca99c4..efd258f3079af64f00b94262fcbd10da4f4cb0b9 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -4,408 +4,36 @@ \alias{Start} \title{Declare, discover, subset and retrieve multidimensional distributed data sets} \usage{ -Start(..., return_vars = NULL, synonims = NULL, file_opener = NcOpener, - file_var_reader = NcVarReader, file_dim_reader = NcDimReader, - file_data_reader = NcDataReader, file_closer = NcCloser, - transform = NULL, transform_params = NULL, transform_vars = NULL, - transform_extra_cells = 2, apply_indices_after_transform = FALSE, - pattern_dims = NULL, metadata_dims = NULL, - selector_checker = SelectorChecker, merge_across_dims = FALSE, - merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, - path_glob_permissive = FALSE, largest_dims_length = FALSE, - retrieve = FALSE, num_procs = 1, ObjectBigmemory = NULL, - silent = FALSE, debug = FALSE) +Start( + ..., + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, + debug = FALSE +) } \arguments{ -\item{return_vars}{A named list where the names are the names of the -variables to be fetched in the files, and the values are vectors of -character strings with the names of the file dimension which to retrieve each -variable for, or NULL if the variable has to be retrieved only once -from any (the first) of the involved files.\cr\cr -Apart from retrieving a multidimensional data array, retrieving auxiliary -variables inside the files can also be needed. The parameter -'return_vars' allows for requesting such variables, as long as a -'file_var_reader' function is also specified in the call to -Start() (see documentation on the corresponding parameter). -\cr\cr -In the case of the the item sales example (see documentation on parameter -\code{\dots)}, the store location variable is requested with the parameter\cr -\code{return_vars = list(store_location = NULL)}.\cr This will cause -Start() to fetch once the variable 'store_location' and return it in -the component\cr \code{$Variables$common$store_location},\cr and will be an -array of character strings with the location names, with the dimensions -\code{c('store' = 100)}. Although useless in this example, we could ask -Start() to fetch and return such variable for each file along the -items dimension as follows: \cr -\code{return_vars = list(store_location = c('item'))}.\cr In that case, the -variable will be fetched once from a file of each of the items, and will be -returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. -\cr\cr -If a variable is requested along a file dimension that contains path pattern -specifications ('source' in the example), the fetched variable values will be -returned in the component\cr \code{$Variables$$}.\cr -For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item'))) -\cr # # Checking the structure of the returned variables -\cr # str(found_data$Variables) -\cr # Named list -\cr # ..$common: NULL -\cr # ..$sourceA: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # ..$sourceB: Named list -\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... -\cr # # Checking the dimensions of the returned variable -\cr # # for the source A -\cr # dim(found_data$Variables$sourceA) -\cr # item store -\cr # 3 3 -} -\cr\cr -The names of the requested variables do not necessarily have to match the -actual variable names inside the files. A list of alternative names to be -seeked can be specified via the parameter 'synonims'.} - -\item{synonims}{A named list where the names are the requested variable or -dimension names, and the values are vectors of character strings with -alternative names to seek for such dimension or variable.\cr\cr -In some requests, data from different sources may follow different naming -conventions for the dimensions or variables, or even files in the same source -could have varying names. This parameter is in order for Start() to -properly identify the dimensions or variables with different names. -\cr\cr -In the example used in parameter 'return_vars', it may be the case that -the two involved data sources follow slightly different naming conventions. -For example, source A uses 'sect' as name for the sections dimension, whereas -source B uses 'section'; source A uses 'store_loc' as variable name for the -store locations, whereas source B uses 'store_location'. This can be taken -into account as follows: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = c('source', -\cr # 'item')), -\cr # synonims = list( -\cr # section = c('sec', 'section'), -\cr # store_location = c('store_loc', -\cr # 'store_location') -\cr # )) -} -\cr} - -\item{file_opener}{A function that receives as a single parameter - 'file_path' a character string with the path to a file to be opened, - and returns an object with an open connection to the file (optionally with - header information) on success, or returns NULL on failure. -\cr\cr -This parameter takes by default NcOpener() (an opener function for NetCDF -files). -\cr\cr -See NcOpener() for a template to build a file opener for your own file -format.} - -\item{file_var_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, - \code{synonims} that returns an array with auxiliary data (i.e. data from a - variable) inside a file. Start() will provide automatically either a - 'file_path' or a 'file_object' to the 'file_var_reader' - function (the function has to be ready to work whichever of these two is - provided). The parameter 'file_selectors' will also be provided - automatically to the variable reader, containing a named list where the - names are the names of the file dimensions of the queried data set (see - documentation on \code{\dots}) and the values are single character strings - with the components used to build the path to the file being read (the one - provided in 'file_path' or 'file_object'). The parameter 'var_name' - will be filled in automatically by Start() also, with the name of one - of the variales to be read. The parameter 'synonims' will be filled in - with exactly the same value as provided in the parameter 'synonims' in - the call to Start(), and has to be used in the code of the variable - reader to check for alternative variable names inside the target file. The - 'file_var_reader' must return a (multi)dimensional array with named - dimensions, and optionally with the attribute 'variales' with other - additional metadata on the retrieved variable. -\cr\cr -Usually, the 'file_var_reader' should be a degenerate case of the -'file_data_reader' (see documentation on the corresponding parameter), -so it is recommended to code the 'file_data_reder' in first place. -\cr\cr -This parameter takes by default NcVarReader() (a variable reader function -for NetCDF files). -\cr\cr -See NcVarReader() for a template to build a variale reader for your own -file format.} - -\item{file_dim_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} - that returns a named numeric vector where the names are the names of the - dimensions of the multidimensional data array in the file and the values are - the sizes of such dimensions. Start() will provide automatically - either a 'file_path' or a 'file_object' to the - 'file_dim_reader' function (the function has to be ready to work - whichever of these two is provided). The parameter 'file_selectors' - will also be provided automatically to the dimension reader, containing a - named list where the names are the names of the file dimensions of the - queried data set (see documentation on \code{\dots}) and the values are - single character strings with the components used to build the path to the - file being read (the one provided in 'file_path' or 'file_object'). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and can optionally be used in advanced configurations. -\cr\cr -This parameter takes by default NcDimReader() (a dimension reader -function for NetCDF files). -\cr\cr -See NcDimReader() for (an advanced) template to build a dimension reader -for your own file format.} - -\item{file_data_reader}{A function with the header \code{file_path = NULL}, - \code{file_object = NULL}, \code{file_selectors = NULL}, - \code{inner_indices = NULL}, \code{synonims} that returns a subset of the - multidimensional data array inside a file (even if internally it is not an - array). Start() will provide automatically either a 'file_path' - or a 'file_object' to the 'file_data_reader' function (the - function has to be ready to work whichever of these two is provided). The - parameter 'file_selectors' will also be provided automatically to the - data reader, containing a named list where the names are the names of the - file dimensions of the queried data set (see documentation on \code{\dots}) - and the values are single character strings with the components used to - build the path to the file being read (the one provided in 'file_path' or - 'file_object'). The parameter 'inner_indices' will be filled in - automatically by Start() also, with a named list of numeric vectors, - where the names are the names of all the expected inner dimensions in a file - to be read, and the numeric vectors are the indices to be taken from the - corresponding dimension (the indices may not be consecutive nor in order). - The parameter 'synonims' will be filled in with exactly the same value - as provided in the parameter 'synonims' in the call to Start(), - and has to be used in the code of the data reader to check for alternative - dimension names inside the target file. The 'file_data_reader' must - return a (multi)dimensional array with named dimensions, and optionally with - the attribute 'variables' with other additional metadata on the retrieved - data. -\cr\cr -Usually, 'file_data_reader' should use 'file_dim_reader' -(see documentation on the corresponding parameter), so it is recommended to -code 'file_dim_reder' in first place. -\cr\cr -This parameter takes by default NcDataReader() (a data reader function -for NetCDF files). -\cr\cr -See NcDataReader() for a template to build a data reader for your own -file format.} - -\item{file_closer}{A function that receives as a single parameter - 'file_object' an open connection (as returned by 'file_opener') - to one of the files to be read, optionally with header information, and - closes the open connection. Always returns NULL. -\cr\cr -This parameter takes by default NcCloser() (a closer function for NetCDF -files). -\cr\cr -See NcCloser() for a template to build a file closer for your own file -format.} - -\item{transform}{A function with the header \code{dara_array}, -\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as -input, through the parameter \code{data_array}, a subset of a -multidimensional array (as returned by 'file_data_reader'), applies a -transformation to it and returns it, preserving the amount of dimensions but -potentially modifying their size. This transformation may require data from -other auxiliary variables, automatically provided to 'transform' -through the parameter 'variables', in the form of a named list where -the names are the variable names and the values are (multi)dimensional -arrays. Which variables need to be sent to 'transform' can be specified -with the parameter 'transform_vars' in Start(). The parameter -'file_selectors' will also be provided automatically to -'transform', containing a named list where the names are the names of -the file dimensions of the queried data set (see documentation on -\code{\dots}) and the values are single character strings with the -components used to build the path to the file the subset being processed -belongs to. The parameter \code{\dots} will be filled in with other -additional parameters to adjust the transformation, exactly as provided in -the call to Start() via the parameter 'transform_params'.} - -\item{transform_params}{A named list with additional parameters to be sent to -the 'transform' function (if specified). See documentation on parameter -'transform' for details.} - -\item{transform_vars}{A vector of character strings with the names of -auxiliary variables to be sent to the 'transform' function (if -specified). All the variables to be sent to 'transform' must also -have been requested as return variables in the parameter 'return_vars' -of Start().} - -\item{transform_extra_cells}{An integer of extra indices to retrieve from the -data set, beyond the requested indices in \code{\dots}, in order for -'transform' to dispose of additional information to properly apply -whichever transformation (if needed). As many as -'transform_extra_cells' will be retrieved beyond each of the limits for -each of those inner dimensions associated to a coordinate variable and sent -to 'transform' (i.e. present in 'transform_vars'). After -'transform' has finished, Start() will take again and return a -subset of the result, for the returned data to fall within the specified -bounds in \code{\dots}. The default value is 2.} - -\item{apply_indices_after_transform}{A logical value indicating when a -'transform' is specified in Start() and numeric indices are -provided for any of the inner dimensions that depend on coordinate variables, -these numeric indices can be made effective (retrieved) before applying the -transformation or after. The boolean flag allows to adjust this behaviour. -It takes FALSE by default (numeric indices are applied before sending -data to 'transform').} - -\item{pattern_dims}{A character string indicating the name of the dimension -with path pattern specifications (see \code{\dots} for details). If not -specified, Start() assumes the first provided dimension is the pattern -dimension, with a warning.} - -\item{metadata_dims}{A vector of character strings with the names of the file -dimensions which to return metadata for. As noted in 'file_data_reader', -the data reader can optionally return auxiliary data via the attribute -'variables' of the returned array. Start() by default returns the -auxiliary data read for only the first file of each source (or data set) in -the pattern dimension (see \code{\dots} for info on what the pattern -dimension is). However it can be configured to return the metadata for all -the files along any set of file dimensions. The default value is NULL, and -it will be assigned automatically as parameter 'pattern_dims'.} - -\item{selector_checker}{A function used internaly by Start() to -translate a set of selectors (values for a dimension associated to a -coordinate variable) into a set of numeric indices. It takes by default -SelectorChecker() and, in principle, it should not be required to -change it for customized file formats. The option to replace it is left open -for more versatility. See the code of SelectorChecker() for details on -the inputs, functioning and outputs of a selector checker.} - -\item{merge_across_dims}{A logical value indicating whether to merge -dimensions across which another dimension extends (according to the -'_across' parameters). Takes the value FALSE by default. For -example, if the dimension 'time' extends across the dimension 'chunk' and -\code{merge_across_dims = TRUE}, the resulting data array will only contain -only the dimension 'time' as long as all the chunks together.} - -\item{merge_across_dims_narm}{A logical value indicating whether to remove -the additional NAs from data when parameter 'merge_across_dims' is TRUE. -It is helpful when the length of the to-be-merged dimension is different -across another dimension. For example, if the dimension 'time' extends -across dimension 'chunk', and the time length along the first chunk is 2 -while along the second chunk is 10. Setting this parameter as TRUE can -remove the additional 8 NAs at position 3 to 10. The default value is FALSE.} - -\item{split_multiselected_dims}{A logical value indicating whether to split a -dimension that has been selected with a multidimensional array of selectors -into as many dimensions as present in the selector array. The default value -is FALSE.} - -\item{path_glob_permissive}{A logical value or an integer specifying how many - folder levels in the path pattern, beginning from the end, the shell glob - expressions must be preserved and worked out for each file. The default - value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr -When specifying a path pattern for a dataset, it might contain shell glob -experissions. For each dataset, the first file matching the path pattern is -found, and the found file is used to work out fixed values for the glob -expressions that will be used for all the files of the dataset. However, in -some cases, the values of the shell glob expressions may not be constant for -all files in a dataset, and they need to be worked out for each file -involved.\cr\cr -For example, a path pattern could be as follows: \cr -\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving -\code{path_glob_permissive = FALSE} will trigger automatic seek of the - contents to replace the asterisks (e.g. the first asterisk matches with - \code{'bar'} and the second with \code{'baz'}. The found contents will be - used for all files in the dataset (in the example, the path pattern will be - fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if - any of the files in the dataset have other contents in the position of the - asterisks, Start() will not find them (in the example, a file like \cr - \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be - found). Setting \code{path_glob_permissive = 1} would preserve global - expressions in the latest level (in the example, the fixed path pattern - would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the - problematic file mentioned before would be found), but of course this would - slow down the Start() call if the dataset involves a large number of - files. Setting \code{path_glob_permissive = 2} would leave the original path - pattern with the original glob expressions in the 1st and 2nd levels (in the - example, both asterisks would be preserved, thus would allow Start() - to recognize files such as \cr - \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr -Note that each glob expression can only represent one possibility (Start() -chooses the first). Because /code{*} is not the tag, which means it cannot -be a dimension of the output array. Therefore, only one possibility can be -adopted. For example, if \cr -\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr -has two matches:\cr -\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr -\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr -only the first found file will be used.} - -\item{largest_dims_length}{A logical value or a named integer vector - indicating if Start() should examine all the files to get the largest - length of the inner dimensions (TRUE) or use the first valid file of each - dataset as the returned dimension length (FALSE). Since examining all the - files could be time-consuming, a vector can be used to explicitly specify - the expected length of the inner dimensions. For those inner dimensions not - specified, the first valid file will be used. The default value is FALSE.\cr\cr - This parameter is useful when the required files don't have consistent - inner dimension. For example, there are 10 required experimental data files - of a series of start dates. The data only contain 25 members for the first - 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, - the returned member dimension length will be 25 only. The 26th to 51st - members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, - the returned member dimension length will be 51. To save the resource, -\code{'largest_dims_length = c(member = 51)'} can also be used.} - -\item{retrieve}{A logical value indicating whether to retrieve the data -defined in the Start() call or to explore only its dimension lengths -and names, and the values for the file and inner dimensions. The default -value is FALSE.} - -\item{num_procs}{An integer of number of processes to be created for the -parallel execution of the retrieval/transformation/arrangement of the -multiple involved files in a call to Start(). If set to NULL, -takes the number of available cores (as detected by detectCores() in -the package 'future'). The default value is 1 (no parallel execution).} - -\item{ObjectBigmemory}{a character string to be included as part of the -bigmemory object name. This parameter is thought to be used internally by the -chunking capabilities of startR.} - -\item{silent}{A logical value of whether to display progress messages (FALSE) -or not (TRUE). The default value is FALSE.} - -\item{debug}{A logical value of whether to return detailed messages on the -progress and operations in a Start() call (TRUE) or not (FALSE). The -default value is FALSE.} - \item{\dots}{A selection of custemized parameters depending on the data format. When we retrieve data from one or a collection of data sets, the involved data can be perceived as belonging to a large multi-dimensional @@ -620,61 +248,451 @@ as they have been specified in the call. For example, the following call: \cr # variable = 'all') } \cr\cr -would return an array with the following dimensions: -\cr -\command{ -\cr # source month store item section variable -\cr # 1 24 100 3 2 2 -} +would return an array with the following dimensions: +\cr +\command{ +\cr # source month store item section variable +\cr # 1 24 100 3 2 2 +} +\cr\cr +Next, a more advanced example to retrieve data for only the sales records, for +the first section ('electronics'), for the 1st and 3rd items and for the +stores located in Barcelona (assuming the files contain the variable +'store_location' with the name of the city each of the 100 stores are located +at): +\cr +\command{ +\cr # data <- Start(source = paste0('/data/$variable$/', +\cr # '$section$/$item$.data'), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr\cr +The defined names for the dimensions do not necessarily have to match the +names of the dimensions inside the file. Lists of alternative names to be +seeked can be defined in the parameter 'synonims'. +\cr\cr +If data from multiple sources (not necessarily following the same structure) +has to be retrieved, it can be done by providing a vector of character strings +with path pattern specifications, or, in the extended form, by providing a +list of lists with the components 'name' and 'path', and the name of the +dataset and path pattern as values, respectively. For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = NULL)) +} +\cr} + +\item{return_vars}{A named list where the names are the names of the +variables to be fetched in the files, and the values are vectors of +character strings with the names of the file dimension which to retrieve each +variable for, or NULL if the variable has to be retrieved only once +from any (the first) of the involved files.\cr\cr +Apart from retrieving a multidimensional data array, retrieving auxiliary +variables inside the files can also be needed. The parameter +'return_vars' allows for requesting such variables, as long as a +'file_var_reader' function is also specified in the call to +Start() (see documentation on the corresponding parameter). +\cr\cr +In the case of the the item sales example (see documentation on parameter +\code{\dots)}, the store location variable is requested with the parameter\cr +\code{return_vars = list(store_location = NULL)}.\cr This will cause +Start() to fetch once the variable 'store_location' and return it in +the component\cr \code{$Variables$common$store_location},\cr and will be an +array of character strings with the location names, with the dimensions +\code{c('store' = 100)}. Although useless in this example, we could ask +Start() to fetch and return such variable for each file along the +items dimension as follows: \cr +\code{return_vars = list(store_location = c('item'))}.\cr In that case, the +variable will be fetched once from a file of each of the items, and will be +returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. +\cr\cr +If a variable is requested along a file dimension that contains path pattern +specifications ('source' in the example), the fetched variable values will be +returned in the component\cr \code{$Variables$$}.\cr +For example: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item'))) +\cr # # Checking the structure of the returned variables +\cr # str(found_data$Variables) +\cr # Named list +\cr # ..$common: NULL +\cr # ..$sourceA: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # ..$sourceB: Named list +\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +\cr # # Checking the dimensions of the returned variable +\cr # # for the source A +\cr # dim(found_data$Variables$sourceA) +\cr # item store +\cr # 3 3 +} +\cr\cr +The names of the requested variables do not necessarily have to match the +actual variable names inside the files. A list of alternative names to be +seeked can be specified via the parameter 'synonims'.} + +\item{synonims}{A named list where the names are the requested variable or +dimension names, and the values are vectors of character strings with +alternative names to seek for such dimension or variable.\cr\cr +In some requests, data from different sources may follow different naming +conventions for the dimensions or variables, or even files in the same source +could have varying names. This parameter is in order for Start() to +properly identify the dimensions or variables with different names. +\cr\cr +In the example used in parameter 'return_vars', it may be the case that +the two involved data sources follow slightly different naming conventions. +For example, source A uses 'sect' as name for the sections dimension, whereas +source B uses 'section'; source A uses 'store_loc' as variable name for the +store locations, whereas source B uses 'store_location'. This can be taken +into account as follows: +\cr +\command{ +\cr # data <- Start(source = list( +\cr # list(name = 'sourceA', +\cr # path = paste0('/sourceA/$variable$/', +\cr # '$section$/$item$.data')), +\cr # list(name = 'sourceB', +\cr # path = paste0('/sourceB/$section$/', +\cr # '$variable$/$item$.data')) +\cr # ), +\cr # variable = 'sales', +\cr # section = 'first', +\cr # item = indices(c(1, 3)), +\cr # item_depends = 'section', +\cr # store = 'Barcelona', +\cr # store_var = 'store_location', +\cr # month = 'all', +\cr # return_vars = list(store_location = c('source', +\cr # 'item')), +\cr # synonims = list( +\cr # section = c('sec', 'section'), +\cr # store_location = c('store_loc', +\cr # 'store_location') +\cr # )) +} +\cr} + +\item{file_opener}{A function that receives as a single parameter + 'file_path' a character string with the path to a file to be opened, + and returns an object with an open connection to the file (optionally with + header information) on success, or returns NULL on failure. +\cr\cr +This parameter takes by default NcOpener() (an opener function for NetCDF +files). +\cr\cr +See NcOpener() for a template to build a file opener for your own file +format.} + +\item{file_var_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, + \code{synonims} that returns an array with auxiliary data (i.e. data from a + variable) inside a file. Start() will provide automatically either a + 'file_path' or a 'file_object' to the 'file_var_reader' + function (the function has to be ready to work whichever of these two is + provided). The parameter 'file_selectors' will also be provided + automatically to the variable reader, containing a named list where the + names are the names of the file dimensions of the queried data set (see + documentation on \code{\dots}) and the values are single character strings + with the components used to build the path to the file being read (the one + provided in 'file_path' or 'file_object'). The parameter 'var_name' + will be filled in automatically by Start() also, with the name of one + of the variales to be read. The parameter 'synonims' will be filled in + with exactly the same value as provided in the parameter 'synonims' in + the call to Start(), and has to be used in the code of the variable + reader to check for alternative variable names inside the target file. The + 'file_var_reader' must return a (multi)dimensional array with named + dimensions, and optionally with the attribute 'variales' with other + additional metadata on the retrieved variable. +\cr\cr +Usually, the 'file_var_reader' should be a degenerate case of the +'file_data_reader' (see documentation on the corresponding parameter), +so it is recommended to code the 'file_data_reder' in first place. +\cr\cr +This parameter takes by default NcVarReader() (a variable reader function +for NetCDF files). +\cr\cr +See NcVarReader() for a template to build a variale reader for your own +file format.} + +\item{file_dim_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} + that returns a named numeric vector where the names are the names of the + dimensions of the multidimensional data array in the file and the values are + the sizes of such dimensions. Start() will provide automatically + either a 'file_path' or a 'file_object' to the + 'file_dim_reader' function (the function has to be ready to work + whichever of these two is provided). The parameter 'file_selectors' + will also be provided automatically to the dimension reader, containing a + named list where the names are the names of the file dimensions of the + queried data set (see documentation on \code{\dots}) and the values are + single character strings with the components used to build the path to the + file being read (the one provided in 'file_path' or 'file_object'). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and can optionally be used in advanced configurations. +\cr\cr +This parameter takes by default NcDimReader() (a dimension reader +function for NetCDF files). +\cr\cr +See NcDimReader() for (an advanced) template to build a dimension reader +for your own file format.} + +\item{file_data_reader}{A function with the header \code{file_path = NULL}, + \code{file_object = NULL}, \code{file_selectors = NULL}, + \code{inner_indices = NULL}, \code{synonims} that returns a subset of the + multidimensional data array inside a file (even if internally it is not an + array). Start() will provide automatically either a 'file_path' + or a 'file_object' to the 'file_data_reader' function (the + function has to be ready to work whichever of these two is provided). The + parameter 'file_selectors' will also be provided automatically to the + data reader, containing a named list where the names are the names of the + file dimensions of the queried data set (see documentation on \code{\dots}) + and the values are single character strings with the components used to + build the path to the file being read (the one provided in 'file_path' or + 'file_object'). The parameter 'inner_indices' will be filled in + automatically by Start() also, with a named list of numeric vectors, + where the names are the names of all the expected inner dimensions in a file + to be read, and the numeric vectors are the indices to be taken from the + corresponding dimension (the indices may not be consecutive nor in order). + The parameter 'synonims' will be filled in with exactly the same value + as provided in the parameter 'synonims' in the call to Start(), + and has to be used in the code of the data reader to check for alternative + dimension names inside the target file. The 'file_data_reader' must + return a (multi)dimensional array with named dimensions, and optionally with + the attribute 'variables' with other additional metadata on the retrieved + data. +\cr\cr +Usually, 'file_data_reader' should use 'file_dim_reader' +(see documentation on the corresponding parameter), so it is recommended to +code 'file_dim_reder' in first place. \cr\cr -Next, a more advanced example to retrieve data for only the sales records, for -the first section ('electronics'), for the 1st and 3rd items and for the -stores located in Barcelona (assuming the files contain the variable -'store_location' with the name of the city each of the 100 stores are located -at): -\cr -\command{ -\cr # data <- Start(source = paste0('/data/$variable$/', -\cr # '$section$/$item$.data'), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} +This parameter takes by default NcDataReader() (a data reader function +for NetCDF files). \cr\cr -The defined names for the dimensions do not necessarily have to match the -names of the dimensions inside the file. Lists of alternative names to be -seeked can be defined in the parameter 'synonims'. +See NcDataReader() for a template to build a data reader for your own +file format.} + +\item{file_closer}{A function that receives as a single parameter + 'file_object' an open connection (as returned by 'file_opener') + to one of the files to be read, optionally with header information, and + closes the open connection. Always returns NULL. \cr\cr -If data from multiple sources (not necessarily following the same structure) -has to be retrieved, it can be done by providing a vector of character strings -with path pattern specifications, or, in the extended form, by providing a -list of lists with the components 'name' and 'path', and the name of the -dataset and path pattern as values, respectively. For example: -\cr -\command{ -\cr # data <- Start(source = list( -\cr # list(name = 'sourceA', -\cr # path = paste0('/sourceA/$variable$/', -\cr # '$section$/$item$.data')), -\cr # list(name = 'sourceB', -\cr # path = paste0('/sourceB/$section$/', -\cr # '$variable$/$item$.data')) -\cr # ), -\cr # variable = 'sales', -\cr # section = 'first', -\cr # item = indices(c(1, 3)), -\cr # item_depends = 'section', -\cr # store = 'Barcelona', -\cr # store_var = 'store_location', -\cr # month = 'all', -\cr # return_vars = list(store_location = NULL)) -} -\cr} +This parameter takes by default NcCloser() (a closer function for NetCDF +files). +\cr\cr +See NcCloser() for a template to build a file closer for your own file +format.} + +\item{transform}{A function with the header \code{dara_array}, +\code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as +input, through the parameter \code{data_array}, a subset of a +multidimensional array (as returned by 'file_data_reader'), applies a +transformation to it and returns it, preserving the amount of dimensions but +potentially modifying their size. This transformation may require data from +other auxiliary variables, automatically provided to 'transform' +through the parameter 'variables', in the form of a named list where +the names are the variable names and the values are (multi)dimensional +arrays. Which variables need to be sent to 'transform' can be specified +with the parameter 'transform_vars' in Start(). The parameter +'file_selectors' will also be provided automatically to +'transform', containing a named list where the names are the names of +the file dimensions of the queried data set (see documentation on +\code{\dots}) and the values are single character strings with the +components used to build the path to the file the subset being processed +belongs to. The parameter \code{\dots} will be filled in with other +additional parameters to adjust the transformation, exactly as provided in +the call to Start() via the parameter 'transform_params'.} + +\item{transform_params}{A named list with additional parameters to be sent to +the 'transform' function (if specified). See documentation on parameter +'transform' for details.} + +\item{transform_vars}{A vector of character strings with the names of +auxiliary variables to be sent to the 'transform' function (if +specified). All the variables to be sent to 'transform' must also +have been requested as return variables in the parameter 'return_vars' +of Start().} + +\item{transform_extra_cells}{An integer of extra indices to retrieve from the +data set, beyond the requested indices in \code{\dots}, in order for +'transform' to dispose of additional information to properly apply +whichever transformation (if needed). As many as +'transform_extra_cells' will be retrieved beyond each of the limits for +each of those inner dimensions associated to a coordinate variable and sent +to 'transform' (i.e. present in 'transform_vars'). After +'transform' has finished, Start() will take again and return a +subset of the result, for the returned data to fall within the specified +bounds in \code{\dots}. The default value is 2.} + +\item{apply_indices_after_transform}{A logical value indicating when a +'transform' is specified in Start() and numeric indices are +provided for any of the inner dimensions that depend on coordinate variables, +these numeric indices can be made effective (retrieved) before applying the +transformation or after. The boolean flag allows to adjust this behaviour. +It takes FALSE by default (numeric indices are applied before sending +data to 'transform').} + +\item{pattern_dims}{A character string indicating the name of the dimension +with path pattern specifications (see \code{\dots} for details). If not +specified, Start() assumes the first provided dimension is the pattern +dimension, with a warning.} + +\item{metadata_dims}{A vector of character strings with the names of the file +dimensions which to return metadata for. As noted in 'file_data_reader', +the data reader can optionally return auxiliary data via the attribute +'variables' of the returned array. Start() by default returns the +auxiliary data read for only the first file of each source (or data set) in +the pattern dimension (see \code{\dots} for info on what the pattern +dimension is). However it can be configured to return the metadata for all +the files along any set of file dimensions. The default value is NULL, and +it will be assigned automatically as parameter 'pattern_dims'.} + +\item{selector_checker}{A function used internaly by Start() to +translate a set of selectors (values for a dimension associated to a +coordinate variable) into a set of numeric indices. It takes by default +SelectorChecker() and, in principle, it should not be required to +change it for customized file formats. The option to replace it is left open +for more versatility. See the code of SelectorChecker() for details on +the inputs, functioning and outputs of a selector checker.} + +\item{merge_across_dims}{A logical value indicating whether to merge +dimensions across which another dimension extends (according to the +'_across' parameters). Takes the value FALSE by default. For +example, if the dimension 'time' extends across the dimension 'chunk' and +\code{merge_across_dims = TRUE}, the resulting data array will only contain +only the dimension 'time' as long as all the chunks together.} + +\item{merge_across_dims_narm}{A logical value indicating whether to remove +the additional NAs from data when parameter 'merge_across_dims' is TRUE. +It is helpful when the length of the to-be-merged dimension is different +across another dimension. For example, if the dimension 'time' extends +across dimension 'chunk', and the time length along the first chunk is 2 +while along the second chunk is 10. Setting this parameter as TRUE can +remove the additional 8 NAs at position 3 to 10. The default value is TRUE, +but will be automatically turned to FALSE if 'merge_across_dims = FALSE'.} + +\item{split_multiselected_dims}{A logical value indicating whether to split a +dimension that has been selected with a multidimensional array of selectors +into as many dimensions as present in the selector array. The default value +is FALSE.} + +\item{path_glob_permissive}{A logical value or an integer specifying how many + folder levels in the path pattern, beginning from the end, the shell glob + expressions must be preserved and worked out for each file. The default + value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr +When specifying a path pattern for a dataset, it might contain shell glob +experissions. For each dataset, the first file matching the path pattern is +found, and the found file is used to work out fixed values for the glob +expressions that will be used for all the files of the dataset. However, in +some cases, the values of the shell glob expressions may not be constant for +all files in a dataset, and they need to be worked out for each file +involved.\cr\cr +For example, a path pattern could be as follows: \cr +\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving +\code{path_glob_permissive = FALSE} will trigger automatic seek of the + contents to replace the asterisks (e.g. the first asterisk matches with + \code{'bar'} and the second with \code{'baz'}. The found contents will be + used for all files in the dataset (in the example, the path pattern will be + fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if + any of the files in the dataset have other contents in the position of the + asterisks, Start() will not find them (in the example, a file like \cr + \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be + found). Setting \code{path_glob_permissive = 1} would preserve global + expressions in the latest level (in the example, the fixed path pattern + would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the + problematic file mentioned before would be found), but of course this would + slow down the Start() call if the dataset involves a large number of + files. Setting \code{path_glob_permissive = 2} would leave the original path + pattern with the original glob expressions in the 1st and 2nd levels (in the + example, both asterisks would be preserved, thus would allow Start() + to recognize files such as \cr + \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +Note that each glob expression can only represent one possibility (Start() +chooses the first). Because /code{*} is not the tag, which means it cannot +be a dimension of the output array. Therefore, only one possibility can be +adopted. For example, if \cr +\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +has two matches:\cr +\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +only the first found file will be used.} + +\item{largest_dims_length}{A logical value or a named integer vector + indicating if Start() should examine all the files to get the largest + length of the inner dimensions (TRUE) or use the first valid file of each + dataset as the returned dimension length (FALSE). Since examining all the + files could be time-consuming, a vector can be used to explicitly specify + the expected length of the inner dimensions. For those inner dimensions not + specified, the first valid file will be used. The default value is FALSE.\cr\cr + This parameter is useful when the required files don't have consistent + inner dimension. For example, there are 10 required experimental data files + of a series of start dates. The data only contain 25 members for the first + 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, + the returned member dimension length will be 25 only. The 26th to 51st + members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, + the returned member dimension length will be 51. To save the resource, +\code{'largest_dims_length = c(member = 51)'} can also be used.} + +\item{retrieve}{A logical value indicating whether to retrieve the data +defined in the Start() call or to explore only its dimension lengths +and names, and the values for the file and inner dimensions. The default +value is FALSE.} + +\item{num_procs}{An integer of number of processes to be created for the +parallel execution of the retrieval/transformation/arrangement of the +multiple involved files in a call to Start(). If set to NULL, +takes the number of available cores (as detected by detectCores() in +the package 'future'). The default value is 1 (no parallel execution).} + +\item{ObjectBigmemory}{a character string to be included as part of the +bigmemory object name. This parameter is thought to be used internally by the +chunking capabilities of startR.} + +\item{silent}{A logical value of whether to display progress messages (FALSE) +or not (TRUE). The default value is FALSE.} + +\item{debug}{A logical value of whether to return detailed messages on the +progress and operations in a Start() call (TRUE) or not (FALSE). The +default value is FALSE.} } \value{ If \code{retrieve = TRUE} the involved data is loaded into RAM memory @@ -830,4 +848,3 @@ file format. retrieve = FALSE) } - diff --git a/man/Step.Rd b/man/Step.Rd index 65f0c727eef1d8b9ccf24b6f992304bd1a99e63a..c473ccbd3229176cf3cf9c321028ae6e2f61aa54 100644 --- a/man/Step.Rd +++ b/man/Step.Rd @@ -4,8 +4,13 @@ \alias{Step} \title{Define the operation applied on declared data.} \usage{ -Step(fun, target_dims, output_dims, use_libraries = NULL, - use_attributes = NULL) +Step( + fun, + target_dims, + output_dims, + use_libraries = NULL, + use_attributes = NULL +) } \arguments{ \item{fun}{A function in R format defining the operation to be applied to the @@ -70,4 +75,3 @@ to the expected order for this function. wf <- AddStep(data, step) } - diff --git a/man/indices.Rd b/man/indices.Rd index a3d85ea1e9c0c6f6b1168c532872a769f85cc15e..6233b71963d0e801c4ce5c882aa76a3e228df2a6 100644 --- a/man/indices.Rd +++ b/man/indices.Rd @@ -39,4 +39,3 @@ original data. See details in the documentation of the parameter \code{\dots} \seealso{ \code{\link{values}} } - diff --git a/man/values.Rd b/man/values.Rd index 3300f19dc730494b23723c734d4372f13366e7b1..31ce95af586070b3fb6a47c472b7a0bf30b5ff6c 100644 --- a/man/values.Rd +++ b/man/values.Rd @@ -41,4 +41,3 @@ coordinate variable. See details in the documentation of the parameter \seealso{ \code{\link{indices}} } - diff --git a/tests/testthat/test-Start-DCPP-across-depends.R b/tests/testthat/test-Start-DCPP-across-depends.R new file mode 100644 index 0000000000000000000000000000000000000000..f5a5dc6f7ee8dbbc535dabf813825b3cbada3747 --- /dev/null +++ b/tests/testthat/test-Start-DCPP-across-depends.R @@ -0,0 +1,42 @@ +context("DCPP successfull retrieved for depends and across parameters.") +test_that("Chunks of DCPP files- Local execution", { + + path <- '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$chunk$.nc' + + sdates <- c('2017', '2018') + dat <- Start(dat = path, + var = 'tos', + sdate = sdates, + chunk = indices(3:5), + chunk_depends = 'sdate', + time = 'all', + i = indices(1:10), + j = indices(1:10), + time_across = 'chunk', + merge_across_dims = TRUE, + retrieve = TRUE, + return_vars = list(time = 'sdate')) + +# [sdate = 2, chunk = 3] +dat_2018_chunk3 <- Start(dat = '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s2018-r1i1p1f2_gn_202201-202212.nc', + var = 'tos', time = 'all', i = indices(1:10), j = indices(1:10), + retrieve = TRUE) + +expect_equal(dat[1,1,2,25:36,,], dat_2018_chunk3[1,1,,,]) + +# [sdate = 1, chunk = 2] +dat_2017_chunk2 <- Start(dat = '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s2017-r1i1p1f2_gn_202001-202012.nc', + var = 'tos', time = 'all', i = indices(1:10), j = indices(1:10), + retrieve = TRUE) + +expect_equal(dat[1,1,1,13:24,,], dat_2017_chunk2[1,1,,,]) + +# [sdate = 2, chunk = 1] +dat_2018_chunk1 <- Start(dat = '/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Omon/tos/gn/v20200417/$var$_Omon_HadGEM3-GC31-MM_dcppA-hindcast_s2018-r1i1p1f2_gn_202001-202012.nc', + var = 'tos', time = 'all', i = indices(1:10), j = indices(1:10), + retrieve = TRUE) + +expect_equal(dat[1,1,2,1:12,,], dat_2018_chunk1[1,1,,,]) + +}) + diff --git a/tests/testthat/test-Start-implicit_inner_dim.R b/tests/testthat/test-Start-implicit_inner_dim.R index 621fe5b703e3ea348e90200fdaba10569c60cdaf..26e3ce71796aebabea10a0da9220444e340e815f 100644 --- a/tests/testthat/test-Start-implicit_inner_dim.R +++ b/tests/testthat/test-Start-implicit_inner_dim.R @@ -1,20 +1,15 @@ context("Start() implicit inner dimension") -# The unit test is for the implicitly defined inner dimension. If a file dimension selector -# is an array with named dimensions, and 'split_multiselected_dims' is used, then the file -# dim can be split into multiple dimensions that may contain inner dimensions. - -# merge_across_dims + split_multiselected_dims + implicit inner dim???? - +# The unit test is for the implicit inner dimension. If the inner dimension length is 1, +# startR allows it not to be specified in the call. Users can still define it in +# 'return_vars'. #--------------------------------------------------------------- -test_that("1. Split into inner dimension", { +test_that("1. time = 1", { obs.path <- "/esarchive/recon/ecmwf/era5/monthly_mean/$var$_f1h-r1440x721cds/$var$_$file_date$.nc" variable <- "prlr" -dates_file <- c("201311","201312","201411","201412") -dim(dates_file) <- c(time = 2, syear = 2) +dates_file <- c("201311","201312") -# (1) suppressWarnings( obs <- Start(dat = obs.path, var = variable, @@ -25,8 +20,8 @@ obs <- Start(dat = obs.path, longitude_reorder = CircularSort(-180, 180), synonims = list(latitude = c('lat', 'latitude'), longitude = c('lon', 'longitude')), - return_vars = list(latitude = 'dat', - longitude = 'dat', + return_vars = list(latitude = NULL, + longitude = NULL, time = 'file_date'), split_multiselected_dims = TRUE, retrieve = FALSE) @@ -34,11 +29,11 @@ obs <- Start(dat = obs.path, expect_equal( attr(obs, 'Dimensions'), -c(dat = 1, var = 1, time = 2, syear = 2, latitude = 18, longitude = 81) +c(dat = 1, var = 1, file_date = 2, latitude = 18, longitude = 81) ) expect_equal( dim(attr(obs, 'Variables')$common$time), -c(file_date = 4, time = 1) +c(file_date = 2, time = 1) ) expect_equal( attr(obs, 'Variables')$common$time[1, 1], @@ -48,42 +43,3 @@ as.POSIXct('2013-11-15 23:30:00', tz = 'UTC') }) -test_that("2. Split into file dimension", { - -obs.path <- "/esarchive/recon/ecmwf/era5/monthly_mean/$var$_f1h-r1440x721cds/$var$_$file_date$.nc" -variable <- "prlr" -dates_file <- c("201311","201312","201411","201412") -dim(dates_file) <- c(smonth = 2, syear = 2) - -suppressWarnings( -obs <- Start(dat = obs.path, - var = variable, - file_date = dates_file, - time = 'all', - latitude = values(list(35.6, 40)), - latitude_reorder = Sort(decreasing = TRUE), - longitude = values(list(-10, 10)), - longitude_reorder = CircularSort(-180, 180), - synonims = list(latitude = c('lat', 'latitude'), - longitude = c('lon', 'longitude')), - return_vars = list(latitude = 'dat', - longitude = 'dat', - time = 'file_date'), - split_multiselected_dims = TRUE, - retrieve = FALSE) -) - -expect_equal( -attr(obs, 'Dimensions'), -c(dat = 1, var = 1, smonth = 2, syear = 2, time = 1, latitude = 18, longitude = 81) -) -expect_equal( -dim(attr(obs, 'Variables')$common$time), -c(file_date = 4, time = 1) -) -expect_equal( -attr(obs, 'Variables')$common$time[1, 1], -as.POSIXct('2013-11-15 23:30:00', tz = 'UTC') -) - -}) diff --git a/tests/testthat/test-Start-reshape.R b/tests/testthat/test-Start-reshape.R new file mode 100644 index 0000000000000000000000000000000000000000..793a3b3cbcf0a28a07314af001d866812fc3ee19 --- /dev/null +++ b/tests/testthat/test-Start-reshape.R @@ -0,0 +1,503 @@ +context("Start() reshape parameters check") +# This one is more comprehensive than test-Start-split-merge.R + +path_exp <- '/esarchive/exp/ecmwf/system5c3s/daily_mean/$var$_f6h/$var$_$sdate$.nc' +path_obs <- '/esarchive/recon/ecmwf/era5/daily_mean/$var$_f1h-r360x181/$var$_$date$.nc' +var <- 'tas' +sdate <- paste0(1993:1995, '1201') + +suppressWarnings( +exp <- Start(dat = path_exp, + var = var, + sdate = sdate, + time = indices(1:90), #indices(1:91), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = FALSE, silent = T) +) +dates <- attr(exp, 'Variables')$common$time + +# easyNCDF +library(easyNCDF) +# obs +easy_sdate <- c('199312', paste0(rep(1994:1995, each = 3), c('01', '02', '12')), + '199601', '199602') +easy_array <- c() + +for (i in 1:length(easy_sdate)) { + easy_file <- NcOpen(paste0('/esarchive/recon/ecmwf/era5/daily_mean/tas_f1h-r360x181/tas_', + easy_sdate[i], '.nc')) + if (substr(easy_sdate[i], 5, 6) == '02') { + sub_time <- 1:28 + } else { + sub_time <- 1:31 + } + easy_obs <- NcToArray(easy_file, vars_to_read = 'tas', + dim_indices = list(lon = c(1), lat = c(1), time = sub_time)) + NcClose(easy_file) + easy_array <- c(easy_array, as.vector(easy_obs)) +} +dim(easy_array) <- c(time = 90, sdate = 3) + + + +test_that("1. split + merge + narm", { + +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +unsorted_dates <- unique(format(dates, '%Y%m')) + +# unsorted dates +obs1 <- Start(dat = path_obs, + var = var, + date = unsorted_dates, + time = values(dates), #dim: [sdate = 3, time = 90] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 3, time = 90] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, sdate = 3, time = 90, lat = 1, lon = 1) +) +expect_equal( +dim(obs1), +dim(obs2) +) +expect_equal( +as.vector(obs1), +as.vector(obs2) +) +expect_equal( +as.vector(obs1[1, 1, 1, , 1, 1]), +as.vector(easy_array[, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, , 1, 1]), +as.vector(easy_array[, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 3, , 1, 1]), +as.vector(easy_array[, 3]) +) + +}) + + +test_that("2. split + merge", { + +exp <- Start(dat = path_exp, + var = var, + sdate = sdate, + time = indices(1:62), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = FALSE) + +dates <- attr(exp, 'Variables')$common$time + +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +unsorted_dates <- unique(format(dates, '%Y%m')) + +# unsorted dates +obs1 <- Start(dat = path_obs, + var = var, + date = unsorted_dates, + time = values(dates), #dim: [sdate = 3, time = 62] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 3, time = 62] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, sdate = 3, time = 62, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs1[1, 1, 1, , 1, 1]), +as.vector(easy_array[1:62, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, , 1, 1]), +as.vector(easy_array[1:62, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 3, , 1, 1]), +as.vector(easy_array[1:62, 3]) +) +expect_equal( +as.vector(obs1), +as.vector(obs2) +) + +}) + + + +test_that("3. merge", { +# NOTE: The three files are all regarded to have time = 31, despite 199402 only has 28. +# It happens when time = 'all' or time = indices(). It seems reasonable when +# 'merge_across_dims' is not used, but if it is used, it's common to expect 31+31+28. +# See the next test "4. merge + narm". 199402 is still regarded as 31, so NAs are not +# removed. +suppressWarnings( +obs3 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = 'all', + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) +) + + +expect_equal( +dim(obs3), +c(dat = 1, var = 1, time = 93, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs3), +c(as.vector(easy_array[, 1]), NA, NA, NA) +) + +}) + + +test_that("4. merge + narm", { + +# (1) Notice that the NAs at the tail of 199402 won't be removed because Start() +# considers all the files have the same length, i.e., 31. +# The NAs in 199402 are regarded as part of the original file. + +obs3 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = 'all', + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs3), +c(dat = 1, var = 1, time = 93, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs3), +c(as.vector(easy_array[, 1]), NA, NA, NA) +) + +# (2) It's tricky that 199402 is considered time = 31 because Start() considers +# all the files have the same length. So it won't return an error when +# time = indices(93). +# The first 14 time steps of 199312 will be removed but the NAs at the tail +# of 199402 will be preserved. +obs4 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = indices(15:93), + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs4), +c(dat = 1, var = 1, time = 79, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs4), +c(as.vector(easy_array[15:90, 1]), NA, NA, NA) +) + +# (3) If time is values(), 199402 is considered time = 28, so NAs will be removed. +obs5 <- Start(dat = path_obs, + var = var, + date = c('199312', '199401', '199402'), + time = dates[1, ], + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, +# split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) +expect_equal( +dim(obs5), +c(dat = 1, var = 1, time = 90, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs5), +as.vector(easy_array[1:90, 1]) +) + +}) + +test_that("5. split", { + +date_array <- c('199312', '199401', '199412', '199501') +dim(date_array) <- c(month = 2, year = 2) + +# split file dim +obs1 <- Start(dat = path_obs, + var = var, + date = date_array, # [month = 2, year = 2] + time = indices(1:31), + lat = indices(1), + lon = indices(1), +# time_across = 'date', +# merge_across_dims = TRUE, +# merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL), + # time = 'date'), + retrieve = TRUE) + +expect_equal( +dim(obs1), +c(dat = 1, var = 1, month = 2, year = 2, time = 31, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs1[1, 1, 1, 1, , 1, 1]), +as.vector(easy_array[1:31, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 2, 1, , 1, 1]), +as.vector(easy_array[32:62, 1]) +) +expect_equal( +as.vector(obs1[1, 1, 1, 2, , 1, 1]), +as.vector(easy_array[1:31, 2]) +) +expect_equal( +as.vector(obs1[1, 1, 2, 2, , 1, 1]), +as.vector(easy_array[32:62, 2]) +) + +# split inner time +## time is indices +time_array <- array(1:62, dim = c(day = 31, month = 2)) +exp1 <- Start(dat = path_exp, + var = var, + sdate = sdate[1], + time = time_array, + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = TRUE) + +# easyNCDF +easy_sdate_exp <- '19931201' +easy_file_exp <- NcOpen(paste0('/esarchive/exp/ecmwf/system5c3s/daily_mean/tas_f6h/tas_', + easy_sdate_exp, '.nc')) +easy_exp <- NcToArray(easy_file_exp, vars_to_read = 'tas', + dim_indices = list(longitude = c(1), latitude = c(1), ensemble = c(1), + time = 1:62)) +NcClose(easy_file_exp) + +expect_equal( +dim(exp1), +c(dat = 1, var = 1, sdate = 1, day = 31, month = 2, ensemble = 1, lat = 1, lon = 1) +) +expect_equal( +as.vector(exp1), +as.vector(easy_exp) +) + +## time is values +time_array <- dates[1, 1:62] +dim(time_array) <- c(day = 31, month = 2) +exp2 <- Start(dat = path_exp, + var = var, + sdate = sdate[1], + time = time_array, + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL), +# time = 'sdate'), + retrieve = TRUE) +expect_equal( +dim(exp2), +c(dat = 1, var = 1, sdate = 1, day = 31, month = 2, ensemble = 1, lat = 1, lon = 1) +) +expect_equal( +as.vector(exp1), +as.vector(exp2) +) + + +}) + +test_that("6. repetitive values", { + +exp <- Start(dat = path_exp, + var = var, + sdate = c('19931101', '19931201'), + time = indices(1:61), + ensemble = indices(1), + lat = indices(1), + lon = indices(1), + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'sdate'), + retrieve = F) +dates <- attr(exp, 'Variables')$common$time + +# sorted and unsorted are the same here +sorted_dates <- sort(unique(format(dates, '%Y%m'))) +#unsorted_dates <- unique(format(dates, '%Y%m')) + +# sorted_dates +obs2 <- Start(dat = path_obs, + var = var, + date = sorted_dates, + time = values(dates), #dim: [sdate = 2, time = 61] + lat = indices(1), + lon = indices(1), + time_across = 'date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = TRUE) + +# easyNCDF +easy_file_199311 <- NcOpen(paste0('/esarchive/recon/ecmwf/era5/daily_mean/tas_f1h-r360x181/tas_', + '199311', '.nc')) +easy_obs_199311 <- NcToArray(easy_file_199311, vars_to_read = 'tas', + dim_indices = list(lon = c(1), lat = c(1), time = 1:30)) +NcClose(easy_file_199311) + +expect_equal( +dim(obs2), +c(dat = 1, var = 1, sdate = 2, time = 61, lat = 1, lon = 1) +) +expect_equal( +as.vector(obs2[1, 1, 1, 1:30, 1, 1]), +as.vector(easy_obs_199311) +) +expect_equal( +as.vector(obs2[1, 1, 1, 31:61, 1, 1]), +as.vector(obs2[1, 1, 2, 1:31, 1, 1]) +) +expect_equal( +as.vector(obs2[1, 1, 2, 31:61, 1, 1]), +easy_array[31:61 ,1] +) + +}) diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R index e998844b3489d728a86f56d7c32af0a13144d05f..9376f9afc895f1ad714046ddbe54e6e9793ccf27 100644 --- a/tests/testthat/test-Start-split-merge.R +++ b/tests/testthat/test-Start-split-merge.R @@ -1,10 +1,9 @@ -#if (identical(Sys.getenv("NOT_CRAN"), "")) Sys.setenv(NOT_CRAN='true') - context("Start() split + merge dim and value check") var_name <- 'sfcWind' path.exp <- '/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc' +suppressWarnings( hcst <- Start(dat = path.exp, var = var_name, sdate = c('20160704', '20161222'), @@ -19,6 +18,7 @@ hcst <- Start(dat = path.exp, time = c('sdate', 'syear')), retrieve = F, silent = TRUE) +) dates <- attr(hcst, 'Variables')$common$time file_date <- sort(unique(gsub('-', '', sapply(as.character(dates), substr, 1, 7)))) @@ -27,7 +27,7 @@ path.obs <- '/esarchive/recon/ecmwf/era5/1hourly/$var$/$var$_$file_date$.nc' test_that("1. split + merge + narm", { -skip_on_cran() +suppressWarnings( obs <- Start(dat = path.obs, var = var_name, file_date = file_date, # a vector @@ -44,7 +44,7 @@ obs <- Start(dat = path.obs, longitude = 'dat', time = 'file_date'), retrieve = T) - +) expect_equal( dim(obs), c(dat = 1, var = 1, latitude = 10, longitude = 10, sdate = 2, syear = 3, time = 12) @@ -67,7 +67,7 @@ obs <- Start(dat = path.obs, test_that("2. no split + merge + narm", { -skip_on_cran() +suppressWarnings( obs <- Start(dat = path.obs, var = var_name, file_date = file_date, # a vector @@ -84,7 +84,7 @@ obs <- Start(dat = path.obs, longitude = 'dat', time = 'file_date'), retrieve = T) - +) expect_equal( dim(obs), c(dat = 1, var = 1, latitude = 10, longitude = 10, time = 72) @@ -107,7 +107,7 @@ obs <- Start(dat = path.obs, test_that("3. no split + merge + no narm", { -skip_on_cran() +suppressWarnings( obs <- Start(dat = path.obs, var = var_name, file_date = file_date, # a vector @@ -124,7 +124,7 @@ obs <- Start(dat = path.obs, longitude = 'dat', time = 'file_date'), retrieve = T) - +) expect_equal( dim(obs), c(dat = 1, var = 1, latitude = 10, longitude = 10, time = 108) @@ -145,3 +145,44 @@ obs <- Start(dat = path.obs, ) }) +test_that("4. split only", { + +obs.path <- "/esarchive/recon/ecmwf/era5/monthly_mean/$var$_f1h-r1440x721cds/$var$_$file_date$.nc" +variable <- "prlr" +dates_file <- c("201311","201312","201411","201412") +dim(dates_file) <- c(smonth = 2, syear = 2) + +suppressWarnings( +obs <- Start(dat = obs.path, + var = variable, + file_date = dates_file, + time = 'all', + latitude = values(list(35.6, 40)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-10, 10)), + longitude_reorder = CircularSort(-180, 180), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + split_multiselected_dims = TRUE, + retrieve = FALSE) +) + +expect_equal( +attr(obs, 'Dimensions'), +c(dat = 1, var = 1, smonth = 2, syear = 2, time = 1, latitude = 18, longitude = 81) +) +expect_equal( +dim(attr(obs, 'Variables')$common$time), +c(file_date = 4, time = 1) +) +expect_equal( +attr(obs, 'Variables')$common$time[1, 1], +as.POSIXct('2013-11-15 23:30:00', tz = 'UTC') +) + + + +})