From 1b04d1d2de8809cdd1d1f522543d11d1e01b372b Mon Sep 17 00:00:00 2001 From: nperez Date: Tue, 2 Mar 2021 13:01:58 +0100 Subject: [PATCH 01/10] split Start step to found_pattern_dims using parent.frame() env --- R/Start.R | 42 ++++-------------------------------------- R/zzz.R | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 38 deletions(-) diff --git a/R/Start.R b/R/Start.R index 33d5809..33c6ab1 100644 --- a/R/Start.R +++ b/R/Start.R @@ -874,45 +874,11 @@ Start <- function(..., # dim = indices/selectors, dim_names <- names(dim_params) # Look for chunked dims chunks <- look_for_chunks(dim_params, dim_names) - + # Check pattern_dims - if (is.null(pattern_dims)) { - .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '", - dim_names[1], "' as 'pattern_dims'.")) - pattern_dims <- dim_names[1] - } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) { - pattern_dims <- unique(pattern_dims) - } else { - stop("Parameter 'pattern_dims' must be a vector of character strings.") - } - if (any(names(var_params) %in% pattern_dims)) { - stop("'*_var' parameters specified for pattern dimensions. Remove or fix them.") - } - # Find the pattern dimension with the pattern specifications - found_pattern_dim <- NULL - for (pattern_dim in pattern_dims) { - # Check all specifications in pattern_dim are valid - dat <- datasets <- dim_params[[pattern_dim]] - if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) { - stop(paste0("Parameter '", pattern_dim, - "' must be a list of lists with pattern specifications or a vector of character strings.")) - } - if (!is.null(dim_reorder_params[[pattern_dim]])) { - .warning(paste0("A reorder for the selectors of '", pattern_dim, - "' has been specified, but it is a pattern dimension and the reorder will be ignored.")) - } - if (is.list(dat) || any(sapply(dat, is.list))) { - if (is.null(found_pattern_dim)) { - found_pattern_dim <- pattern_dim - } else { - stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.") - } - } - } - if (is.null(found_pattern_dim)) { - .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications.")) - found_pattern_dim <- pattern_dims[1] - } + # Function found_pattern_dims may change pattern_dims in the .GlobalEnv + found_pattern_dim <- found_pattern_dims(pattern_dims, dim_names, var_params, + dim_params, dim_reorder_params) # Check all *_reorder are NULL or functions, and that they all have # a matching dimension param. diff --git a/R/zzz.R b/R/zzz.R index af04e6e..7dda044 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -182,3 +182,49 @@ look_for_chunks <- function(dim_params, dim_names) { array(indices, dim = setNames(length(indices), dim_name)) } +# Check pattern_dims +# Function found_pattern_dims may change pattern_dims in the parent.env +found_pattern_dims <- function(pattern_dims, dim_names, var_params, + dim_params, dim_reorder_params) { + if (is.null(pattern_dims)) { + .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '", + dim_names[1], "' as 'pattern_dims'.")) + assign('pattern_dims', dim_names[1], envir = parent.frame()) + pattern_dims <- dim_names[1] + } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) { + assign('pattern_dims', unique(pattern_dims), envir = parent.frame()) + pattern_dims <- unique(pattern_dims) + } else { + stop("Parameter 'pattern_dims' must be a vector of character strings.") + } + if (any(names(var_params) %in% pattern_dims)) { + stop("'*_var' parameters specified for pattern dimensions. Remove or fix them.") + } + # Find the pattern dimension with the pattern specifications + found_pattern_dim <- NULL + for (pattern_dim in pattern_dims) { + # Check all specifications in pattern_dim are valid + dat <- datasets <- dim_params[[pattern_dim]] + if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) { + stop(paste0("Parameter '", pattern_dim, + "' must be a list of lists with pattern specifications or a vector of character strings.")) + } + if (!is.null(dim_reorder_params[[pattern_dim]])) { + .warning(paste0("A reorder for the selectors of '", pattern_dim, + "' has been specified, but it is a pattern dimension and the reorder will be ignored.")) + } + if (is.list(dat) || any(sapply(dat, is.list))) { + if (is.null(found_pattern_dim)) { + found_pattern_dim <- pattern_dim + } else { + stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.") + } + } + } + if (is.null(found_pattern_dim)) { + .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications.")) + found_pattern_dim <- pattern_dims[1] + } + return(found_pattern_dim) +} + -- GitLab From e07c0a87ae725aa2d8f3588822443cbb0a97a636 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 16 Jul 2021 17:58:34 +0200 Subject: [PATCH 02/10] Modularize Start() in progress --- R/Start.R | 294 +++++++++++++----------------------------------------- R/zzz.R | 194 ++++++++++++++++++++++++++++++++++- 2 files changed, 262 insertions(+), 226 deletions(-) diff --git a/R/Start.R b/R/Start.R index 00e9877..d723ea4 100644 --- a/R/Start.R +++ b/R/Start.R @@ -885,7 +885,7 @@ Start <- function(..., # dim = indices/selectors, # Function found_pattern_dims may change pattern_dims in the .GlobalEnv found_pattern_dim <- found_pattern_dims(pattern_dims, dim_names, var_params, dim_params, dim_reorder_params) - + # Check all *_reorder are NULL or functions, and that they all have # a matching dimension param. i <- 1 @@ -950,17 +950,17 @@ Start <- function(..., # dim = indices/selectors, # Check if pattern_dims is the first item in metadata_dims if ((pattern_dims %in% metadata_dims) & metadata_dims[1] != pattern_dims) { - metadata_dims <- c(metadata_dims[-1], metadata_dims[1]) + metadata_dims <- c(pattern_dims, metadata_dims[-which(metadata_dim == pattern_dims)]) } # Check if metadata_dims has more than 2 elements if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2)) { - .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", - "function. Keep '", metadata_dims[1], "' and '", metadata_dims[2], "' only.")) - metadata_dims <- metadata_dims[1:2] + .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", + "function. Keep '", metadata_dims[1], "' and '", metadata_dims[2], "' only.")) + metadata_dims <- metadata_dims[1:2] } else if (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1) { - .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", - "function. Keep '", metadata_dims[1], "' only.")) - metadata_dims <- metadata_dims[1] + .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", + "function. Keep '", metadata_dims[1], "' only.")) + metadata_dims <- metadata_dims[1] } # Once the pattern dimension with dataset specifications is found, @@ -972,64 +972,12 @@ Start <- function(..., # dim = indices/selectors, chunks[[found_pattern_dim]]['n_chunks'], found_pattern_dim) dim_params[[found_pattern_dim]] <- dim_params[[found_pattern_dim]][dats_to_take] - dat <- datasets <- dim_params[[found_pattern_dim]] - dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames') - dat_to_fetch <- c() - dat_names <- c() - if (!is.list(dat)) { - dat <- as.list(dat) - } else { - if (!any(sapply(dat, is.list))) { - dat <- list(dat) - } - } - for (i in 1:length(dat)) { - if (is.character(dat[[i]]) && length(dat[[i]]) == 1 && nchar(dat[[i]]) > 0) { - if (grepl('^(\\./|\\.\\./|/.*/|~/)', dat[[i]])) { - dat[[i]] <- list(path = dat[[i]]) - } else { - dat[[i]] <- list(name = dat[[i]]) - } - } else if (!is.list(dat[[i]])) { - stop(paste0("Parameter '", pattern_dim, - "' is incorrect. It must be a list of lists or character strings.")) - } - #if (!(all(names(dat[[i]]) %in% dat_info_names))) { - # stop("Error: parameter 'dat' is incorrect. There are unrecognized components in the information of some of the datasets. Check 'dat' in ?Load for details.") - #} - if (!('name' %in% names(dat[[i]]))) { - dat[[i]][['name']] <- paste0('dat', i) - if (!('path' %in% names(dat[[i]]))) { - stop(paste0("Parameter '", found_pattern_dim, - "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided.")) - } - } else if (!('path' %in% names(dat[[i]]))) { - dat_to_fetch <- c(dat_to_fetch, i) - } - #if ('path' %in% names(dat[[i]])) { - # if (!('nc_var_name' %in% names(dat[[i]]))) { - # dat[[i]][['nc_var_name']] <- '$var_name$' - # } - # if (!('suffix' %in% names(dat[[i]]))) { - # dat[[i]][['suffix']] <- '' - # } - # if (!('var_min' %in% names(dat[[i]]))) { - # dat[[i]][['var_min']] <- '' - # } - # if (!('var_max' %in% names(dat[[i]]))) { - # dat[[i]][['var_max']] <- '' - # } - #} - dat_names <- c(dat_names, dat[[i]][['name']]) - } - if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) { - .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.") - } - if (length(dat_to_fetch) > 0) { - stop("Specified only the name for some data sets, but not the path ", - "pattern. This option has not been yet implemented.") - } - + dat <- dim_params[[found_pattern_dim]] + #NOTE: This function creates the object 'dat_names' + dat <- mount_dat(dat, pattern_dim, found_pattern_dim) + + dim_params[[found_pattern_dim]] <- dat_names + # Reorder inner_dims_across_files (to make the keys be the file dimensions, # and the values to be the inner dimensions that go across it). if (!is.null(inner_dims_across_files)) { @@ -1127,7 +1075,24 @@ Start <- function(..., # dim = indices/selectors, if (length(var_entries_to_add) > 0) { synonims[names(var_params)[var_entries_to_add]] <- as.list(names(var_params)[var_entries_to_add]) } - + + # Check if return_vars name is inner dim name. If it is synonim, change back to inner dim name + # and return a warning. + use_syn_names <- which(names(return_vars) %in% unlist(synonims) & + !names(return_vars) %in% names(synonims)) + if (!identical(use_syn_names, integer(0))) { + for (use_syn_name in use_syn_names) { + wrong_name <- names(return_vars)[use_syn_name] + names(return_vars)[use_syn_name] <- names(unlist( + lapply(lapply(synonims, '%in%', + names(return_vars)[use_syn_name]), + which))) + .warning(paste0("The name '", wrong_name, "' in parameter 'return_vars' in synonim. ", + "Change it back to the inner dimension name, '", + names(return_vars)[use_syn_name], "'.")) + } + } + # Check selector_checker if (is.null(selector_checker) || !is.function(selector_checker)) { stop("Parameter 'selector_checker' must be a function.") @@ -1242,8 +1207,6 @@ Start <- function(..., # dim = indices/selectors, stop("Parameter 'silent' must be logical.") } - dim_params[[found_pattern_dim]] <- dat_names - if (!silent) { .message(paste0("Exploring files... This will take a variable amount ", "of time depending on the issued request and the ", @@ -1340,60 +1303,34 @@ Start <- function(..., # dim = indices/selectors, stop("All dimensions in 'metadata_dims' must be file dimensions.") } } + + # Add attributes indicating whether this dimension selector is value or indice + tmp <- lapply(dat_selectors[which(dim_names != pattern_dims)], add_value_indices_flag) + dat_selectors <- c(dat_selectors[pattern_dims], tmp) + ## Look for _var params that should be requested automatically. - for (dim_name in dim_names) { - if (!(dim_name %in% pattern_dims)) { - if (is.null(attr(dat_selectors[[dim_name]], 'values')) || - is.null(attr(dat_selectors[[dim_name]], 'indices'))) { - flag <- ((any(dat_selectors[[dim_name]] %in% c('all', 'first', 'last'))) || - (is.numeric(unlist(dat_selectors[[dim_name]])))) - attr(dat_selectors[[dim_name]], 'values') <- !flag - attr(dat_selectors[[dim_name]], 'indices') <- flag - } - ## The following code 'rewrites' var_params for all datasets. If providing different - ## path pattern repositories with different file/inner dimensions, var_params might - ## have to be handled for each dataset separately. - if ((attr(dat_selectors[[dim_name]], 'values') || (dim_name %in% c('var', 'variable'))) && - !(dim_name %in% names(var_params)) && !(dim_name %in% file_dims)) { - if (dim_name %in% c('var', 'variable')) { - var_params <- c(var_params, setNames(list('var_names'), dim_name)) - .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", - dim_name, "_var' requested. ", '"', dim_name, "_var = '", - 'var_names', "'", '"', " has been automatically added to ", - "the Start call.")) - } else { - var_params <- c(var_params, setNames(list(dim_name), dim_name)) - .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", - dim_name, "_var' requested. ", '"', dim_name, "_var = '", - dim_name, "'", '"', " has been automatically added to ", - "the Start call.")) - } + for (dim_name in dim_names[-which(dim_names == pattern_dims)]) { + ## The following code 'rewrites' var_params for all datasets. If providing different + ## path pattern repositories with different file/inner dimensions, var_params might + ## have to be handled for each dataset separately. + if ((attr(dat_selectors[[dim_name]], 'values') || (dim_name %in% c('var', 'variable'))) && + !(dim_name %in% names(var_params)) && !(dim_name %in% file_dims)) { + if (dim_name %in% c('var', 'variable')) { + var_params <- c(var_params, setNames(list('var_names'), dim_name)) + .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", + dim_name, "_var' requested. ", '"', dim_name, "_var = '", + 'var_names', "'", '"', " has been automatically added to ", + "the Start call.")) + } else { + var_params <- c(var_params, setNames(list(dim_name), dim_name)) + .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", + dim_name, "_var' requested. ", '"', dim_name, "_var = '", + dim_name, "'", '"', " has been automatically added to ", + "the Start call.")) } } } -#================================================== - # Check if return_vars name is inner dim name. If it is synonim, change back to - # inner dim name and return a warning. - dim_params_found_file_dims <- dim_params[found_file_dims[[i]]] - if (any(names(return_vars) %in% unlist(synonims) & - !names(return_vars) %in% names(synonims))) { - use_syn_names <- which(names(return_vars) %in% unlist(synonims) & - !names(return_vars) %in% names(synonims)) - for (use_syn_name in use_syn_names) { - wrong_name <- names(return_vars)[use_syn_name] - names(return_vars)[use_syn_name] <- names(unlist( - lapply(lapply(synonims, '%in%', - names(return_vars)[use_syn_name]), - which))) - .warning(paste0("The name '", wrong_name, - "' in parameter 'return_vars' in synonim. ", - "Change it back to the inner dimension name, '", - names(return_vars)[use_syn_name], "'.")) - } - } -#===================================================== - ## (Check the *_var parameters). if (any(!(unlist(var_params) %in% names(return_vars)))) { vars_to_add <- which(!(unlist(var_params) %in% names(return_vars))) @@ -1406,15 +1343,17 @@ Start <- function(..., # dim = indices/selectors, paste(paste0("'", unlist(var_params), "'"), collapse = ', '))) } + # Examine the selectors of file dim and create 'replace_values', which uses the first + # explicit selector (i.e., character) for all file dimensions. replace_values <- vector('list', length = length(file_dims)) names(replace_values) <- file_dims - # Take the first selector for all possible file dimensions for (file_dim in file_dims) { if (file_dim %in% names(var_params)) { .warning(paste0("The '", file_dim, "_var' param will be ignored since '", file_dim, "' is a file dimension (for the dataset with pattern ", dat[[i]][['path']], ").")) } + # If the selector is a vector or a list of 2 without names (represent the value range) if (!is.list(dat_selectors[[file_dim]]) || (is.list(dat_selectors[[file_dim]]) && length(dat_selectors[[file_dim]]) == 2 && @@ -1446,15 +1385,18 @@ Start <- function(..., # dim = indices/selectors, } } sv <- dat_selectors[[file_dim]][[1]] + # 'replace_values' has the first selector (if it's character) or NULL (if it's not explicitly + # defined) for each file dimension. if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) { - replace_values[[file_dim]] <- dat_selectors[[file_dim]][[1]][1] + replace_values[[file_dim]] <- sv[1] } } #print("C") # Now we know which dimensions whose selectors are provided non-explicitly. undefined_file_dims <- file_dims[which(sapply(replace_values, is.null))] defined_file_dims <- file_dims[which(!(file_dims %in% undefined_file_dims))] - # Quickly check if the depending dimensions are provided properly. + # Quickly check if the depending dimensions are provided properly. The check is only for + # if the depending and depended file dims are both explicited defined. for (file_dim in file_dims) { if (file_dim %in% names(depending_file_dims)) { ## TODO: Detect multi-dependancies and forbid. @@ -1540,112 +1482,13 @@ Start <- function(..., # dim = indices/selectors, dataset_has_files[i] <- TRUE ## TODO: Improve message here if no variable found: if (length(undefined_file_dims) > 0) { - # Looking for the first values, parsed from first_file. - first_values <- vector('list', length = length(undefined_file_dims)) - names(first_values) <- undefined_file_dims - found_values <- 0 - stop <- FALSE - try_dim <- 1 - last_success <- 1 - while ((found_values < length(undefined_file_dims)) && !stop) { - u_file_dim <- undefined_file_dims[try_dim] - if (is.null(first_values[[u_file_dim]])) { - path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], - replace_values[-which(file_dims == u_file_dim)], - allow_undefined_key_vars = TRUE) - found_value <- .FindTagValue(path_with_globs_and_tag, - first_file, u_file_dim) - if (!is.null(found_value)) { - found_values <- found_values + 1 - last_success <- try_dim - first_values[[u_file_dim]] <- found_value - replace_values[[u_file_dim]] <- found_value - } - } - try_dim <- (try_dim %% length(undefined_file_dims)) + 1 - if (try_dim == last_success) { - stop <- TRUE - } - } - if (found_values < length(undefined_file_dims)) { - stop(paste0("Path pattern of dataset '", dat[[i]][['name']], - "' is too complex. Could not automatically ", - "detect values for all non-explicitly defined ", - "indices. Check its pattern: ", dat[[i]][['path']])) - } - ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above - ## Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical - ## cmor case, requesting all members and chunks for fixed var and sdate. Not fixing - ## sdate raised 'too complex' error. - # Replace shell globs in path pattern and keep the file_dims as tags - dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, - file_dims, dat[[i]][['name']], path_glob_permissive) - # Now time to look for the available values for the non - # explicitly defined selectors for the file dimensions. - #print("H") - # Check first the ones that do not depend on others. - ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))], - undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))]) - - for (u_file_dim in ufd) { - replace_values[undefined_file_dims] <- first_values - replace_values[[u_file_dim]] <- '*' - depended_dim <- NULL - depended_dim_values <- NA - -#NOTE: Here 'selectors' is always 1. Is it supposed to be like this? - selectors <- dat_selectors[[u_file_dim]][[1]] - if (u_file_dim %in% names(depending_file_dims)) { - depended_dim <- depending_file_dims[[u_file_dim]] - depended_dim_values <- dat_selectors[[depended_dim]][[1]] - dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values)) - names(dat_selectors[[u_file_dim]]) <- depended_dim_values - } else { - dat_selectors[[u_file_dim]] <- list() - } - if (u_file_dim %in% unlist(depending_file_dims)) { - depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))] - replace_values[depending_dims] <- rep('*', length(depending_dims)) - } - for (j in 1:length(depended_dim_values)) { - parsed_values <- c() - if (!is.null(depended_dim)) { - replace_values[[depended_dim]] <- depended_dim_values[j] - } - path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) - found_files <- Sys.glob(path_with_globs) - ## TODO: Enhance this error message, or change by warning. - ## Raises if a wrong sdate is specified, for example. - if (length(found_files) == 0) { - .warning(paste0("Could not find files for any '", u_file_dim, - "' for '", depended_dim, "' = '", - depended_dim_values[j], "'.")) - dat_selectors[[u_file_dim]][[j]] <- NA - } else { - for (found_file in found_files) { - path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], - replace_values[-which(file_dims == u_file_dim)], - allow_undefined_key_vars = TRUE) - parsed_values <- c(parsed_values, - .FindTagValue(path_with_globs_and_tag, found_file, - u_file_dim)) - } - #TODO: selector_checker() doesn't allow selectors to be characters. For selectors - # like "member = 'r7i1p1f1", it cannot be defined with values. - dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors, - var = unique(parsed_values), - return_indices = FALSE) - # Take chunk if needed - dat_selectors[[u_file_dim]][[j]] <- dat_selectors[[u_file_dim]][[j]][chunk_indices(length(dat_selectors[[u_file_dim]][[j]]), - chunks[[u_file_dim]]['chunk'], - chunks[[u_file_dim]]['n_chunks'], - u_file_dim)] - } - } - } + # Note: "dat[[i]][['path']]" is changed by the function below. + dat_selectors <- find_ufd_value(undefined_file_dims, dat, i, replace_values, + first_file, file_dims, path_glob_permissive, + depending_file_dims, dat_selectors, selector_checker, + chunks) #print("I") } else { - #NOTE: If there is no non-explicitly defined dim, use the first found file # to modify. Problem: '*' doesn't catch all the possible file. Only use # the first file. @@ -1654,6 +1497,7 @@ Start <- function(..., # dim = indices/selectors, } } } +#NEW # Now fetch for the first available file if (dataset_has_files[i]) { known_dims <- file_dims diff --git a/R/zzz.R b/R/zzz.R index 7dda044..9d607d2 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -204,7 +204,8 @@ found_pattern_dims <- function(pattern_dims, dim_names, var_params, found_pattern_dim <- NULL for (pattern_dim in pattern_dims) { # Check all specifications in pattern_dim are valid - dat <- datasets <- dim_params[[pattern_dim]] +# dat <- datasets <- dim_params[[pattern_dim]] + dat <- dim_params[[pattern_dim]] if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) { stop(paste0("Parameter '", pattern_dim, "' must be a list of lists with pattern specifications or a vector of character strings.")) @@ -228,3 +229,194 @@ found_pattern_dims <- function(pattern_dims, dim_names, var_params, return(found_pattern_dim) } + +# The variable 'dat' is mounted with the information (name, path) of each dataset. +# NOTE: This function creates the object 'dat_names' in the parent env. +mount_dat <- function(dat, pattern_dim, found_pattern_dim) { + +# dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames') + dat_to_fetch <- c() + dat_names <- c() + if (!is.list(dat)) { + dat <- as.list(dat) + } else { + if (!any(sapply(dat, is.list))) { + dat <- list(dat) + } + } + for (i in 1:length(dat)) { + if (is.character(dat[[i]]) && length(dat[[i]]) == 1 && nchar(dat[[i]]) > 0) { + if (grepl('^(\\./|\\.\\./|/.*/|~/)', dat[[i]])) { + dat[[i]] <- list(path = dat[[i]]) + } else { + dat[[i]] <- list(name = dat[[i]]) + } + } else if (!is.list(dat[[i]])) { + stop(paste0("Parameter '", pattern_dim, + "' is incorrect. It must be a list of lists or character strings.")) + } + #if (!(all(names(dat[[i]]) %in% dat_info_names))) { + # stop("Error: parameter 'dat' is incorrect. There are unrecognized components in the information of some of the datasets. Check 'dat' in ?Load for details.") + #} + if (!('name' %in% names(dat[[i]]))) { + dat[[i]][['name']] <- paste0('dat', i) + if (!('path' %in% names(dat[[i]]))) { + stop(paste0("Parameter '", found_pattern_dim, + "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided.")) + } + } else if (!('path' %in% names(dat[[i]]))) { + dat_to_fetch <- c(dat_to_fetch, i) + } + #if ('path' %in% names(dat[[i]])) { + # if (!('nc_var_name' %in% names(dat[[i]]))) { + # dat[[i]][['nc_var_name']] <- '$var_name$' + # } + # if (!('suffix' %in% names(dat[[i]]))) { + # dat[[i]][['suffix']] <- '' + # } + # if (!('var_min' %in% names(dat[[i]]))) { + # dat[[i]][['var_min']] <- '' + # } + # if (!('var_max' %in% names(dat[[i]]))) { + # dat[[i]][['var_max']] <- '' + # } + #} + dat_names <- c(dat_names, dat[[i]][['name']]) + } + if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) { + .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.") + } + if (length(dat_to_fetch) > 0) { + stop("Specified only the name for some data sets, but not the path ", + "pattern. This option has not been yet implemented.") + } + + assign('dat_names', dat_names, envir = parent.frame()) + return(dat) +} + +# Add attributes indicating whether this dimension selector is value or indice +add_value_indices_flag <- function(x) { + if (is.null(attr(x, 'values')) || is.null(attr(x, 'indices'))) { + flag <- (any(x %in% c('all', 'first', 'last')) || is.numeric(unlist(x))) + attr(x, 'values') <- !flag + attr(x, 'indices') <- flag + } + return(x) +} + + +# Find the value for the undefined selector (i.e., indices()). Use the value from the first +# found file. +# Note that "dat[[i]][['path']]" in parent env. is changed in this function. +find_ufd_value <- function(undefined_file_dims, dat, i, replace_values, + first_file, file_dims, path_glob_permissive, + depending_file_dims, dat_selectors, selector_checker, chunks) { + first_values <- vector('list', length = length(undefined_file_dims)) + names(first_values) <- undefined_file_dims + found_values <- 0 + stop <- FALSE + try_dim <- 1 + last_success <- 1 + while ((found_values < length(undefined_file_dims)) && !stop) { + u_file_dim <- undefined_file_dims[try_dim] + if (is.null(first_values[[u_file_dim]])) { + path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], + replace_values[-which(file_dims == u_file_dim)], + allow_undefined_key_vars = TRUE) + found_value <- .FindTagValue(path_with_globs_and_tag, + first_file, u_file_dim) + if (!is.null(found_value)) { + found_values <- found_values + 1 + last_success <- try_dim + first_values[[u_file_dim]] <- found_value + replace_values[[u_file_dim]] <- found_value + } + } + try_dim <- (try_dim %% length(undefined_file_dims)) + 1 + if (try_dim == last_success) { + stop <- TRUE + } + } + if (found_values < length(undefined_file_dims)) { + stop(paste0("Path pattern of dataset '", dat[[i]][['name']], + "' is too complex. Could not automatically ", + "detect values for all non-explicitly defined ", + "indices. Check its pattern: ", dat[[i]][['path']])) + } + ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above + ## Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical + ## cmor case, requesting all members and chunks for fixed var and sdate. Not fixing + ## sdate raised 'too complex' error. + # Replace shell globs in path pattern and keep the file_dims as tags + dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, + file_dims, dat[[i]][['name']], path_glob_permissive) + + # Now time to look for the available values for the non + # explicitly defined selectors for the file dimensions. + #print("H") + # Check first the ones that do not depend on others. + ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))], + undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))]) + + for (u_file_dim in ufd) { + replace_values[undefined_file_dims] <- first_values + replace_values[[u_file_dim]] <- '*' + depended_dim <- NULL + depended_dim_values <- NA + + #NOTE: Here 'selectors' is always 1. Is it supposed to be like this? + selectors <- dat_selectors[[u_file_dim]][[1]] + if (u_file_dim %in% names(depending_file_dims)) { + depended_dim <- depending_file_dims[[u_file_dim]] + depended_dim_values <- dat_selectors[[depended_dim]][[1]] + dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values)) + names(dat_selectors[[u_file_dim]]) <- depended_dim_values + } else { + dat_selectors[[u_file_dim]] <- list() + } + if (u_file_dim %in% unlist(depending_file_dims)) { + depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))] + replace_values[depending_dims] <- rep('*', length(depending_dims)) + } + for (j in 1:length(depended_dim_values)) { + parsed_values <- c() + if (!is.null(depended_dim)) { + replace_values[[depended_dim]] <- depended_dim_values[j] + } + path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + found_files <- Sys.glob(path_with_globs) + ## TODO: Enhance this error message, or change by warning. + ## Raises if a wrong sdate is specified, for example. + if (length(found_files) == 0) { + .warning(paste0("Could not find files for any '", u_file_dim, + "' for '", depended_dim, "' = '", + depended_dim_values[j], "'.")) + dat_selectors[[u_file_dim]][[j]] <- NA + } else { + for (found_file in found_files) { + path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], + replace_values[-which(file_dims == u_file_dim)], + allow_undefined_key_vars = TRUE) + parsed_values <- c(parsed_values, + .FindTagValue(path_with_globs_and_tag, found_file, + u_file_dim)) + } + #TODO: selector_checker() doesn't allow selectors to be characters. For selectors + # like "member = 'r7i1p1f1", it cannot be defined with values. + dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors, + var = unique(parsed_values), + return_indices = FALSE) + # Take chunk if needed + dat_selectors[[u_file_dim]][[j]] <- dat_selectors[[u_file_dim]][[j]][chunk_indices(length(dat_selectors[[u_file_dim]][[j]]), + chunks[[u_file_dim]]['chunk'], + chunks[[u_file_dim]]['n_chunks'], + u_file_dim)] + } + } + } + #NOTE: change 'dat' in parent env. because "dat[[i]][['path']]" is changed. + assign('dat', dat, envir = parent.frame()) + return(dat_selectors) +} + -- GitLab From e2ea2a47d1b424a5be8a01a6ad0531dd919b572b Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 2 Aug 2021 09:43:31 +0200 Subject: [PATCH 03/10] Modify unit test for the change of netCDF file timestamps --- .../testthat/test-Start-implicit_inner_dim.R | 2 +- .../test-Start-path_glob_permissive.R | 90 +++++++++++-------- tests/testthat/test-Start-split-merge.R | 2 +- 3 files changed, 57 insertions(+), 37 deletions(-) diff --git a/tests/testthat/test-Start-implicit_inner_dim.R b/tests/testthat/test-Start-implicit_inner_dim.R index 26e3ce7..6a3262a 100644 --- a/tests/testthat/test-Start-implicit_inner_dim.R +++ b/tests/testthat/test-Start-implicit_inner_dim.R @@ -37,7 +37,7 @@ c(file_date = 2, time = 1) ) expect_equal( attr(obs, 'Variables')$common$time[1, 1], -as.POSIXct('2013-11-15 23:30:00', tz = 'UTC') +as.POSIXct('2013-11-15', tz = 'UTC') ) }) diff --git a/tests/testthat/test-Start-path_glob_permissive.R b/tests/testthat/test-Start-path_glob_permissive.R index f298bd4..ddd69be 100644 --- a/tests/testthat/test-Start-path_glob_permissive.R +++ b/tests/testthat/test-Start-path_glob_permissive.R @@ -96,45 +96,65 @@ test_that("2. tag at the end", { # Without the layer that path_glob_permissive allows to contain *, the last item in the path is tag. In the example below, the path without path_glob_permissive layer is # "/esarchive/oper/S2S4E-data/weekly_statistics/S2S/$var$/$sdate$/". The last item is "$sdate$" -sdates.seq.thu <- format(seq(as.Date(paste(2020, 06, 11, sep = '-')), as.Date(paste(2020, 09, 17, sep = '-')), - by = 'weeks'), format='%Y%m%d') -path <- "/esarchive/oper/S2S4E-data/weekly_statistics/S2S/$var$/$sdate$/$var$_$sdate$_*.nc" -suppressWarnings( -exp <- Start(dat = path, - var = "tas", - sdate = sdates.seq.thu, - time = 'all', - ensemble = "all", - latitude = indices(1:2), - longitude = indices(1:2), - path_glob_permissive = 1, - retrieve = F) -) - asd <- as.list(attr(exp, 'ExpectedFiles')) - qwe <- sapply(sapply(asd, strsplit, '/'), '[[', 9) - files <- paste0('tas_', sdates.seq.thu, '_', 24:38, '.nc') - expect_equal( - qwe, files - ) +#!!!!!!!!!!This data has been removed!!!!!!!!!!!!! +#sdates.seq.thu <- format(seq(as.Date(paste(2020, 06, 11, sep = '-')), as.Date(paste(2020, 09, 17, sep = '-')), +# by = 'weeks'), format='%Y%m%d') +#path <- "/esarchive/oper/S2S4E-data/weekly_statistics/S2S/$var$/$sdate$/$var$_$sdate$_*.nc" +#suppressWarnings( +#exp <- Start(dat = path, +# var = "tas", +# sdate = sdates.seq.thu, +# time = 'all', +# ensemble = "all", +# latitude = indices(1:2), +# longitude = indices(1:2), +# path_glob_permissive = 1, +# retrieve = F) +#) +# asd <- as.list(attr(exp, 'ExpectedFiles')) +# qwe <- sapply(sapply(asd, strsplit, '/'), '[[', 9) +# files <- paste0('tas_', sdates.seq.thu, '_', 24:38, '.nc') +# expect_equal( +# qwe, files +# ) +# +#suppressWarnings( +#exp <- Start(dat = path, +# var = "tas", +# sdate = sdates.seq.thu, +# time = 'all', +# ensemble = "all", +# latitude = indices(1:2), +# longitude = indices(1:2), +# path_glob_permissive = FALSE, +# retrieve = F) +#) +# asd <- as.list(attr(exp, 'ExpectedFiles')) +# qwe <- sapply(sapply(asd, strsplit, '/'), '[[', 9) +# files <- paste0('tas_', sdates.seq.thu, '_', 24, '.nc') +# expect_equal( +# qwe, files +# ) + +path <- "/esarchive/exp/ecmwf/system4_m1/6hourly/$var$/$var$_$year$0*.nc" suppressWarnings( -exp <- Start(dat = path, - var = "tas", - sdate = sdates.seq.thu, - time = 'all', - ensemble = "all", - latitude = indices(1:2), - longitude = indices(1:2), - path_glob_permissive = FALSE, - retrieve = F) +data <- Start(dat = path, + var = "tas", + year = c('1994', '1995'), + time = indices(1:2), + ensemble = indices(1), + latitude = indices(1:2), + longitude = indices(1:2), + path_glob_permissive = 1, + retrieve = F) ) - asd <- as.list(attr(exp, 'ExpectedFiles')) - qwe <- sapply(sapply(asd, strsplit, '/'), '[[', 9) - files <- paste0('tas_', sdates.seq.thu, '_', 24, '.nc') - expect_equal( - qwe, files - ) +expect_equal( +as.list(attr(data, 'ExpectedFiles')), +list("/esarchive/exp/ecmwf/system4_m1/6hourly/tas/tas_19940501.nc", + "/esarchive/exp/ecmwf/system4_m1/6hourly/tas/tas_19950101.nc") +) }) diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R index 9376f9a..fe686dc 100644 --- a/tests/testthat/test-Start-split-merge.R +++ b/tests/testthat/test-Start-split-merge.R @@ -180,7 +180,7 @@ c(file_date = 4, time = 1) ) expect_equal( attr(obs, 'Variables')$common$time[1, 1], -as.POSIXct('2013-11-15 23:30:00', tz = 'UTC') +as.POSIXct('2013-11-15', tz = 'UTC') ) -- GitLab From f414d54c6396ca361b3fcfc6af46481f1639ecc7 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 4 Aug 2021 12:04:37 +0200 Subject: [PATCH 04/10] ex1_8 has bugs --- inst/doc/usecase/ex1_8_tasandtos.R | 5 +- tests/testthat/test-Start-two_dats.R | 89 ++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 tests/testthat/test-Start-two_dats.R diff --git a/inst/doc/usecase/ex1_8_tasandtos.R b/inst/doc/usecase/ex1_8_tasandtos.R index 38fdf95..a384368 100644 --- a/inst/doc/usecase/ex1_8_tasandtos.R +++ b/inst/doc/usecase/ex1_8_tasandtos.R @@ -19,7 +19,7 @@ library(startR) paths = list(list(path = '/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/Amon/$var$/gr/v20190713/$var$_Amon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc'), list(path = '/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/Omon/$var$/gr/v20190713/$var$_Omon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc')) data1 <- Start(dataset = paths, - var = c('tas','tos'), + var = c('tas', 'tos'), sdate = paste0(1960:1962), fmonth = 1, lat = values(list(0, 10)), @@ -29,9 +29,10 @@ data1 <- Start(dataset = paths, fyear_depends = 'sdate', fmonth_across = 'fyear', merge_across_dims = TRUE, - synonims = list(fmonth = c('fmonth','time'), + synonims = list(fmonth = c('fmonth', 'time'), lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + metadata_dims = 'var', return_vars = list(lat = 'dataset', lon = 'dataset'), num_procs = 1, retrieve = TRUE) diff --git a/tests/testthat/test-Start-two_dats.R b/tests/testthat/test-Start-two_dats.R new file mode 100644 index 0000000..74738d0 --- /dev/null +++ b/tests/testthat/test-Start-two_dats.R @@ -0,0 +1,89 @@ +# ex1_8 +context("Start() two dats in one call") + +test_that("1. ex1_8", { + +path_tas <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/', + 'dcppA-hindcast/$member$/Amon/$var$/gr/v20190713/', + '$var$_Amon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc') +path_tos <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/', + 'dcppA-hindcast/$member$/Omon/$var$/gr/v20190713/', + '$var$_Omon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc') +suppressWarnings( +data <- Start(dataset = list(list(path = path_tas), list(path = path_tos)), + var = c('tas', 'tos'), + sdate = paste0(1960:1962), + fmonth = 1, + lat = values(list(8, 10)), + lon = values(list(8, 10)), + fyear = 'all', + member = indices(1), + fyear_depends = 'sdate', + fmonth_across = 'fyear', + merge_across_dims = TRUE, + synonims = list(fmonth = c('fmonth', 'time'), + lon = c('lon', 'longitude'), + lat = c('lat', 'latitude')), + metadata_dims = 'var', + return_vars = list(lat = 'dataset', lon = 'dataset'), + retrieve = TRUE) +) + +expect_equal( +dim(data), +c(dataset = 2, var = 2, sdate = 3, fmonth = 1, lat = 3, lon = 3, member = 1) +) +expect_equal( +sum(data, na.rm = T), +15396.7, +tolerance = 0.0001 +) +expect_equal( +length(data[is.na(data)]), +54 +) +expect_equal( +sum(is.na(data[1,1,,,,,])), +0 +) +# Amon also has tos +expect_equal( +sum(is.na(data[1,2,,,,,])), +0 +) +expect_equal( +sum(is.na(data[2,1,,,,,])), +27 +) +# WRONG!!!! Omon should have tos. The value should be 0 +expect_equal( +sum(is.na(data[2,2,,,,,])), +27 +) + +expect_equal( +names(attr(data, 'Variables')), +c("common", "dat1", "dat2") +) +expect_equal( +names(attr(data, 'Variables')$common), +c("fmonth", "tas", "tos") +) +expect_equal( +names(attr(data, 'Variables')$dat1), +c("lat", "lon") +) +expect_equal( +names(attr(data, 'Variables')$dat2), +c("lat", "lon") +) +expect_equal( +length(attr(data, 'Variables')$common$tas), +17 +) +expect_equal( +length(attr(data, 'Variables')$common$tos), +16 +) + +}) -- GitLab From a87d15793e41e2585366363eed7e56e7401a0a58 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 4 Aug 2021 12:05:07 +0200 Subject: [PATCH 05/10] Finish the first stage of modularization --- R/Start.R | 969 +++++++++++------------------------------------------- R/zzz.R | 752 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 944 insertions(+), 777 deletions(-) diff --git a/R/Start.R b/R/Start.R index d723ea4..6f602a5 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1428,6 +1428,7 @@ Start <- function(..., # dim = indices/selectors, } } } + # Find the possible values for the selectors that are provided as # indices. If the requested file is on server, impossible operation. if (length(grep("^http", dat[[i]][['path']])) > 0) { @@ -1497,7 +1498,8 @@ Start <- function(..., # dim = indices/selectors, } } } -#NEW + dat[[i]][['selectors']] <- dat_selectors + # Now fetch for the first available file if (dataset_has_files[i]) { known_dims <- file_dims @@ -1514,24 +1516,33 @@ Start <- function(..., # dim = indices/selectors, sub_array_of_not_found_files <- array(!dataset_has_files[i], dim = files_to_load) names(dim(sub_array_of_not_found_files)) <- known_dims - j <- 1 - if (!exists('selector_indices_save')) { - selector_indices_save <- vector('list', length = length(dat)) - } - if (!exists('selectors_total_list')) { - selectors_total_list <- vector('list', length = length(dat)) + + if (largest_dims_length) { + if (!exists('selector_indices_save')) { + selector_indices_save <- vector('list', length = length(dat)) + } + if (!exists('selectors_total_list')) { + selectors_total_list <- vector('list', length = length(dat)) + } + selector_indices_save[[i]] <- vector('list', length = prod(files_to_load)) + selectors_total_list[[i]] <- vector('list', length = prod(files_to_load)) } - selector_indices_save[[i]] <- vector('list', length = prod(files_to_load)) - selectors_total_list[[i]] <- vector('list', length = prod(files_to_load)) + j <- 1 + # NOTE: This while loop has these objects that are used afterward: 'sub_array_of_files_to_load', + # 'sub_array_of_not_found_files', 'indices_of_first_files_with_data', 'selectors_of_first_files_with_data'; + # 'selector_indices_save' and 'selectors_total_list' are used if 'largest_dims_length = T'. while (j <= prod(files_to_load)) { selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] names(selector_indices) <- known_dims - tmp <- selector_indices - tmp[which(known_dims == found_pattern_dim)] <- i - selector_indices_save[[i]][[j]] <- tmp + if (largest_dims_length) { + tmp <- selector_indices + tmp[which(known_dims == found_pattern_dim)] <- i + selector_indices_save[[i]][[j]] <- tmp + } + # This 'selectors' is only used in this while loop selectors <- sapply(1:length(known_dims), function (x) { vector_to_pick <- 1 @@ -1541,9 +1552,13 @@ Start <- function(..., # dim = indices/selectors, dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]] }) names(selectors) <- known_dims - selectors_total_list[[i]][[j]] <- selectors - names(selectors_total_list[[i]][[j]]) <- known_dims + if (largest_dims_length) { + selectors_total_list[[i]][[j]] <- selectors + names(selectors_total_list[[i]][[j]]) <- known_dims + } + + # 'replace_values' and 'file_path' are only used in this while loop replace_values[known_dims] <- selectors if (!dataset_has_files[i]) { if (any(is.na(selectors))) { @@ -1561,13 +1576,14 @@ Start <- function(..., # dim = indices/selectors, } else { file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) -#NOTE: After replacing tags, there is still * if path_glob_permissive is not FALSE. + #NOTE: After replacing tags, there is still * if path_glob_permissive is not FALSE. + # Find the possible value to substitute *. if (grepl('\\*', file_path)) { found_files <- Sys.glob(file_path) file_path <- found_files[1] # choose only the first file. -#NOTE: Above line chooses only the first found file. Because * is not tags, which means -# it is not a dimension. So it cannot store more than one item. If use * to define -# the path, that * should only represent one possibility. + #NOTE: Above line chooses only the first found file. Because * is not tags, which means + # it is not a dimension. So it cannot store more than one item. If use * to define + # the path, that * should only represent one possibility. if (length(found_files) > 1) { .warning("Using glob expression * to define the path, but more ", "than one match is found. Choose the first match only.") @@ -1616,7 +1632,6 @@ Start <- function(..., # dim = indices/selectors, array_of_not_found_files <- .MergeArrays(array_of_not_found_files, sub_array_of_not_found_files, along = found_pattern_dim) } - dat[[i]][['selectors']] <- dat_selectors } if (all(sapply(indices_of_first_files_with_data, is.null))) { stop("No data files found for any of the specified datasets.") @@ -1640,6 +1655,7 @@ Start <- function(..., # dim = indices/selectors, # } #////////////////////////////////////////////// + # Separate 'return_vars' into 'common_return_vars' and 'return_vars' (those = 'dat'). common_return_vars <- NULL common_first_found_file <- NULL common_return_vars_pos <- NULL @@ -1652,6 +1668,9 @@ Start <- function(..., # dim = indices/selectors, common_first_found_file <- rep(FALSE, length(which(sapply(common_return_vars, length) == 0))) names(common_first_found_file) <- names(common_return_vars[which(sapply(common_return_vars, length) == 0)]) } + +#!!!!!!!Check here. return_vars has removed the common ones, and here remove 'dat' value???? +#It seems like it does some benefits to later parts return_vars <- lapply(return_vars, function(x) { if (found_pattern_dim %in% x) { @@ -1660,7 +1679,6 @@ Start <- function(..., # dim = indices/selectors, x } }) - #//////////////////////////////////////////// # Force return_vars = (time = NULL) to (time = 'sdate') if (1) selector = [sdate = 2, time = 4] or # (2) time_across = 'sdate'. @@ -1672,38 +1690,17 @@ Start <- function(..., # dim = indices/selectors, file_dim_as_selector_array_dim <- 1 if (any(found_file_dims[[i]] %in% names(dim(selector_array)))) { - file_dim_as_selector_array_dim <- found_file_dims[[i]][which(found_file_dims[[i]] %in% names(dim(selector_array)))] + file_dim_as_selector_array_dim <- + found_file_dims[[i]][which(found_file_dims[[i]] %in% names(dim(selector_array)))] } - - if (inner_dim %in% inner_dims_across_files | is.character(file_dim_as_selector_array_dim)) { #(2) or (1) + if (inner_dim %in% inner_dims_across_files | + is.character(file_dim_as_selector_array_dim)) { #(2) or (1) # inner_dim is not in return_vars or is NULL if (((!inner_dim %in% names(common_return_vars)) & (!inner_dim %in% names(return_vars))) | - (inner_dim %in% names(common_return_vars) & is.null(common_return_vars[[inner_dim]])) ) { - if (is.character(file_dim_as_selector_array_dim)) { #(1) - if (file_dim_as_selector_array_dim %in% found_pattern_dim) { - stop(paste0("Found '", inner_dim, "' selector has dimension of the pattern dim '", - found_pattern_dim, "', which is not allowed. To assign the dependency on the pattern dim, ", - "use 'return_vars = list(", inner_dim, " = 'dat')' instead.")) - } else { - common_return_vars[[inner_dim]] <- file_dim_as_selector_array_dim - tmp <- file_dim_as_selector_array_dim - } - tmp <- file_dim_as_selector_array_dim - } else if (inner_dim %in% inner_dims_across_files) { #(2) - file_dim_name <- names(which(inner_dim == inner_dims_across_files)) - if (file_dim_name %in% found_pattern_dim) { - stop(paste0("Found '", inner_dim, "' has across dependency on the pattern dim '", - found_pattern_dim, "', which is not allowed.")) - } else { - common_return_vars[[inner_dim]] <- file_dim_name - tmp <- file_dim_name - } - tmp <- file_dim_name - } - .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", tmp, - "', but '", inner_dim, "' is not in return_vars list or is NULL. ", - "To provide the correct metadata, the value of ", inner_dim, - " in 'return_vars' is specified as '", tmp, "'.")) + (inner_dim %in% names(common_return_vars) & is.null(common_return_vars[[inner_dim]]))) { + common_return_vars[[inner_dim]] <- correct_return_vars( + inner_dim, inner_dims_across_files, + found_pattern_dim, file_dim_as_selector_array_dim) } } } @@ -1719,6 +1716,7 @@ Start <- function(..., # dim = indices/selectors, } #//////////////////////////////////////////// + # Create 'picked_common_vars' if (length(common_return_vars) > 0) { picked_common_vars <- vector('list', length = length(common_return_vars)) names(picked_common_vars) <- names(common_return_vars) @@ -1727,29 +1725,34 @@ Start <- function(..., # dim = indices/selectors, } picked_common_vars_ordered <- picked_common_vars picked_common_vars_unorder_indices <- picked_common_vars + + # Create 'picked_vars' picked_vars <- vector('list', length = length(dat)) names(picked_vars) <- dat_names picked_vars_ordered <- picked_vars picked_vars_unorder_indices <- picked_vars + for (i in 1:length(dat)) { if (dataset_has_files[i]) { # Put all selectors in a list of a single list/vector of selectors. # The dimensions that go across files will later be extended to have # lists of lists/vectors of selectors. for (inner_dim in expected_inner_dims[[i]]) { - if (!is.list(dat[[i]][['selectors']][[inner_dim]]) || - (is.list(dat[[i]][['selectors']][[inner_dim]]) && + if (!is.list(dat[[i]][['selectors']][[inner_dim]]) || # not list, or + (is.list(dat[[i]][['selectors']][[inner_dim]]) && # list of 2 that represents range length(dat[[i]][['selectors']][[inner_dim]]) == 2 && is.null(names(dat[[i]][['selectors']][[inner_dim]])))) { dat[[i]][['selectors']][[inner_dim]] <- list(dat[[i]][['selectors']][[inner_dim]]) } } + if (length(return_vars) > 0) { picked_vars[[i]] <- vector('list', length = length(return_vars)) names(picked_vars[[i]]) <- names(return_vars) picked_vars_ordered[[i]] <- picked_vars[[i]] picked_vars_unorder_indices[[i]] <- picked_vars[[i]] } + indices_of_first_file <- as.list(indices_of_first_files_with_data[[i]]) array_file_dims <- sapply(dat[[i]][['selectors']][found_file_dims[[i]]], function(x) length(x[[1]])) names(array_file_dims) <- found_file_dims[[i]] @@ -1759,35 +1762,30 @@ Start <- function(..., # dim = indices/selectors, array_of_var_files <- do.call('[', c(list(x = array_of_files_to_load), indices_of_first_file, list(drop = FALSE))) array_of_var_indices <- array(1:length(array_of_var_files), dim = dim(array_of_var_files)) array_of_not_found_var_files <- do.call('[', c(list(x = array_of_not_found_files), indices_of_first_file, list(drop = FALSE))) + # Create previous_indices. The initial value is -1 because there is no 'previous' before the + # 1st current_indices. previous_indices <- rep(-1, length(indices_of_first_file)) names(previous_indices) <- names(indices_of_first_file) + # Create first_found_file for vars_to_read defining. It is for the dim value in return_vars + # that is NULL or character(0). Because these dims only need to be read once, so + # first_found_file indicates if these dims have been read or not. + # If read, it turns to TRUE and won't be included in vars_to_read again in the next + # 'for j loop'. first_found_file <- NULL if (length(return_vars) > 0) { first_found_file <- rep(FALSE, length(which(sapply(return_vars, length) == 0))) names(first_found_file) <- names(return_vars[which(sapply(return_vars, length) == 0)]) } + for (j in 1:length(array_of_var_files)) { current_indices <- which(array_of_var_indices == j, arr.ind = TRUE)[1, ] names(current_indices) <- names(indices_of_first_file) if (!is.na(array_of_var_files[j]) && !array_of_not_found_var_files[j]) { changed_dims <- which(current_indices != previous_indices) - vars_to_read <- NULL - if (length(return_vars) > 0) { - vars_to_read <- names(return_vars)[sapply(return_vars, function(x) any(names(changed_dims) %in% x))] - } - if (!is.null(first_found_file)) { - if (any(!first_found_file)) { - vars_to_read <- c(vars_to_read, names(first_found_file[which(!first_found_file)])) - } - } - if ((i == 1) && (length(common_return_vars) > 0)) { - vars_to_read <- c(vars_to_read, names(common_return_vars)[sapply(common_return_vars, function(x) any(names(changed_dims) %in% x))]) - } - if (!is.null(common_first_found_file)) { - if (any(!common_first_found_file)) { - vars_to_read <- c(vars_to_read, names(common_first_found_file[which(!common_first_found_file)])) - } - } + # Prepare vars_to_read for this dataset (i loop) and this file (j loop) + vars_to_read <- generate_vars_to_read(return_vars, changed_dims, first_found_file, + common_return_vars, common_first_found_file, i) + file_object <- file_opener(array_of_var_files[j]) if (!is.null(file_object)) { for (var_to_read in vars_to_read) { @@ -1800,15 +1798,7 @@ Start <- function(..., # dim = indices/selectors, synonims) # file_dim_reader returns dimension names as found in the file. # Need to translate accoridng to synonims: - names(var_dims) <- sapply(names(var_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) + names(var_dims) <- replace_with_synonmins(var_dims, synonims) if (!is.null(var_dims)) { var_file_dims <- NULL if (var_to_read %in% names(common_return_vars)) { @@ -1829,13 +1819,11 @@ Start <- function(..., # dim = indices/selectors, "file dimension which also appears in the dimensions of ", "the variable inside the file.\n", array_of_var_files[j]) } - special_types <- list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, - 'Date' = as.Date) first_sample <- file_var_reader(NULL, file_object, NULL, var_to_read, synonims) - if (any(class(first_sample) %in% names(special_types))) { + if (any(class(first_sample) %in% names(time_special_types()))) { array_size <- prod(c(var_file_dims, var_dims)) - new_array <- rep(special_types[[class(first_sample)[1]]](NA), array_size) + new_array <- rep(time_special_types()[[class(first_sample)[1]]](NA), array_size) dim(new_array) <- c(var_file_dims, var_dims) } else { new_array <- array(dim = c(var_file_dims, var_dims)) @@ -1895,16 +1883,14 @@ Start <- function(..., # dim = indices/selectors, padding_dims <- full_array_var_dims padding_dims[longer_dims_in_full_array] <- var_dims[longer_dims] - array_var_dims[longer_dims] - special_types <- list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, - 'Date' = as.Date) if (var_to_read %in% names(common_return_vars)) { var_class <- class(picked_common_vars[[var_to_read]]) } else { var_class <- class(picked_vars[[i]][[var_to_read]]) } - if (any(var_class %in% names(special_types))) { + if (any(var_class %in% names(time_special_types()))) { padding_size <- prod(padding_dims) - padding <- rep(special_types[[var_class[1]]](NA), padding_size) + padding <- rep(time_special_types()[[var_class[1]]](NA), padding_size) dim(padding) <- padding_dims } else { padding <- array(dim = padding_dims) @@ -1975,7 +1961,7 @@ Start <- function(..., # dim = indices/selectors, var_store_indices, list(value = var_values))) # Turn time zone back to UTC if this var_to_read is 'time' - if (all(class(picked_common_vars[[var_to_read]]) == c('POSIXct', 'POSIXt'))) { + if (all(class(picked_common_vars[[var_to_read]]) == names(time_special_types))) { attr(picked_common_vars[[var_to_read]], "tzone") <- 'UTC' } } else { @@ -1984,7 +1970,7 @@ Start <- function(..., # dim = indices/selectors, var_store_indices, list(value = var_values))) # Turn time zone back to UTC if this var_to_read is 'time' - if (all(class(picked_vars[[i]][[var_to_read]]) == c('POSIXct', 'POSIXt'))) { + if (all(class(picked_vars[[i]][[var_to_read]]) == names(time_special_types))) { attr(picked_vars[[i]][[var_to_read]], "tzone") <- 'UTC' } } @@ -2051,15 +2037,7 @@ Start <- function(..., # dim = indices/selectors, synonims) # file_dim_reader returns dimension names as found in the file. # Need to translate accoridng to synonims: - names(data_dims) <- sapply(names(data_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) + names(data_dims) <- replace_with_synonmins(data_dims, synonims) } if (is.numeric(largest_dims_length)) { # largest_dims_length is a named vector @@ -2074,57 +2052,15 @@ Start <- function(..., # dim = indices/selectors, } } else { # largest_dims_length == TRUE - # Open and get all the dim from all the files - data_dims_all_files <- vector('list', length = length(selectors_total_list[[i]])) - - for (selectors_kk in 1:length(data_dims_all_files)) { - file_path <- do.call("[", c(list(array_of_files_to_load), as.list(selector_indices_save[[i]][[selectors_kk]]))) - file_to_open <- file_path - - data_dims_all_files[[selectors_kk]] <- try(file_dim_reader(file_to_open, NULL, selectors_total_list[[i]][[selectors_kk]], - lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), - synonims), silent = TRUE) - - } - - # Remove the missing files (i.e., fail try above) - if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { - tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') - data_dims_all_files <- data_dims_all_files[-tmp] - } - - # Find the longest dimensions from all the files - largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) - - # The inner dim order may differ among files. Need to align them before - # find out the largest dim length. - dim_names_first_file <- names(data_dims_all_files[[1]]) - same_dim_order <-lapply(lapply(data_dims_all_files, names), - identical, dim_names_first_file) - for (to_fix in which(!unlist(same_dim_order))) { - data_dims_all_files[[to_fix]] <- data_dims_all_files[[to_fix]][match(dim_names_first_file, - names(data_dims_all_files[[to_fix]]))] - } - - for (kk in 1:length(data_dims_all_files[[1]])) { - largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) - } - names(largest_data_dims) <- names(data_dims_all_files[[1]]) + data_dims <- find_largest_dims_length( + selectors_total_list[[i]], array_of_files_to_load, + selector_indices_save[[i]], dat[[i]], expected_inner_dims[[i]], + synonims, file_dim_reader) # file_dim_reader returns dimension names as found in the file. # Need to translate accoridng to synonims: - names(largest_data_dims) <- sapply(names(largest_data_dims), - function(x) { - which_entry <- which(sapply(synonims, function(y) x %in% y)) - if (length(which_entry) > 0) { - names(synonims)[which_entry] - } else { - x - } - }) + names(data_dims) <- replace_with_synonmins(data_dims, synonims) - # replace data_dims with largest_data_dims - data_dims <- largest_data_dims } # end of if (largest_dims_length == TRUE) # Transform the variables if needed and keep them apart. @@ -2133,38 +2069,13 @@ Start <- function(..., # dim = indices/selectors, stop("Could not find all the required variables in 'transform_vars' ", "for the dataset '", dat[[i]][['name']], "'.") } + vars_to_transform <- NULL - picked_vars_to_transform <- which(names(picked_vars[[i]]) %in% transform_vars) - if (length(picked_vars_to_transform) > 0) { - picked_vars_to_transform <- names(picked_vars[[i]])[picked_vars_to_transform] - new_vars_to_transform <- picked_vars[[i]][picked_vars_to_transform] - which_are_ordered <- which(!sapply(picked_vars_ordered[[i]][picked_vars_to_transform], is.null)) - - ##NOTE: The following 'if' replaces the original with reordering vector - if (length(which_are_ordered) > 0) { - tmp <- which(!is.na(match(names(picked_vars_ordered[[i]]), names(which_are_ordered)))) - new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[[i]][tmp] - - } - vars_to_transform <- c(vars_to_transform, new_vars_to_transform) - } - - ##NOTE: Above is non-common vars, here is common vars (ie, return_vars = NULL). - picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars) - if (length(picked_common_vars_to_transform) > 0) { - picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform] - - new_vars_to_transform <- picked_common_vars[picked_common_vars_to_transform] - which_are_ordered <- which(!sapply(picked_common_vars_ordered[picked_common_vars_to_transform], is.null)) - - if (length(which_are_ordered) > 0) { - - tmp <- which(!is.na(match(names(picked_common_vars_ordered), names(which_are_ordered)))) - new_vars_to_transform[which_are_ordered] <- picked_common_vars_ordered[tmp] - } - vars_to_transform <- c(vars_to_transform, new_vars_to_transform) - } - + # picked_vars[[i]] + vars_to_transform <- generate_vars_to_transform(vars_to_transform, picked_vars[[i]], transform_vars, picked_vars_ordered[[i]]) + # picked_common_vars + vars_to_transform <- generate_vars_to_transform(vars_to_transform, picked_common_vars, transform_vars, picked_common_vars_ordered) + # Transform the variables transformed_data <- do.call(transform, c(list(data_array = NULL, variables = vars_to_transform, @@ -2674,6 +2585,7 @@ Start <- function(..., # dim = indices/selectors, print(file_dim) } } + #???????????????? if ((!is.null(file_dim) && (file_dim %in% names(selector_file_dims))) || is.null(file_dim)) { if (length(sub_array_of_selectors) > 0) { if (debug) { @@ -2697,23 +2609,25 @@ Start <- function(..., # dim = indices/selectors, # The selector_checker will return either a vector of indices or a list # with the first and last desired indices. + #NOTE: goes_across_prime_meridian may be TRUE only if the selector is list goes_across_prime_meridian <- FALSE + is_circular_dim <- FALSE if (!is.null(var_ordered) && !selectors_are_indices) { if (!is.null(dim_reorder_params[[inner_dim]])) { if (is.list(sub_array_of_selectors)) { - ## NOTE: The check of 'goes_across_prime_meridian' is moved forward to here. - is_circular_dim <- attr(dim_reorder_params[[inner_dim]], "circular") - if (!is.null(is_circular_dim)) { - if (is_circular_dim) { - - # NOTE: Use CircularSort() to put the values in the assigned range, and get the order. - # For example, [-10, 20] in CircularSort(0, 360) is [350, 20]. The $ix list is [2, 1]. - # 'goes_across_prime_meridian' means the selector range across the border. For example, - # CircularSort(-180, 180) with selector [170, 190] -> goes_across_prime_meridian = TRUE. - tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix - goes_across_prime_meridian <- tmp[1] > tmp[2] - } + if ('circular' %in% names(attributes(dim_reorder_params[[inner_dim]]))) { + is_circular_dim <- attr(dim_reorder_params[[inner_dim]], "circular") + } + if (is_circular_dim) { + # NOTE: Use CircularSort() to put the values in the assigned range, and get the order. + # For example, [-10, 20] in CircularSort(0, 360) is [350, 20]. The $ix list is [2, 1]. + # 'goes_across_prime_meridian' means the selector range across the border. For example, + # CircularSort(-180, 180) with selector [170, 190] -> goes_across_prime_meridian = TRUE. + # dim_reorder_params is a list of Reorder function, i.e., + # Sort() or CircularSort(). + tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix + goes_across_prime_meridian <- tmp[1] > tmp[2] } # HERE change to the same code as below (under 'else'). Not sure why originally @@ -2724,19 +2638,12 @@ Start <- function(..., # dim = indices/selectors, #sub_array_of_selectors <- as.list(sub_array_reordered$x[sub_array_unorder]) # Add warning if the boundary is out of range - if (sub_array_of_selectors[1] < range(var_ordered)[1] | sub_array_of_selectors[1] > range(var_ordered)[2]) { - .warning(paste0("The lower boundary of selector of ", - inner_dim, - " is out of range [", - min(var_ordered), ", ", max(var_ordered), "]. ", - "Check if the desired range is all included.")) + if (min(unlist(sub_array_of_selectors)) < range(var_ordered)[1]) { + show_out_of_range_warning(inner_dim, range = range(var_ordered), + bound = 'lower') } - if (sub_array_of_selectors[2] < range(var_ordered)[1] | sub_array_of_selectors[2] > range(var_ordered)[2]) { - .warning(paste0("The upper boundary of selector of ", - inner_dim, - " is out of range [", - min(var_ordered), ", ", max(var_ordered), "]. ", - "Check if the desired range is all included.")) + if (max(unlist(sub_array_of_selectors)) > range(var_ordered)[2]) { + show_out_of_range_warning(inner_dim, range = range(var_ordered), bound = 'upper') } @@ -2781,23 +2688,13 @@ Start <- function(..., # dim = indices/selectors, # Add warning if the boundary is out of range if (is.list(sub_array_of_selectors)) { - if (sub_array_of_selectors[1] < - min(sub_array_of_values) | sub_array_of_selectors[1] > - max(sub_array_of_values)) { - .warning(paste0("The lower boundary of selector of ", - inner_dim, " is out of range [", - min(sub_array_of_values), ", ", - max(sub_array_of_values), "]. ", - "Check if the desired range is all included.")) + if (min(unlist(sub_array_of_selectors)) < min(sub_array_of_values)) { + show_out_of_range_warning(inner_dim, range = range(sub_array_of_values), + bound = 'lower') } - if (sub_array_of_selectors[2] < - min(sub_array_of_values) | sub_array_of_selectors[2] > - max(sub_array_of_values)) { - .warning(paste0("The upper boundary of selector of ", - inner_dim, " is out of range [", - min(sub_array_of_values), ", ", - max(sub_array_of_values), "]. ", - "Check if the desired range is all included.")) + if (max(unlist(sub_array_of_selectors)) > max(sub_array_of_values)) { + show_out_of_range_warning(inner_dim, range = range(sub_array_of_values), + bound = 'upper') } } @@ -2857,84 +2754,16 @@ Start <- function(..., # dim = indices/selectors, print("-> SELECTORS REQUESTED BEFORE TRANSFORM.") } } - - ###NOTE: Here, the transform, is different from the below part of non-transform. - # search 'if (goes_across_prime_meridian' to find the lines below. - if (goes_across_prime_meridian) { - # NOTE: before changing, the return is already correct. - - #NOTE: The fix below has the same explanation as no with_transform part below. - # Search the next next 'if (goes_across_prime_meridian) {'. - if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) { - # global longitude - sub_array_of_fri <- 1:n - # Warning if transform_extra_cell != 0 - if (beta != 0) { - .warning(paste0("Adding parameter transform_extra_cells = ", - transform_extra_cells, " to the transformed index excesses ", - "the border. The border index is used for transformation.")) - } - - } else { - # normal case, i.e., not global - first_index <- min(unlist(sub_array_of_indices)) - last_index <- max(unlist(sub_array_of_indices)) - gap_width <- last_index - first_index - 1 - sub_array_of_fri <- c(1:(min(unlist(sub_array_of_indices)) + min(gap_width, beta)), - (max(unlist(sub_array_of_indices)) - min(gap_width, beta)):n) - - if (min(gap_width, beta) != beta) { - .warning(paste0("Adding parameter transform_extra_cells = ", - transform_extra_cells, " to the transformed index excesses ", - "the border. The border index is used for transformation.")) - } - } - - } else { - #NOTE: This if seems redundant. - if (is.list(sub_array_of_indices)) { - sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] - } - first_index <- min(unlist(sub_array_of_indices)) - last_index <- max(unlist(sub_array_of_indices)) - - start_padding <- min(beta, first_index - 1) - end_padding <- min(beta, n - last_index) - - if (exists("is_circular_dim")) { - if (!is_circular_dim) { #latitude - sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) - if (start_padding != beta | end_padding != beta) { - .warning(paste0("Adding parameter transform_extra_cells = ", - transform_extra_cells, " to the transformed index excesses ", - "the border. The border index is used for transformation.")) - } - } else { #longitude - if ((last_index - first_index + 1 + beta * 2) >= n) { - sub_array_of_fri <- 1:n - } else if (start_padding < beta) { # left side too close to border, need to go to right side - sub_array_of_fri <- c((first_index - start_padding):(last_index + end_padding), (n - (beta - start_padding - 1)):n) - } else if (end_padding < beta) { # right side too close to border, need to go to left side - sub_array_of_fri <- c(1: (beta - end_padding), (first_index - start_padding):(last_index + end_padding)) - } else { #normal - sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) - } - } - } else { # when _reorder is not used - sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) - if (start_padding != beta | end_padding != beta) { - .warning(paste0("Adding parameter transform_extra_cells = ", - transform_extra_cells, " to the transformed index excesses ", - "the border. The border index is used for transformation.")) - } - } - - } + # Generate sub_array_of_fri + sub_array_of_fri <- generate_sub_array_of_fri( + with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta, + is_circular_dim) + subset_vars_to_transform <- vars_to_transform if (!is.null(var_ordered)) { - ##NOTE: if var_ordered is common_vars, it doesn't have attributes and it is a vector. - ## Turn it into array and add dimension name. + #NOTE: If var_ordered is common_vars, it doesn't have attributes and it is a vector. + # Turn it into array and add dimension name. if (!is.array(var_ordered)) { var_ordered <- as.array(var_ordered) names(dim(var_ordered)) <- inner_dim @@ -3030,9 +2859,11 @@ Start <- function(..., # dim = indices/selectors, if (debug) { if (inner_dim %in% dims_to_check) { print("-> FIRST INDEX:") - print(first_index) +# print(first_index) + print("NOTE: Check function generate_sub_array_of_fri() in zzz.R") print("-> LAST INDEX:") - print(last_index) +# print(last_index) + print("NOTE: Check function generate_sub_array_of_fri() in zzz.R") print("-> STRUCTURE OF FIRST ROUND INDICES:") print(str(sub_array_of_fri)) print("-> STRUCTURE OF SECOND ROUND INDICES:") @@ -3072,30 +2903,11 @@ Start <- function(..., # dim = indices/selectors, ### sub_array_of_sri <- sub_array_of_sri + n_of_extra_cells sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position), list(value = sub_array_of_sri))) - } else { - if (goes_across_prime_meridian) { - #NOTE: The potential problem here is, if it is global longitude, - # and the indices overlap (e.g., lon = [0, 359.723] and - # CircularSort(-180, 180), then sub_array_of_indices = list(649, 649)). - # Therefore, sub_array_of_fri will be c(1:649, 649:1296). We'll - # get two 649. - # The fix below may not be the best solution, but it works for - # the example above. - - if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) { - # global longitude - sub_array_of_fri <- c(1:n) - } else { - # the common case, i.e., non-global - sub_array_of_fri <- c(1:min(unlist(sub_array_of_indices)), - max(unlist(sub_array_of_indices)):n) - } - - } else if (is.list(sub_array_of_indices)) { - sub_array_of_fri <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] - } else { - sub_array_of_fri <- sub_array_of_indices - } + + } else { # !with_transform + sub_array_of_fri <- generate_sub_array_of_fri( + with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta, + is_circular_dim) } if (!is.null(var_unorder_indices)) { if (is.null(ordered_fri)) { @@ -3111,7 +2923,7 @@ Start <- function(..., # dim = indices/selectors, taken_chunks <- TRUE } } - } else { + } else { #???????????? if (debug) { if (inner_dim %in% dims_to_check) { print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.") @@ -3478,95 +3290,37 @@ Start <- function(..., # dim = indices/selectors, # final_dims will be used for collocation of data, whereas final_dims_fake # will be used for shaping the final array to be returned to the user. final_dims_fake <- final_dims - if (merge_across_dims) { - if (!is.null(inner_dims_across_files)) { - for (file_dim_across in names(inner_dims_across_files)) { - inner_dim_pos <- which(names(final_dims_fake) == inner_dims_across_files[[file_dim_across]]) - new_dims <- c() - if (inner_dim_pos > 1) { - new_dims <- c(new_dims, final_dims_fake[1:(inner_dim_pos - 1)]) - } - new_dims <- c(new_dims, setNames(prod(final_dims_fake[c(inner_dim_pos, inner_dim_pos + 1)]), - inner_dims_across_files[[file_dim_across]])) - if (inner_dim_pos + 1 < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(inner_dim_pos + 2):length(final_dims_fake)]) - } - final_dims_fake <- new_dims - } - } + final_dims_fake <- dims_merge(inner_dims_across_files, final_dims_fake) } #========================================================================= # Find the dimension to split if split_multiselected_dims = TRUE. # If there is no dimension able to be split, change split_multiselected_dims to FALSE. all_split_dims <- NULL if (split_multiselected_dims) { - for (dim_param in 1:length(dim_params)) { - if (!is.null(dim(dim_params[[dim_param]]))) { - if (length(dim(dim_params[[dim_param]])) > 1) { - split_dims <- dim(dim_params[[dim_param]]) - all_split_dims <- c(all_split_dims, setNames(list(split_dims), - names(dim_params)[dim_param])) - if (is.null(names(split_dims))) { - names(split_dims) <- paste0(names(dim_params)[dim_param], - 1:length(split_dims)) - } - old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) + tmp <- dims_split(dim_params, final_dims_fake) + final_dims_fake <- tmp[[1]] + # all_split_dims is a list containing all the split dims + all_split_dims <- tmp[[2]] - # If merge_across_dims and split_multiselected_dims are both used, - # on one file dim, and this file dim is multi-dim, it doesn't work. - if (identical(old_dim_pos, integer(0))) { - stop(paste0("The dimension '", names(dim_params)[dim_param], - "' to be split cannot be found after 'merge_across_dims' ", - "is used. Check if the reshape parameters are used appropriately.")) - } - - # NOTE: Three steps to create new dims. - # 1st: Put in the dims before split_dim. - # 2nd: Replace the old_dim with split_dims. - # 3rd: Put in the dims after split_dim. - new_dims <- c() - if (old_dim_pos > 1) { - new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)]) - } - new_dims <- c(new_dims, split_dims) - if (old_dim_pos < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(old_dim_pos + 1):length(final_dims_fake)]) - } - final_dims_fake <- new_dims - } - } - } - if (is.null(all_split_dims)) { - split_multiselected_dims <- FALSE - .warning(paste0("Not found any dimensions able to be split. The parameter ", - "'split_multiselected_dims' is changed to FALSE.")) - } + if (is.null(all_split_dims)) { + split_multiselected_dims <- FALSE + .warning(paste0("Not found any dimensions able to be split. The parameter ", + "'split_multiselected_dims' is changed to FALSE.")) + } } #====================================================================== + # If only merge_across_dims and merge_across_dims_narm and no split_multiselected_dims, + # the length of inner across dim (e.g., time) needs to be adjusted. Sum up the actual length + # without potential NAs. if (merge_across_dims) { - # only merge_across_dims -> the 'time' dim length needs to be adjusted + # Prepare the arguments for later use across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? # Get the length of each inner_dim ('time') along each file_dim ('file_date') length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) - if (merge_across_dims_narm) { - across_file_dim <- names(inner_dims_across_files) #TODO: more than one? - - if (!split_multiselected_dims) { - final_dims_fake_name <- names(final_dims_fake) - pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) - new_length_inner_dim <- sum(unlist(length_inner_across_dim)) - if (pos_across_inner_dim != length(final_dims_fake)) { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim, - final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) - } else { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim) - } - names(final_dims_fake) <- final_dims_fake_name - } + if (merge_across_dims_narm & !split_multiselected_dims) { + final_dims_fake <- merge_narm_dims(final_dims_fake, across_inner_dim, length_inner_across_dim) } } @@ -3594,7 +3348,7 @@ Start <- function(..., # dim = indices/selectors, # NOTE: If split_multiselected_dims + merge_across_dims, the dim order may need to be changed. # The inner_dim needs to be the first dim among split dims. - # Cannot control the rest dims are in the same order or not... + # TODO: Cannot control the rest dims are in the same order or not... # Suppose users put the same order of across inner and file dims. if (split_multiselected_dims & merge_across_dims) { # TODO: More than one split? @@ -3602,24 +3356,15 @@ Start <- function(..., # dim = indices/selectors, # if inner_dim is not the first, change! if (inner_dim_pos_in_split_dims != 1) { - split_dims <- c(split_dims[inner_dim_pos_in_split_dims], - split_dims[1:length(split_dims)][-inner_dim_pos_in_split_dims]) - split_dims_pos <- which(!is.na(match(names(final_dims_fake), names(split_dims)))) - # Save the current final_dims_fake for later reorder back + # Save the current final_dims_fake for reordering it back later final_dims_fake_output <- final_dims_fake - new_dims <- c() - if (split_dims_pos[1] != 1) { - new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) - } - new_dims <- c(new_dims, split_dims) - if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) - } - final_dims_fake <- new_dims + tmp <- reorder_split_dims(all_split_dims[[1]], inner_dim_pos_in_split_dims, final_dims_fake) + final_dims_fake <- tmp[[1]] + all_split_dims[[1]] <- tmp[[2]] } } - # The following several lines will only be run if retrieve = TRUE + # The following several lines will only run if retrieve = TRUE if (retrieve) { ########## CREATING THE SHARED MATRIX AND DISPATCHING WORK PIECES ########### @@ -3668,136 +3413,22 @@ Start <- function(..., # dim = indices/selectors, work_pieces <- list() for (i in 1:length(dat)) { if (dataset_has_files[i]) { - selectors <- dat[[i]][['selectors']] - file_dims <- found_file_dims[[i]] - inner_dims <- expected_inner_dims[[i]] - sub_array_dims <- final_dims[file_dims] - sub_array_dims[found_pattern_dim] <- 1 - sub_array_of_files_to_load <- array(1:prod(sub_array_dims), - dim = sub_array_dims) - names(dim(sub_array_of_files_to_load)) <- names(sub_array_dims) - # Detect which of the dimensions of the dataset go across files. - file_dim_across_files <- lapply(inner_dims, - function(x) { - dim_across <- sapply(inner_dims_across_files, function(y) x %in% y) - if (any(dim_across)) { - names(inner_dims_across_files)[which(dim_across)[1]] - } else { - NULL - } - }) - names(file_dim_across_files) <- inner_dims - j <- 1 - while (j <= prod(sub_array_dims)) { - # Work out file path. - file_to_load_sub_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] - names(file_to_load_sub_indices) <- names(sub_array_dims) - file_to_load_sub_indices[found_pattern_dim] <- i - big_dims <- rep(1, length(dim(array_of_files_to_load))) - names(big_dims) <- names(dim(array_of_files_to_load)) - file_to_load_indices <- .MergeArrayDims(file_to_load_sub_indices, big_dims)[[1]] - file_to_load <- do.call('[[', c(list(array_of_files_to_load), - as.list(file_to_load_indices))) - not_found_file <- do.call('[[', c(list(array_of_not_found_files), - as.list(file_to_load_indices))) - load_file_metadata <- do.call('[', c(list(array_of_metadata_flags), - as.list(file_to_load_indices))) - if (load_file_metadata) { - metadata_file_counter <- metadata_file_counter + 1 - } - if (!is.na(file_to_load) && !not_found_file) { - # Work out indices to take - first_round_indices <- lapply(inner_dims, - function (x) { - if (is.null(file_dim_across_files[[x]])) { - x_dim_name <- attr(attr(selectors[[x]][['fri']], "dim"), "names") - if (!is.null(x_dim_name)) { - which_chunk <- file_to_load_sub_indices[x_dim_name] - selectors[[x]][['fri']][[which_chunk]] - } else { - selectors[[x]][['fri']][[1]] - } - } else { - which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] - selectors[[x]][['fri']][[which_chunk]] - } - }) - names(first_round_indices) <- inner_dims - second_round_indices <- lapply(inner_dims, - function (x) { - if (is.null(file_dim_across_files[[x]])) { - x_dim_name <- attr(attr(selectors[[x]][['sri']], "dim"), "names") - if (!is.null(x_dim_name)) { - which_chunk <- file_to_load_sub_indices[x_dim_name] - selectors[[x]][['sri']][[which_chunk]] - } else { - selectors[[x]][['sri']][[1]] - } - } else { - which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] - selectors[[x]][['sri']][[which_chunk]] - } - }) - if (debug) { - print("-> BUILDING A WORK PIECE") - #print(str(selectors)) - } - names(second_round_indices) <- inner_dims - if (!any(sapply(first_round_indices, length) == 0)) { - work_piece <- list() - work_piece[['first_round_indices']] <- first_round_indices - work_piece[['second_round_indices']] <- second_round_indices - work_piece[['file_indices_in_array_of_files']] <- file_to_load_indices - work_piece[['file_path']] <- file_to_load - work_piece[['store_dims']] <- final_dims - # Work out store position - store_position <- final_dims - store_position[names(file_to_load_indices)] <- file_to_load_indices - store_position[inner_dims] <- rep(1, length(inner_dims)) - work_piece[['store_position']] <- store_position - # Work out file selectors - file_selectors <- sapply(file_dims, - function (x) { - vector_to_pick <- 1 - if (x %in% names(depending_file_dims)) { - vector_to_pick <- file_to_load_indices[depending_file_dims[[x]]] - } - selectors[file_dims][[x]][[vector_to_pick]][file_to_load_indices[x]] - }) - names(file_selectors) <- file_dims - work_piece[['file_selectors']] <- file_selectors - # Send variables for transformation - if (!is.null(transform) && (length(transform_vars) > 0)) { - vars_to_transform <- NULL - picked_vars_to_transform <- which(names(picked_vars[[i]]) %in% transform_vars) - if (length(picked_vars_to_transform) > 0) { - picked_vars_to_transform <- names(picked_vars[[i]])[picked_vars_to_transform] - vars_to_transform <- c(vars_to_transform, picked_vars[[i]][picked_vars_to_transform]) - if (any(picked_vars_to_transform %in% names(picked_vars_ordered[[i]]))) { - picked_vars_ordered_to_transform <- picked_vars_to_transform[which(picked_vars_to_transform %in% names(picked_vars_ordered[[i]]))] - vars_to_transform[picked_vars_ordered_to_transform] <- picked_vars_ordered[[i]][picked_vars_ordered_to_transform] - } - } - picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars) - if (length(picked_common_vars_to_transform) > 0) { - picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform] - vars_to_transform <- c(vars_to_transform, picked_common_vars[picked_common_vars_to_transform]) - if (any(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))) { - picked_common_vars_ordered_to_transform <- picked_common_vars_to_transform[which(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))] - vars_to_transform[picked_common_vars_ordered_to_transform] <- picked_common_vars_ordered[picked_common_vars_ordered_to_transform] - } - } - work_piece[['vars_to_transform']] <- vars_to_transform - } - # Send flag to load metadata - if (load_file_metadata) { - work_piece[['save_metadata_in']] <- paste0(metadata_folder, '/', metadata_file_counter) - } - work_pieces <- c(work_pieces, list(work_piece)) - } - } - j <- j + 1 - } + # metadata_file_counter may be changed by the following function + work_pieces <- build_work_pieces( + work_pieces = work_pieces, i = i, selectors = dat[[i]][['selectors']], + file_dims = found_file_dims[[i]], + inner_dims = expected_inner_dims[[i]], final_dims = final_dims, + found_pattern_dim = found_pattern_dim, + inner_dims_across_files = inner_dims_across_files, + array_of_files_to_load = array_of_files_to_load, + array_of_not_found_files = array_of_not_found_files, + array_of_metadata_flags = array_of_metadata_flags, + metadata_file_counter = metadata_file_counter, + depending_file_dims = depending_file_dims, transform = transform, + transform_vars = transform_vars, picked_vars = picked_vars[[i]], + picked_vars_ordered = picked_vars_ordered[[i]], + picked_common_vars = picked_common_vars, + metadata_folder = metadata_folder, debug = debug) } } #print("N") @@ -3807,65 +3438,7 @@ Start <- function(..., # dim = indices/selectors, # Calculate the progress %s that will be displayed and assign them to # the appropriate work pieces. - if (length(work_pieces) / num_procs >= 2 && !silent) { - if (length(work_pieces) / num_procs < 10) { - amount <- 100 / ceiling(length(work_pieces) / num_procs) - reps <- ceiling(length(work_pieces) / num_procs) - } else { - amount <- 10 - reps <- 10 - } - progress_steps <- rep(amount, reps) - if (length(work_pieces) < (reps + 1)) { - selected_pieces <- length(work_pieces) - progress_steps <- c(sum(head(progress_steps, reps)), - tail(progress_steps, reps)) - } else { - selected_pieces <- round(seq(1, length(work_pieces), - length.out = reps + 1))[-1] - } - progress_steps <- paste0(' + ', round(progress_steps, 2), '%') - progress_message <- 'Progress: 0%' - } else { - progress_message <- '' - selected_pieces <- NULL - } - piece_counter <- 1 - step_counter <- 1 - work_pieces <- lapply(work_pieces, - function (x) { - if (piece_counter %in% selected_pieces) { - wp <- c(x, list(progress_amount = progress_steps[step_counter])) - step_counter <<- step_counter + 1 - } else { - wp <- x - } - piece_counter <<- piece_counter + 1 - wp - }) - if (!silent) { - .message("If the size of the requested data is close to or above the free shared RAM memory, R may crash.") - .message("If the size of the requested data is close to or above the half of the free RAM memory, R may crash.") - .message(paste0("Will now proceed to read and process ", length(work_pieces), " data files:")) - if (length(work_pieces) < 30) { - lapply(work_pieces, function (x) .message(x[['file_path']], indent = 2)) - } else { - .message("The list of files is long. You can check it after Start() finishes in the output '$Files'.", indent = 2, exdent = 5) - } - } - - # Build the cluster of processes that will do the work and dispatch work pieces. - # The function .LoadDataFile is applied to each work piece. This function will - # open the data file, regrid if needed, subset, apply the mask, - # compute and apply the weights if needed, - # disable extreme values and store in the shared memory matrix. - #print("O") - if (!silent) { - .message("Loading... This may take several minutes...") - if (progress_message != '') { - .message(progress_message, appendLF = FALSE) - } - } + work_pieces <- retrieve_progress_message(work_pieces, num_procs, silent) # NOTE: In .LoadDataFile(), metadata is saved in metadata_folder/1 or /2 etc. Before here, # the path name is created in work_pieces but the path hasn't been built yet. @@ -3893,7 +3466,8 @@ Start <- function(..., # dim = indices/selectors, } if (!silent) { - if (progress_message != '') { + # if (progress_message != '') + if (length(work_pieces) / num_procs >= 2 && !silent) { .message("\n", tag = '') } } @@ -3907,38 +3481,9 @@ Start <- function(..., # dim = indices/selectors, if (!merge_across_dims_narm) { data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) } else { - # Get the length of these two dimensions in final_dims - length_inner_across_store_dims <- final_dims[across_inner_dim] - length_file_across_store_dims <- final_dims[across_file_dim] - - # Create a logical array for merge_across_dims - logi_array <- array(rep(FALSE, - length_file_across_store_dims * length_inner_across_store_dims), - dim = c(length_inner_across_store_dims, length_file_across_store_dims)) - for (i in 1:length_file_across_store_dims) { #1:4 - logi_array[1:length_inner_across_dim[[i]], i] <- TRUE - } - - # First, get the data array with final_dims dimension - data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) - - # Change the NA derived from additional spaces to -9999, then remove these -9999 - func_remove_blank <- function(data_array, logi_array) { - # dim(data_array) = [time, file_date] - # dim(logi_array) = [time, file_date] - # Change the blank spaces from NA to -9999 - data_array[which(!logi_array)] <- -9999 - return(data_array) - } - data_array_final_dims <- multiApply::Apply(data_array_final_dims, - target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') - output_dims = c(across_inner_dim, across_file_dim), - fun = func_remove_blank, - logi_array = logi_array)$output1 - ## reorder back to the correct dim - tmp <- match(names(final_dims), names(dim(data_array_final_dims))) - data_array_final_dims <- .aperm2(data_array_final_dims, tmp) - data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + data_array_tmp <- remove_additional_na_from_merge( + inner_dims_across_files, final_dims, across_inner_dim, + length_inner_across_dim, data_array) } if (length(data_array_tmp) != prod(final_dims_fake)) { @@ -3949,70 +3494,9 @@ Start <- function(..., # dim = indices/selectors, #NOTE: When one file contains values for dicrete dimensions, rearrange the # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { - - # generate the correct order list from indices_chunk - final_order_list <- list() - i <- 1 - j <- 1 - a <- indices_chunk[i] - while (i <= length(indices_chunk)) { - while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { - a <- c(a, indices_chunk[i+1]) - i <- i + 1 - } - final_order_list[[j]] <- a - a <- indices_chunk[i+1] - i <- i + 1 - j <- j + 1 - } - names(final_order_list) <- sapply(final_order_list, '[[', 1) - final_order_list <- lapply(final_order_list, length) - - if (!all(diff(as.numeric(names(final_order_list))) > 0)) { - # shape the vector into the array without split_dims - split_dims_pos <- match(split_dims, final_dims_fake) - new_dims <- c() - if (split_dims_pos[1] > 1) { - new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) - } - new_dims <- c(new_dims, prod(split_dims)) - names(new_dims)[split_dims_pos[1]] <- across_inner_dim - if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { - new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) - } - final_dims_fake_no_split <- new_dims - data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) - # seperate 'time' dim into each work_piece length - data_array_seperate <- list() - tmp <- cumsum(unlist(length_inner_across_dim)) - tmp <- c(0, tmp) - for (i in 1:length(length_inner_across_dim)) { - data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, - (tmp[i] + 1):tmp[i + 1]) - } - - # re-build the array: chunk - which_chunk <- as.numeric(names(final_order_list)) - how_many_indices <- unlist(final_order_list) - array_piece <- list() - ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) - for (i in 1:length(final_order_list)) { - array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], - across_inner_dim, - ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) - ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - } - - # re-build the array: paste - data_array_tmp <- array_piece[[1]] - along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) - if (length(array_piece) > 1) { - for (i in 2:length(array_piece)) { - data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], - along = along_pos) - } - } - } + data_array_tmp <- rebuild_array_merge_split( + data_array_tmp, indices_chunk, all_split_dims, final_dims_fake, + across_inner_dim, length_inner_across_dim) } data_array <- array(data_array_tmp, dim = final_dims_fake) @@ -4044,45 +3528,10 @@ Start <- function(..., # dim = indices/selectors, unlink(metadata_folder, recursive = TRUE) -#NOTE: Here, metadata can be saved in one of two ways: one for $common and the other for $dat -# for $common, it is a list of metadata length. For $dat, it is a list of dat length, -# and each sublist has the metadata for each dat. - dim_of_metadata <- dim(array_of_metadata_flags)[metadata_dims] - if (!any(names(dim_of_metadata) == pattern_dims) | - (any(names(dim_of_metadata) == pattern_dims) & - dim_of_metadata[pattern_dims] == 1)) { # put under $common; old code - return_metadata <- vector('list', - length = prod(dim(array_of_metadata_flags)[metadata_dims])) - return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata - dim(return_metadata) <- dim(array_of_metadata_flags[metadata_dims]) - - } else { # put under $dat. metadata_dims has 'dat' and dat length > 1 - return_metadata <- vector('list', - length = dim_of_metadata[pattern_dims]) - names(return_metadata) <- dat_names - for (kk in 1:length(return_metadata)) { - return_metadata[[kk]] <- vector('list', length = prod(dim_of_metadata[-1])) # 1 is dat - } - loaded_metadata_count <- 1 - for (kk in 1:length(return_metadata)) { - for (jj in 1:length(return_metadata[[kk]])) { - - if (dataset_has_files[kk]) { - if (loaded_metadata_count %in% loaded_metadata_files) { - return_metadata[[kk]][jj] <- loaded_metadata[[loaded_metadata_count]] - names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[loaded_metadata_count]]) - } else { - return_metadata[[kk]][jj] <- NULL - } - loaded_metadata_count <- loaded_metadata_count + 1 - } else { - return_metadata[[kk]][jj] <- NULL - } - - } - } - } - attr(data_array, 'Variables') <- return_metadata + # Create a list of metadata of the variable (e.g., tas) + return_metadata <- create_metadata_list(array_of_metadata_flags, metadata_dims, pattern_dims, + loaded_metadata_files, loaded_metadata, dat_names, + dataset_has_files) # TODO: Try to infer data type from loaded_metadata # as.integer(data_array) } @@ -4141,65 +3590,30 @@ Start <- function(..., # dim = indices/selectors, for (i in 1:length(dat)) { file_selectors[[dat[[i]][['name']]]] <- dat[[i]][['selectors']][which(names(dat[[i]][['selectors']]) %in% found_file_dims[[i]])] } + if (retrieve) { if (!silent) { .message("Successfully retrieved data.") } - if (all(sapply(attr(data_array, 'Variables'), is.null))) { - var_backup <- NULL + if (all(sapply(return_metadata, is.null))) { + # We don't have metadata of the variable (e.g., tas). The returned metadata list only + # contains those are specified in argument "return_vars". + Variables_list <- c(list(common = picked_common_vars), picked_vars) .warning(paste0("Metadata cannot be retrieved. The reason may be the ", "non-existence of the first file. Use parameter 'metadata_dims'", " to assign to file dimensions along which to return metadata, ", "or check the existence of the first file.")) } else { - -#NOTE: The metadata of variables can be saved in one of the two different structures. -# (1) metadata_dims != 'dat', or (metadata_dims == 'dat' & length(dat) == 1): -# put under $common -# (2) (metadata_dims == 'dat' & length(dat) > 1): -# put under $dat1, $dat2, .... Put it in picked_vars list -#TODO: The current (2) uses the inefficient method. Should define the list structure first -# then fill the list, rather than expand it in the for loop. - if (any(metadata_dims == pattern_dims) & length(dat) > 1) { # (2) - var_backup <- attr(data_array, 'Variables') - for (kk in 1:length(var_backup)) { - sublist_names <- lapply(var_backup, names)[[kk]] - if (!is.null(sublist_names)) { - for (jj in 1:length(sublist_names)) { - picked_vars[[kk]][[sublist_names[jj]]] <- var_backup[[kk]][[jj]] - } - } - } - var_backup <- NULL - - } else { #(1) - var_backup <- attr(data_array, 'Variables') - len <- unlist(lapply(var_backup, length)) - len <- sum(len) + length(which(len == 0)) #0 means NULL - name_list <- lapply(var_backup, names) - new_list <- vector('list', length = len) - count <- 1 - - for (kk in 1:length(var_backup)) { - if (length(var_backup[[kk]]) == 0) { #NULL - count <- count + 1 - } else { - for (jj in 1:length(var_backup[[kk]])) { - new_list[[count]] <- var_backup[[kk]][[jj]] - names(new_list)[count] <- name_list[[kk]][jj] - count <- count + 1 - } - } - } - var_backup <- new_list + # Add the metadata of the variable (e.g., tas) into the list of picked_vars or + # picked_common_vars. + Variables_list <- combine_metadata_picked_vars( + return_metadata, picked_vars, picked_common_vars, + metadata_dims, pattern_dims, length(dat)) } -} - attr(data_array, 'Variables') <- NULL attributes(data_array) <- c(attributes(data_array), - list(Variables = c(list(common = c(picked_common_vars, var_backup)), - picked_vars), + list(Variables = Variables_list, Files = array_of_files_to_load, NotFoundFiles = array_of_not_found_files, FileSelectors = file_selectors, @@ -4208,7 +3622,8 @@ Start <- function(..., # dim = indices/selectors, ) attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class')) data_array - } else { + + } else { # retrieve = FALSE if (!silent) { .message("Successfully discovered data dimensions.") } diff --git a/R/zzz.R b/R/zzz.R index 9d607d2..a775b47 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -420,3 +420,755 @@ find_ufd_value <- function(undefined_file_dims, dat, i, replace_values, return(dat_selectors) } + +# Adjust the argument 'return_vars' if users don't assign them properly. +# Force return_vars = (time = NULL) to (time = 'sdate') if one of the situations: +# (1) selector = [sdate = 2, time = 4], or +# (2) time_across = 'sdate'. +correct_return_vars <- function(inner_dim, inner_dims_across_files, found_pattern_dim, + file_dim_as_selector_array_dim) { + # inner_dim is not in return_vars or is NULL + if (is.character(file_dim_as_selector_array_dim)) { #(1) + if (file_dim_as_selector_array_dim %in% found_pattern_dim) { + stop(paste0("Found '", inner_dim, "' selector has dimension of the pattern dim '", + found_pattern_dim, + "', which is not allowed. To assign the dependency on the pattern dim, ", + "use 'return_vars = list(", inner_dim, " = 'dat')' instead.")) + } else { + corrected_value <- file_dim_as_selector_array_dim + } + } else if (inner_dim %in% inner_dims_across_files) { #(2) + file_dim_name <- names(which(inner_dim == inner_dims_across_files)) + if (file_dim_name %in% found_pattern_dim) { + stop(paste0("Found '", inner_dim, "' has across dependency on the pattern dim '", + found_pattern_dim, "', which is not allowed.")) + } else { + corrected_value <- file_dim_name + } + } + .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", corrected_value, + "', but '", inner_dim, "' is not in return_vars list or is NULL. ", + "To provide the correct metadata, the value of ", inner_dim, + " in 'return_vars' is specified as '", corrected_value, "'.")) + return(corrected_value) +} + +# The time classes that are needed to adjust time zone back to UTC. +time_special_types <- function() { + list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, 'Date' = as.Date) +} + +# Replace the dim names read from netCDF file with the user-specified synonims. +replace_with_synonmins <- function(read_dims, synonims) { + corrected_dim_name <- sapply(names(read_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + return(corrected_dim_name) +} + + +# Prepare vars_to_read for this dataset (i loop) and this file (j loop) +generate_vars_to_read <- function(return_vars, changed_dims, first_found_file, common_return_vars, + common_first_found_file, i) { + vars_to_read <- NULL + if (length(return_vars) > 0) { + #NOTE: because return_vars has changed 'dat' to character(0) above (line 1775), + # 'dat' won't be included in vars_to_read here. + vars_to_read <- names(return_vars)[sapply(return_vars, function(x) any(names(changed_dims) %in% x))] + } + if (!is.null(first_found_file)) { + if (any(!first_found_file)) { + vars_to_read <- c(vars_to_read, names(first_found_file[which(!first_found_file)])) + } + } + if ((i == 1) && (length(common_return_vars) > 0)) { + vars_to_read <- c(vars_to_read, names(common_return_vars)[sapply(common_return_vars, function(x) any(names(changed_dims) %in% x))]) + } + if (!is.null(common_first_found_file)) { + if (any(!common_first_found_file)) { + vars_to_read <- c(vars_to_read, names(common_first_found_file[which(!common_first_found_file)])) + } + } + return(vars_to_read) +} + +# Find the largest dims length within one dataset. +find_largest_dims_length <- function(selectors_total_list, array_of_files_to_load, + selector_indices_save, dat, expected_inner_dims, + synonims, file_dim_reader) { + # Open and get all the dims from all the files + data_dims_all_files <- vector('list', length = length(selectors_total_list)) + + for (selectors_kk in 1:length(data_dims_all_files)) { + file_to_open <- do.call("[", c(list(array_of_files_to_load), + as.list(selector_indices_save[[selectors_kk]]))) + data_dims_all_files[[selectors_kk]] <- try( + file_dim_reader(file_to_open, NULL, selectors_total_list[[selectors_kk]], + lapply(dat[['selectors']][expected_inner_dims], '[[', 1), + synonims), silent = TRUE) + + } + + # Remove the missing files (i.e., fail try above) + if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { + tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') + data_dims_all_files <- data_dims_all_files[-tmp] + } + + # Find the longest dimensions from all the files + largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) + + # The inner dim order may differ among files. Need to align them before + # find out the largest dim length. + dim_names_first_file <- names(data_dims_all_files[[1]]) + same_dim_order <-lapply(lapply(data_dims_all_files, names), + identical, dim_names_first_file) + for (to_fix in which(!unlist(same_dim_order))) { + data_dims_all_files[[to_fix]] <- data_dims_all_files[[to_fix]][match(dim_names_first_file, + names(data_dims_all_files[[to_fix]]))] + } + + for (kk in 1:length(data_dims_all_files[[1]])) { + largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) + } + names(largest_data_dims) <- names(data_dims_all_files[[1]]) + return(largest_data_dims) +} + +# Gererate vars_to_transform from picked_vars[[i]] and picked_common_vars +generate_vars_to_transform <- function(vars_to_transform, picked_vars, transform_vars, + picked_vars_ordered) { + # In Start(), picked_vars can be picked_vars[[i]] or picked_common_vars + picked_vars_to_transform <- which(names(picked_vars) %in% transform_vars) + if (length(picked_vars_to_transform) > 0) { + picked_vars_to_transform <- names(picked_vars)[picked_vars_to_transform] + new_vars_to_transform <- picked_vars[picked_vars_to_transform] + which_are_ordered <- which(!sapply(picked_vars_ordered[picked_vars_to_transform], is.null)) + + if (length(which_are_ordered) > 0) { + tmp <- which(!is.na(match(names(picked_vars_ordered), names(which_are_ordered)))) + new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[tmp] + } + vars_to_transform <- c(vars_to_transform, new_vars_to_transform) + } + return(vars_to_transform) +} + +# Out-of-range warning +show_out_of_range_warning <- function(inner_dim, range, bound) { + # bound: 'lower' or 'upper' + .warning(paste0("The ", bound, " boundary of selector of ", inner_dim, + " is out of range [", min(range), ", ", max(range), "]. ", + "Check if the desired range is all included.")) +} + + +# Generate sub_array_of_fri +generate_sub_array_of_fri <- function(with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta, + is_circular_dim) { + print_warning <- FALSE + if (goes_across_prime_meridian) { + #NOTE: The potential problem here is, if it is global longitude, + # and the indices overlap (e.g., lon = [0, 359.723] and + # CircularSort(-180, 180), then sub_array_of_indices = list(649, 649)). + # Therefore, sub_array_of_fri will be c(1:649, 649:1296). We'll get two 649. + # The fix below may not be the best solution, but it works for the example above. + + if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) { + # global longitude + sub_array_of_fri <- 1:n # n = prod(dim(var_with_selectors)) + + if (with_transform & beta != 0) { + # Warning if transform_extra_cell != 0 + print_warning <- TRUE + } + + } else { + # normal case, i.e., not global + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + if (with_transform) { + gap_width <- last_index - first_index - 1 + actual_beta <- min(gap_width, beta) + sub_array_of_fri <- c(1:(first_index + actual_beta), + (last_index - actual_beta):n) + if (actual_beta != beta) { + print_warning <- TRUE + } + } else { + sub_array_of_fri <- c(1:first_index, last_index:n) + } + } + + } else { + #NOTE: This if seems redundant. +# if (is.list(sub_array_of_indices)) { +# sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] +# } + #NOTE: sub_array_of_indices may be vector or list + if (with_transform) { + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + start_padding <- min(beta, first_index - 1) + end_padding <- min(beta, n - last_index) + + if (!is_circular_dim) { #latitude or when _reorder is not used + sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) + if (start_padding != beta | end_padding != beta) { + print_warning <- TRUE + } + } else { #longitude + # if (((last_index + beta) - (first_index - beta) + 1) >= n) { + if (start_padding <= beta & end_padding <= beta) { + sub_array_of_fri <- 1:n + } else if (start_padding < beta) { # left side too close to border, need to go to right side + sub_array_of_fri <- c((first_index - start_padding):(last_index + end_padding), (n - (beta - start_padding - 1)):n) + } else if (end_padding < beta) { # right side too close to border, need to go to left side + sub_array_of_fri <- c(1: (beta - end_padding), (first_index - start_padding):(last_index + end_padding)) + } else { #normal + sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) + } + } + + } else { + if (is.list(sub_array_of_indices)) { + sub_array_of_fri <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + } else { + sub_array_of_fri <- sub_array_of_indices + } + } + } + if (print_warning) { + .warning(paste0("Adding parameter transform_extra_cells = ", beta, + " to the transformed index excesses ", + "the border. The border index is used for transformation.")) + } + + return(sub_array_of_fri) +} + +# This function merges two dimensions (e.g., time and sdate if "time_across = 'sdate'") into one. +# The two dimensions have to be next to each other. In Start(), it is used to reshape +# final_dims_fake if merge_across_dims = TRUE +dims_merge <- function(inner_dims_across_files, final_dims_fake) { + # inner_dims_across_files would be like: $sdate: "time" + for (file_dim_across in names(inner_dims_across_files)) { + inner_dim_pos <- which(names(final_dims_fake) == inner_dims_across_files[[file_dim_across]]) + new_dims <- c() + # part 1: Put the dims before 'time' in new_dims + if (inner_dim_pos > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(inner_dim_pos - 1)]) + } + # part 2: Merge time and sdate together, and name this dim as 'time' + # The cross and being crossed dims are next to each other, e.g., [time, sdate] + new_dims <- c(new_dims, setNames(prod(final_dims_fake[c(inner_dim_pos, inner_dim_pos + 1)]), + inner_dims_across_files[[file_dim_across]])) + # part 3: Put the dimes after 'sdate' in new_dims + if (inner_dim_pos + 1 < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(inner_dim_pos + 2):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + return(final_dims_fake) +} + +# This function splits one dimension into two. In Start(), it is used to reshape final_dims_fake +# if split_multiselected_dims = TRUE. +dims_split <- function(dim_params, final_dims_fake) { + all_split_dims <- NULL + for (dim_param in 1:length(dim_params)) { + split_dims <- dim(dim_params[[dim_param]]) + if (!is.null(split_dims)) { + if (length(split_dims) > 1) { + all_split_dims <- c(all_split_dims, setNames(list(split_dims), + names(dim_params)[dim_param])) + if (is.null(names(split_dims))) { + names(split_dims) <- paste0(names(dim_params)[dim_param], + 1:length(split_dims)) + } + old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) + + # If merge_across_dims and split_multiselected_dims are both used, + # on one file dim, and this file dim is multi-dim, it doesn't work. + if (identical(old_dim_pos, integer(0))) { + stop(paste0("The dimension '", names(dim_params)[dim_param], + "' to be split cannot be found after 'merge_across_dims' ", + "is used. Check if the reshape parameters are used appropriately.")) + } + + # NOTE: Three steps to create new dims. + # 1st: Put in the dims before split_dim. + # 2nd: Replace the old_dim with split_dims. + # 3rd: Put in the dims after split_dim. + new_dims <- c() + if (old_dim_pos > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)]) + } + new_dims <- c(new_dims, split_dims) + if (old_dim_pos < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(old_dim_pos + 1):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + } + } + return(list(final_dims_fake, all_split_dims)) +} + + +# This function sums up the length of all the inner across dim (e.g., time: list(31, 29, 31, 30)) +# and use it to replace the value of that inner dim. That is, it returns the actual length of +# time rather than using the one including NAs. In Start(), it is used to reshape final_dims_fake +# if merge_across_dims = TRUE, merge_across_dims_narm = TRUE, and split_multiselected_dims = FALSE. +merge_narm_dims <- function(final_dims_fake, across_inner_dim, length_inner_across_dim) { + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name + return(final_dims_fake) +} + + + +# Adjust the dim order. If split_multiselected_dims + merge_across_dims, the dim order may +# need to be changed. The inner_dim needs to be the first dim among split dims. +reorder_split_dims <- function(all_split_dims, inner_dim_pos_in_split_dims, final_dims_fake) { + all_split_dims <- c(all_split_dims[inner_dim_pos_in_split_dims], + all_split_dims[1:length(all_split_dims)][-inner_dim_pos_in_split_dims]) + split_dims_pos <- which(!is.na(match(names(final_dims_fake), names(all_split_dims)))) + new_dims <- c() + if (split_dims_pos[1] != 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, all_split_dims) + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + + return(list(final_dims_fake, all_split_dims)) +} + + +# Build the work pieces. +build_work_pieces <- function(work_pieces, i, selectors, file_dims, inner_dims, final_dims, + found_pattern_dim, inner_dims_across_files, array_of_files_to_load, + array_of_not_found_files, array_of_metadata_flags, + metadata_file_counter, depending_file_dims, transform, + transform_vars, picked_vars, picked_vars_ordered, picked_common_vars, + metadata_folder, debug = debug) { + sub_array_dims <- final_dims[file_dims] + sub_array_dims[found_pattern_dim] <- 1 + sub_array_of_files_to_load <- array(1:prod(sub_array_dims), + dim = sub_array_dims) + names(dim(sub_array_of_files_to_load)) <- names(sub_array_dims) + # Detect which of the dimensions of the dataset go across files. + file_dim_across_files <- lapply(inner_dims, + function(x) { + dim_across <- sapply(inner_dims_across_files, function(y) x %in% y) + if (any(dim_across)) { + names(inner_dims_across_files)[which(dim_across)[1]] + } else { + NULL + } + }) + names(file_dim_across_files) <- inner_dims + j <- 1 + while (j <= prod(sub_array_dims)) { + # Work out file path. + file_to_load_sub_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] + names(file_to_load_sub_indices) <- names(sub_array_dims) + file_to_load_sub_indices[found_pattern_dim] <- i + big_dims <- rep(1, length(dim(array_of_files_to_load))) + names(big_dims) <- names(dim(array_of_files_to_load)) + file_to_load_indices <- .MergeArrayDims(file_to_load_sub_indices, big_dims)[[1]] + file_to_load <- do.call('[[', c(list(array_of_files_to_load), + as.list(file_to_load_indices))) + not_found_file <- do.call('[[', c(list(array_of_not_found_files), + as.list(file_to_load_indices))) + load_file_metadata <- do.call('[', c(list(array_of_metadata_flags), + as.list(file_to_load_indices))) + if (load_file_metadata) { + metadata_file_counter <- metadata_file_counter + 1 + assign('metadata_file_counter', metadata_file_counter, envir = parent.frame()) + } + if (!is.na(file_to_load) && !not_found_file) { + # Work out indices to take + first_round_indices <- lapply(inner_dims, + function (x) { + if (is.null(file_dim_across_files[[x]])) { + x_dim_name <- attr(attr(selectors[[x]][['fri']], "dim"), "names") + if (!is.null(x_dim_name)) { + which_chunk <- file_to_load_sub_indices[x_dim_name] + selectors[[x]][['fri']][[which_chunk]] + } else { + selectors[[x]][['fri']][[1]] + } + } else { + which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] + selectors[[x]][['fri']][[which_chunk]] + } + }) + names(first_round_indices) <- inner_dims + second_round_indices <- lapply(inner_dims, + function (x) { + if (is.null(file_dim_across_files[[x]])) { + x_dim_name <- attr(attr(selectors[[x]][['sri']], "dim"), "names") + if (!is.null(x_dim_name)) { + which_chunk <- file_to_load_sub_indices[x_dim_name] + selectors[[x]][['sri']][[which_chunk]] + } else { + selectors[[x]][['sri']][[1]] + } + } else { + which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] + selectors[[x]][['sri']][[which_chunk]] + } + }) + if (debug) { + print("-> BUILDING A WORK PIECE") + #print(str(selectors)) + } + names(second_round_indices) <- inner_dims + if (!any(sapply(first_round_indices, length) == 0)) { + work_piece <- list() + work_piece[['first_round_indices']] <- first_round_indices + work_piece[['second_round_indices']] <- second_round_indices + work_piece[['file_indices_in_array_of_files']] <- file_to_load_indices + work_piece[['file_path']] <- file_to_load + work_piece[['store_dims']] <- final_dims + # Work out store position + store_position <- final_dims + store_position[names(file_to_load_indices)] <- file_to_load_indices + store_position[inner_dims] <- rep(1, length(inner_dims)) + work_piece[['store_position']] <- store_position + # Work out file selectors + file_selectors <- sapply(file_dims, + function (x) { + vector_to_pick <- 1 + if (x %in% names(depending_file_dims)) { + vector_to_pick <- file_to_load_indices[depending_file_dims[[x]]] + } + selectors[file_dims][[x]][[vector_to_pick]][file_to_load_indices[x]] + }) + names(file_selectors) <- file_dims + work_piece[['file_selectors']] <- file_selectors + # Send variables for transformation + if (!is.null(transform) && (length(transform_vars) > 0)) { + vars_to_transform <- NULL + picked_vars_to_transform <- which(names(picked_vars) %in% transform_vars) + if (length(picked_vars_to_transform) > 0) { + picked_vars_to_transform <- names(picked_vars)[picked_vars_to_transform] + vars_to_transform <- c(vars_to_transform, picked_vars[picked_vars_to_transform]) + if (any(picked_vars_to_transform %in% names(picked_vars_ordered))) { + picked_vars_ordered_to_transform <- picked_vars_to_transform[which(picked_vars_to_transform %in% names(picked_vars_ordered))] + vars_to_transform[picked_vars_ordered_to_transform] <- picked_vars_ordered[picked_vars_ordered_to_transform] + } + } + picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars) + if (length(picked_common_vars_to_transform) > 0) { + picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform] + vars_to_transform <- c(vars_to_transform, picked_common_vars[picked_common_vars_to_transform]) + if (any(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))) { + picked_common_vars_ordered_to_transform <- picked_common_vars_to_transform[which(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))] + vars_to_transform[picked_common_vars_ordered_to_transform] <- picked_common_vars_ordered[picked_common_vars_ordered_to_transform] + } + } + work_piece[['vars_to_transform']] <- vars_to_transform + } + # Send flag to load metadata + if (load_file_metadata) { + work_piece[['save_metadata_in']] <- paste0(metadata_folder, '/', metadata_file_counter) + } + work_pieces <- c(work_pieces, list(work_piece)) + } + } + j <- j + 1 + } + + return(work_pieces) +} + +# Calculate the progress %s that will be displayed and assign them to the appropriate work pieces. +retrieve_progress_message <- function(work_pieces, num_procs, silent) { + if (length(work_pieces) / num_procs >= 2 && !silent) { + if (length(work_pieces) / num_procs < 10) { + amount <- 100 / ceiling(length(work_pieces) / num_procs) + reps <- ceiling(length(work_pieces) / num_procs) + } else { + amount <- 10 + reps <- 10 + } + progress_steps <- rep(amount, reps) + if (length(work_pieces) < (reps + 1)) { + selected_pieces <- length(work_pieces) + progress_steps <- c(sum(head(progress_steps, reps)), + tail(progress_steps, reps)) + } else { + selected_pieces <- round(seq(1, length(work_pieces), + length.out = reps + 1))[-1] + } + progress_steps <- paste0(' + ', round(progress_steps, 2), '%') + progress_message <- 'Progress: 0%' + } else { + progress_message <- '' + selected_pieces <- NULL + } + piece_counter <- 1 + step_counter <- 1 + work_pieces <- lapply(work_pieces, + function (x) { + if (piece_counter %in% selected_pieces) { + wp <- c(x, list(progress_amount = progress_steps[step_counter])) + step_counter <<- step_counter + 1 + } else { + wp <- x + } + piece_counter <<- piece_counter + 1 + wp + }) + if (!silent) { + .message("If the size of the requested data is close to or above the free shared RAM memory, R may crash.") + .message("If the size of the requested data is close to or above the half of the free RAM memory, R may crash.") + .message(paste0("Will now proceed to read and process ", length(work_pieces), " data files:")) + if (length(work_pieces) < 30) { + lapply(work_pieces, function (x) .message(x[['file_path']], indent = 2)) + } else { + .message("The list of files is long. You can check it after Start() finishes in the output '$Files'.", indent = 2, exdent = 5) + } + } + + # Build the cluster of processes that will do the work and dispatch work pieces. + # The function .LoadDataFile is applied to each work piece. This function will + # open the data file, regrid if needed, subset, apply the mask, + # compute and apply the weights if needed, + # disable extreme values and store in the shared memory matrix. + #print("O") + if (!silent) { + .message("Loading... This may take several minutes...") + if (progress_message != '') { + .message(progress_message, appendLF = FALSE) + } + } + return(work_pieces) +} + +# If merge_across_dims = TRUE and merge_across_dims_narm = TRUE, remove the additional NAs +# due to unequal inner_dim ('time') length across file_dim ('sdate'). +remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, across_inner_dim, + length_inner_across_dim, data_array) { + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + # Get the length of these two dimensions in final_dims + length_inner_across_store_dims <- final_dims[across_inner_dim] + length_file_across_store_dims <- final_dims[across_file_dim] + + # Create a logical array for merge_across_dims + logi_array <- array(rep(FALSE, + length_file_across_store_dims * length_inner_across_store_dims), + dim = c(length_inner_across_store_dims, length_file_across_store_dims)) + for (i in 1:length_file_across_store_dims) { #1:4 + logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + } + + # First, get the data array with final_dims dimension + data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) + + # Change the NA derived from additional spaces to -9999, then remove these -9999 + func_remove_blank <- function(data_array, logi_array) { + # dim(data_array) = [time, file_date] + # dim(logi_array) = [time, file_date] + # Change the blank spaces from NA to -9999 + data_array[which(!logi_array)] <- -9999 + return(data_array) + } + data_array_final_dims <- multiApply::Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + + return(data_array_tmp) +} + + + +# When merge_across_dims = TRUE and split_multiselected_dims = TRUE, rearrange the chunks +# (i.e., work_piece) is necessary if one file contains values for discrete dimensions +rebuild_array_merge_split <- function(data_array_tmp, indices_chunk, all_split_dims, + final_dims_fake, across_inner_dim, length_inner_across_dim) { + # generate the correct order list from indices_chunk + final_order_list <- list() + i <- 1 + j <- 1 + a <- indices_chunk[i] + while (i <= length(indices_chunk)) { + while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { + a <- c(a, indices_chunk[i+1]) + i <- i + 1 + } + final_order_list[[j]] <- a + a <- indices_chunk[i+1] + i <- i + 1 + j <- j + 1 + } + names(final_order_list) <- sapply(final_order_list, '[[', 1) + final_order_list <- lapply(final_order_list, length) + + if (!all(diff(as.numeric(names(final_order_list))) > 0)) { + # shape the vector into the array without split_dims + split_dims_pos <- match(all_split_dims[[1]], final_dims_fake) + new_dims <- c() + if (split_dims_pos[1] > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, prod(all_split_dims[[1]])) + names(new_dims)[split_dims_pos[1]] <- across_inner_dim + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + data_array_no_split <- array(data_array_tmp, dim = new_dims) + # seperate 'time' dim into each work_piece length + data_array_seperate <- list() + tmp <- cumsum(unlist(length_inner_across_dim)) + tmp <- c(0, tmp) + for (i in 1:length(length_inner_across_dim)) { + data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + # re-build the array: chunk + which_chunk <- as.numeric(names(final_order_list)) + how_many_indices <- unlist(final_order_list) + array_piece <- list() + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + for (i in 1:length(final_order_list)) { + array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], + across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] + } + + # re-build the array: paste + data_array_tmp <- array_piece[[1]] + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + if (length(array_piece) > 1) { + for (i in 2:length(array_piece)) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } + } + } + + return(data_array_tmp) +} + + +# Create a list of metadata of the variable (e.g., tas) +create_metadata_list <- function(array_of_metadata_flags, metadata_dims, pattern_dims, + loaded_metadata_files, loaded_metadata, dat_names, + dataset_has_files) { + #NOTE: Here, metadata can be saved in one of two ways: one for $common and the other for $dat + # for $common, it is a list of metadata length. For $dat, it is a list of dat length, + # and each sublist has the metadata for each dat. + dim_of_metadata <- dim(array_of_metadata_flags)[metadata_dims] + if (!any(names(dim_of_metadata) == pattern_dims) | + (any(names(dim_of_metadata) == pattern_dims) & + dim_of_metadata[pattern_dims] == 1)) { # put under $common; old code + return_metadata <- vector('list', + length = prod(dim_of_metadata)) + return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata + dim(return_metadata) <- dim_of_metadata + + } else { # put under $dat. metadata_dims has 'dat' and dat length > 1 + return_metadata <- vector('list', + length = dim_of_metadata[pattern_dims]) + names(return_metadata) <- dat_names + for (kk in 1:length(return_metadata)) { + return_metadata[[kk]] <- vector('list', length = prod(dim_of_metadata[-1])) # 1 is dat + } + loaded_metadata_count <- 1 + for (kk in 1:length(return_metadata)) { + for (jj in 1:length(return_metadata[[kk]])) { + + if (dataset_has_files[kk]) { + if (loaded_metadata_count %in% loaded_metadata_files) { + return_metadata[[kk]][jj] <- loaded_metadata[[loaded_metadata_count]] + names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[loaded_metadata_count]]) + } else { + return_metadata[[kk]][jj] <- NULL + } + loaded_metadata_count <- loaded_metadata_count + 1 + } else { + return_metadata[[kk]][jj] <- NULL + } + + } + } + } + + return(return_metadata) +} + +# This function adds the metadata of the variable (e.g., tas) into the list of picked_vars or +# picked_common_vars. The metadata is only retrieved when 'retrieve = TRUE'. +combine_metadata_picked_vars <- function(return_metadata, picked_vars, picked_common_vars, + metadata_dims, pattern_dims, length_dat) { +#NOTE: The metadata of variables can be saved in one of the two different structures. +# (1) metadata_dims != 'dat', or (metadata_dims == 'dat' & length(dat) == 1): +# put under $common +# (2) (metadata_dims == 'dat' & length(dat) > 1): +# put under $dat1, $dat2, .... Put it in picked_vars list +#TODO: The current (2) uses the inefficient method. Should define the list structure first +# then fill the list, rather than expand it in the for loop. + + if (any(metadata_dims == pattern_dims) & length_dat > 1) { # (2) + for (kk in 1:length(return_metadata)) { + sublist_names <- lapply(return_metadata, names)[[kk]] + if (!is.null(sublist_names)) { + for (jj in 1:length(sublist_names)) { + picked_vars[[kk]][[sublist_names[jj]]] <- return_metadata[[kk]][[jj]] + } + } + } + Variables_list <- c(list(common = picked_common_vars), picked_vars) + + } else { #(1) + len <- unlist(lapply(return_metadata, length)) + len <- sum(len) + length(which(len == 0)) #0 means NULL + name_list <- lapply(return_metadata, names) + new_list <- vector('list', length = len) + count <- 1 + + for (kk in 1:length(return_metadata)) { + if (length(return_metadata[[kk]]) == 0) { #NULL + count <- count + 1 + } else { + for (jj in 1:length(return_metadata[[kk]])) { + new_list[[count]] <- return_metadata[[kk]][[jj]] + names(new_list)[count] <- name_list[[kk]][jj] + count <- count + 1 + } + } + } + Variables_list <- c(list(common = c(picked_common_vars, new_list)), picked_vars) + } + + return(Variables_list) +} -- GitLab From 6e7523c429be9ea9a5e20343b65bdebbf6c57c63 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 4 Aug 2021 16:05:44 +0200 Subject: [PATCH 06/10] Add data value check --- tests/testthat/test-Start-metadata_dims.R | 70 ++++++++++++++++++----- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/tests/testthat/test-Start-metadata_dims.R b/tests/testthat/test-Start-metadata_dims.R index b514df6..841786d 100644 --- a/tests/testthat/test-Start-metadata_dims.R +++ b/tests/testthat/test-Start-metadata_dims.R @@ -8,8 +8,8 @@ suppressWarnings( sdate = '20170101', ensemble = indices(1), time = indices(1), - lat = indices(1:10), - lon = indices(1:10), + lat = indices(1:2), + lon = indices(2:3), synonims = list(lat = c('lat', 'latitude'), lon = c('lon', 'longitude')), return_vars = list(time = 'sdate', @@ -39,6 +39,11 @@ suppressWarnings( length(attr(data, 'Variables')$common$tas), 12 ) + expect_equal( + data[1, 1, 1, 1, 1, , 1], + c(248.5012, 248.7815), + tolerance = 0.0001 + ) }) @@ -53,8 +58,8 @@ suppressWarnings( sdate = '20170101', ensemble = indices(1), time = indices(1), - lat = indices(1:10), - lon = indices(1:10), + lat = indices(1:2), + lon = indices(2:3), synonims = list(lat = c('lat', 'latitude'), lon = c('lon', 'longitude')), return_vars = list(time = 'sdate', @@ -92,7 +97,16 @@ suppressWarnings( length(attr(data, 'Variables')$system4_m1$tas), 11 ) - + expect_equal( + data[, 1, 1, 1, 1, 1, 1], + c(247.2570, 248.5012), + tolerance = 0.0001 + ) + expect_equal( + data[, 1, 1, 1, 1, 1, 2], + c(247.2570, 248.5016), + tolerance = 0.0001 + ) }) test_that("3. One data set, two vars", { @@ -100,13 +114,12 @@ test_that("3. One data set, two vars", { 'EC-Earth3/historical/r24i1p1f1/Amon/$var$/gr/v20190312/', '$var$_Amon_EC-Earth3_historical_r24i1p1f1_gr_185001-185012.nc') var <- c('tas', 'clt') - sdate <- '20170101' suppressWarnings( data <- Start(dat = repos, var = var, time = indices(1), - lat = indices(1:10), - lon = indices(10:19), + lat = indices(9:10), + lon = indices(10:11), return_vars = list(lat = NULL, lon = NULL), metadata_dims = 'var', synonims = list(lat = c('lat', 'latitude'), @@ -138,7 +151,16 @@ suppressWarnings( length(attr(data, 'Variables')$common$clt), 16 ) - + expect_equal( + data[1, , 1, 1, 1], + c(249.42436, 32.45226), + tolerance = 0.0001 + ) + expect_equal( + data[1, , 1, 2, 1], + c(250.00110, 25.04345), + tolerance = 0.0001 + ) }) test_that("4. Two data sets, two vars", { @@ -151,8 +173,8 @@ suppressWarnings( sdate = '20170101', ensemble = indices(1), time = indices(1), - lat = indices(1:10), - lon = indices(1:10), + lat = indices(1:2), + lon = indices(1:2), synonims = list(lat = c('lat', 'latitude'), lon = c('lon', 'longitude')), return_vars = list(time = 'sdate', @@ -190,7 +212,18 @@ suppressWarnings( length(attr(data, 'Variables')$system4_m1$tas), 11 ) + expect_equal( + data[1, , 1, 1, 1, 2, 2], + c(247.227219, 6.370782), + tolerance = 0.0001 + ) + expect_equal( + data[2, , 1, 1, 1, 2, 2], + c(248.781540, 5.794801), + tolerance = 0.0001 + ) +#------------------------------------------------------------- suppressWarnings( data <- Start(dat = list(list(name = 'system4_m1', path = repos2), list(name = 'system5_m1', path = repos)), @@ -198,8 +231,8 @@ suppressWarnings( sdate = '20170101', ensemble = indices(1), time = indices(1), - lat = indices(1:10), - lon = indices(1:10), + lat = indices(1:2), + lon = indices(1:2), synonims = list(lat = c('lat', 'latitude'), lon = c('lon', 'longitude')), return_vars = list(time = 'sdate', @@ -246,7 +279,16 @@ suppressWarnings( length(attr(data, 'Variables')$system4_m1$sfcWind), 11 ) - + expect_equal( + data[1, , 1, 1, 1, 2, 2], + c(247.227219, 6.370782), + tolerance = 0.0001 + ) + expect_equal( + data[2, , 1, 1, 1, 2, 2], + c(248.781540, 5.794801), + tolerance = 0.0001 + ) }) test_that("5. Specify metadata_dims with another file dimension", { -- GitLab From 87c0f579be4aa897f57172db8cfae2419f4643c3 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 4 Aug 2021 18:13:16 +0200 Subject: [PATCH 07/10] Revise ex1_8 and its corresponding unit test --- inst/doc/usecase/ex1_8_tasandtos.R | 137 +++++++++++++++++---------- tests/testthat/test-Start-two_dats.R | 61 +++++++----- 2 files changed, 125 insertions(+), 73 deletions(-) diff --git a/inst/doc/usecase/ex1_8_tasandtos.R b/inst/doc/usecase/ex1_8_tasandtos.R index a384368..2c9e75e 100644 --- a/inst/doc/usecase/ex1_8_tasandtos.R +++ b/inst/doc/usecase/ex1_8_tasandtos.R @@ -1,6 +1,7 @@ # ----------------------------------------------------- # Loading tas and tos for EC-Earth decadal predictions: # Authors: Carlos Delgado and Núria Pérez-Zanón +# Revised by An-Chi Ho on 4th Aug. 2021 # ------------------------------------------------------ # Three ways to load the same data are provided: @@ -9,21 +10,35 @@ # 3) two Start call (one for each path and variable) -# Case 1) returns dimensions 'dataset' and 'var' with length 2 , but only the positions of the diagonal are filled: -# tas is stored in {dataset = 1, var = 1} -# tos is stored in {dataset = 2, var = 2} -# NOTE!!! check {datastet = 1, var = 2} because an issue in ESMValTool:https://earth.bsc.es/gitlab/es/auto-ecearth3/issues/1258 +# Case 1: +# Return dimensions 'dataset' and 'var' with length 2, but only the positions of the diagonal +# are filled: +# = Amon/tas is stored in {dataset = 1, var = 1} +# = Omon/tos is stored in {dataset = 2, var = 2} +# We choose an ocean region (lon = 150:170; lat = 10:20) so 'tos' will have values. + +## NOTE!!! [dataset = 1, var = 2] has values because an issue in ESMValTool:https://earth.bsc.es/gitlab/es/auto-ecearth3/issues/1258. +## However, the file seems incorrect. tos shouldn't have values on land. But it is a file issue +## rather than Start()'s problem. library(startR) -paths = list(list(path = '/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/Amon/$var$/gr/v20190713/$var$_Amon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc'), - list(path = '/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/Omon/$var$/gr/v20190713/$var$_Omon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc')) -data1 <- Start(dataset = paths, +path_tas <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/', + 'dcppA-hindcast/$member$/Amon/$var$/gr/v20190713/', + '$var$_Amon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc') +path_tos <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/', + 'dcppA-hindcast/$member$/Omon/$var$/gr/v20190713/', + '$var$_Omon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc') + +data1 <- Start(dataset = list(list(path = path_tas), + list(path = path_tos)), var = c('tas', 'tos'), sdate = paste0(1960:1962), fmonth = 1, - lat = values(list(0, 10)), - lon = values(list(0, 10)), + lat = values(list(10, 20)), + lat_reorder = Sort(), + lon = values(list(150, 170)), + lon_reorder = CircularSort(0, 360), fyear = 'all', member = indices(1), fyear_depends = 'sdate', @@ -32,75 +47,101 @@ data1 <- Start(dataset = paths, synonims = list(fmonth = c('fmonth', 'time'), lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), - metadata_dims = 'var', + metadata_dims = c('dataset', 'var'), return_vars = list(lat = 'dataset', lon = 'dataset'), num_procs = 1, retrieve = TRUE) dim(data1) #dataset var sdate fmonth lat lon member -# 2 2 3 12 14 15 1 +# 2 2 3 1 15 28 1 # Check empty and filled dimensions: -sum(is.na(data1[1,1,,,,,])) == (3*12*14*15) -#[1] FALSE -sum(is.na(data1[1,2,,,,,])) == (3*12*14*15) -#[1] TRUE -sum(is.na(data1[2,2,,,,,])) == (3*12*14*15) -#[1] FALSE -sum(is.na(data1[2,1,,,,,])) == (3*12*14*15) -#[1] TRUE +sum(is.na(data1[1, 1, , , , , ])) +#[1] 0 +sum(is.na(data1[1, 2, , , , , ])) # It should be 1260 if Amon/tos doesn't exist +#[1] 0 +sum(is.na(data1[2, 1, , , , , ])) +#[1] 1260 +sum(is.na(data1[2, 2, , , , , ])) +#[1] 0 lat1 <- as.vector(attributes(data1)$Variables$dat1$lat) lon1 <- as.vector(attributes(data1)$Variables$dat1$lon) +# Check metadata. 'dat1' has 'tas' and 'tos'; 'dat2' has 'tos' +names(attr(data1, 'Variables')$common) +#[1] "fmonth" +names(attr(data1, 'Variables')$dat1) +#[1] "lat" "lon" "tas" "tos" +names(attr(data1, 'Variables')$dat2) +#[1] "lat" "lon" "tos" + # --------------------------------------------------------------- -# Case 2) using a single path, {dataset = 1, var = 2, type = 1} -# 'type' dimension is necessary to distinguish between 'Amon' and 'Omon'. +# Case 2: +# Use a single path, {dataset = 1, var = 2, type = 1}. +# 'type' dimension is necessary to distinguish between 'Amon' and 'Omon', and the dependency +# needs to be specified. library(startR) -path = '/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/$type$/$var$/gr/v20190713/$var$_$type$_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc' +path <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/', + 'dcppA-hindcast/$member$/$type$/$var$/gr/v20190713/', + '$var$_$type$_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc') data2 <- Start(dataset = path, var = c('tas', 'tos'), - type = 'all', + type = list('tas' = 'Amon', 'tos' = 'Omon'), type_depends = 'var', sdate = paste0(1960:1962), fmonth = 1, - lat = values(list(0, 10)), - lon = values(list(0, 10)), + lat = values(list(10, 20)), + lat_reorder = Sort(), + lon = values(list(150, 170)), + lon_reorder = CircularSort(0, 360), fyear = indices(1), member = indices(1), fyear_depends = 'sdate', fmonth_across = 'fyear', merge_across_dims = TRUE, - synonims = list(fmonth = c('fmonth','time'), + synonims = list(fmonth = c('fmonth', 'time'), lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), + metadata_dims = 'var', return_vars = list(lat = 'dataset', lon = 'dataset'), num_procs = 1, retrieve = TRUE) dim(data2) -# dataset var type sdate fmonth lat lon member -# 1 2 1 3 1 14 15 1 +#dataset var type sdate fmonth lat lon member +# 1 2 1 3 1 15 28 1 +# Compare data1 and data2 +identical(as.vector(data1[1, 1, , 1, , , 1]), as.vector(data2[1, 1, 1, , 1, , , 1])) +#[1] TRUE +identical(as.vector(data1[2, 2, , 1, , , 1]), as.vector(data2[1, 2, 1, , 1, , , 1])) +#[1] TRUE # --------------------------------------------------------------- -# Case 3) Two different Start calls can save data_tas and data_tos both with {dataset = 1 and var = 1} dimensions and avoiding extra dimensions like 'type'. +# Case 3: +# Two different Start calls can save data_tas and data_tos both with {dataset = 1 and var = 1} +# dimensions and avoid extra dimensions like 'type'. -path = '/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/Amon/$var$/gr/v20190713/$var$_Amon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc' +path <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/', + 'dcppA-hindcast/$member$/Amon/$var$/gr/v20190713/', + '$var$_Amon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc') data_tas <- Start(dataset = path, var = 'tas', sdate = paste0(1960:1962), fmonth = 1, - lat = values(list(0, 10)), - lon = values(list(0, 10)), + lat = values(list(10, 20)), + lat_reorder = Sort(), + lon = values(list(150, 170)), + lon_reorder = CircularSort(0, 360), fyear = indices(1), member = indices(1), fyear_depends = 'sdate', fmonth_across = 'fyear', merge_across_dims = TRUE, - synonims = list(fmonth = c('fmonth','time'), + synonims = list(fmonth = c('fmonth', 'time'), lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), return_vars = list(lat = 'dataset', lon = 'dataset'), @@ -108,21 +149,25 @@ data_tas <- Start(dataset = path, dim(data_tas) #dataset var sdate fmonth lat lon member -# 1 1 3 1 14 15 1 +# 1 1 3 1 15 28 1 -path = '/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/Omon/$var$/gr/v20190713/$var$_Omon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc' +path <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/', + 'dcppA-hindcast/$member$/Omon/$var$/gr/v20190713/', + '$var$_Omon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc') data_tos <- Start(dataset = path, var = 'tos', sdate = paste0(1960:1962), fmonth = 1, - lat = values(list(0, 10)), - lon = values(list(0, 10)), + lat = values(list(10, 20)), + lat_reorder = Sort(), + lon = values(list(150, 170)), + lon_reorder = CircularSort(0, 360), fyear = indices(1), member = indices(1), fyear_depends = 'sdate', fmonth_across = 'fyear', merge_across_dims = TRUE, - synonims = list(fmonth = c('fmonth','time'), + synonims = list(fmonth = c('fmonth', 'time'), lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), return_vars = list(lat = 'dataset', lon = 'dataset'), @@ -130,15 +175,11 @@ data_tos <- Start(dataset = path, dim(data_tos) #dataset var sdate fmonth lat lon member -# 1 1 3 1 14 15 1 - -# --------------------------------------------------------------------- +# 1 1 3 1 15 28 1 -# Comparison cases 1) to 3): -#---------------------------------------------------------------------- - -all(data1[1, 1, , , , , ] == data_tas[1, 1, , , , , ]) -all((data1[2, 2, , , , , ]) == data_tos[1, 1, , , , , ], na.rm = TRUE) -all(data2[1, 1, 1, , , , , ] == data_tas[1, 1, , , , ,]) -all((data2[1, 2, 1, , , , , ]) == data_tos[1, 1, , , , , ], na.rm = TRUE) +# Compare with previous results +identical(as.vector(data2[1, 1, 1, , 1, , , 1]), as.vector(data_tas)) +#[1] TRUE +identical(as.vector(data2[1, 2, 1, , 1, , , 1]), as.vector(data_tos)) +#[1] TRUE diff --git a/tests/testthat/test-Start-two_dats.R b/tests/testthat/test-Start-two_dats.R index 74738d0..4fa8642 100644 --- a/tests/testthat/test-Start-two_dats.R +++ b/tests/testthat/test-Start-two_dats.R @@ -1,7 +1,7 @@ # ex1_8 -context("Start() two dats in one call") +context("Start() two dats and two vars in one call") -test_that("1. ex1_8", { +test_that("1. ex1_8, case 1", { path_tas <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/', 'dcppA-hindcast/$member$/Amon/$var$/gr/v20190713/', @@ -12,53 +12,49 @@ path_tos <- paste0('/esarchive/exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consorti suppressWarnings( data <- Start(dataset = list(list(path = path_tas), list(path = path_tos)), var = c('tas', 'tos'), - sdate = paste0(1960:1962), - fmonth = 1, - lat = values(list(8, 10)), - lon = values(list(8, 10)), + sdate = paste0(1960), + time = 1, + lat = values(list(10, 20)), + lat_reorder = Sort(), + lon = values(list(150, 170)), + lon_reorder = CircularSort(0, 360), fyear = 'all', member = indices(1), fyear_depends = 'sdate', - fmonth_across = 'fyear', + time_across = 'fyear', merge_across_dims = TRUE, - synonims = list(fmonth = c('fmonth', 'time'), - lon = c('lon', 'longitude'), + synonims = list(lon = c('lon', 'longitude'), lat = c('lat', 'latitude')), - metadata_dims = 'var', + metadata_dims = c('dataset', 'var'), return_vars = list(lat = 'dataset', lon = 'dataset'), retrieve = TRUE) ) expect_equal( dim(data), -c(dataset = 2, var = 2, sdate = 3, fmonth = 1, lat = 3, lon = 3, member = 1) +c(dataset = 2, var = 2, sdate = 1, time = 1, lat = 15, lon = 28, member = 1) ) expect_equal( sum(data, na.rm = T), -15396.7, +264710, tolerance = 0.0001 ) expect_equal( -length(data[is.na(data)]), -54 -) -expect_equal( sum(is.na(data[1,1,,,,,])), 0 ) -# Amon also has tos +# Amon also has tos (though the file is incorrect) expect_equal( sum(is.na(data[1,2,,,,,])), 0 ) expect_equal( sum(is.na(data[2,1,,,,,])), -27 +420 ) -# WRONG!!!! Omon should have tos. The value should be 0 expect_equal( sum(is.na(data[2,2,,,,,])), -27 +0 ) expect_equal( @@ -67,23 +63,38 @@ c("common", "dat1", "dat2") ) expect_equal( names(attr(data, 'Variables')$common), -c("fmonth", "tas", "tos") +c("time") ) expect_equal( names(attr(data, 'Variables')$dat1), -c("lat", "lon") +c("lat", "lon", "tas", "tos") ) expect_equal( names(attr(data, 'Variables')$dat2), -c("lat", "lon") +c("lat", "lon", "tos") ) expect_equal( -length(attr(data, 'Variables')$common$tas), +length(attr(data, 'Variables')$dat1$tas), 17 ) expect_equal( -length(attr(data, 'Variables')$common$tos), +length(attr(data, 'Variables')$dat1$tos), 16 ) +expect_equal( +length(attr(data, 'Variables')$dat2$tos), +16 +) + +expect_equal( +data[1, , 1, 1, 1, 1, 1], +c(299.9199, 302.5184), +tolerance = 0.0001 +) +expect_equal( +data[2, , 1, 1, 1, 1, 1], +c(NA, 29.3684), +tolerance = 0.0001 +) }) -- GitLab From 93ac5f8c4b076c3ec94e343e8a56f8792fc8cc4a Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 4 Aug 2021 18:15:12 +0200 Subject: [PATCH 08/10] Bugfix for metadata file count --- R/Start.R | 4 ++-- R/zzz.R | 64 +++++++++++++++++++++++++++---------------------------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/R/Start.R b/R/Start.R index 6f602a5..5f4a295 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3440,8 +3440,8 @@ Start <- function(..., # dim = indices/selectors, # the appropriate work pieces. work_pieces <- retrieve_progress_message(work_pieces, num_procs, silent) -# NOTE: In .LoadDataFile(), metadata is saved in metadata_folder/1 or /2 etc. Before here, -# the path name is created in work_pieces but the path hasn't been built yet. + # NOTE: In .LoadDataFile(), metadata is saved in metadata_folder/1 or /2 etc. Before here, + # the path name is created in work_pieces but the path hasn't been built yet. if (num_procs == 1) { found_files <- lapply(work_pieces, .LoadDataFile, shared_matrix_pointer = shared_matrix_pointer, diff --git a/R/zzz.R b/R/zzz.R index a775b47..d5429d1 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1087,41 +1087,41 @@ create_metadata_list <- function(array_of_metadata_flags, metadata_dims, pattern #NOTE: Here, metadata can be saved in one of two ways: one for $common and the other for $dat # for $common, it is a list of metadata length. For $dat, it is a list of dat length, # and each sublist has the metadata for each dat. - dim_of_metadata <- dim(array_of_metadata_flags)[metadata_dims] - if (!any(names(dim_of_metadata) == pattern_dims) | - (any(names(dim_of_metadata) == pattern_dims) & - dim_of_metadata[pattern_dims] == 1)) { # put under $common; old code - return_metadata <- vector('list', - length = prod(dim_of_metadata)) - return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata - dim(return_metadata) <- dim_of_metadata - - } else { # put under $dat. metadata_dims has 'dat' and dat length > 1 - return_metadata <- vector('list', - length = dim_of_metadata[pattern_dims]) - names(return_metadata) <- dat_names - for (kk in 1:length(return_metadata)) { - return_metadata[[kk]] <- vector('list', length = prod(dim_of_metadata[-1])) # 1 is dat - } - loaded_metadata_count <- 1 - for (kk in 1:length(return_metadata)) { - for (jj in 1:length(return_metadata[[kk]])) { - - if (dataset_has_files[kk]) { - if (loaded_metadata_count %in% loaded_metadata_files) { - return_metadata[[kk]][jj] <- loaded_metadata[[loaded_metadata_count]] - names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[loaded_metadata_count]]) - } else { - return_metadata[[kk]][jj] <- NULL - } - loaded_metadata_count <- loaded_metadata_count + 1 - } else { - return_metadata[[kk]][jj] <- NULL - } - + dim_of_metadata <- dim(array_of_metadata_flags)[metadata_dims] + if (!any(names(dim_of_metadata) == pattern_dims) | + (any(names(dim_of_metadata) == pattern_dims) & + dim_of_metadata[pattern_dims] == 1)) { # put under $common; old code + return_metadata <- vector('list', + length = prod(dim_of_metadata)) + return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata + dim(return_metadata) <- dim_of_metadata + + } else { # put under $dat. metadata_dims has 'dat' and dat length > 1 + return_metadata <- vector('list', + length = dim_of_metadata[pattern_dims]) + names(return_metadata) <- dat_names + for (kk in 1:length(return_metadata)) { + return_metadata[[kk]] <- vector('list', length = prod(dim_of_metadata[-1])) # 1 is dat + } + loaded_metadata_count <- 1 + for (kk in 1:length(return_metadata)) { + for (jj in 1:length(return_metadata[[kk]])) { + if (dataset_has_files[kk]) { + if (loaded_metadata_count %in% loaded_metadata_files) { + return_metadata[[kk]][jj] <- loaded_metadata[[which(loaded_metadata_files == loaded_metadata_count)]] + names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[which(loaded_metadata_files == loaded_metadata_count)]]) + + } else { + return_metadata[[kk]][jj] <- NULL } + loaded_metadata_count <- loaded_metadata_count + 1 + } else { + return_metadata[[kk]][jj] <- NULL } + } + } + } return(return_metadata) } -- GitLab From 5834f9315400f9a541463b808c72b2e1babf210b Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 16 Aug 2021 16:48:13 +0200 Subject: [PATCH 09/10] Fix the warning and note from check --- DESCRIPTION | 3 ++- R/Start.R | 8 +++++--- R/zzz.R | 8 ++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 27d4a77..6ad8a64 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,7 +32,8 @@ Imports: PCICt Suggests: stats, - utils + utils, + testthat License: LGPL-3 URL: https://earth.bsc.es/gitlab/es/startR/ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues diff --git a/R/Start.R b/R/Start.R index 5f4a295..5571dae 100644 --- a/R/Start.R +++ b/R/Start.R @@ -950,7 +950,7 @@ Start <- function(..., # dim = indices/selectors, # Check if pattern_dims is the first item in metadata_dims if ((pattern_dims %in% metadata_dims) & metadata_dims[1] != pattern_dims) { - metadata_dims <- c(pattern_dims, metadata_dims[-which(metadata_dim == pattern_dims)]) + metadata_dims <- c(pattern_dims, metadata_dims[-which(metadata_dims == pattern_dims)]) } # Check if metadata_dims has more than 2 elements if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2)) { @@ -974,7 +974,8 @@ Start <- function(..., # dim = indices/selectors, dim_params[[found_pattern_dim]] <- dim_params[[found_pattern_dim]][dats_to_take] dat <- dim_params[[found_pattern_dim]] #NOTE: This function creates the object 'dat_names' - dat <- mount_dat(dat, pattern_dim, found_pattern_dim) + dat_names <- c() + dat <- mount_dat(dat, pattern_dims, found_pattern_dim, dat_names) dim_params[[found_pattern_dim]] <- dat_names @@ -3427,7 +3428,8 @@ Start <- function(..., # dim = indices/selectors, depending_file_dims = depending_file_dims, transform = transform, transform_vars = transform_vars, picked_vars = picked_vars[[i]], picked_vars_ordered = picked_vars_ordered[[i]], - picked_common_vars = picked_common_vars, + picked_common_vars = picked_common_vars, + picked_common_vars_ordered = picked_common_vars_ordered, metadata_folder = metadata_folder, debug = debug) } } diff --git a/R/zzz.R b/R/zzz.R index d5429d1..b045e85 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -232,11 +232,11 @@ found_pattern_dims <- function(pattern_dims, dim_names, var_params, # The variable 'dat' is mounted with the information (name, path) of each dataset. # NOTE: This function creates the object 'dat_names' in the parent env. -mount_dat <- function(dat, pattern_dim, found_pattern_dim) { +mount_dat <- function(dat, pattern_dims, found_pattern_dim, dat_names) { # dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames') dat_to_fetch <- c() - dat_names <- c() +# dat_names <- c() if (!is.list(dat)) { dat <- as.list(dat) } else { @@ -252,7 +252,7 @@ mount_dat <- function(dat, pattern_dim, found_pattern_dim) { dat[[i]] <- list(name = dat[[i]]) } } else if (!is.list(dat[[i]])) { - stop(paste0("Parameter '", pattern_dim, + stop(paste0("Parameter '", pattern_dims, "' is incorrect. It must be a list of lists or character strings.")) } #if (!(all(names(dat[[i]]) %in% dat_info_names))) { @@ -770,7 +770,7 @@ build_work_pieces <- function(work_pieces, i, selectors, file_dims, inner_dims, array_of_not_found_files, array_of_metadata_flags, metadata_file_counter, depending_file_dims, transform, transform_vars, picked_vars, picked_vars_ordered, picked_common_vars, - metadata_folder, debug = debug) { + picked_common_vars_ordered, metadata_folder, debug = debug) { sub_array_dims <- final_dims[file_dims] sub_array_dims[found_pattern_dim] <- 1 sub_array_of_files_to_load <- array(1:prod(sub_array_dims), -- GitLab From e054a9814ebc106e4701fbbe5727fc6397356701 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 16 Aug 2021 17:37:41 +0200 Subject: [PATCH 10/10] Remove lubridate dependency in tests --- tests/testthat.R | 4 ++++ .../test-Start-implicit_dependency_by_selector.R | 13 +++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/testthat.R b/tests/testthat.R index d424073..5073b5e 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,5 +1,9 @@ library(testthat) library(startR) +library(SpecsVerification) +library(dplyr) +library(plyr) +library(s2dv) test_check("startR") diff --git a/tests/testthat/test-Start-implicit_dependency_by_selector.R b/tests/testthat/test-Start-implicit_dependency_by_selector.R index bcc5ac1..5c2f050 100644 --- a/tests/testthat/test-Start-implicit_dependency_by_selector.R +++ b/tests/testthat/test-Start-implicit_dependency_by_selector.R @@ -79,19 +79,16 @@ data2[1, 1, 1, 1, ] test_that("2. time depends on sdate", { -library(lubridate) - repos <- '/esarchive/exp/ecmwf/system4_m1/daily_mean/$var$_f24h/$var$_$sdate$.nc' - -sdates <- ymd("20010501") + rep(years(0:2), each = 1) -times <- array(ymd("20010501") + days(0:30) + rep(years(0:2), each = 31), - dim = c(time = 31, sdate = 3)) -times <- as.POSIXct(times * 86400, tz = 'UTC', origin = '1970-01-01') +sdates <- paste0(2001:2003, '0501') +tmp <- as.POSIXct(sapply(2001:2003, function(x) paste0(x, '-05-', sprintf('%02d', 1:31))), tz = 'UTC') +tmp <- array(tmp, dim = c(time = 31, sdate = 3)) +times <- as.POSIXct(tmp, tz = 'UTC', origin = '1970-01-01') suppressWarnings( exp <- Start(dat = repos, var = 'tos', - sdate = format(sdates, "%Y%m%d"), + sdate = sdates, time = times, #dim: [time = 31, sdate = 3]. time is corresponding to each sdate ensemble = indices(1:2), lat = indices(1:3), -- GitLab