diff --git a/R/Apply.R b/R/Apply.R index 32f725b69febe8389a701329a0468c15e83a87ef..ba213f58993616dcfa92819f1114305fb872b74a 100644 --- a/R/Apply.R +++ b/R/Apply.R @@ -4,10 +4,12 @@ #' #' @param data One or a list of vectors, matrices or arrays. They must be in the same order as expected by the function provided in the parameter 'fun'. The dimensions do not necessarily have to be ordered. If the 'target_dims' require a different order than the provided, \code{Apply} will automatically reorder the dimensions as needed. #' @param target_dims One or a list of vectors (or NULLs) containing the dimensions to be input into fun for each of the objects in the data. If a single vector of target dimensions is specified and multiple inputs are provided in 'data, then the single set of target dimensions is re-used for all of the inputs. These vectors can contain either integers specifying the position of the dimensions, or character strings corresponding to the dimension names. This parameter is mandatory if 'margins' are not specified. If both 'margins' and 'target_dims' are specified, 'margins' takes priority. -#' @param fun Function to be applied to the arrays. Must receive as many inputs as provided in 'data', each with as many dimensions as specified in 'target_dims' or as the total number of dimensions in 'data' minus the ones specified in 'margins'. The function can receive other additional fixed parameters (see parameter '...' of \code{Apply}). The function can return one or a list of vectors or multidimensional arrays, optionally with dimension names which will be propagated to the final result. The returned list can optionally be named, with a name for each output, which will be propagated to the resulting array. The function can optionally be provided with the attributes 'target_dims' and 'output_dims'. In that case, the corresponding parameters of \code{Apply} do not need to be provided. The function can expect named dimensions for each of its inputs, in the same order as specified in 'target_dims' or, if no 'target_dims' have been provided, in the same order as provided in 'data'. +#' @param fun Function to be applied to the arrays. Must receive as many inputs as provided in 'data', each with as many dimensions as specified in 'target_dims' or as the total number of dimensions in 'data' minus the ones specified in 'margins'. The function can receive other additional fixed parameters (see parameter '...' of \code{Apply}). The function can return one or a list of vectors or multidimensional arrays, optionally with dimension names which will be propagated to the final result. The returned list can optionally be named, with a name for each output, which will be propagated to the resulting array. The function can optionally be provided with the attributes 'target_dims' and 'output_dims'. In that case, the corresponding parameters of \code{Apply} do not need to be provided. The function can expect named dimensions for each of its inputs, in the same order as specified in 'target_dims' or, if no 'target_dims' have been provided, in the same order as provided in 'data'. The function can access the variable \code{.margin_indices}, a named numeric vector that provides the indices of the current iteration over the margins, as well as any other variables specified in the parameter \code{extra_info} or input attributes specified in the parameter \code{use_attributes}. #' @param ... Additional fixed arguments expected by the function provided in the parameter 'fun'. #' @param output_dims Optional list of vectors containing the names of the dimensions to be output from the fun for each of the objects it returns (or a single vector if the function has only one output). #' @param margins One or a list of vectors (or NULLs) containing the 'margin' dimensions to be looped over for each input in 'data'. If a single vector of margins is specified and multiple inputs are provided in 'data', then the single set of margins is re-used for all of the inputs. These vectors can contain either integers specifying the position of the margins, or character strings corresponding to the dimension names. If both 'margins' and 'target_dims' are specified, 'margins' takes priority. +#' @param use_attributes List of vectors of character strings with names of attributes of each object in 'data' to be propagated to the subsets of data sent as inputs to the function specified in 'fun'. If this parameter is not specified (NULL), all attributes are dropped. This parameter can be specified as a named list (then the names of this list must match those of the names of parameter 'data'), or as an unnamed list (then the vectors of attribute names will be assigned in order to the input arrays in 'data'). +#' @param extra_info Named list of extra variables to be defined for them to be accessible from within the function specified in 'fun'. The variable names will automatically be prepended a heading dot ('.'). So, if the variable 'name = "Tony"' is sent through this parameter, it will be accessible from within 'fun' via '.name'. #' @param guess_dim_names Whether to automatically guess missing dimension names for dimensions of equal length across different inputs in 'data' with a warning (TRUE; default), or to crash whenever unnamed dimensions of equa length are identified across different inputs (FALSE). #' @param ncores The number of parallel processes to spawn for the use for parallel computation in multiple cores. #' @param split_factor Factor telling to which degree the input data should be split into smaller pieces to be processed by the available cores. By default (split_factor = 1) the data is split into 4 pieces for each of the cores (as specified in ncores). A split_factor of 2 will result in 8 pieces for each of the cores, and so on. The special value 'greatest' will split the input data into as many pieces as possible. @@ -32,6 +34,7 @@ #' @importFrom utils capture.output Apply <- function(data, target_dims = NULL, fun, ..., output_dims = NULL, margins = NULL, + use_attributes = NULL, extra_info = NULL, guess_dim_names = TRUE, ncores = NULL, split_factor = 1) { # Check data @@ -295,6 +298,102 @@ Apply <- function(data, target_dims = NULL, fun, ..., } } + # Check use_attributes + if (!is.null(use_attributes)) { + if (!is.list(use_attributes)) { + stop("Parameter 'use_attributes' must be a list.") + } + if (is.null(names(data)) && !is.null(names(use_attributes))) { + warning("Parameter 'use_attributes' provided with names, but ", + "no names provided for 'data'. All names will be ", + "disregarded.") + names(use_attributes) <- NULL + } + if (!is.null(names(use_attributes))) { + if (!all(sapply(names(use_attributes), function(x) nchar(x) > 0))) { + stop("If providing names for the list 'use_attributes', all ", + "components must be named.") + } + if (length(unique(names(use_attributes))) != + length(names(use_attributes))) { + stop("The list in parameter 'use_attributes' must not ", + "contain repeated names.") + } + if (any(!(names(use_attributes) %in% names(data)))) { + stop("Provided some names in parameter 'use_attributes' not present ", + "in parameter 'data'.") + } + use_attributes <- use_attributes[names(data)] + } else { + if (length(use_attributes) != length(data)) { + warning("Provided different number of items in 'use_attributes' ", + "and in 'data'. Assuming same order.") + } + use_attributes <- use_attributes[1:length(data)] + } + } else { + use_attributes <- vector('list', length = length(data)) + } + for (i in 1:length(data)) { + if (is.character(use_attributes[[i]])) { + use_attributes[[i]] <- as.list(use_attributes[[i]]) + } + if (is.list(use_attributes[[i]])) { + if (length(use_attributes[[i]]) == 0) { + use_attributes[i] <- list(NULL) + } else { + if (!all(sapply(use_attributes[[i]], + function(x) all(is.character(x) & nchar(x) > 0)))) { + stop("All entries in 'use_attributes' must be character strings ", + "of length > 0.") + } + } + } else if (!is.null(use_attributes[[i]])) { + stop("Parameter 'use_attributes' must be a list of character vectors or ", + "a list of lists of character vectors.") + } + for (j in seq_along(use_attributes[[i]])) { + if (length(use_attributes[[i]][[j]]) == 1 && + use_attributes[[i]][[j]] == 'dim') { + stop("Requesting the attribute 'dim' via the parameter ", + "'use_attributes' is forbidden.") + } + found_entry <- FALSE + entry <- try({`[[`(attributes(data[[i]]), + use_attributes[[i]][[j]])}, silent = TRUE) + if ('try-error' %in% class(entry)) { + stop("Parameter 'use_attributes' contains some attribute names ", + "that are not present in the attributes of the corresponding ", + "object in parameter 'data'.") + } + } + } + + # Check extra_info + if (is.null(extra_info)) { + extra_info <- list() + } + raise_error <- FALSE + if (!is.list(extra_info)) { + raise_error <- TRUE + } else if (length(extra_info) > 0) { + if (is.null(names(extra_info))) { + raise_error <- TRUE + } + if (any(sapply(names(extra_info), function(x) nchar(x) == 0))) { + raise_error <- TRUE + } + names(extra_info) <- paste0('.', names(extra_info)) + } + if (raise_error) { + stop("Parameter 'extra_info' must be a list with all components named.") + } + + # Check guess_dim_names + if (!is.logical(guess_dim_names)) { + stop("Parameter 'guess_dim_names' must be logical.") + } + # Check ncores if (is.null(ncores)) { ncores <- 1 @@ -407,6 +506,13 @@ Apply <- function(data, target_dims = NULL, fun, ..., chunk_sizes <- c(chunk_sizes, total_size %% chunk_size) } + fun_env <- new.env() + for (i in seq_along(extra_info)) { + assign(names(extra_info)[i], extra_info[[i]], envir = fun_env) + } + environment(fun) <- fun_env + splatted_f <- splat(fun) + input_margin_weights <- vector('list', length(data)) for (i in 1:length(data)) { marg_sizes <- dim(data[[i]])[margins[[i]]] @@ -415,7 +521,6 @@ Apply <- function(data, target_dims = NULL, fun, ..., } # TODO: need to add progress bar - splatted_f <- splat(fun) # For a selected use case, these are the timings: # - total: 17 s # - preparation + post: 1 s @@ -435,10 +540,29 @@ Apply <- function(data, target_dims = NULL, fun, ..., names(first_marg_indices) <- names(mad) sub_arrays_of_results <- list() found_first_sub_result <- FALSE + attributes_to_send <- vector('list', length = length(data)) iteration_indices_to_take <- list() for (i in 1:length(data)) { iteration_indices_to_take[[i]] <- as.list(rep(TRUE, length(dim(data[[i]])))) names(iteration_indices_to_take[[i]]) <- names(dim(data[[i]])) + if (length(use_attributes[[i]]) > 0) { + attributes_to_send[[i]] <- list() + for (j in seq_along(use_attributes[[i]])) { + found_entry <- FALSE + entry <- try({`[[`(attributes(data[[i]]), + use_attributes[[i]][[j]]) + }, silent = TRUE) + if ('try-error' %in% class(entry)) { + stop("Unexpected error with the attributes of the inputs.") + } + save_string <- "attributes_to_send[[i]]" + access_string <- "`[[`(attributes(data[[i]]), use_attributes[[i]][[j]])" + for (k in seq_along(use_attributes[[i]][[j]])) { + save_string <- paste0(save_string, '$', use_attributes[[i]][[j]][[k]]) + } + eval(parse(text = paste(save_string, '<-', access_string))) + } + } } add_one_multidim <- function(index, dims) { @@ -485,14 +609,22 @@ Apply <- function(data, target_dims = NULL, fun, ..., #if only one dim remains, make as.vector } } - } - if (!is.null(mad)) { - first_marg_indices <- add_one_multidim(first_marg_indices, mad) + attributes(iteration_input[[i]]) <- c(attributes(iteration_input[[i]]), + attributes_to_send[[i]]) } + assign(".margin_indices", + setNames(as.integer(first_marg_indices), + names(first_marg_indices)), + envir = fun_env) + # SPLATTED_F result <- splatted_f(iteration_input, ...) + if (!is.null(mad)) { + first_marg_indices <- add_one_multidim(first_marg_indices, mad) + } + # SUB-ITERATION OUTRO if (!is.list(result)) { result <- list(result) diff --git a/R/zzz.R b/R/zzz.R index 2bf747e7d38039995b33abe6636e771e6f2f30f5..662624d9cb94cff6436d71016703c558c974c579 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,6 +1,10 @@ # Function to permute arrays of non-atomic elements (e.g. POSIXct) .aperm2 <- function(x, new_order) { old_dims <- dim(x) + attr_bk <- attributes(x) + if ('dim' %in% names(attr_bk)) { + attr_bk[['dim']] <- NULL + } if (is.numeric(x)) { x <- aperm(x, new_order) } else { @@ -9,5 +13,6 @@ x <- x[as.vector(y)] } dim(x) <- old_dims[new_order] + attributes(x) <- c(attributes(x), attr_bk) x } diff --git a/man/Apply.Rd b/man/Apply.Rd index 4fe37e951ce203ef6ca4b605a45d0b86cd41c5c7..b93c173352638ae4b7e37aa65e42b1cc72f41b63 100644 --- a/man/Apply.Rd +++ b/man/Apply.Rd @@ -5,8 +5,8 @@ \title{Apply Functions to Multiple Multidimensional Arrays or Vectors} \usage{ Apply(data, target_dims = NULL, fun, ..., output_dims = NULL, - margins = NULL, guess_dim_names = TRUE, ncores = NULL, - split_factor = 1) + margins = NULL, use_attributes = NULL, extra_info = NULL, + guess_dim_names = TRUE, ncores = NULL, split_factor = 1) } \arguments{ \item{data}{One or a list of numeric object (vector, matrix or array). They must be in the same order as expected by the function provided in the parameter 'fun'. The dimensions do not necessarily have to be ordered. If the 'target_dims' require a different order than the provided, \code{Apply} will automatically reorder the dimensions as needed.} @@ -21,6 +21,10 @@ Apply(data, target_dims = NULL, fun, ..., output_dims = NULL, \item{margins}{One or a list of vectors (or NULLs) containing the 'margin' dimensions to be looped over for each input in 'data'. If a single vector of margins is specified and multiple inputs are provided in 'data', then the single set of margins is re-used for all of the inputs. These vectors can contain either integers specifying the position of the margins, or character strings corresponding to the dimension names. If both 'margins' and 'target_dims' are specified, 'margins' takes priority.} +\item{use_attributes}{List of vectors of character strings with names of attributes of each object in 'data' to be propagated to the subsets of data sent as inputs to the function specified in 'fun'. If this parameter is not specified (NULL), all attributes are dropped. This parameter can be specified as a named list (then the names of this list must match those of the names of parameter 'data'), or as an unnamed list (then the vectors of attribute names will be assigned in order to the input arrays in 'data').} + +\item{extra_info}{Named list of extra variables to be defined for them to be accessible from within the function specified in 'fun'. The variable names will automatically be prepended a heading dot ('.'). So, if the variable 'name = "Tony"' is sent through this parameter, it will be accessible from within 'fun' via '.name'.} + \item{guess_dim_names}{Whether to automatically guess missing dimension names for dimensions of equal length across different inputs in 'data' with a warning (TRUE; default), or to crash whenever unnamed dimensions of equa length are identified across different inputs (FALSE).} \item{ncores}{The number of parallel processes to spawn for the use for parallel computation in multiple cores.} diff --git a/tests/testthat/test-use-cases.R b/tests/testthat/test-use-cases.R index 62d3399d0b95601815edd2ac4a3e5160a07d2fa3..6dd8d87a2605b5f2306e23f747ca8d6b28757e0f 100644 --- a/tests/testthat/test-use-cases.R +++ b/tests/testthat/test-use-cases.R @@ -1255,6 +1255,80 @@ test_that("real use case - standardization", { }) +# Test margin indices and extra info +test_that("Margin indices and extra info are provided correctly.", { + a <- array(1:prod(1:6), dim = c(a = 1, b = 2, c = 3, d = 4, e = 5, f = 6)) + b <- array(1:prod(c(1, 2, 3, 5, 6)), dim = c(a = 1, b = 2, c = 3, e = 5, f = 6)) + + attr(a, 'test_attr_a') <- 'test_a' + attr(b, 'test_attr_b') <- list(x = 1, z = 2) + + f <- function(a, b) { + stopifnot(length(.margin_indices) == 3) + stopifnot(identical(names(.margin_indices), c('a', 'e', 'f'))) + stopifnot(all(is.integer(.margin_indices))) + stopifnot(identical(.test_info, 'test')) + stopifnot(!is.null(attr(a, 'test_attr_a'))) + stopifnot(identical(attr(a, 'test_attr_a'), 'test_a')) + stopifnot(!is.null(attr(b, 'test_attr_b'))) + stopifnot(identical(attr(b, 'test_attr_b'), list(x = 1, z = 2))) + } + + r <- multiApply::Apply(list(a, b), + list(c('b', 'c', 'd'), + c('b', 'c')), + extra_info = list(test_info = 'test'), + use_attributes = list(a = 'test_attr_a', + b = 'test_attr_b'), + f) + + r <- multiApply::Apply(list(a = a, b = b), + list(c('b', 'c', 'd'), + c('b', 'c')), + extra_info = list(test_info = 'test'), + use_attributes = list(a = 'test_attr_a', + b = 'test_attr_b'), + f) + + r <- multiApply::Apply(list(a = a, b = b), + list(c('b', 'c', 'd'), + c('b', 'c')), + extra_info = list(test_info = 'test'), + use_attributes = list(b = 'test_attr_b', + a = 'test_attr_a'), + f) + + attr(b, 'test_attr_b') <- list(x = 1, z = 2) + attr(b, 'z') <- 3 + + f <- function(a, b) { + stopifnot(identical(attr(b, 'test_attr_b')$z, 2)) + stopifnot(identical(attr(b, 'z'), 3)) + } + + r <- multiApply::Apply(list(a = a, b = b), + list(c('b', 'c', 'd'), + c('b', 'c')), + extra_info = list(test_info = 'test'), + use_attributes = list(b = c('test_attr_b', 'z'), + a = 'test_attr_a'), + f) + + f <- function(a, b) { + stopifnot(identical(attr(b, 'test_attr_b')$z, 2)) + stopifnot(is.null(attr(b, 'test_attr_b')$x)) + stopifnot(is.null(attr(b, 'z'))) + } + + r <- multiApply::Apply(list(a = a, b = b), + list(c('b', 'c', 'd'), + c('b', 'c')), + extra_info = list(test_info = 'test'), + use_attributes = list(b = list(c('test_attr_b', 'z')), + a = 'test_attr_a'), + f) +}) + # Test .aperm2 test_that(".aperm2", { data <- seq(as.POSIXct('1990-11-01'),