diff --git a/R/Start.R b/R/Start.R index 29f7db64c6f071bb798ddf22b9fa59bb6e407f16..fc05ee5495ffabf03263108cc8c84589d7f77282 100644 --- a/R/Start.R +++ b/R/Start.R @@ -23,7 +23,8 @@ Start <- function(..., # dim = indices/selectors, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, retrieve = FALSE, - num_procs = 1, silent = FALSE, debug = FALSE) { + num_procs = 1, + silent = FALSE, debug = FALSE) { #, config_file = NULL #dictionary_dim_names = , #dictionary_var_names = @@ -2972,6 +2973,7 @@ print(str(sub_array)) metadata <- attr(sub_array, 'variables') + names_bk <- names(store_indices) store_indices <- lapply(names(store_indices), function (x) { if (!(x %in% names(first_round_indices))) { @@ -2993,6 +2995,7 @@ print(str(sub_array)) } } }) + names(store_indices) <- names_bk if (debug) { if (all(unlist(store_indices) == 1)) { print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:") @@ -3004,17 +3007,77 @@ print(str(store_indices)) } } - # Converting array indices to vector indices + store_indices <- lapply(store_indices, as.integer) store_dims <- work_piece[['store_dims']] - store_indices <- do.call('expand.grid', store_indices) - ### Given a matrix where each row is a set of array indices of an element - ### the vector indices are computed + + # split the storage work of the loaded subset in parts + largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))] + max_parts <- length(store_indices[[largest_dim_name]]) + + # Indexing a data file of N MB with expand.grid takes 30*N MB + # The peak ram of Start is, minimum, 2 * total data to load from all files + # due to inefficiencies in other regions of the code + # The more parts we split the indexing done below in, the lower + # the memory footprint of the indexing and the fast. + # But more than 10 indexing iterations (parts) for each MB processed + # makes the iteration slower (tested empirically on BSC workstations). + subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024 + best_n_parts <- ceiling(subset_size_in_mb * 10) + # We want to set n_parts to a greater value than the one that would + # result in a memory footprint (of the subset indexing code below) equal + # to 2 * total data to load from all files. + # s = subset size in MB + # p = number of parts to break it in + # T = total size of data to load + # then, s / p * 30 = 2 * T + # then, p = s * 15 / T + min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims)) + # Make sure we pick n_parts much greater than the minimum calculated + n_parts <- min_n_parts * 10 + if (n_parts > best_n_parts) { + n_parts <- best_n_parts + } + # Boundary checks + if (n_parts < 1) { + n_parts <- 1 + } + if (n_parts > max_parts) { + n_parts <- max_parts + } + + if (n_parts > 1) { + make_parts <- function(length, n) { + clusters <- cut(1:length, n, labels = FALSE) + lapply(1:n, function(y) which(clusters == y)) + } + part_indices <- make_parts(max_parts, n_parts) + parts <- lapply(part_indices, + function(x) { + store_indices[[largest_dim_name]][x] + }) + } else { + part_indices <- list(1:max_parts) + parts <- store_indices[largest_dim_name] + } + + # do the storage work weights <- sapply(1:length(store_dims), function(i) prod(c(1, store_dims)[1:i])) - matrix_indices <- 1 + colSums(t(store_indices - 1) * weights) - ### + part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array)))) + names(part_indices_in_sub_array) <- names(dim(sub_array)) data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) - data_array[matrix_indices] <- as.vector(sub_array) + for (i in 1:n_parts) { + store_indices[[largest_dim_name]] <- parts[[i]] + # Converting array indices to vector indices + matrix_indices <- do.call("expand.grid", store_indices) + # Given a matrix where each row is a set of array indices of an element + # the vector indices are computed + matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights) + part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]] + data_array[matrix_indices] <- as.vector(do.call('[', + c(list(x = sub_array), + part_indices_in_sub_array))) + } rm(data_array) gc()