From 981e5fc9ef22be29206c191942c8d7e658ef6662 Mon Sep 17 00:00:00 2001 From: Roberto Serrano-Notivoli Date: Fri, 15 Jun 2018 09:54:25 +0200 Subject: [PATCH 1/3] Progress. Reducing memory consumption. Very slow. --- R/Start.R | 55 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/R/Start.R b/R/Start.R index 1462c17..72253a7 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2940,20 +2940,49 @@ print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:") print(str(store_indices)) } } + ## Converting array indices to vector indices + #store_dims <- work_piece[['store_dims']] + #store_indices <- do.call('expand.grid', store_indices) + #### Given a matrix where each row is a set of array indices of an element + #### the vector indices are computed + #weights <- sapply(1:length(store_dims), + # function(i) prod(c(1, store_dims)[1:i])) + #matrix_indices <- 1 + colSums(t(store_indices - 1) * weights) + #### + #data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) + #data_array[matrix_indices] <- as.vector(sub_array) + #rm(data_array) + #gc() - # Converting array indices to vector indices - store_dims <- work_piece[['store_dims']] - store_indices <- do.call('expand.grid', store_indices) - ### Given a matrix where each row is a set of array indices of an element - ### the vector indices are computed - weights <- sapply(1:length(store_dims), - function(i) prod(c(1, store_dims)[1:i])) - matrix_indices <- 1 + colSums(t(store_indices - 1) * weights) - ### - data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) - data_array[matrix_indices] <- as.vector(sub_array) - rm(data_array) - gc() +######new code +store_dims <- work_piece[['store_dims']] +sub_array_vec <- as.vector(sub_array) +total_length <- prod(store_dims) +w <- sapply(1:length(store_dims), function(i) prod(c(store_dims)[1:i])) +f_sel <- function(j, w = w, h = h, store_indices, total_length){ + if(length(store_indices[[j]]) == 1) return(store_indices[[j]]) else{ + rr <- rep(sort(rep(store_indices[[j]], w[j-1])), length.out = total_length) + return(rr[h]) + } +} +lsd <- 1:length(store_dims) +for(h in 1:total_length){ + #select the correct sequence of indices + pos <- sapply(lsd, f_sel, w, h, store_indices, total_length) + if(mean(pos) == 1) res <- rep(1, length(lsd)) else{ + #computes the index in single number mode + i <- length(lsd) + while(i <= length(store_dims)){ + if(pos[i] == 1) i <- i - 1 else{ + res <- prod(c(store_dims[1:(i-1)], (pos[i]-1))) + pos[i-1] + i <- length(store_dims) + 1 + } + } + } + #insert the data in the big.matrix + data_array[res] <- sub_array_vec[h] +} +###### if (!is.null(work_piece[['save_metadata_in']])) { saveRDS(metadata, file = work_piece[['save_metadata_in']]) -- GitLab From bc08f48b39986af5d6991637eb248fc43c5d40dc Mon Sep 17 00:00:00 2001 From: Roberto Serrano-Notivoli Date: Wed, 20 Jun 2018 10:20:07 +0200 Subject: [PATCH 2/3] Memory consumption reduced --- R/Start.R | 58 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/R/Start.R b/R/Start.R index 72253a7..a0b1fa1 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2941,7 +2941,7 @@ print(str(store_indices)) } } ## Converting array indices to vector indices - #store_dims <- work_piece[['store_dims']] + store_dims <- work_piece[['store_dims']] #store_indices <- do.call('expand.grid', store_indices) #### Given a matrix where each row is a set of array indices of an element #### the vector indices are computed @@ -2954,34 +2954,36 @@ print(str(store_indices)) #rm(data_array) #gc() -######new code -store_dims <- work_piece[['store_dims']] -sub_array_vec <- as.vector(sub_array) -total_length <- prod(store_dims) -w <- sapply(1:length(store_dims), function(i) prod(c(store_dims)[1:i])) -f_sel <- function(j, w = w, h = h, store_indices, total_length){ - if(length(store_indices[[j]]) == 1) return(store_indices[[j]]) else{ - rr <- rep(sort(rep(store_indices[[j]], w[j-1])), length.out = total_length) - return(rr[h]) - } -} -lsd <- 1:length(store_dims) -for(h in 1:total_length){ - #select the correct sequence of indices - pos <- sapply(lsd, f_sel, w, h, store_indices, total_length) - if(mean(pos) == 1) res <- rep(1, length(lsd)) else{ - #computes the index in single number mode - i <- length(lsd) - while(i <= length(store_dims)){ - if(pos[i] == 1) i <- i - 1 else{ - res <- prod(c(store_dims[1:(i-1)], (pos[i]-1))) + pos[i-1] - i <- length(store_dims) + 1 - } - } - } - #insert the data in the big.matrix - data_array[res] <- sub_array_vec[h] +########################### +#split the work in parts +wl <- which.max(store_dims) +partition <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE)) +#less than 5 parts: increase of RAM consumption + #mor than 5 parts: non-significant reduction of RAM consumption +parts <- partition(store_indices[[wl]], 5) +for(i in 1:5){ + #print(paste(i, 'of 5')) + #compute indices in data_array + stind <- store_indices + stind[[wl]] <- parts[[i]] + stind <- do.call("expand.grid", stind) + stdims <- store_dims + stdims[[wl]] <- length(parts[[i]]) + weights <- sapply(1:length(stdims), function(i) prod(c(1, stdims)[1:i])) + matrix_indices <- 1 + colSums(t(stind - 1) * weights) + #compute length of indices in sub_array + stdims_sa <- sapply(first_round_indices, length) + #indices in sub_array + stind_sa <- stind[,match(names(first_round_indices),names(stdims))] + weights_sa <- sapply(1:length(stdims_sa), function(i) prod(c(1, stdims_sa)[1:i])) + matrix_indices_sa <- 1 + colSums(t(stind_sa - 1) * weights_sa) + #load data_array and assign data from sub_array + data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) + data_array[matrix_indices] <- as.vector(sub_array)[matrix_indices_sa] + rm(data_array) + gc() } + ###### if (!is.null(work_piece[['save_metadata_in']])) { -- GitLab From 5814009a966738be9e3f962757ecaef6d90e372c Mon Sep 17 00:00:00 2001 From: Nicolau Manubens Date: Sat, 19 Jan 2019 00:27:21 +0100 Subject: [PATCH 3/3] Improved method for computing store indices + automatic adjustment of parameters for that method. --- R/Start.R | 118 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 43 deletions(-) diff --git a/R/Start.R b/R/Start.R index 212ae02..fc05ee5 100644 --- a/R/Start.R +++ b/R/Start.R @@ -23,7 +23,8 @@ Start <- function(..., # dim = indices/selectors, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, retrieve = FALSE, - num_procs = 1, silent = FALSE, debug = FALSE) { + num_procs = 1, + silent = FALSE, debug = FALSE) { #, config_file = NULL #dictionary_dim_names = , #dictionary_var_names = @@ -2972,6 +2973,7 @@ print(str(sub_array)) metadata <- attr(sub_array, 'variables') + names_bk <- names(store_indices) store_indices <- lapply(names(store_indices), function (x) { if (!(x %in% names(first_round_indices))) { @@ -2993,6 +2995,7 @@ print(str(sub_array)) } } }) + names(store_indices) <- names_bk if (debug) { if (all(unlist(store_indices) == 1)) { print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:") @@ -3003,51 +3006,80 @@ print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:") print(str(store_indices)) } } - ## Converting array indices to vector indices + + store_indices <- lapply(store_indices, as.integer) store_dims <- work_piece[['store_dims']] - #store_indices <- do.call('expand.grid', store_indices) - #### Given a matrix where each row is a set of array indices of an element - #### the vector indices are computed - #weights <- sapply(1:length(store_dims), - # function(i) prod(c(1, store_dims)[1:i])) - #matrix_indices <- 1 + colSums(t(store_indices - 1) * weights) - #### - #data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) - #data_array[matrix_indices] <- as.vector(sub_array) - #rm(data_array) - #gc() -########################### -#split the work in parts -wl <- which.max(store_dims) -partition <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE)) -#less than 5 parts: increase of RAM consumption - #mor than 5 parts: non-significant reduction of RAM consumption -parts <- partition(store_indices[[wl]], 5) -for(i in 1:5){ - #print(paste(i, 'of 5')) - #compute indices in data_array - stind <- store_indices - stind[[wl]] <- parts[[i]] - stind <- do.call("expand.grid", stind) - stdims <- store_dims - stdims[[wl]] <- length(parts[[i]]) - weights <- sapply(1:length(stdims), function(i) prod(c(1, stdims)[1:i])) - matrix_indices <- 1 + colSums(t(stind - 1) * weights) - #compute length of indices in sub_array - stdims_sa <- sapply(first_round_indices, length) - #indices in sub_array - stind_sa <- stind[,match(names(first_round_indices),names(stdims))] - weights_sa <- sapply(1:length(stdims_sa), function(i) prod(c(1, stdims_sa)[1:i])) - matrix_indices_sa <- 1 + colSums(t(stind_sa - 1) * weights_sa) - #load data_array and assign data from sub_array - data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) - data_array[matrix_indices] <- as.vector(sub_array)[matrix_indices_sa] - rm(data_array) - gc() -} + # split the storage work of the loaded subset in parts + largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))] + max_parts <- length(store_indices[[largest_dim_name]]) + + # Indexing a data file of N MB with expand.grid takes 30*N MB + # The peak ram of Start is, minimum, 2 * total data to load from all files + # due to inefficiencies in other regions of the code + # The more parts we split the indexing done below in, the lower + # the memory footprint of the indexing and the fast. + # But more than 10 indexing iterations (parts) for each MB processed + # makes the iteration slower (tested empirically on BSC workstations). + subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024 + best_n_parts <- ceiling(subset_size_in_mb * 10) + # We want to set n_parts to a greater value than the one that would + # result in a memory footprint (of the subset indexing code below) equal + # to 2 * total data to load from all files. + # s = subset size in MB + # p = number of parts to break it in + # T = total size of data to load + # then, s / p * 30 = 2 * T + # then, p = s * 15 / T + min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims)) + # Make sure we pick n_parts much greater than the minimum calculated + n_parts <- min_n_parts * 10 + if (n_parts > best_n_parts) { + n_parts <- best_n_parts + } + # Boundary checks + if (n_parts < 1) { + n_parts <- 1 + } + if (n_parts > max_parts) { + n_parts <- max_parts + } + + if (n_parts > 1) { + make_parts <- function(length, n) { + clusters <- cut(1:length, n, labels = FALSE) + lapply(1:n, function(y) which(clusters == y)) + } + part_indices <- make_parts(max_parts, n_parts) + parts <- lapply(part_indices, + function(x) { + store_indices[[largest_dim_name]][x] + }) + } else { + part_indices <- list(1:max_parts) + parts <- store_indices[largest_dim_name] + } -###### + # do the storage work + weights <- sapply(1:length(store_dims), + function(i) prod(c(1, store_dims)[1:i])) + part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array)))) + names(part_indices_in_sub_array) <- names(dim(sub_array)) + data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) + for (i in 1:n_parts) { + store_indices[[largest_dim_name]] <- parts[[i]] + # Converting array indices to vector indices + matrix_indices <- do.call("expand.grid", store_indices) + # Given a matrix where each row is a set of array indices of an element + # the vector indices are computed + matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights) + part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]] + data_array[matrix_indices] <- as.vector(do.call('[', + c(list(x = sub_array), + part_indices_in_sub_array))) + } + rm(data_array) + gc() if (!is.null(work_piece[['save_metadata_in']])) { saveRDS(metadata, file = work_piece[['save_metadata_in']]) -- GitLab