From 981e5fc9ef22be29206c191942c8d7e658ef6662 Mon Sep 17 00:00:00 2001
From: Roberto Serrano-Notivoli <roberto.serrano@bsc.es>
Date: Fri, 15 Jun 2018 09:54:25 +0200
Subject: [PATCH 1/3] Progress. Reducing memory consumption. Very slow.

---
 R/Start.R | 55 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/R/Start.R b/R/Start.R
index 1462c17..72253a7 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -2940,20 +2940,49 @@ print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:")
 print(str(store_indices))
 }
 }
+    ## Converting array indices to vector indices
+    #store_dims <- work_piece[['store_dims']]
+    #store_indices <- do.call('expand.grid', store_indices)
+    #### Given a matrix where each row is a set of array indices of an element
+    #### the vector indices are computed
+    #weights <- sapply(1:length(store_dims), 
+    #                  function(i) prod(c(1, store_dims)[1:i]))
+    #matrix_indices <- 1 + colSums(t(store_indices - 1) * weights)
+    ####
+    #data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
+    #data_array[matrix_indices] <- as.vector(sub_array)
+    #rm(data_array)
+    #gc()
 
-    # Converting array indices to vector indices
-    store_dims <- work_piece[['store_dims']]
-    store_indices <- do.call('expand.grid', store_indices)
-    ### Given a matrix where each row is a set of array indices of an element
-    ### the vector indices are computed
-    weights <- sapply(1:length(store_dims), 
-                      function(i) prod(c(1, store_dims)[1:i]))
-    matrix_indices <- 1 + colSums(t(store_indices - 1) * weights)
-    ###
-    data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
-    data_array[matrix_indices] <- as.vector(sub_array)
-    rm(data_array)
-    gc()
+######new code
+store_dims <- work_piece[['store_dims']]
+sub_array_vec <- as.vector(sub_array)
+total_length <- prod(store_dims)
+w <- sapply(1:length(store_dims), function(i) prod(c(store_dims)[1:i]))
+f_sel <- function(j, w = w, h = h, store_indices, total_length){
+  if(length(store_indices[[j]]) == 1) return(store_indices[[j]]) else{
+    rr <- rep(sort(rep(store_indices[[j]], w[j-1])), length.out = total_length)
+    return(rr[h])
+  }
+}
+lsd <- 1:length(store_dims)
+for(h in 1:total_length){
+  #select the correct sequence of indices
+  pos <- sapply(lsd, f_sel, w, h, store_indices, total_length)
+  if(mean(pos) == 1) res <- rep(1, length(lsd)) else{
+    #computes the index in single number mode
+    i <- length(lsd)
+    while(i <= length(store_dims)){
+      if(pos[i] == 1) i <- i - 1 else{
+        res <- prod(c(store_dims[1:(i-1)], (pos[i]-1))) + pos[i-1]
+        i <- length(store_dims) + 1
+      }
+    }
+  }
+  #insert the data in the big.matrix
+  data_array[res] <- sub_array_vec[h]
+}
+######
 
     if (!is.null(work_piece[['save_metadata_in']])) {
       saveRDS(metadata, file = work_piece[['save_metadata_in']])
-- 
GitLab


From bc08f48b39986af5d6991637eb248fc43c5d40dc Mon Sep 17 00:00:00 2001
From: Roberto Serrano-Notivoli <roberto.serrano@bsc.es>
Date: Wed, 20 Jun 2018 10:20:07 +0200
Subject: [PATCH 2/3] Memory consumption reduced

---
 R/Start.R | 58 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/R/Start.R b/R/Start.R
index 72253a7..a0b1fa1 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -2941,7 +2941,7 @@ print(str(store_indices))
 }
 }
     ## Converting array indices to vector indices
-    #store_dims <- work_piece[['store_dims']]
+    store_dims <- work_piece[['store_dims']]
     #store_indices <- do.call('expand.grid', store_indices)
     #### Given a matrix where each row is a set of array indices of an element
     #### the vector indices are computed
@@ -2954,34 +2954,36 @@ print(str(store_indices))
     #rm(data_array)
     #gc()
 
-######new code
-store_dims <- work_piece[['store_dims']]
-sub_array_vec <- as.vector(sub_array)
-total_length <- prod(store_dims)
-w <- sapply(1:length(store_dims), function(i) prod(c(store_dims)[1:i]))
-f_sel <- function(j, w = w, h = h, store_indices, total_length){
-  if(length(store_indices[[j]]) == 1) return(store_indices[[j]]) else{
-    rr <- rep(sort(rep(store_indices[[j]], w[j-1])), length.out = total_length)
-    return(rr[h])
-  }
-}
-lsd <- 1:length(store_dims)
-for(h in 1:total_length){
-  #select the correct sequence of indices
-  pos <- sapply(lsd, f_sel, w, h, store_indices, total_length)
-  if(mean(pos) == 1) res <- rep(1, length(lsd)) else{
-    #computes the index in single number mode
-    i <- length(lsd)
-    while(i <= length(store_dims)){
-      if(pos[i] == 1) i <- i - 1 else{
-        res <- prod(c(store_dims[1:(i-1)], (pos[i]-1))) + pos[i-1]
-        i <- length(store_dims) + 1
-      }
-    }
-  }
-  #insert the data in the big.matrix
-  data_array[res] <- sub_array_vec[h]
+###########################
+#split the work in parts
+wl <- which.max(store_dims)
+partition <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
+#less than 5 parts: increase of RAM consumption
+  #mor than 5 parts: non-significant reduction of RAM consumption
+parts <- partition(store_indices[[wl]], 5)
+for(i in 1:5){
+  #print(paste(i, 'of 5'))
+  #compute indices in data_array
+  stind <- store_indices
+  stind[[wl]] <- parts[[i]]
+  stind <- do.call("expand.grid", stind)
+  stdims <- store_dims
+  stdims[[wl]] <- length(parts[[i]])
+  weights <- sapply(1:length(stdims), function(i) prod(c(1, stdims)[1:i]))
+  matrix_indices <- 1 + colSums(t(stind - 1) * weights)
+  #compute length of indices in sub_array
+  stdims_sa <- sapply(first_round_indices, length)
+  #indices in sub_array
+  stind_sa <- stind[,match(names(first_round_indices),names(stdims))]
+  weights_sa <- sapply(1:length(stdims_sa), function(i) prod(c(1, stdims_sa)[1:i]))
+  matrix_indices_sa <- 1 + colSums(t(stind_sa - 1) * weights_sa)
+  #load data_array and assign data from sub_array
+  data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
+  data_array[matrix_indices] <- as.vector(sub_array)[matrix_indices_sa]
+  rm(data_array)
+  gc()
 }
+
 ######
 
     if (!is.null(work_piece[['save_metadata_in']])) {
-- 
GitLab


From 5814009a966738be9e3f962757ecaef6d90e372c Mon Sep 17 00:00:00 2001
From: Nicolau Manubens <nicolau.manubens@ic3.cat>
Date: Sat, 19 Jan 2019 00:27:21 +0100
Subject: [PATCH 3/3] Improved method for computing store indices + automatic
 adjustment of parameters for that method.

---
 R/Start.R | 118 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 75 insertions(+), 43 deletions(-)

diff --git a/R/Start.R b/R/Start.R
index 212ae02..fc05ee5 100644
--- a/R/Start.R
+++ b/R/Start.R
@@ -23,7 +23,8 @@ Start <- function(..., # dim = indices/selectors,
                   split_multiselected_dims = FALSE,
                   path_glob_permissive = FALSE,
                   retrieve = FALSE, 
-                  num_procs = 1, silent = FALSE, debug = FALSE) {
+                  num_procs = 1, 
+                  silent = FALSE, debug = FALSE) {
                   #, config_file = NULL
                   #dictionary_dim_names = ,
                   #dictionary_var_names =  
@@ -2972,6 +2973,7 @@ print(str(sub_array))
 
     metadata <- attr(sub_array, 'variables')
 
+    names_bk <- names(store_indices)
     store_indices <- lapply(names(store_indices), 
       function (x) {
         if (!(x %in% names(first_round_indices))) {
@@ -2993,6 +2995,7 @@ print(str(sub_array))
           }
         }
       })
+    names(store_indices) <- names_bk
 if (debug) {
 if (all(unlist(store_indices) == 1)) {
 print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:")
@@ -3003,51 +3006,80 @@ print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:")
 print(str(store_indices))
 }
 }
-    ## Converting array indices to vector indices
+
+    store_indices <- lapply(store_indices, as.integer)
     store_dims <- work_piece[['store_dims']]
-    #store_indices <- do.call('expand.grid', store_indices)
-    #### Given a matrix where each row is a set of array indices of an element
-    #### the vector indices are computed
-    #weights <- sapply(1:length(store_dims), 
-    #                  function(i) prod(c(1, store_dims)[1:i]))
-    #matrix_indices <- 1 + colSums(t(store_indices - 1) * weights)
-    ####
-    #data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
-    #data_array[matrix_indices] <- as.vector(sub_array)
-    #rm(data_array)
-    #gc()
 
-###########################
-#split the work in parts
-wl <- which.max(store_dims)
-partition <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
-#less than 5 parts: increase of RAM consumption
-  #mor than 5 parts: non-significant reduction of RAM consumption
-parts <- partition(store_indices[[wl]], 5)
-for(i in 1:5){
-  #print(paste(i, 'of 5'))
-  #compute indices in data_array
-  stind <- store_indices
-  stind[[wl]] <- parts[[i]]
-  stind <- do.call("expand.grid", stind)
-  stdims <- store_dims
-  stdims[[wl]] <- length(parts[[i]])
-  weights <- sapply(1:length(stdims), function(i) prod(c(1, stdims)[1:i]))
-  matrix_indices <- 1 + colSums(t(stind - 1) * weights)
-  #compute length of indices in sub_array
-  stdims_sa <- sapply(first_round_indices, length)
-  #indices in sub_array
-  stind_sa <- stind[,match(names(first_round_indices),names(stdims))]
-  weights_sa <- sapply(1:length(stdims_sa), function(i) prod(c(1, stdims_sa)[1:i]))
-  matrix_indices_sa <- 1 + colSums(t(stind_sa - 1) * weights_sa)
-  #load data_array and assign data from sub_array
-  data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
-  data_array[matrix_indices] <- as.vector(sub_array)[matrix_indices_sa]
-  rm(data_array)
-  gc()
-}
+    # split the storage work of the loaded subset in parts
+    largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))]
+    max_parts <- length(store_indices[[largest_dim_name]])
+
+    # Indexing a data file of N MB with expand.grid takes 30*N MB
+    # The peak ram of Start is, minimum, 2 * total data to load from all files
+    # due to inefficiencies in other regions of the code
+    # The more parts we split the indexing done below in, the lower
+    # the memory footprint of the indexing and the fast. 
+    # But more than 10 indexing iterations (parts) for each MB processed 
+    # makes the iteration slower (tested empirically on BSC workstations).
+    subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024
+    best_n_parts <- ceiling(subset_size_in_mb * 10)
+    # We want to set n_parts to a greater value than the one that would 
+    # result in a memory footprint (of the subset indexing code below) equal
+    # to 2 * total data to load from all files.
+    # s = subset size in MB
+    # p = number of parts to break it in
+    # T = total size of data to load
+    # then, s / p * 30 = 2 * T
+    # then, p = s * 15 / T
+    min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims))
+    # Make sure we pick n_parts much greater than the minimum calculated
+    n_parts <- min_n_parts * 10
+    if (n_parts > best_n_parts) {
+      n_parts <- best_n_parts
+    }
+    # Boundary checks
+    if (n_parts < 1) {
+      n_parts <- 1
+    }
+    if (n_parts > max_parts) {
+      n_parts <- max_parts
+    }
+
+    if (n_parts > 1) {
+      make_parts <- function(length, n) {
+        clusters <- cut(1:length, n, labels = FALSE)
+        lapply(1:n, function(y) which(clusters == y))
+      }
+      part_indices <- make_parts(max_parts, n_parts)
+      parts <- lapply(part_indices, 
+                      function(x) {
+                        store_indices[[largest_dim_name]][x]
+                      })
+    } else {
+      part_indices <- list(1:max_parts)
+      parts <- store_indices[largest_dim_name]
+    }
 
-######
+    # do the storage work
+    weights <- sapply(1:length(store_dims), 
+                      function(i) prod(c(1, store_dims)[1:i]))
+    part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array))))
+    names(part_indices_in_sub_array) <- names(dim(sub_array))
+    data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
+    for (i in 1:n_parts) {
+      store_indices[[largest_dim_name]] <- parts[[i]]
+      # Converting array indices to vector indices
+      matrix_indices <- do.call("expand.grid", store_indices)
+      # Given a matrix where each row is a set of array indices of an element
+      # the vector indices are computed
+      matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights)
+      part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]]
+      data_array[matrix_indices] <- as.vector(do.call('[',
+        c(list(x = sub_array), 
+          part_indices_in_sub_array)))
+    }
+    rm(data_array)
+    gc()
 
     if (!is.null(work_piece[['save_metadata_in']])) {
       saveRDS(metadata, file = work_piece[['save_metadata_in']])
-- 
GitLab