Start.R

    if (num_procs == 1) {
      found_files <- lapply(work_pieces, .LoadDataFile, 
                            shared_matrix_pointer = shared_matrix_pointer,
                            file_data_reader = file_data_reader, 
                            synonims = synonims,
                            transform = transform, 
                            transform_params = transform_params,
                            silent = silent, debug = debug)
    } else {
      cluster <- parallel::makeCluster(num_procs, outfile = "")
      # Send the heavy work to the workers
      work_errors <- try({
        found_files <- parallel::clusterApplyLB(cluster, work_pieces, .LoadDataFile, 
                                                shared_matrix_pointer = shared_matrix_pointer,
                                                file_data_reader = file_data_reader,
                                                synonims = synonims,
                                                transform = transform, 
                                                transform_params = transform_params,
                                                silent = silent, debug = debug)
      })
      parallel::stopCluster(cluster)
    }
    
    if (!silent) {
      if (progress_message != '') {
        .message("\n", tag = '')
      }
    }
    #print("P")
    
    # NOTE: If merge_across_dims = TRUE, there might be additional NAs due to
    #       unequal inner_dim ('time') length across file_dim ('file_date').
    #       If merge_across_dims_narm = TRUE, add additional lines to remove these NAs.
    # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. 
    if (merge_across_dims_narm) {
      
      # Get the length of these two dimensions in final_dims
      length_inner_across_store_dims <- final_dims[across_inner_dim]
      length_file_across_store_dims <- final_dims[across_file_dim]
      
      # Create a logical array for merge_across_dims
      logi_array <- array(rep(FALSE,
                              length_file_across_store_dims * length_inner_across_store_dims),
                          dim = c(length_inner_across_store_dims, length_file_across_store_dims))
      for (i in 1:length_file_across_store_dims) {  #1:4
        logi_array[1:length_inner_across_dim[[i]], i] <- TRUE
      }
      
      # First, get the data array with final_dims dimension
      data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims)
      
      # Change the NA derived from additional spaces to -9999, then remove these -9999
      func_remove_blank <- function(data_array, logi_array) {
        # dim(data_array) = [time, file_date]
        # dim(logi_array) = [time, file_date]
        # Change the blank spaces from NA to -9999
        data_array[which(!logi_array)] <- -9999
        return(data_array)
      }
      data_array_final_dims <- multiApply::Apply(data_array_final_dims,
                                                 target_dims = c(across_inner_dim, across_file_dim),  #c('time', 'file_date')
                                                 output_dims = c(across_inner_dim, across_file_dim),
                                                 fun = func_remove_blank,
                                                 logi_array = logi_array)$output1
      ## reorder back to the correct dim
      tmp <- match(names(final_dims), names(dim(data_array_final_dims)))
      data_array_final_dims <- .aperm2(data_array_final_dims, tmp)
      data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999]  # become a vector
      
 #NOTE: When one file contains values for dicrete dimensions, rearrange the 
 #      chunks (i.e., work_piece) is necessary.
      if (split_multiselected_dims) {

      # generate the correct order list from indices_chunk 
        final_order_list <- list()
        i <- 1
        j <- 1
        a <- indices_chunk[i]
        while (i < length(indices_chunk)) {
          while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) {
            a <- c(a, indices_chunk[i+1])
            i <- i + 1
          }
          final_order_list[[j]] <- a
          a <- indices_chunk[i+1]
          i <- i + 1
          j <- j + 1
        }
        names(final_order_list) <- sapply(final_order_list, '[[', 1)
        final_order_list <- lapply(final_order_list, length)
        
        if (!all(diff(as.numeric(names(final_order_list))) > 0)) {


      # shape the vector into the array without split_dims
        split_dims_pos <- match(split_dims, final_dims_fake)
        new_dims <- c()
        if (split_dims_pos[1] > 1) {
          new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)])
        }
        new_dims <- c(new_dims,  prod(split_dims))
        names(new_dims)[split_dims_pos[1]] <- across_inner_dim
        if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) {
          new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)])
        }
        final_dims_fake_no_split <- new_dims
        data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split)
      # seperate 'time' dim into each work_piece length
        data_array_seperate <- list()
        tmp <- cumsum(unlist(length_inner_across_dim))
        tmp <- c(0, tmp)
        for (i in 1:length(length_inner_across_dim)) {
          data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim,
                                             (tmp[i] + 1):tmp[i + 1])
        }

      # re-build the array: chunk 
        which_chunk <- as.numeric(names(final_order_list))
        how_many_indices <- unlist(final_order_list)
        array_piece <- list()
        ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate)))
        for (i in 1:length(final_order_list)) {
          array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]],
                                     across_inner_dim,
                                     ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1))
          ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i]
        }
        
        # re-build the array: paste
        data_array_tmp <- array_piece[[1]]
        along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim)
        if (length(array_piece) > 1) {
          for (i in 2:length(array_piece)) {
          data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]],
                                        along = along_pos)
          }
        }
      }        
      }
  
      data_array <- array(data_array_tmp, dim = final_dims_fake)
      
    } else {  # merge_across_dims_narm = F (old version)
      data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake)
    }
    
    # NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above.
    #       To get the user-required dim order, we need to reorder the array again.
    if (split_multiselected_dims & merge_across_dims) {
      if (inner_dim_pos_in_split_dims != 1) {
        correct_order <- match(names(final_dims_fake_output), names(final_dims_fake))
        data_array <- .aperm2(data_array, correct_order)
      }
    }
    
    gc()
    
    # Load metadata and remove the metadata folder
    if (!is.null(metadata_dims)) {
      loaded_metadata_files <- list.files(metadata_folder)

      if (!identical(loaded_metadata_files, character(0))) {  # old code
        loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS)
      } else {
        loaded_metadata <- NULL
      }

      unlink(metadata_folder, recursive = TRUE)

#NOTE: Here, metadata can be saved in one of two ways: one for $common and the other for $dat
#      for $common, it is a list of metadata length. For $dat, it is a list of dat length,
#      and each sublist has the metadata for each dat.
      dim_of_metadata <- dim(array_of_metadata_flags)[metadata_dims]
      if (!any(names(dim_of_metadata) == pattern_dims) |
          (any(names(dim_of_metadata) == pattern_dims) &
           dim_of_metadata[pattern_dims] == 1)) {  # put under $common; old code
        return_metadata <- vector('list',
                                  length = prod(dim(array_of_metadata_flags)[metadata_dims]))
        return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata
        dim(return_metadata) <- dim(array_of_metadata_flags[metadata_dims])

      } else { # put under $dat. metadata_dims has 'dat' and dat length > 1
        return_metadata <- vector('list',
                                  length = dim_of_metadata[pattern_dims])
        names(return_metadata) <- dat_names
        for (kk in 1:length(return_metadata)) {
          return_metadata[[kk]] <- vector('list', length = prod(dim_of_metadata[-1])) # 1 is dat
        }
        loaded_metadata_count <- 1
        for (kk in 1:length(return_metadata)) {
          for (jj in 1:length(return_metadata[[kk]])) {

            if (dataset_has_files[kk]) {
              if (loaded_metadata_count %in% loaded_metadata_files) {
                return_metadata[[kk]][jj] <- loaded_metadata[[loaded_metadata_count]]
                names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[loaded_metadata_count]])
              } else {
                return_metadata[[kk]][jj] <- NULL
              }
              loaded_metadata_count <- loaded_metadata_count + 1
            } else {
              return_metadata[[kk]][jj] <- NULL
            }

          }
        }
      }
      attr(data_array, 'Variables') <- return_metadata
      # TODO: Try to infer data type from loaded_metadata
      # as.integer(data_array) 
    }
    
    failed_pieces <- work_pieces[which(unlist(found_files))]
    for (failed_piece in failed_pieces) {
      array_of_not_found_files <- do.call('[<-', 
                                          c(list(array_of_not_found_files), 
                                            as.list(failed_piece[['file_indices_in_array_of_files']]),
                                            list(value = TRUE)))
    }
    if (any(array_of_not_found_files)) {
      for (i in 1:prod(dim(array_of_files_to_load))) {
        if (is.na(array_of_not_found_files[i])) {
          array_of_files_to_load[i] <- NA
        } else {
          if (array_of_not_found_files[i]) {
            array_of_not_found_files[i] <- array_of_files_to_load[i]
            array_of_files_to_load[i] <- NA
          } else {
            array_of_not_found_files[i] <- NA
          }
        }
      }
    } else {
      array_of_not_found_files <- NULL
    }
    
  } # End if (retrieve)
  
  # Change final_dims_fake back because retrieve = FALSE will use it for attributes later
  if (exists("final_dims_fake_output")) {
    final_dims_fake <- final_dims_fake_output
  }
  # Replace the vars and common vars by the transformed vars and common vars
  for (i in 1:length(dat)) {
    if (length(names(transformed_vars[[i]])) > 0) {
      picked_vars[[i]][names(transformed_vars[[i]])] <- transformed_vars[[i]]
    } else if (length(names(picked_vars_ordered[[i]])) > 0) {
      picked_vars[[i]][names(picked_vars_ordered[[i]])] <- picked_vars_ordered[[i]]
    }
  }
  if (length(names(transformed_common_vars)) > 0) {
    picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars
  } else if (length(names(picked_common_vars_ordered)) > 0) {
    picked_common_vars[names(picked_common_vars_ordered)] <- picked_common_vars_ordered
  }
  if (debug) {
    print("-> THE TRANSFORMED VARS:")
    print(str(transformed_vars))
    print("-> THE PICKED VARS:")
    print(str(picked_vars))
  }
  
  file_selectors <- NULL
  for (i in 1:length(dat)) {
    file_selectors[[dat[[i]][['name']]]] <- dat[[i]][['selectors']][which(names(dat[[i]][['selectors']]) %in% found_file_dims[[i]])]
  }
  if (retrieve) {
    if (!silent) {
      .message("Successfully retrieved data.")
    }

    if (all(sapply(attr(data_array, 'Variables'), is.null))) {
      var_backup <- NULL
      .warning(paste0("Metadata cannot be retrieved. The reason may be the ",
                      "non-existence of the first file. Use parameter 'metadata_dims'",
                      " to assign to file dimensions along which to return metadata, ",
                      "or check the existence of the first file."))
    } else {

#NOTE: The metadata of variables can be saved in one of the two different structures.
#      (1) metadata_dims != 'dat', or (metadata_dims == 'dat' & length(dat) == 1):
#          put under $common
#      (2) (metadata_dims == 'dat' & length(dat) > 1):
#          put under $dat1, $dat2, .... Put it in picked_vars list
#TODO: The current (2) uses the inefficient method. Should define the list structure first
#      then fill the list, rather than expand it in the for loop.
    if (any(metadata_dims == pattern_dims) & length(dat) > 1) { # (2)
      var_backup <- attr(data_array, 'Variables')
      for (kk in 1:length(var_backup)) {
        sublist_names <- lapply(var_backup, names)[[kk]]
        if (!is.null(sublist_names)) {
          for (jj in 1:length(sublist_names)) {
            picked_vars[[kk]][[sublist_names[jj]]] <- var_backup[[kk]][[jj]]
          }
        }
      }
      var_backup <- NULL

    } else {  #(1)
      var_backup <- attr(data_array, 'Variables')
      len <- unlist(lapply(var_backup, length))
      len <- sum(len) + length(which(len == 0))  #0 means NULL
      name_list <- lapply(var_backup, names)
      new_list <- vector('list', length = len)
      count <- 1

      for (kk in 1:length(var_backup)) {
        if (length(var_backup[[kk]]) == 0) {  #NULL
          count <- count + 1
        } else {
          for (jj in 1:length(var_backup[[kk]])) {
            new_list[[count]] <- var_backup[[kk]][[jj]]
            names(new_list)[count] <- name_list[[kk]][jj]
            count <- count + 1
          }
        }
      }
      var_backup <- new_list
    }
}

    attr(data_array, 'Variables') <- NULL
    attributes(data_array) <- c(attributes(data_array), 
                                list(Variables = c(list(common = c(picked_common_vars, var_backup)), 
                                                   picked_vars),
                                     Files = array_of_files_to_load, 
                                     NotFoundFiles = array_of_not_found_files,
                                     FileSelectors = file_selectors,
                                     PatternDim = found_pattern_dim,
                                     ObjectBigmemory = name_bigmemory_obj) #attr(shared_matrix_pointer, 'description')$sharedName)
    )
    attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class'))
    data_array
  } else {
    if (!silent) {
      .message("Successfully discovered data dimensions.")
    }
    start_call <- match.call()
    for (i in 2:length(start_call)) {
      if (class(start_call[[i]]) %in% c('name', 'call')) {
        start_call[[i]] <- eval.parent(start_call[[i]])
      }
    }
    start_call[['retrieve']] <- TRUE
    attributes(start_call) <- c(attributes(start_call),
                                list(Dimensions = final_dims_fake,
                                     Variables = c(list(common = picked_common_vars), picked_vars),
                                     ExpectedFiles = array_of_files_to_load,
                                     FileSelectors = file_selectors,
                                     PatternDim = found_pattern_dim,
                                     MergedDims = if (merge_across_dims) {
                                       inner_dims_across_files
                                     } else {
                                       NULL
                                     },
                                     SplitDims = if (split_multiselected_dims) {
                                       all_split_dims
                                     } else {
                                       NULL
                                     })
    )
    attr(start_call, 'class') <- c('startR_cube', attr(start_call, 'class'))
    start_call
  }
}

# This function is the responsible for loading the data of each work
# piece.
.LoadDataFile <- function(work_piece, shared_matrix_pointer, 
                          file_data_reader, synonims,
                          transform, transform_params,
                          silent = FALSE, debug = FALSE) {
  #warning(attr(shared_matrix_pointer, 'description')$sharedName)
  #  suppressPackageStartupMessages({library(bigmemory)})
  ### TODO: Specify dependencies as parameter
  #  suppressPackageStartupMessages({library(ncdf4)})
  
  #print("1")
  store_indices <- as.list(work_piece[['store_position']])
  first_round_indices <- work_piece[['first_round_indices']]
  second_round_indices <- work_piece[['second_round_indices']]
  #print("2")
  file_to_open <- work_piece[['file_path']]
  sub_array <- file_data_reader(file_to_open, NULL, 
                                work_piece[['file_selectors']],
                                first_round_indices, synonims)
  if (debug) {
    if (all(unlist(store_indices[1:6]) == 1)) {
      print("-> LOADING A WORK PIECE")
      print("-> STRUCTURE OF READ UNTRANSFORMED DATA:")
      print(str(sub_array))
      print("-> STRUCTURE OF VARIABLES TO TRANSFORM:")
      print(str(work_piece[['vars_to_transform']]))
      print("-> COMMON ARRAY DIMENSIONS:")
      print(str(work_piece[['store_dims']]))
    }
  }
  if (!is.null(sub_array)) {
    # Apply data transformation once we have the data arrays.
    if (!is.null(transform)) {
      if (debug) {
        if (all(unlist(store_indices[1:6]) == 1)) {
          print("-> PROCEEDING TO TRANSFORM ARRAY")
          print("-> DIMENSIONS OF ARRAY RIGHT BEFORE TRANSFORMING:")
          print(dim(sub_array))
        }
      }
      sub_array <- do.call(transform, c(list(data_array = sub_array,
                                             variables = work_piece[['vars_to_transform']],
                                             file_selectors = work_piece[['file_selectors']]),
                                        transform_params))
      if (debug) {
        if (all(unlist(store_indices[1:6]) == 1)) {
          print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER TRANSFORMING:")
          print(str(sub_array))
          print("-> DIMENSIONS OF ARRAY RIGHT AFTER TRANSFORMING:")
          print(dim(sub_array$data_array))
        }
      }
      sub_array <- sub_array$data_array
      # Subset with second round of indices
      dims_to_crop <- which(!sapply(second_round_indices, is.null))
      if (length(dims_to_crop) > 0) {
        dimnames_to_crop <- names(second_round_indices)[dims_to_crop]
        sub_array <- Subset(sub_array, dimnames_to_crop, 
                            second_round_indices[dimnames_to_crop])
      }
      if (debug) {
        if (all(unlist(store_indices[1:6]) == 1)) {
          print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER SUBSETTING WITH 2nd ROUND INDICES:")
          print(str(sub_array))
        }
      }
    }
    
    metadata <- attr(sub_array, 'variables')
    
    names_bk <- names(store_indices)
    store_indices <- lapply(names(store_indices), 
                            function (x) {
                              if (!(x %in% names(first_round_indices))) {
                                store_indices[[x]]
                              } else if (is.null(second_round_indices[[x]])) {
                                1:dim(sub_array)[x]
                              } else {
                                if (is.numeric(second_round_indices[[x]])) {
                                  ## TODO: Review carefully this line. Inner indices are all 
                                  ## aligned to the left-most positions. If dataset A has longitudes
                                  ## 1, 2, 3, 4 but dataset B has only longitudes 3 and 4, then
                                  ## they will be stored as follows:
                                  ## 1, 2, 3, 4
                                  ## 3, 4, NA, NA
                                  ##x - min(x) + 1
                                  1:length(second_round_indices[[x]])
                                } else {
                                  1:length(second_round_indices[[x]])
                                }
                              }
                            })
    names(store_indices) <- names_bk
    if (debug) {
      if (all(unlist(store_indices) == 1)) {
        print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:")
        print(str(first_round_indices))
        print("-> STRUCTURE OF SECOND ROUND INDICES FOR THIS WORK PIECE:")
        print(str(second_round_indices))
        print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:")
        print(str(store_indices))
      }
    }
    
    store_indices <- lapply(store_indices, as.integer)
    store_dims <- work_piece[['store_dims']]
    
    # split the storage work of the loaded subset in parts
    largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))]
    max_parts <- length(store_indices[[largest_dim_name]])
    
    # Indexing a data file of N MB with expand.grid takes 30*N MB
    # The peak ram of Start is, minimum, 2 * total data to load from all files
    # due to inefficiencies in other regions of the code
    # The more parts we split the indexing done below in, the lower
    # the memory footprint of the indexing and the fast. 
    # But more than 10 indexing iterations (parts) for each MB processed 
    # makes the iteration slower (tested empirically on BSC workstations).
    subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024
    best_n_parts <- ceiling(subset_size_in_mb * 10)
    # We want to set n_parts to a greater value than the one that would 
    # result in a memory footprint (of the subset indexing code below) equal
    # to 2 * total data to load from all files.
    # s = subset size in MB
    # p = number of parts to break it in
    # T = total size of data to load
    # then, s / p * 30 = 2 * T
    # then, p = s * 15 / T
    min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims))
    # Make sure we pick n_parts much greater than the minimum calculated
    n_parts <- min_n_parts * 10
    if (n_parts > best_n_parts) {
      n_parts <- best_n_parts
    }
    # Boundary checks
    if (n_parts < 1) {
      n_parts <- 1
    }
    if (n_parts > max_parts) {
      n_parts <- max_parts
    }
    
    if (n_parts > 1) {
      make_parts <- function(length, n) {
        clusters <- cut(1:length, n, labels = FALSE)
        lapply(1:n, function(y) which(clusters == y))
      }
      part_indices <- make_parts(max_parts, n_parts)
      parts <- lapply(part_indices, 
                      function(x) {
                        store_indices[[largest_dim_name]][x]
                      })
    } else {
      part_indices <- list(1:max_parts)
      parts <- store_indices[largest_dim_name]
    }
    
    # do the storage work
    weights <- sapply(1:length(store_dims), 
                      function(i) prod(c(1, store_dims)[1:i]))
    part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array))))
    names(part_indices_in_sub_array) <- names(dim(sub_array))
    data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
    for (i in 1:n_parts) {
      store_indices[[largest_dim_name]] <- parts[[i]]
      # Converting array indices to vector indices
      matrix_indices <- do.call("expand.grid", store_indices)
      # Given a matrix where each row is a set of array indices of an element
      # the vector indices are computed
      matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights)
      part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]]
      data_array[matrix_indices] <- as.vector(do.call('[',
                                                      c(list(x = sub_array), 
                                                        part_indices_in_sub_array)))
    }
    rm(data_array)
    gc()
    
    if (!is.null(work_piece[['save_metadata_in']])) {
      saveRDS(metadata, file = work_piece[['save_metadata_in']])
    }
  }
  if (!is.null(work_piece[['progress_amount']]) && !silent) {
    message(work_piece[['progress_amount']], appendLF = FALSE)
  }
  is.null(sub_array)
}