Start.R


      # shape the vector into the array without split_dims
        split_dims_pos <- match(split_dims, final_dims_fake)
        new_dims <- c()
        if (split_dims_pos[1] > 1) {
          new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)])
        }
        new_dims <- c(new_dims,  prod(split_dims))
        names(new_dims)[split_dims_pos[1]] <- across_inner_dim
        if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) {
          new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)])
        }
        final_dims_fake_no_split <- new_dims
        data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split)
      # seperate 'time' dim into each work_piece length
        data_array_seperate <- list()
        tmp <- cumsum(unlist(length_inner_across_dim))
        tmp <- c(0, tmp)
        for (i in 1:length(length_inner_across_dim)) {
          data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim,
                                             (tmp[i] + 1):tmp[i + 1])
        }

      # re-build the array: chunk 
        which_chunk <- as.numeric(names(final_order_list))
        how_many_indices <- unlist(final_order_list)
        array_piece <- list()
        ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate)))
        for (i in 1:length(final_order_list)) {
          array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]],
                                     across_inner_dim,
                                     ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1))
          ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i]
        }
        
        # re-build the array: paste
        data_array_tmp <- array_piece[[1]]
        along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim)
        if (length(array_piece) > 1) {
          for (i in 2:length(array_piece)) {
          data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]],
                                        along = along_pos)
          }
        }
      }        
      }
  
      data_array <- array(data_array_tmp, dim = final_dims_fake)
      
    } else {  # merge_across_dims_narm = F (old version)
      data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake)
    }
    
    # NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above.
    #       To get the user-required dim order, we need to reorder the array again.
    if (split_multiselected_dims & merge_across_dims) {
      if (inner_dim_pos_in_split_dims != 1) {
        correct_order <- match(names(final_dims_fake_output), names(final_dims_fake))
        data_array <- .aperm2(data_array, correct_order)
      }
    }
    
    gc()
    
    # Load metadata and remove the metadata folder
    if (!is.null(metadata_dims)) {
      loaded_metadata_files <- list.files(metadata_folder)

      if (!identical(loaded_metadata_files, character(0))) {  # old code
        loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS)
      } else {
        loaded_metadata <- NULL
      }

      unlink(metadata_folder, recursive = TRUE)

#NOTE: Here, metadata can be saved in one of two ways: one for $common and the other for $dat
#      for $common, it is a list of metadata length. For $dat, it is a list of dat length,
#      and each sublist has the metadata for each dat.
      dim_of_metadata <- dim(array_of_metadata_flags)[metadata_dims]
      if (!any(names(dim_of_metadata) == pattern_dims) |
          (any(names(dim_of_metadata) == pattern_dims) &
           dim_of_metadata[pattern_dims] == 1)) {  # put under $common; old code
        return_metadata <- vector('list',
                                  length = prod(dim(array_of_metadata_flags)[metadata_dims]))
        return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata
        dim(return_metadata) <- dim(array_of_metadata_flags[metadata_dims])

      } else { # put under $dat. metadata_dims has 'dat' and dat length > 1
        return_metadata <- vector('list',
                                  length = dim_of_metadata[pattern_dims])
        names(return_metadata) <- dat_names
        for (kk in 1:length(return_metadata)) {
          return_metadata[[kk]] <- vector('list', length = prod(dim_of_metadata[-1])) # 1 is dat
        }
        loaded_metadata_count <- 1
        for (kk in 1:length(return_metadata)) {
          for (jj in 1:length(return_metadata[[kk]])) {
            return_metadata[[kk]][jj] <- loaded_metadata[[loaded_metadata_count]]
            names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[loaded_metadata_count]])
            loaded_metadata_count <- loaded_metadata_count + 1
          }
        }
      }
      attr(data_array, 'Variables') <- return_metadata
      # TODO: Try to infer data type from loaded_metadata
      # as.integer(data_array) 
    }
    
    failed_pieces <- work_pieces[which(unlist(found_files))]
    for (failed_piece in failed_pieces) {
      array_of_not_found_files <- do.call('[<-', 
                                          c(list(array_of_not_found_files), 
                                            as.list(failed_piece[['file_indices_in_array_of_files']]),
                                            list(value = TRUE)))
    }
    if (any(array_of_not_found_files)) {
      for (i in 1:prod(dim(array_of_files_to_load))) {
        if (is.na(array_of_not_found_files[i])) {
          array_of_files_to_load[i] <- NA
        } else {
          if (array_of_not_found_files[i]) {
            array_of_not_found_files[i] <- array_of_files_to_load[i]
            array_of_files_to_load[i] <- NA
          } else {
            array_of_not_found_files[i] <- NA
          }
        }
      }
    } else {
      array_of_not_found_files <- NULL
    }
    
  } # End if (retrieve)
  
  # Change final_dims_fake back because retrieve = FALSE will use it for attributes later
  if (exists("final_dims_fake_output")) {
    final_dims_fake <- final_dims_fake_output
  }
  # Replace the vars and common vars by the transformed vars and common vars
  for (i in 1:length(dat)) {
    if (length(names(transformed_vars[[i]])) > 0) {
      picked_vars[[i]][names(transformed_vars[[i]])] <- transformed_vars[[i]]
    } else if (length(names(picked_vars_ordered[[i]])) > 0) {
      picked_vars[[i]][names(picked_vars_ordered[[i]])] <- picked_vars_ordered[[i]]
    }
  }
  if (length(names(transformed_common_vars)) > 0) {
    picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars
  } else if (length(names(picked_common_vars_ordered)) > 0) {
    picked_common_vars[names(picked_common_vars_ordered)] <- picked_common_vars_ordered
  }
  if (debug) {
    print("-> THE TRANSFORMED VARS:")
    print(str(transformed_vars))
    print("-> THE PICKED VARS:")
    print(str(picked_vars))
  }
  
  file_selectors <- NULL
  for (i in 1:length(dat)) {
    file_selectors[[dat[[i]][['name']]]] <- dat[[i]][['selectors']][which(names(dat[[i]][['selectors']]) %in% found_file_dims[[i]])]
  }
  if (retrieve) {
    if (!silent) {
      .message("Successfully retrieved data.")
    }

    if (all(sapply(attr(data_array, 'Variables'), is.null))) {
      var_backup <- NULL
      .warning(paste0("Metadata cannot be retrieved. The reason may be the ",
                      "non-existence of the first file. Use parameter 'metadata_dims'",
                      " to assign to file dimensions along which to return metadata, ",
                      "or check the existence of the first file."))
    } else {

#NOTE: The metadata of variables can be saved in one of the two different structures.
#      (1) metadata_dims != 'dat', or (metadata_dims == 'dat' & length(dat) == 1):
#          put under $common
#      (2) (metadata_dims == 'dat' & length(dat) > 1):
#          put under $dat1, $dat2, .... Put it in picked_vars list
#TODO: The current (2) uses the inefficient method. Should define the list structure first
#      then fill the list, rather than expand it in the for loop.
    if (any(metadata_dims == pattern_dims) & length(dat) > 1) { # (2)
      var_backup <- attr(data_array, 'Variables')
      for (kk in 1:length(var_backup)) {
        sublist_names <- lapply(var_backup, names)[[kk]]
        for (jj in 1:length(sublist_names)) {
          picked_vars[[kk]][[sublist_names[jj]]] <- var_backup[[kk]][[jj]]
        }
      }
      var_backup <- NULL

    } else {  #(1)
      var_backup <- attr(data_array, 'Variables')
      var_backup_name <- lapply(var_backup, names)
      var_backup <- lapply(var_backup, '[[', 1)
      names(var_backup) <- unlist(var_backup_name)
    }
}
    attr(data_array, 'Variables') <- NULL
    attributes(data_array) <- c(attributes(data_array), 
                                list(Variables = c(list(common = c(picked_common_vars, var_backup)), 
                                                   picked_vars),
                                     Files = array_of_files_to_load, 
                                     NotFoundFiles = array_of_not_found_files,
                                     FileSelectors = file_selectors,
                                     PatternDim = found_pattern_dim)
    )
    attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class'))
    data_array
  } else {
    if (!silent) {
      .message("Successfully discovered data dimensions.")
    }
    start_call <- match.call()
    for (i in 2:length(start_call)) {
      if (class(start_call[[i]]) %in% c('name', 'call')) {
        start_call[[i]] <- eval.parent(start_call[[i]])
      }
    }
    start_call[['retrieve']] <- TRUE
    attributes(start_call) <- c(attributes(start_call),
                                list(Dimensions = final_dims_fake,
                                     Variables = c(list(common = picked_common_vars), picked_vars),
                                     ExpectedFiles = array_of_files_to_load,
                                     FileSelectors = file_selectors,
                                     PatternDim = found_pattern_dim,
                                     MergedDims = if (merge_across_dims) {
                                       inner_dims_across_files
                                     } else {
                                       NULL
                                     },
                                     SplitDims = if (split_multiselected_dims) {
                                       all_split_dims
                                     } else {
                                       NULL
                                     })
    )
    attr(start_call, 'class') <- c('startR_cube', attr(start_call, 'class'))
    start_call
  }
}

# This function is the responsible for loading the data of each work
# piece.
.LoadDataFile <- function(work_piece, shared_matrix_pointer, 
                          file_data_reader, synonims,
                          transform, transform_params,
                          silent = FALSE, debug = FALSE) {
  #  suppressPackageStartupMessages({library(bigmemory)})
  ### TODO: Specify dependencies as parameter
  #  suppressPackageStartupMessages({library(ncdf4)})
  
  #print("1")
  store_indices <- as.list(work_piece[['store_position']])
  first_round_indices <- work_piece[['first_round_indices']]
  second_round_indices <- work_piece[['second_round_indices']]
  #print("2")
  file_to_open <- work_piece[['file_path']]
  sub_array <- file_data_reader(file_to_open, NULL, 
                                work_piece[['file_selectors']],
                                first_round_indices, synonims)
  if (debug) {
    if (all(unlist(store_indices[1:6]) == 1)) {
      print("-> LOADING A WORK PIECE")
      print("-> STRUCTURE OF READ UNTRANSFORMED DATA:")
      print(str(sub_array))
      print("-> STRUCTURE OF VARIABLES TO TRANSFORM:")
      print(str(work_piece[['vars_to_transform']]))
      print("-> COMMON ARRAY DIMENSIONS:")
      print(str(work_piece[['store_dims']]))
    }
  }
  if (!is.null(sub_array)) {
    # Apply data transformation once we have the data arrays.
    if (!is.null(transform)) {
      if (debug) {
        if (all(unlist(store_indices[1:6]) == 1)) {
          print("-> PROCEEDING TO TRANSFORM ARRAY")
          print("-> DIMENSIONS OF ARRAY RIGHT BEFORE TRANSFORMING:")
          print(dim(sub_array))
        }
      }
      sub_array <- do.call(transform, c(list(data_array = sub_array,
                                             variables = work_piece[['vars_to_transform']],
                                             file_selectors = work_piece[['file_selectors']]),
                                        transform_params))
      if (debug) {
        if (all(unlist(store_indices[1:6]) == 1)) {
          print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER TRANSFORMING:")
          print(str(sub_array))
          print("-> DIMENSIONS OF ARRAY RIGHT AFTER TRANSFORMING:")
          print(dim(sub_array$data_array))
        }
      }
      sub_array <- sub_array$data_array
      # Subset with second round of indices
      dims_to_crop <- which(!sapply(second_round_indices, is.null))
      if (length(dims_to_crop) > 0) {
        dimnames_to_crop <- names(second_round_indices)[dims_to_crop]
        sub_array <- Subset(sub_array, dimnames_to_crop, 
                            second_round_indices[dimnames_to_crop])
      }
      if (debug) {
        if (all(unlist(store_indices[1:6]) == 1)) {
          print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER SUBSETTING WITH 2nd ROUND INDICES:")
          print(str(sub_array))
        }
      }
    }
    
    metadata <- attr(sub_array, 'variables')
    
    names_bk <- names(store_indices)
    store_indices <- lapply(names(store_indices), 
                            function (x) {
                              if (!(x %in% names(first_round_indices))) {
                                store_indices[[x]]
                              } else if (is.null(second_round_indices[[x]])) {
                                1:dim(sub_array)[x]
                              } else {
                                if (is.numeric(second_round_indices[[x]])) {
                                  ## TODO: Review carefully this line. Inner indices are all 
                                  ## aligned to the left-most positions. If dataset A has longitudes
                                  ## 1, 2, 3, 4 but dataset B has only longitudes 3 and 4, then
                                  ## they will be stored as follows:
                                  ## 1, 2, 3, 4
                                  ## 3, 4, NA, NA
                                  ##x - min(x) + 1
                                  1:length(second_round_indices[[x]])
                                } else {
                                  1:length(second_round_indices[[x]])
                                }
                              }
                            })
    names(store_indices) <- names_bk
    if (debug) {
      if (all(unlist(store_indices) == 1)) {
        print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:")
        print(str(first_round_indices))
        print("-> STRUCTURE OF SECOND ROUND INDICES FOR THIS WORK PIECE:")
        print(str(second_round_indices))
        print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:")
        print(str(store_indices))
      }
    }
    
    store_indices <- lapply(store_indices, as.integer)
    store_dims <- work_piece[['store_dims']]
    
    # split the storage work of the loaded subset in parts
    largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))]
    max_parts <- length(store_indices[[largest_dim_name]])
    
    # Indexing a data file of N MB with expand.grid takes 30*N MB
    # The peak ram of Start is, minimum, 2 * total data to load from all files
    # due to inefficiencies in other regions of the code
    # The more parts we split the indexing done below in, the lower
    # the memory footprint of the indexing and the fast. 
    # But more than 10 indexing iterations (parts) for each MB processed 
    # makes the iteration slower (tested empirically on BSC workstations).
    subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024
    best_n_parts <- ceiling(subset_size_in_mb * 10)
    # We want to set n_parts to a greater value than the one that would 
    # result in a memory footprint (of the subset indexing code below) equal
    # to 2 * total data to load from all files.
    # s = subset size in MB
    # p = number of parts to break it in
    # T = total size of data to load
    # then, s / p * 30 = 2 * T
    # then, p = s * 15 / T
    min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims))
    # Make sure we pick n_parts much greater than the minimum calculated
    n_parts <- min_n_parts * 10
    if (n_parts > best_n_parts) {
      n_parts <- best_n_parts
    }
    # Boundary checks
    if (n_parts < 1) {
      n_parts <- 1
    }
    if (n_parts > max_parts) {
      n_parts <- max_parts
    }
    
    if (n_parts > 1) {
      make_parts <- function(length, n) {
        clusters <- cut(1:length, n, labels = FALSE)
        lapply(1:n, function(y) which(clusters == y))
      }
      part_indices <- make_parts(max_parts, n_parts)
      parts <- lapply(part_indices, 
                      function(x) {
                        store_indices[[largest_dim_name]][x]
                      })
    } else {
      part_indices <- list(1:max_parts)
      parts <- store_indices[largest_dim_name]
    }
    
    # do the storage work
    weights <- sapply(1:length(store_dims), 
                      function(i) prod(c(1, store_dims)[1:i]))
    part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array))))
    names(part_indices_in_sub_array) <- names(dim(sub_array))
    data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
    for (i in 1:n_parts) {
      store_indices[[largest_dim_name]] <- parts[[i]]
      # Converting array indices to vector indices
      matrix_indices <- do.call("expand.grid", store_indices)
      # Given a matrix where each row is a set of array indices of an element
      # the vector indices are computed
      matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights)
      part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]]
      data_array[matrix_indices] <- as.vector(do.call('[',
                                                      c(list(x = sub_array), 
                                                        part_indices_in_sub_array)))
    }
    rm(data_array)
    gc()
    
    if (!is.null(work_piece[['save_metadata_in']])) {
      saveRDS(metadata, file = work_piece[['save_metadata_in']])
    }
  }
  if (!is.null(work_piece[['progress_amount']]) && !silent) {
    message(work_piece[['progress_amount']], appendLF = FALSE)
  }
  is.null(sub_array)
}