zzz.R 70.4 KB
Newer Older
# Take *_var parameters apart
take_var_params <- function(dim_params) {
 # Take *_var parameters apart
  var_params_ind <- grep('_var$', names(dim_params))
  var_params <- dim_params[var_params_ind]
  # Check all *_var are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (var_param in var_params) {
    if (!is.character(var_param)) {
      stop("All '*_var' parameters must be character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(var_params)[i],
                                               '_var$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_var' parameters must be associated to a dimension parameter. Found parameter '",
                  names(var_params)[i], "' but no parameter '",
                  strsplit(names(var_params)[i], '_var$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'var_params' to be the name of 
  # the corresponding dimension.
  if (length(var_params) < 1) {
    var_params <- NULL
  } else {
    names(var_params) <- gsub('_var$', '', names(var_params))
  }
  return(var_params)
}

# Take *_reorder parameters apart
take_var_reorder <- function(dim_params) {
  # Take *_reorder parameters apart
  dim_reorder_params_ind <- grep('_reorder$', names(dim_params))
  dim_reorder_params <- dim_params[dim_reorder_params_ind]
  # Make the keys of 'dim_reorder_params' to be the name of 
  # the corresponding dimension.
  if (length(dim_reorder_params) < 1) {
    dim_reorder_params <- NULL
  } else {
    names(dim_reorder_params) <- gsub('_reorder$', '', names(dim_reorder_params))
  }
  return(dim_reorder_params)
}

# Take *_depends parameters apart
take_var_depends <- function(dim_params) {
  depends_params_ind <- grep('_depends$', names(dim_params))
  depends_params <- dim_params[depends_params_ind]
  # Check all *_depends are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (depends_param in depends_params) {
    if (!is.character(depends_param) || (length(depends_param) > 1)) {
      stop("All '*_depends' parameters must be single character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(depends_params)[i],
                                               '_depends$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_depends' parameters must be associated to a dimension parameter. Found parameter '",
                  names(depends_params)[i], "' but no parameter '",
                  strsplit(names(depends_params)[i], '_depends$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'depends_params' to be the name of 
  # the corresponding dimension.
  if (length(depends_params) < 1) {
    depends_params <- NULL
  } else {
    names(depends_params) <- gsub('_depends$', '', names(depends_params))
  }
  return(depends_params)
}

# Take *_across parameters apart
take_var_across <- function(dim_params) {
  across_params_ind <- grep('_across$', names(dim_params))
  across_params <- dim_params[across_params_ind]
  # Check all *_across are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (across_param in across_params) {
    if (!is.character(across_param) || (length(across_param) > 1)) {
      stop("All '*_across' parameters must be single character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(across_params)[i],
                                               '_across$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_across' parameters must be associated to a dimension parameter. Found parameter '",
                  names(across_params)[i], "' but no parameter '",
                  strsplit(names(across_params)[i], '_across$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'across_params' to be the name of 
  # the corresponding dimension.
  if (length(across_params) < 1) {
    across_params <- NULL
  } else {
    names(across_params) <- gsub('_across$', '', names(across_params))
  }
  return(across_params)
}

# Leave alone the dimension parameters in the variable dim_params
rebuild_dim_params <- function(dim_params, merge_across_dims,
                               inner_dims_across_files) {
  var_params_ind <- grep('_var$', names(dim_params))
  dim_reorder_params_ind <- grep('_reorder$', names(dim_params))
  tolerance_params_ind <- grep('_tolerance$', names(dim_params))
  depends_params_ind <- grep('_depends$', names(dim_params))
  across_params_ind <- grep('_across$', names(dim_params))
  # Leave alone the dimension parameters in the variable dim_params
  if (length(c(var_params_ind, dim_reorder_params_ind, tolerance_params_ind,
               depends_params_ind, across_params_ind)) > 0) {
    dim_params <- dim_params[-c(var_params_ind, dim_reorder_params_ind,
                                tolerance_params_ind, depends_params_ind,
                                across_params_ind)]
    # Reallocating pairs of across file and inner dimensions if they have
    # to be merged. They are put one next to the other to ease merge later.
    if (merge_across_dims) {
      if (any(!names(inner_dims_across_files) %in% names(dim_params)) |
          any(!unlist(inner_dims_across_files) %in% names(dim_params)))
        stop("All *_across parameters must have value as a file dimension name.")
      for (inner_dim_across in names(inner_dims_across_files)) {
        inner_dim_pos <- which(names(dim_params) == inner_dim_across)
        file_dim_pos <- which(names(dim_params) == inner_dims_across_files[[inner_dim_across]])
        new_pos <- inner_dim_pos
        if (file_dim_pos < inner_dim_pos) {
          new_pos <- new_pos - 1
        }
        dim_params_to_move <- dim_params[c(inner_dim_pos, file_dim_pos)]
        dim_params <- dim_params[-c(inner_dim_pos, file_dim_pos)]
        new_dim_params <- list()
        if (new_pos > 1) {
          new_dim_params <- c(new_dim_params, dim_params[1:(new_pos - 1)])
        }
        new_dim_params <- c(new_dim_params, dim_params_to_move)
        if (length(dim_params) >= new_pos) {
          new_dim_params <- c(new_dim_params, dim_params[new_pos:length(dim_params)])
        }
        dim_params <- new_dim_params
      }
    }
  }
  dim_names <- names(dim_params)
  if (is.null(dim_names)) {
    stop("At least one pattern dim must be specified.")
  }
  return(dim_params)
}

# Look for chunked dims
look_for_chunks <- function(dim_params, dim_names) {
    chunks <- vector('list', length(dim_names))
  names(chunks) <- dim_names
  for (dim_name in dim_names) {
    if (!is.null(attr(dim_params[[dim_name]], 'chunk'))) {
      chunks[[dim_name]] <- attr(dim_params[[dim_name]], 'chunk')
      attributes(dim_params[[dim_name]]) <- attributes(dim_params[[dim_name]])[-which(names(attributes(dim_params[[dim_name]])) == 'chunk')]
    } else {
      chunks[[dim_name]] <- c(chunk = 1, n_chunks = 1)
    }
  }
  return(chunks)
}

# This is a helper function to compute the chunk indices to take once the total
# number of indices for a dimension has been discovered.
  get_chunk_indices <- function(n_indices, chunk, n_chunks, dim_name) {
    if (n_chunks > n_indices) {
      stop("Requested to divide dimension '", dim_name, "' of length ",
           n_indices, " in ", n_chunks, " chunks, which is not possible.")
    }
    chunk_sizes <- rep(floor(n_indices / n_chunks), n_chunks)
    chunks_to_extend <- n_indices - chunk_sizes[1] * n_chunks
    if (chunks_to_extend > 0) {
      chunk_sizes[1:chunks_to_extend] <- chunk_sizes[1:chunks_to_extend] + 1
    }
    chunk_size <- chunk_sizes[chunk]
    offset <- 0
    if (chunk > 1) {
      offset <- sum(chunk_sizes[1:(chunk - 1)])
    }
    indices <- 1:chunk_sizes[chunk] + offset
    array(indices, dim = setNames(length(indices), dim_name))
  }

# Check pattern_dims
# Function found_pattern_dims may change pattern_dims in the parent.env
found_pattern_dims <- function(pattern_dims, dim_names, var_params,
                               dim_params, dim_reorder_params) {
  if (is.null(pattern_dims)) {
    .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '",
                    dim_names[1], "' as 'pattern_dims'."))
    assign('pattern_dims', dim_names[1], envir = parent.frame())
    pattern_dims <- dim_names[1]
  } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) {
    assign('pattern_dims', unique(pattern_dims), envir = parent.frame())
    pattern_dims <- unique(pattern_dims)
  } else {
    stop("Parameter 'pattern_dims' must be a vector of character strings.")
  }
  if (any(names(var_params) %in% pattern_dims)) {
    stop("'*_var' parameters specified for pattern dimensions. Remove or fix them.")
  }
  # Find the pattern dimension with the pattern specifications
  found_pattern_dim <- NULL
  for (pattern_dim in pattern_dims) {
    # Check all specifications in pattern_dim are valid
aho's avatar
aho committed
#    dat <- datasets <- dim_params[[pattern_dim]]
    dat <- dim_params[[pattern_dim]]
    if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) {
      stop(paste0("Parameter '", pattern_dim,
                  "' must be a list of lists with pattern specifications or a vector of character strings."))
    }
    if (!is.null(dim_reorder_params[[pattern_dim]])) {
      .warning(paste0("A reorder for the selectors of '", pattern_dim,
                      "' has been specified, but it is a pattern dimension and the reorder will be ignored."))
    }
    if (is.list(dat) || any(sapply(dat, is.list))) {
      if (is.null(found_pattern_dim)) {
        found_pattern_dim <- pattern_dim
      } else {
        stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.")
      }
    }
  }
  if (is.null(found_pattern_dim)) {
   .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications."))
    found_pattern_dim <- pattern_dims[1]
  }
  return(found_pattern_dim)
}

aho's avatar
aho committed

# The variable 'dat' is mounted with the information (name, path) of each dataset.
# NOTE: This function creates the object 'dat_names' in the parent env.
aho's avatar
aho committed
mount_dat <- function(dat, pattern_dims, found_pattern_dim, dat_names) {
aho's avatar
aho committed

#  dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames')
  dat_to_fetch <- c()
aho's avatar
aho committed
#  dat_names <- c()
aho's avatar
aho committed
  if (!is.list(dat)) {
    dat <- as.list(dat)
  } else {
    if (!any(sapply(dat, is.list))) {
      dat <- list(dat)
    }
  }
  for (i in 1:length(dat)) {
    if (is.character(dat[[i]]) && length(dat[[i]]) == 1 && nchar(dat[[i]]) > 0) {
      if (grepl('^(\\./|\\.\\./|/.*/|~/)', dat[[i]])) {
        dat[[i]] <- list(path = dat[[i]])
      } else {
        dat[[i]] <- list(name = dat[[i]])
      }
    } else if (!is.list(dat[[i]])) {
aho's avatar
aho committed
      stop(paste0("Parameter '", pattern_dims, 
aho's avatar
aho committed
                  "' is incorrect. It must be a list of lists or character strings."))
    }
    #if (!(all(names(dat[[i]]) %in% dat_info_names))) {
    #  stop("Error: parameter 'dat' is incorrect. There are unrecognized components in the information of some of the datasets. Check 'dat' in ?Load for details.")
    #}
    if (!('name' %in% names(dat[[i]]))) {
      dat[[i]][['name']] <- paste0('dat', i)
      if (!('path' %in% names(dat[[i]]))) {
        stop(paste0("Parameter '", found_pattern_dim, 
                    "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided."))
      }
    } else if (!('path' %in% names(dat[[i]]))) {
      dat_to_fetch <- c(dat_to_fetch, i)
    }
    #if ('path' %in% names(dat[[i]])) {
    #  if (!('nc_var_name' %in% names(dat[[i]]))) {
    #    dat[[i]][['nc_var_name']] <- '$var_name$'
    #  }
    #  if (!('suffix' %in% names(dat[[i]]))) {
    #    dat[[i]][['suffix']] <- ''
    #  }
    #  if (!('var_min' %in% names(dat[[i]]))) {
    #    dat[[i]][['var_min']] <- ''
    #  }
    #  if (!('var_max' %in% names(dat[[i]]))) {
    #    dat[[i]][['var_max']] <- ''
    #  }
    #}
    dat_names <- c(dat_names, dat[[i]][['name']])
  }
  if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) {
    .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.")
  }
  if (length(dat_to_fetch) > 0) {
    stop("Specified only the name for some data sets, but not the path ",
         "pattern. This option has not been yet implemented.")
  }

  assign('dat_names', dat_names, envir = parent.frame())
  return(dat)
}

# Add attributes indicating whether this dimension selector is value or indice
add_value_indices_flag <- function(x) {
  if (is.null(attr(x, 'values')) || is.null(attr(x, 'indices'))) {
    flag <- (any(x %in% c('all', 'first', 'last')) || is.numeric(unlist(x)))
    attr(x, 'values') <- !flag
    attr(x, 'indices') <- flag
  }
  return(x)
}


# Find the value for the undefined selector (i.e., indices()). Use the value from the first 
# found file.
# Note that "dat[[i]][['path']]" in parent env. is changed in this function.
find_ufd_value <- function(undefined_file_dims, dat, i, replace_values,
                           first_file, file_dims, path_glob_permissive, 
                           depending_file_dims, dat_selectors, selector_checker, chunks) {
  first_values <- vector('list', length = length(undefined_file_dims))
  names(first_values) <- undefined_file_dims
  found_values <- 0
  stop <- FALSE
  try_dim <- 1
  last_success <- 1
  while ((found_values < length(undefined_file_dims)) && !stop) {
    u_file_dim <- undefined_file_dims[try_dim]
    if (is.null(first_values[[u_file_dim]])) {
      path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], 
                                                           replace_values[-which(file_dims == u_file_dim)],
                                                           allow_undefined_key_vars = TRUE)
      found_value <- .FindTagValue(path_with_globs_and_tag, 
                                   first_file, u_file_dim)
      if (!is.null(found_value)) {
        found_values <- found_values + 1
        last_success <- try_dim
        first_values[[u_file_dim]] <- found_value
        replace_values[[u_file_dim]] <- found_value
      }
    }
    try_dim <- (try_dim %% length(undefined_file_dims)) + 1
    if (try_dim == last_success) {
      stop <- TRUE
    }
  }
  if (found_values < length(undefined_file_dims)) {
    stop(paste0("Path pattern of dataset '", dat[[i]][['name']], 
                "' is too complex. Could not automatically ",
                "detect values for all non-explicitly defined ",
                "indices. Check its pattern: ", dat[[i]][['path']]))
  }
  ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above
  ##       Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical
  ##       cmor case, requesting all members and chunks for fixed var and sdate. Not fixing
  ##       sdate raised 'too complex' error.
  # Replace shell globs in path pattern and keep the file_dims as tags
  dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, 
                                                file_dims, dat[[i]][['name']], path_glob_permissive)

  # Now time to look for the available values for the non 
  # explicitly defined selectors for the file dimensions.
  #print("H")
  # Check first the ones that do not depend on others.
  ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))],
           undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))])
  
  for (u_file_dim in ufd) {
    replace_values[undefined_file_dims] <- first_values
    replace_values[[u_file_dim]] <- '*'
    depended_dim <- NULL
    depended_dim_values <- NA

    #NOTE: Here 'selectors' is always 1. Is it supposed to be like this?
    selectors <- dat_selectors[[u_file_dim]][[1]]
    if (u_file_dim %in% names(depending_file_dims)) {
      depended_dim <- depending_file_dims[[u_file_dim]]
      depended_dim_values <- dat_selectors[[depended_dim]][[1]]
      dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values))
      names(dat_selectors[[u_file_dim]]) <- depended_dim_values
    } else {
      dat_selectors[[u_file_dim]] <- list()
    }
    if (u_file_dim %in% unlist(depending_file_dims)) {
      depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))]
      replace_values[depending_dims] <- rep('*', length(depending_dims))
    }
    for (j in 1:length(depended_dim_values)) {
      parsed_values <- c()
      if (!is.null(depended_dim)) {
        replace_values[[depended_dim]] <- depended_dim_values[j]
      }
      path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values)
      found_files <- Sys.glob(path_with_globs)
      ## TODO: Enhance this error message, or change by warning.
      ##       Raises if a wrong sdate is specified, for example.
      if (length(found_files) == 0) {
        .warning(paste0("Could not find files for any '", u_file_dim, 
                        "' for '", depended_dim, "' = '", 
                        depended_dim_values[j], "'."))
        dat_selectors[[u_file_dim]][[j]] <- NA
      } else {
        for (found_file in found_files) {
          path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']],
                                                               replace_values[-which(file_dims == u_file_dim)],
                                                               allow_undefined_key_vars = TRUE)
          parsed_values <- c(parsed_values, 
                             .FindTagValue(path_with_globs_and_tag, found_file, 
                                           u_file_dim))
        }
        #TODO: selector_checker() doesn't allow selectors to be characters. For selectors
        #      like "member = 'r7i1p1f1", it cannot be defined with values.
        dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors,
                                                             var = unique(parsed_values),
                                                             return_indices = FALSE)
        # Take chunk if needed
        dat_selectors[[u_file_dim]][[j]] <- dat_selectors[[u_file_dim]][[j]][get_chunk_indices(length(dat_selectors[[u_file_dim]][[j]]),
aho's avatar
aho committed
                                                                                           chunks[[u_file_dim]]['chunk'],
                                                                                           chunks[[u_file_dim]]['n_chunks'], 
                                                                                           u_file_dim)]
      }
    }
  }
  #NOTE: change 'dat' in parent env. because "dat[[i]][['path']]" is changed.
  assign('dat', dat, envir = parent.frame())
  return(dat_selectors)
}


# Adjust the argument 'return_vars' if users don't assign them properly.
# Force return_vars = (time = NULL) to (time = 'sdate') if one of the situations:
# (1) selector = [sdate = 2, time = 4], or
# (2) time_across = 'sdate'.
correct_return_vars <- function(inner_dim, inner_dims_across_files, found_pattern_dim,
                                file_dim_as_selector_array_dim) {
    # inner_dim is not in return_vars or is NULL
      if (is.character(file_dim_as_selector_array_dim)) { #(1)
        if (any(file_dim_as_selector_array_dim %in% found_pattern_dim)) {
          stop(paste0("Found '", inner_dim, "' selector has dimension of the pattern dim '",
                      found_pattern_dim, 
                      "', which is not allowed. To assign the dependency on the pattern dim, ",
                      "use 'return_vars = list(", inner_dim, " = 'dat')' instead."))
        } else {
          corrected_value <- file_dim_as_selector_array_dim
        }
      } else if (inner_dim %in% inner_dims_across_files) { #(2)
        file_dim_name <- names(which(inner_dim == inner_dims_across_files))
        if (file_dim_name %in% found_pattern_dim) {
          stop(paste0("Found '", inner_dim, "' has across dependency on the pattern dim '",
                      found_pattern_dim, "', which is not allowed."))
        } else {
          corrected_value <- file_dim_name
        }
      }
aho's avatar
aho committed
      .warning(paste0("Found '", inner_dim, "' dependency on file dimension '", corrected_value,
                      "', but '", inner_dim, "' is not in return_vars list or does not include '", corrected_value,
                      "'. To provide the correct metadata, '", corrected_value, "' is included under '", inner_dim,
                      "' in 'return_vars."))
  return(corrected_value)
}

# The time classes that are needed to adjust time zone back to UTC.
time_special_types <- function() {
  list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, 'Date' = as.Date)
}

# Replace the dim names read from netCDF file with the user-specified synonims.
replace_with_synonmins <- function(read_dims, synonims) {
  corrected_dim_name <- sapply(names(read_dims), 
                            function(x) {
                              which_entry <- which(sapply(synonims, function(y) x %in% y))
                              if (length(which_entry) > 0) {
                                names(synonims)[which_entry]
                              } else {
                                x
                              }
                            })
  return(corrected_dim_name)
}


# Prepare vars_to_read for this dataset (i loop) and this file (j loop)
generate_vars_to_read <- function(return_vars, changed_dims, first_found_file, common_return_vars,
                                  common_first_found_file, i) {
  vars_to_read <- NULL
  if (length(return_vars) > 0) {
    #NOTE: because return_vars has changed 'dat' to character(0) above (line 1775),
    #      'dat' won't be included in vars_to_read here.
    vars_to_read <- names(return_vars)[sapply(return_vars, function(x) any(names(changed_dims) %in% x))]
  }
  if (!is.null(first_found_file)) {
    if (any(!first_found_file)) {
      vars_to_read <- c(vars_to_read, names(first_found_file[which(!first_found_file)]))
    }
  }
  if ((i == 1) && (length(common_return_vars) > 0)) {
    vars_to_read <- c(vars_to_read, names(common_return_vars)[sapply(common_return_vars, function(x) any(names(changed_dims) %in% x))])
  }
  if (!is.null(common_first_found_file)) {
    if (any(!common_first_found_file)) {
      vars_to_read <- c(vars_to_read, names(common_first_found_file[which(!common_first_found_file)]))
    }
  }
  return(vars_to_read)
}

# Find the largest dims length within one dataset.
find_largest_dims_length <- function(selectors_total_list, array_of_files_to_load,
                                     selector_indices_save, dat, expected_inner_dims,
                                     synonims, file_dim_reader) {
  # Open and get all the dims from all the files
  data_dims_all_files <- vector('list', length = length(selectors_total_list))

  for (selectors_kk in 1:length(data_dims_all_files)) {
    file_to_open <- do.call("[", c(list(array_of_files_to_load), 
                         as.list(selector_indices_save[[selectors_kk]])))
    data_dims_all_files[[selectors_kk]] <- try(
      file_dim_reader(file_to_open, NULL, selectors_total_list[[selectors_kk]],
                      lapply(dat[['selectors']][expected_inner_dims], '[[', 1),
                      synonims), silent = TRUE)

  }

  # Remove the missing files (i.e., fail try above)
  if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) {
      tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error')
      data_dims_all_files <- data_dims_all_files[-tmp]
  }

  # Find the longest dimensions from all the files
  largest_data_dims <- rep(0, length(data_dims_all_files[[1]]))

  # The inner dim order may differ among files. Need to align them before
  # find out the largest dim length.
  dim_names_first_file <- names(data_dims_all_files[[1]])
  same_dim_order <-lapply(lapply(data_dims_all_files, names),      
                          identical, dim_names_first_file)
  for (to_fix in which(!unlist(same_dim_order))) {
    data_dims_all_files[[to_fix]] <- data_dims_all_files[[to_fix]][match(dim_names_first_file,
                                       names(data_dims_all_files[[to_fix]]))]
  }

  for (kk in 1:length(data_dims_all_files[[1]])) {
    largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk))
  }
  names(largest_data_dims) <- names(data_dims_all_files[[1]])
  return(list(largest_data_dims = largest_data_dims,
              data_dims_all_files = data_dims_all_files))
}

# Gererate vars_to_transform from picked_vars[[i]] and picked_common_vars
generate_vars_to_transform <- function(vars_to_transform, picked_vars, transform_vars, 
                                       picked_vars_ordered) {
  # In Start(), picked_vars can be picked_vars[[i]] or picked_common_vars
  picked_vars_to_transform <- which(names(picked_vars) %in% transform_vars)
  if (length(picked_vars_to_transform) > 0) {
    picked_vars_to_transform <- names(picked_vars)[picked_vars_to_transform]
    new_vars_to_transform <- picked_vars[picked_vars_to_transform]
    which_are_ordered <- which(!sapply(picked_vars_ordered[picked_vars_to_transform], is.null))
            
    if (length(which_are_ordered) > 0) {    
      tmp <- which(!is.na(match(names(picked_vars_ordered), names(which_are_ordered))))
      new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[tmp]
    }
    vars_to_transform <- c(vars_to_transform, new_vars_to_transform)
  }
  return(vars_to_transform)
}

# Turn indices to values for transform_crop_domain
generate_transform_crop_domain_values <- function(transform_crop_domain, picked_vars) {
  if (any(transform_crop_domain == 'all')) {
    transform_crop_domain <- c(picked_vars[1], tail(picked_vars, 1))
  } else {  # indices()
    if (is.list(transform_crop_domain)) {
      transform_crop_domain <- picked_vars[unlist(transform_crop_domain)]
    } else {  # vector
      transform_crop_domain <-
        c(picked_vars[transform_crop_domain[1]],
          picked_vars[tail(transform_crop_domain, 1)])
    }
  }
  return(transform_crop_domain)
}

# Out-of-range warning
show_out_of_range_warning <- function(inner_dim, range, bound) {
  # bound: 'lower' or 'upper'
  .warning(paste0("The ", bound, " boundary of selector of ", inner_dim,
                  " is out of range [", min(range), ", ", max(range), "]. ",
                  "Check if the desired range is all included."))
}

# 'sub_sub_array_of_values' is for sri chunking. If this inner dim is chunked,
# the sri has to follow the chunking of fri. Therefore, we save the original 
# value of this chunk here for later use. We'll find the corresponding 
# transformed value within 'sub_sub_array_of_values' and chunk sri. This
# function also returns 'previous_sub_subarray_of_values', which is used for
# checking if there is sri being skipped.
generate_sub_sub_array_of_values <- function(input_array_of_values, sub_array_of_indices,
                                             number_of_chunk) {
  previous_sub_sub_array_of_values <- NULL

  if (is.list(sub_array_of_indices)) {
    sub_sub_array_of_values <- list(input_array_of_values[sub_array_of_indices[[1]]],
                                    input_array_of_values[sub_array_of_indices[[2]]])
    if (number_of_chunk > 1) {
aho's avatar
aho committed
      if (diff(unlist(sub_array_of_indices)) > 0) {
        previous_sub_sub_array_of_values <- 
          input_array_of_values[sub_array_of_indices[[1]] - 1]
      } else {
        previous_sub_sub_array_of_values <-
          input_array_of_values[sub_array_of_indices[[1]] + 1]
      }
    }
  } else {  # is vector
    sub_sub_array_of_values <- input_array_of_values[sub_array_of_indices]
    if (number_of_chunk > 1) {
aho's avatar
aho committed
      if (diff(sub_array_of_indices[1:2]) > 0) {
        previous_sub_sub_array_of_values <- 
          input_array_of_values[sub_array_of_indices[1] - 1]
      } else {
        previous_sub_sub_array_of_values <-
          input_array_of_values[sub_array_of_indices[1] + 1]
      }
    }
  }

  return(list(sub_sub_array_of_values = sub_sub_array_of_values,
              previous_sub_sub_array_of_values = previous_sub_sub_array_of_values))
}


# Generate sub_array_of_fri 
generate_sub_array_of_fri <- function(with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta,
                                      is_circular_dim, add_beta = TRUE) {
  print_warning <- FALSE
  if (goes_across_prime_meridian) {
    #NOTE: The potential problem here is, if it is global longitude,
    #      and the indices overlap (e.g., lon = [0, 359.723] and 
    #      CircularSort(-180, 180), then sub_array_of_indices = list(649, 649)). 
    #      Therefore, sub_array_of_fri will be c(1:649, 649:1296). We'll get two 649.
    #      The fix below may not be the best solution, but it works for the example above.

    if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) {
      # global longitude
      sub_array_of_fri <- 1:n  # n = prod(dim(var_with_selectors))

      if (with_transform & beta != 0 & add_beta) {
        # Warning if transform_extra_cell != 0
        print_warning <- TRUE
      }

    } else {
      # normal case, i.e., not global
      first_index <- min(unlist(sub_array_of_indices))
      last_index <- max(unlist(sub_array_of_indices))
      if (with_transform & add_beta) {
        gap_width <- last_index - first_index - 1
        actual_beta <- min(gap_width, beta)
        sub_array_of_fri <- c(1:(first_index + actual_beta),                                      
                              (last_index - actual_beta):n)
        if (actual_beta != beta) {
          print_warning <- TRUE
        }
      } else {
        sub_array_of_fri <- c(1:first_index, last_index:n)
      }
    }

  } else {
    #NOTE: This if seems redundant.
#    if (is.list(sub_array_of_indices)) {
#      sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]]
#    }
    #NOTE: sub_array_of_indices may be vector or list
    if (with_transform & add_beta) {
      first_index <- min(unlist(sub_array_of_indices))
      last_index <- max(unlist(sub_array_of_indices))
      start_padding <- min(beta, first_index - 1)
      end_padding <- min(beta, n - last_index)
  
      if (!is_circular_dim) {  #latitude or when <var>_reorder is not used
        sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding)
        if (start_padding != beta | end_padding != beta) {
          print_warning <- TRUE
        }
      } else {  #longitude
        if (start_padding == beta & end_padding == beta) {
          # normal regional situation
          sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding)
        } else if (start_padding < beta & end_padding < beta) {
          # global
          sub_array_of_fri <- 1:n
        } else if (start_padding < beta) {
          # left side too close to border, need to go to right side
          sub_array_of_fri <- c((first_index - start_padding):(last_index + end_padding), (n - (beta - start_padding - 1)):n)
        } else if (end_padding < beta) {
          # right side too close to border, need to go to left side
          sub_array_of_fri <- c(1: (beta - end_padding), (first_index - start_padding):(last_index + end_padding))
        }
      }

    } else {
      if (is.list(sub_array_of_indices)) {
        sub_array_of_fri <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]]
      } else {
        sub_array_of_fri <- sub_array_of_indices
      }
    }
  }
  if (print_warning) {
    .warning(paste0("Adding parameter transform_extra_cells =  ", beta,
                    " to the transformed index excesses ",
                    "the border. The border index is used for transformation."))
  }

  return(sub_array_of_fri)
}

# This function merges two dimensions (e.g., time and sdate if "time_across = 'sdate'") into one. 
# The two dimensions have to be next to each other. In Start(), it is used to reshape 
# final_dims_fake if merge_across_dims = TRUE
dims_merge <- function(inner_dims_across_files, final_dims_fake) {
  # inner_dims_across_files would be like: $sdate: "time"
  for (file_dim_across in names(inner_dims_across_files)) {
    inner_dim_pos <- which(names(final_dims_fake) == inner_dims_across_files[[file_dim_across]])
    new_dims <- c()
    # part 1: Put the dims before 'time' in new_dims
    if (inner_dim_pos > 1) {
      new_dims <- c(new_dims, final_dims_fake[1:(inner_dim_pos - 1)])
    }
    # part 2: Merge time and sdate together, and name this dim as 'time'
    # The cross and being crossed dims are next to each other, e.g., [time, sdate]
    new_dims <- c(new_dims, setNames(prod(final_dims_fake[c(inner_dim_pos, inner_dim_pos + 1)]), 
                                     inner_dims_across_files[[file_dim_across]]))
    # part 3: Put the dimes after 'sdate' in new_dims
    if (inner_dim_pos + 1 < length(final_dims_fake)) {
      new_dims <- c(new_dims, final_dims_fake[(inner_dim_pos + 2):length(final_dims_fake)])
    }
    final_dims_fake <- new_dims
  }
  return(final_dims_fake)
}

# This function splits one dimension into two. In Start(), it is used to reshape final_dims_fake
# if split_multiselected_dims = TRUE. 
dims_split <- function(dim_params, final_dims_fake) {
  all_split_dims <- NULL
  for (dim_param in 1:length(dim_params)) {
    split_dims <- dim(dim_params[[dim_param]])
    if (!is.null(split_dims)) {
      if (length(split_dims) > 1) {
        all_split_dims <- c(all_split_dims, setNames(list(split_dims), 
                                                     names(dim_params)[dim_param]))
        if (is.null(names(split_dims))) {
          names(split_dims) <- paste0(names(dim_params)[dim_param], 
                                      1:length(split_dims))
        }
        old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param])

        # If merge_across_dims and split_multiselected_dims are both used,
        # on one file dim, and this file dim is multi-dim, it doesn't work.
        if (identical(old_dim_pos, integer(0))) {
          stop(paste0("The dimension '", names(dim_params)[dim_param], 
                      "' to be split cannot be found after 'merge_across_dims' ",
                      "is used. Check if the reshape parameters are used appropriately."))
        }
 
        # NOTE: Three steps to create new dims.
        # 1st: Put in the dims before split_dim.
        # 2nd: Replace the old_dim with split_dims.
        # 3rd: Put in the dims after split_dim.
        new_dims <- c()
        if (old_dim_pos > 1) {
          new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)])
        }
        new_dims <- c(new_dims, split_dims)
        if (old_dim_pos < length(final_dims_fake)) {
          new_dims <- c(new_dims, final_dims_fake[(old_dim_pos + 1):length(final_dims_fake)])
        }
        final_dims_fake <- new_dims
      }
    }
  }
  return(list(final_dims_fake, all_split_dims))
}


# This function sums up the length of all the inner across dim (e.g., time: list(31, 29, 31, 30))
# and use it to replace the value of that inner dim. That is, it returns the actual length of 
# time rather than using the one including NAs. In Start(), it is used to reshape final_dims_fake
# if merge_across_dims = TRUE, merge_across_dims_narm = TRUE, and split_multiselected_dims = FALSE. 
merge_narm_dims <- function(final_dims_fake, across_inner_dim, length_inner_across_dim) {
  final_dims_fake_name <- names(final_dims_fake)
  pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim)
  new_length_inner_dim <- sum(unlist(length_inner_across_dim))
  if (pos_across_inner_dim != length(final_dims_fake)) {
    final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)],
                         new_length_inner_dim,
                         final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)])
  } else {
    final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)],
                         new_length_inner_dim)
  }
  names(final_dims_fake) <- final_dims_fake_name
  return(final_dims_fake)
}



# Adjust the dim order. If split_multiselected_dims + merge_across_dims, the dim order may 
# need to be changed. The inner_dim needs to be the first dim among split dims.
reorder_split_dims <- function(all_split_dims, inner_dim_pos_in_split_dims, final_dims_fake) {
  all_split_dims <- c(all_split_dims[inner_dim_pos_in_split_dims],
                      all_split_dims[1:length(all_split_dims)][-inner_dim_pos_in_split_dims])
  split_dims_pos <- which(!is.na(match(names(final_dims_fake), names(all_split_dims))))
  new_dims <- c()
  if (split_dims_pos[1] != 1) {
    new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)])
  }
  new_dims <- c(new_dims, all_split_dims)
  if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) {
    new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)])
  }
  final_dims_fake <- new_dims

  return(list(final_dims_fake, all_split_dims))
}

aho's avatar
aho committed
# Find the final_dims_fake for metadata if it needs to be reshaped
find_final_dims_fake_metadata <- function(merge_across_dims, split_multiselected_dims,
                                          picked_common_vars, across_inner_dim, final_dims_fake,
                                          dims_of_merge_dim, all_split_dims) {
aho's avatar
aho committed
  if (merge_across_dims) {
    if (!split_multiselected_dims) {
      final_dims_fake_metadata <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)]
    } else {
      final_dims_fake_metadata <- final_dims_fake[names(final_dims_fake) %in% names(all_split_dims[[across_inner_dim]])]
    }
  } else if (split_multiselected_dims) {
    target_split_dim_ind <- which(names(dim(picked_common_vars)) == names(all_split_dims))
    margin_dim_ind <- c(1:length(dim(picked_common_vars)))[-target_split_dim_ind]
    if (identical(margin_dim_ind, numeric(0)) | identical(margin_dim_ind, integer(0))) {
      final_dims_fake_metadata <- all_split_dims[[1]]
    } else {
      final_dims_fake_metadata <- .ReplaceElementInVector(dim(picked_common_vars), target = names(all_split_dims), new_val = all_split_dims[[1]])
aho's avatar
aho committed
    }
  }
aho's avatar
aho committed
  return(final_dims_fake_metadata)
}

# Build the work pieces.
build_work_pieces <- function(work_pieces, i, selectors, file_dims, inner_dims, final_dims, 
                              found_pattern_dim, inner_dims_across_files, array_of_files_to_load,
                              array_of_not_found_files, array_of_metadata_flags,
                              metadata_file_counter, depending_file_dims, transform, 
                              transform_vars, picked_vars, picked_vars_ordered, picked_common_vars,
aho's avatar
aho committed
                              picked_common_vars_ordered, metadata_folder, debug = debug) {
  sub_array_dims <- final_dims[file_dims]
  sub_array_dims[found_pattern_dim] <- 1
  sub_array_of_files_to_load <- array(1:prod(sub_array_dims), 
                                      dim = sub_array_dims)
  names(dim(sub_array_of_files_to_load)) <- names(sub_array_dims)
  # Detect which of the dimensions of the dataset go across files.
  file_dim_across_files <- lapply(inner_dims, 
                                  function(x) {
                                    dim_across <- sapply(inner_dims_across_files, function(y) x %in% y)
                                    if (any(dim_across)) {
                                      names(inner_dims_across_files)[which(dim_across)[1]]
                                    } else {
                                      NULL
                                    }
                                  })
  names(file_dim_across_files) <- inner_dims
  j <- 1
  while (j <= prod(sub_array_dims)) {
    # Work out file path.
    file_to_load_sub_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ]
    names(file_to_load_sub_indices) <- names(sub_array_dims)
    file_to_load_sub_indices[found_pattern_dim] <- i
    big_dims <- rep(1, length(dim(array_of_files_to_load)))
    names(big_dims) <- names(dim(array_of_files_to_load))
    file_to_load_indices <- .MergeArrayDims(file_to_load_sub_indices, big_dims)[[1]]
    file_to_load <- do.call('[[', c(list(array_of_files_to_load), 
                            as.list(file_to_load_indices)))
    not_found_file <- do.call('[[', c(list(array_of_not_found_files),
                              as.list(file_to_load_indices)))
    load_file_metadata <- do.call('[', c(list(array_of_metadata_flags), 
                                  as.list(file_to_load_indices)))
    if (load_file_metadata) {
       metadata_file_counter <- metadata_file_counter + 1
       assign('metadata_file_counter', metadata_file_counter, envir = parent.frame())
    }
    if (!is.na(file_to_load) && !not_found_file) {
      # Work out indices to take
      first_round_indices <- lapply(inner_dims, 
                                    function (x) {
                                      if (is.null(file_dim_across_files[[x]])) {
                                        x_dim_name <- attr(attr(selectors[[x]][['fri']], "dim"), "names")
                                        if (!is.null(x_dim_name)) {
                                          which_chunk <- file_to_load_sub_indices[x_dim_name]
                                         if (length(which_chunk) > 1) {
                                            tmp_dim <- attr(selectors[[x]][['fri']], "dim")
                                            vec_ind <- which_chunk[1]
                                            for (i_dim in length(tmp_dim):2) {
                                              vec_ind <- vec_ind + (which_chunk[i_dim] - 1) * prod(tmp_dim[1:(i_dim - 1)])
                                            }
                                            selectors[[x]][['fri']][[vec_ind]]
                                          } else {  #old code
                                            selectors[[x]][['fri']][[which_chunk]]
                                          }
                                        } else {
                                          selectors[[x]][['fri']][[1]]
                                        }
                                      } else {
                                        which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] 
                                        selectors[[x]][['fri']][[which_chunk]]
                                      }
                                    })
      names(first_round_indices) <- inner_dims
      second_round_indices <- lapply(inner_dims, 
                                     function (x) {
                                       if (is.null(file_dim_across_files[[x]])) {
                                         x_dim_name <- attr(attr(selectors[[x]][['sri']], "dim"), "names")
                                         if (!is.null(x_dim_name)) {
                                           which_chunk <- file_to_load_sub_indices[x_dim_name]
                                          if (length(which_chunk) > 1) {
                                            tmp_dim <- attr(selectors[[x]][['sri']], "dim")
                                            vec_ind <- which_chunk[1]
                                            for (i_dim in length(tmp_dim):2) {
                                              vec_ind <- vec_ind + (which_chunk[i_dim] - 1) * prod(tmp_dim[1:(i_dim - 1)])
                                            }
                                            selectors[[x]][['sri']][[vec_ind]]
                                          } else {  #old code
                                             selectors[[x]][['sri']][[which_chunk]]
                                          }
                                         } else {
                                           selectors[[x]][['sri']][[1]]
                                         }
                                       } else {
                                         which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]]
                                         selectors[[x]][['sri']][[which_chunk]]
                                       }
                                     })
      if (debug) {
        print("-> BUILDING A WORK PIECE")
        #print(str(selectors))
      }
      names(second_round_indices) <- inner_dims
      if (!any(sapply(first_round_indices, length) == 0)) {
        work_piece <- list()
        work_piece[['first_round_indices']] <- first_round_indices
        work_piece[['second_round_indices']] <- second_round_indices
        work_piece[['file_indices_in_array_of_files']] <- file_to_load_indices
        work_piece[['file_path']] <- file_to_load
        work_piece[['store_dims']] <- final_dims
        # Work out store position
        store_position <- final_dims
        store_position[names(file_to_load_indices)] <- file_to_load_indices
        store_position[inner_dims] <- rep(1, length(inner_dims))
        work_piece[['store_position']] <- store_position
        # Work out file selectors
        file_selectors <- sapply(file_dims, 
                                 function (x) {
                                   vector_to_pick <- 1
                                   if (x %in% names(depending_file_dims)) {
                                     vector_to_pick <- file_to_load_indices[depending_file_dims[[x]]]
                                   }
                                   selectors[file_dims][[x]][[vector_to_pick]][file_to_load_indices[x]]
                                 })
        names(file_selectors) <- file_dims
        work_piece[['file_selectors']] <- file_selectors
        # Send variables for transformation
        if (!is.null(transform) && (length(transform_vars) > 0)) {
          vars_to_transform <- NULL
          picked_vars_to_transform <- which(names(picked_vars) %in% transform_vars)
          if (length(picked_vars_to_transform) > 0) {
            picked_vars_to_transform <- names(picked_vars)[picked_vars_to_transform]
            vars_to_transform <- c(vars_to_transform, picked_vars[picked_vars_to_transform])
            if (any(picked_vars_to_transform %in% names(picked_vars_ordered))) {
              picked_vars_ordered_to_transform <- picked_vars_to_transform[which(picked_vars_to_transform %in% names(picked_vars_ordered))]
              vars_to_transform[picked_vars_ordered_to_transform] <- picked_vars_ordered[picked_vars_ordered_to_transform]
            }
          }
          picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars)
          if (length(picked_common_vars_to_transform) > 0) {
            picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform]
            vars_to_transform <- c(vars_to_transform, picked_common_vars[picked_common_vars_to_transform])
            if (any(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))) {
              picked_common_vars_ordered_to_transform <- picked_common_vars_to_transform[which(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))]
              vars_to_transform[picked_common_vars_ordered_to_transform] <- picked_common_vars_ordered[picked_common_vars_ordered_to_transform]
            }
          }
          work_piece[['vars_to_transform']] <- vars_to_transform
        }
        # Send flag to load metadata
        if (load_file_metadata) {
          work_piece[['save_metadata_in']] <- paste0(metadata_folder, '/', metadata_file_counter)
        }
        work_pieces <- c(work_pieces, list(work_piece))
      }