Start.R 240 KB
Newer Older
aho's avatar
aho committed
      dat[[i]][['name']] <- paste0('dat', i)
      if (!('path' %in% names(dat[[i]]))) {
        stop(paste0("Parameter '", found_pattern_dim, 
                    "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided."))
      }
    } else if (!('path' %in% names(dat[[i]]))) {
      dat_to_fetch <- c(dat_to_fetch, i)
    }
    #if ('path' %in% names(dat[[i]])) {
    #  if (!('nc_var_name' %in% names(dat[[i]]))) {
    #    dat[[i]][['nc_var_name']] <- '$var_name$'
    #  }
    #  if (!('suffix' %in% names(dat[[i]]))) {
    #    dat[[i]][['suffix']] <- ''
    #  }
    #  if (!('var_min' %in% names(dat[[i]]))) {
    #    dat[[i]][['var_min']] <- ''
    #  }
    #  if (!('var_max' %in% names(dat[[i]]))) {
    #    dat[[i]][['var_max']] <- ''
    #  }
    #}
    dat_names <- c(dat_names, dat[[i]][['name']])
  }
  if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) {
    .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.")
  }
  if (length(dat_to_fetch) > 0) {
    stop("Specified only the name for some data sets, but not the path ",
         "pattern. This option has not been yet implemented.")
  }
  
  # Reorder inner_dims_across_files (to make the keys be the file dimensions,
  # and the values to be the inner dimensions that go across it).
  if (!is.null(inner_dims_across_files)) {
    # Reorder: example, convert list(ftime = 'chunk', ensemble = 'member', xx = 'chunk')
    #                        to list(chunk = c('ftime', 'xx'), member = 'ensemble')
    new_idaf <- list()
    for (i in names(inner_dims_across_files)) {
      if (!(inner_dims_across_files[[i]] %in% names(new_idaf))) {
        new_idaf[[inner_dims_across_files[[i]]]] <- i
      } else {
        new_idaf[[inner_dims_across_files[[i]]]] <- c(new_idaf[[inner_dims_across_files[[i]]]], i)
      }
    }
    inner_dims_across_files <- new_idaf
  }
  
  # Check return_vars
  if (is.null(return_vars)) {
    return_vars <- list()
    #    if (length(var_params) > 0) {
    #      return_vars <- as.list(paste0(names(var_params), '_var'))
    #    } else {
    #      return_vars <- list()
    #    }
  }
  if (!is.list(return_vars)) {
    stop("Parameter 'return_vars' must be a list or NULL.")
  }
  if (length(return_vars) > 0 && is.null(names(return_vars))) {
    #    names(return_vars) <- rep('', length(return_vars))
    stop("Parameter 'return_vars' must be a named list.")
  }
  i <- 1
  while (i <= length(return_vars)) {
    #    if (names(return_vars)[i] == '') {
    #      if (!(is.character(return_vars[[i]]) && (length(return_vars[[i]]) == 1))) {
    #        stop("The ", i, "th specification in 'return_vars' is malformed.")
    #      } 
    #      if (!grepl('_var$', return_vars[[i]])) {
    #        stop("The ", i, "th specification in 'return_vars' is malformed.")
    #      }
    #      dim_name <- strsplit(return_vars[[i]], '_var$')[[1]][1]
    #      if (!(dim_name %in% names(var_params))) {
    #        stop("'", dim_name, "_var' requested in 'return_vars' but ",
    #             "no '", dim_name, "_var' specified in the .Load call.")
    #      }
    #      names(return_vars)[i] <- var_params[[dim_name]]
    #      return_vars[[i]] <- found_pattern_dim
    #    } else
    if (length(return_vars[[i]]) > 0) { 
      if (!is.character(return_vars[[i]])) {
        stop("The ", i, "th specification in 'return_vars' is malformed. It ",
             "must be a vector of character strings of valid file dimension ",
             "names.")
      }
    }
    i <- i + 1
  }
  
  # Check synonims
  if (!is.null(synonims)) {
    error <- FALSE
    if (!is.list(synonims)) {
      error <- TRUE
    }
    for (synonim_entry in names(synonims)) {
      if (!(synonim_entry %in% names(dim_params)) &&
          !(synonim_entry %in% names(return_vars))) {
        error <- TRUE
      }
      if (!is.character(synonims[[synonim_entry]]) ||
          length(synonims[[synonim_entry]]) < 1) {
        error <- TRUE
      }
    }
    if (error) {
      stop("Parameter 'synonims' must be a named list, where the names are ",
           "a name of a requested dimension or variable and the values are ",
           "vectors of character strings with at least one alternative name ",
           " for each dimension or variable in 'synonims'.")
    }
  }
  if (length(unique(names(synonims))) < length(names(synonims))) {
    stop("There must not be repeated entries in 'synonims'.")
  }
  if (length(unique(unlist(synonims))) < length(unlist(synonims))) {
    stop("There must not be repeated values in 'synonims'.")
  }
  # Make that all dims and vars have an entry in synonims, even if only dim_name = dim_name
  dim_entries_to_add <- which(!(names(dim_params) %in% names(synonims)))
  if (length(dim_entries_to_add) > 0) {
    synonims[names(dim_params)[dim_entries_to_add]] <- as.list(names(dim_params)[dim_entries_to_add])
  }
  var_entries_to_add <- which(!(names(var_params) %in% names(synonims)))
  if (length(var_entries_to_add) > 0) {
    synonims[names(var_params)[var_entries_to_add]] <- as.list(names(var_params)[var_entries_to_add])
  }
  
  # Check selector_checker
  if (is.null(selector_checker) || !is.function(selector_checker)) {
    stop("Parameter 'selector_checker' must be a function.")
  }
  
  # Check file_opener
  if (is.null(file_opener) || !is.function(file_opener)) {
    stop("Parameter 'file_opener' must be a function.")
  }
  
  # Check file_var_reader
  if (!is.null(file_var_reader) && !is.function(file_var_reader)) {
    stop("Parameter 'file_var_reader' must be a function.")
  }
  
  # Check file_dim_reader
  if (!is.null(file_dim_reader) && !is.function(file_dim_reader)) {
    stop("Parameter 'file_dim_reader' must be a function.")
  }
  
  # Check file_data_reader
  if (is.null(file_data_reader) || !is.function(file_data_reader)) {
    stop("Parameter 'file_data_reader' must be a function.")
  }
  
  # Check file_closer
  if (is.null(file_closer) || !is.function(file_closer)) {
    stop("Parameter 'file_closer' must be a function.")
  }
  
  # Check transform
  if (!is.null(transform)) {
    if (!is.function(transform)) {
      stop("Parameter 'transform' must be a function.")
    }
  }
  
  # Check transform_params
  if (!is.null(transform_params)) {
    if (!is.list(transform_params)) {
      stop("Parameter 'transform_params' must be a list.")
    }
    if (is.null(names(transform_params))) {
      stop("Parameter 'transform_params' must be a named list.")
    }
  }
  
  # Check transform_vars
  if (!is.null(transform_vars)) {
    if (!is.character(transform_vars)) {
      stop("Parameter 'transform_vars' must be a vector of character strings.")
    }
  }
  if (any(!(transform_vars %in% names(return_vars)))) {
    stop("All the variables specified in 'transform_vars' must also be specified in 'return_vars'.")
  }
  
  # Check apply_indices_after_transform
  if (!is.logical(apply_indices_after_transform)) {
    stop("Parameter 'apply_indices_after_transform' must be either TRUE or FALSE.")
  }
  aiat <- apply_indices_after_transform
  
  # Check transform_extra_cells
  if (!is.numeric(transform_extra_cells)) {
    stop("Parameter 'transform_extra_cells' must be numeric.")
  }
  transform_extra_cells <- round(transform_extra_cells)
  
  # Check split_multiselected_dims
  if (!is.logical(split_multiselected_dims)) {
    stop("Parameter 'split_multiselected_dims' must be TRUE or FALSE.")
  }
  
  # Check path_glob_permissive
  if (!is.numeric(path_glob_permissive) && !is.logical(path_glob_permissive)) {
    stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or an integer.")
  }
  if (length(path_glob_permissive) != 1) {
    stop("Parameter 'path_glob_permissive' must be of length 1.")
  }
  
  # Check largest_dims_length
  if (!is.numeric(largest_dims_length) && !is.logical(largest_dims_length)) {
    stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.")
  }
  if (is.numeric(largest_dims_length)) {
    if (any(largest_dims_length %% 1 != 0) | any(largest_dims_length < 0) | is.null(names(largest_dims_length))) {
      stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.")
    }
  }
  if (is.logical(largest_dims_length) && length(largest_dims_length) != 1) {
    stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or a named integer vector.")
  }

aho's avatar
aho committed
  # Check retrieve
  if (!is.logical(retrieve)) {
    stop("Parameter 'retrieve' must be TRUE or FALSE.")
  }
  
  # Check num_procs
  if (!is.null(num_procs)) {
    if (!is.numeric(num_procs)) {
      stop("Parameter 'num_procs' must be numeric.")
    } else {
      num_procs <- round(num_procs)
    }
  }
  
  # Check silent
  if (!is.logical(silent)) {
    stop("Parameter 'silent' must be logical.")
  }
  
  dim_params[[found_pattern_dim]] <- dat_names
  
  if (!silent) {
    .message(paste0("Exploring files... This will take a variable amount ",
                    "of time depending on the issued request and the ",
                    "performance of the file server..."))
  }
  
  if (!is.character(debug)) {
    dims_to_check <- c('time')
  } else {
    dims_to_check <- debug
    debug <- TRUE
  }
  
  ############################## READING FILE DIMS ############################
  # Check that no unrecognized variables are present in the path patterns
  # and also that no file dimensions are requested to THREDDs catalogs.
  # And in the mean time, build all the work pieces and look for the 
  # first available file of each dataset.
  array_of_files_to_load <- NULL
  array_of_not_found_files <- NULL
  indices_of_first_files_with_data <- vector('list', length(dat))
  selectors_of_first_files_with_data <- vector('list', length(dat))
  dataset_has_files <- rep(FALSE, length(dat))
  found_file_dims <- vector('list', length(dat))
  expected_inner_dims <- vector('list', length(dat))
  
  #print("A")
  for (i in 1:length(dat)) {
    #print("B")
    dat_selectors <- dim_params
    dat_selectors[[found_pattern_dim]] <- dat_selectors[[found_pattern_dim]][i]
    dim_vars <- paste0('$', dim_names, '$')
    file_dims <- which(sapply(dim_vars, grepl, dat[[i]][['path']], fixed = TRUE))
    if (length(file_dims) > 0) {
      file_dims <- dim_names[file_dims]
    }
    file_dims <- unique(c(pattern_dims, file_dims))
    found_file_dims[[i]] <- file_dims
    expected_inner_dims[[i]] <- dim_names[which(!(dim_names %in% file_dims))]
    # (Check the depending_file_dims).
    if (any(c(names(depending_file_dims), unlist(depending_file_dims)) %in% 
            expected_inner_dims[[i]])) {
      stop(paste0("The dimension dependancies specified in ",
                  "'depending_file_dims' can only be between file ",
                  "dimensions, but some inner dimensions found in ",
                  "dependancies for '", dat[[i]][['name']], "', which ",
                  "has the following file dimensions: ", 
                  paste(paste0("'", file_dims, "'"), collapse = ', '), ".")) 
    } else {
      a <- names(depending_file_dims) %in% file_dims
      b <- unlist(depending_file_dims) %in% file_dims
      ab <- a & b
      if (any(!ab)) {
        .warning(paste0("Detected some dependancies in 'depending_file_dims' with ",
                        "non-existing dimension names. These will be disregarded."))
        depending_file_dims <- depending_file_dims[-which(!ab)]
      }
      if (any(names(depending_file_dims) == unlist(depending_file_dims))) {
        depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == unlist(depending_file_dims))]
      }
    }
    # (Check the inner_dims_across_files).
    if (any(!(names(inner_dims_across_files) %in% file_dims)) ||
        any(!(unlist(inner_dims_across_files) %in% expected_inner_dims[[i]]))) {
      stop(paste0("All relationships specified in ",
                  "'_across' parameters must be between a inner ",
                  "dimension and a file dimension. Found wrong ",
                  "specification for '", dat[[i]][['name']], "', which ",
                  "has the following file dimensions: ", 
                  paste(paste0("'", file_dims, "'"), collapse = ', '),
                  ", and the following inner dimensions: ", 
                  paste(paste0("'", expected_inner_dims[[i]], "'"), 
                        collapse = ', '), "."))
    }
    # (Check the return_vars).
    j <- 1
    while (j <= length(return_vars)) {
      if (any(!(return_vars[[j]] %in% file_dims))) {
        if (any(return_vars[[j]] %in% expected_inner_dims[[i]])) {
          stop("Found variables in 'return_vars' requested ",
               "for some inner dimensions (for dataset '",
               dat[[i]][['name']], "'), but variables can only be ",
               "requested for file dimensions.")
        } else {
          stop("Found variables in 'return_vars' requested ",
               "for non-existing dimensions.")
        }
      }
      j <- j + 1
    }
    # (Check the metadata_dims).
    if (!is.null(metadata_dims)) {
      if (any(!(metadata_dims %in% file_dims))) {
        stop("All dimensions in 'metadata_dims' must be file dimensions.")
      }
    }
    ## Look for _var params that should be requested automatically.
    for (dim_name in dim_names) {
      if (!(dim_name %in% pattern_dims)) {
        if (is.null(attr(dat_selectors[[dim_name]], 'values')) ||
            is.null(attr(dat_selectors[[dim_name]], 'indices'))) {
          flag <- ((any(dat_selectors[[dim_name]] %in% c('all', 'first', 'last'))) || 
                     (is.numeric(unlist(dat_selectors[[dim_name]]))))
          attr(dat_selectors[[dim_name]], 'values') <- !flag
          attr(dat_selectors[[dim_name]], 'indices') <- flag
        }
        ## The following code 'rewrites' var_params for all datasets. If providing different
        ## path pattern repositories with different file/inner dimensions, var_params might
        ## have to be handled for each dataset separately.
        if ((attr(dat_selectors[[dim_name]], 'values') || (dim_name %in% c('var', 'variable'))) &&
            !(dim_name %in% names(var_params)) && !(dim_name %in% file_dims))  {
          if (dim_name %in% c('var', 'variable')) {
            var_params <- c(var_params, setNames(list('var_names'), dim_name))
            .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", 
                            dim_name, "_var' requested. ", '"', dim_name, "_var = '", 
                            'var_names', "'", '"', " has been automatically added to ",
                            "the Start call."))
          } else {
            var_params <- c(var_params, setNames(list(dim_name), dim_name))
            .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", 
                            dim_name, "_var' requested. ", '"', dim_name, "_var = '", 
                            dim_name, "'", '"', " has been automatically added to ",
                            "the Start call."))
          }
        }
      }
    }
#==================================================
    # Check if return_vars name is inner dim name. If it is synonim, change back to
    # inner dim name and return a warning.
    dim_params_found_file_dims <- dim_params[found_file_dims[[i]]]
    if (any(names(return_vars) %in% unlist(synonims) &
            !names(return_vars) %in% names(synonims))) {
      use_syn_names <- which(names(return_vars) %in% unlist(synonims) & 
                             !names(return_vars) %in% names(synonims))
      for (use_syn_name in use_syn_names) {
        wrong_name <- names(return_vars)[use_syn_name]
        names(return_vars)[use_syn_name] <- names(unlist(
                                              lapply(lapply(synonims, '%in%',
                                                     names(return_vars)[use_syn_name]),
                                              which)))
        .warning(paste0("The name '", wrong_name,
                        "' in parameter 'return_vars' in synonim. ",
                        "Change it back to the inner dimension name, '",
                        names(return_vars)[use_syn_name], "'."))
#=====================================================
aho's avatar
aho committed
    ## (Check the *_var parameters).
    if (any(!(unlist(var_params) %in% names(return_vars)))) {
      vars_to_add <- which(!(unlist(var_params) %in% names(return_vars)))
      new_return_vars <- vector('list', length(vars_to_add))
      names(new_return_vars) <- unlist(var_params)[vars_to_add]
      return_vars <- c(return_vars, new_return_vars)
      .warning(paste0("All '*_var' params must associate a dimension to one of the ",
                      "requested variables in 'return_vars'. The following variables",
                      " have been added to 'return_vars': ", 
                      paste(paste0("'", unlist(var_params), "'"), collapse = ', ')))
    }
    
    replace_values <- vector('list', length = length(file_dims))
    names(replace_values) <- file_dims
    # Take the first selector for all possible file dimensions
    for (file_dim in file_dims) {
      if (file_dim %in% names(var_params)) {
        .warning(paste0("The '", file_dim, "_var' param will be ignored since '", 
                        file_dim, "' is a file dimension (for the dataset with pattern ", 
                        dat[[i]][['path']], ")."))
      }
      if (!is.list(dat_selectors[[file_dim]]) || 
          (is.list(dat_selectors[[file_dim]]) && 
           length(dat_selectors[[file_dim]]) == 2 &&
           is.null(names(dat_selectors[[file_dim]])))) {
        dat_selectors[[file_dim]] <- list(dat_selectors[[file_dim]])
      }
      first_class <- class(dat_selectors[[file_dim]][[1]])
      first_length <- length(dat_selectors[[file_dim]][[1]])
      for (j in 1:length(dat_selectors[[file_dim]])) {
        sv <- selector_vector <- dat_selectors[[file_dim]][[j]]
        if (!identical(first_class, class(sv)) ||
            !identical(first_length, length(sv))) {
          stop("All provided selectors for depending dimensions must ",
               "be vectors of the same length and of the same class.")
        }
        if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) {
          dat_selectors[[file_dim]][[j]] <- selector_checker(selectors = sv,
                                                             return_indices = FALSE)
          # Take chunk if needed
          dat_selectors[[file_dim]][[j]] <- dat_selectors[[file_dim]][[j]][chunk_indices(length(dat_selectors[[file_dim]][[j]]),
                                                                                         chunks[[file_dim]]['chunk'],
                                                                                         chunks[[file_dim]]['n_chunks'], 
                                                                                         file_dim)]
        } else if (!(is.numeric(sv) || 
                     (is.character(sv) && (length(sv) == 1) && (sv %in% c('all', 'first', 'last'))) || 
                     (is.list(sv) && (length(sv) == 2) && (all(sapply(sv, is.character)) || 
                                                           all(sapply(sv, is.numeric)))))) {
          stop("All explicitly provided selectors for file dimensions must be character strings.")
        } 
      }
      sv <- dat_selectors[[file_dim]][[1]]
      if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) {
        replace_values[[file_dim]] <- dat_selectors[[file_dim]][[1]][1]
      }
    }
    #print("C")
    # Now we know which dimensions whose selectors are provided non-explicitly.
    undefined_file_dims <- file_dims[which(sapply(replace_values, is.null))]
    defined_file_dims <- file_dims[which(!(file_dims %in% undefined_file_dims))]
    # Quickly check if the depending dimensions are provided properly.
    for (file_dim in file_dims) {
      if (file_dim %in% names(depending_file_dims)) {
        ## TODO: Detect multi-dependancies and forbid.
        #NOTE: The if statement below is tricky. It tries to distinguish if the depending dim
        #      has the depended dim as the names of the list. However, if the depending dim
        #      doesn't have list names and its length is 2 (i.e., list( , )), Start() thinks
        #      it means the range, just like `lat = values(list(10, 20))`. And because of this,
        #      we won't enter the following if statement, and the error will occur later in
        #      SelectorChecker(). Need to find a way to distinguish if list( , ) means range or
        #      just the values.
aho's avatar
aho committed
        if (all(c(file_dim, depending_file_dims[[file_dim]]) %in% defined_file_dims)) {
          if (length(dat_selectors[[file_dim]]) != length(dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) {
            stop(paste0("If providing selectors for the depending ",
                        "dimension '", file_dim, "', a ",
                        "vector of selectors must be provided for ",
                        "each selector of the dimension it depends on, '",
                        depending_file_dims[[file_dim]], "'."))
          } else if (!all(names(dat_selectors[[file_dim]]) == dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) {
            stop(paste0("If providing selectors for the depending ",
                        "dimension '", file_dim, "', the name of the ",
                        "provided vectors of selectors must match ",
                        "exactly the selectors of the dimension it ",
                        "depends on, '", depending_file_dims[[file_dim]], "'."))
          } else if (is.null(names(dat_selectors[[file_dim]]))) {
            .warning(paste0("The selectors for the depending dimension '", file_dim, "' do not ",
                            "have list names. Assume that the order of the selectors matches the ",
                            "depended dimensions '", depending_file_dims[[file_dim]], "''s order."))
aho's avatar
aho committed
          }
        }
      }
    }
    # Find the possible values for the selectors that are provided as
    # indices. If the requested file is on server, impossible operation.
    if (length(grep("^http", dat[[i]][['path']])) > 0) {
      if (length(undefined_file_dims) > 0) {
        stop(paste0("All selectors for the file dimensions must be ",
                    "character strings if requesting data to a remote ",
                    "server. Found invalid selectors for the file dimensions ",
                    paste(paste0("'", undefined_file_dims, "'"), collapse = ', '), "."))
      }
      dataset_has_files[i] <- TRUE
    } else {
      dat[[i]][['path']] <- path.expand(dat[[i]][['path']])
      # Iterate over the known dimensions to find the first existing file.
      # The path to the first existing file will be used to find the 
      # values for the non explicitly defined selectors.
      first_file <- NULL
      first_file_selectors <- NULL
      if (length(undefined_file_dims) > 0) {
        replace_values[undefined_file_dims] <- '*'
      }
      ## TODO: What if length of defined_file_dims is 0? code might crash (in practice it worked for an example case)
      files_to_check <- sapply(dat_selectors[defined_file_dims], function(x) length(x[[1]]))
      sub_array_of_files_to_check <- array(1:prod(files_to_check), dim = files_to_check)
      j <- 1
      #print("D")
      while (j <= prod(files_to_check) && is.null(first_file)) {
        selector_indices <- which(sub_array_of_files_to_check == j, arr.ind = TRUE)[1, ]
        selectors <- sapply(1:length(defined_file_dims), 
                            function (x) {
                              vector_to_pick <- 1
                              if (defined_file_dims[x] %in% names(depending_file_dims)) {
                                vector_to_pick <- selector_indices[which(defined_file_dims == depending_file_dims[[defined_file_dims[x]]])]
                              }
                              dat_selectors[defined_file_dims][[x]][[vector_to_pick]][selector_indices[x]]
                            })
        replace_values[defined_file_dims] <- selectors
        file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values)
        file_path <- Sys.glob(file_path)
        if (length(file_path) > 0) {
          first_file <- file_path[1]
          first_file_selectors <- selectors
        }
        j <- j + 1
      }
      #print("E")
      # Start looking for values for the non-explicitly defined selectors.
      if (is.null(first_file)) {
        .warning(paste0("No found files for the datset '", dat[[i]][['name']], 
                        "'. Provide existing selectors for the file dimensions ",
                        " or check and correct its path pattern: ", dat[[i]][['path']]))
      } else {
        dataset_has_files[i] <- TRUE
        ## TODO: Improve message here if no variable found:
        if (length(undefined_file_dims) > 0) {
          # Looking for the first values, parsed from first_file.
          first_values <- vector('list', length = length(undefined_file_dims))
          names(first_values) <- undefined_file_dims
          found_values <- 0
          stop <- FALSE
          try_dim <- 1
          last_success <- 1
          while ((found_values < length(undefined_file_dims)) && !stop) {
            u_file_dim <- undefined_file_dims[try_dim]
            if (is.null(first_values[[u_file_dim]])) {
              path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], 
                                                                   replace_values[-which(file_dims == u_file_dim)],
                                                                   allow_undefined_key_vars = TRUE)
              found_value <- .FindTagValue(path_with_globs_and_tag, 
                                           first_file, u_file_dim)
              if (!is.null(found_value)) {
                found_values <- found_values + 1
                last_success <- try_dim
                first_values[[u_file_dim]] <- found_value
                replace_values[[u_file_dim]] <- found_value
              }
            }
            try_dim <- (try_dim %% length(undefined_file_dims)) + 1
            if (try_dim == last_success) {
              stop <- TRUE
            }
          }
          if (found_values < length(undefined_file_dims)) {
            stop(paste0("Path pattern of dataset '", dat[[i]][['name']], 
                        "' is too complex. Could not automatically ",
                        "detect values for all non-explicitly defined ",
                        "indices. Check its pattern: ", dat[[i]][['path']]))
          }
          ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above
          ##       Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical
          ##       cmor case, requesting all members and chunks for fixed var and sdate. Not fixing
          ##       sdate raised 'too complex' error.
          # Replace shell globs in path pattern and keep the file_dims as tags
          dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, 
                                                        file_dims, dat[[i]][['name']], path_glob_permissive)
          # Now time to look for the available values for the non 
          # explicitly defined selectors for the file dimensions.
          #print("H")
          # Check first the ones that do not depend on others.
          ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))],
                   undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))])
          
          for (u_file_dim in ufd) {
            replace_values[undefined_file_dims] <- first_values
            replace_values[[u_file_dim]] <- '*'
            depended_dim <- NULL
            depended_dim_values <- NA
aho's avatar
aho committed

#NOTE: Here 'selectors' is always 1. Is it supposed to be like this?
aho's avatar
aho committed
            selectors <- dat_selectors[[u_file_dim]][[1]]
            if (u_file_dim %in% names(depending_file_dims)) {
              depended_dim <- depending_file_dims[[u_file_dim]]
              depended_dim_values <- dat_selectors[[depended_dim]][[1]]
              dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values))
              names(dat_selectors[[u_file_dim]]) <- depended_dim_values
            } else {
              dat_selectors[[u_file_dim]] <- list()
            }
            if (u_file_dim %in% unlist(depending_file_dims)) {
              depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))]
              replace_values[depending_dims] <- rep('*', length(depending_dims))
            }
            for (j in 1:length(depended_dim_values)) {
              parsed_values <- c()
              if (!is.null(depended_dim)) {
                replace_values[[depended_dim]] <- depended_dim_values[j]
              }
              path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values)
              found_files <- Sys.glob(path_with_globs)
              ## TODO: Enhance this error message, or change by warning.
              ##       Raises if a wrong sdate is specified, for example.
              if (length(found_files) == 0) {
                .warning(paste0("Could not find files for any '", u_file_dim, 
                                "' for '", depended_dim, "' = '", 
                                depended_dim_values[j], "'."))
                dat_selectors[[u_file_dim]][[j]] <- NA
              } else {
                for (found_file in found_files) {
                  path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']],
                                                                       replace_values[-which(file_dims == u_file_dim)],
                                                                       allow_undefined_key_vars = TRUE)
                  parsed_values <- c(parsed_values, 
                                     .FindTagValue(path_with_globs_and_tag, found_file, 
                                                   u_file_dim))
                }
aho's avatar
aho committed
                #TODO: selector_checker() doesn't allow selectors to be characters. For selectors
                #      like "member = 'r7i1p1f1", it cannot be defined with values.
aho's avatar
aho committed
                dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors,
                                                                     var = unique(parsed_values),
                                                                     return_indices = FALSE)
                # Take chunk if needed
                dat_selectors[[u_file_dim]][[j]] <- dat_selectors[[u_file_dim]][[j]][chunk_indices(length(dat_selectors[[u_file_dim]][[j]]),
                                                                                                   chunks[[u_file_dim]]['chunk'],
                                                                                                   chunks[[u_file_dim]]['n_chunks'], 
                                                                                                   u_file_dim)]
              }
            }
          }
          #print("I")
        } else {
aho's avatar
aho committed

          #NOTE: If there is no non-explicitly defined dim, use the first found file
          #      to modify. Problem: '*' doesn't catch all the possible file. Only use
          #      the first file.
aho's avatar
aho committed
          dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, 
                                                        defined_file_dims, dat[[i]][['name']], path_glob_permissive)
        }
      }
    }
    # Now fetch for the first available file
    if (dataset_has_files[i]) {
      known_dims <- file_dims
    } else {
      known_dims <- defined_file_dims
    }
    replace_values <- vector('list', length = length(known_dims))
    names(replace_values) <- known_dims
    files_to_load <- sapply(dat_selectors[known_dims], function(x) length(x[[1]]))
    files_to_load[found_pattern_dim] <- 1
    sub_array_of_files_to_load <- array(1:prod(files_to_load), 
                                        dim = files_to_load)
    names(dim(sub_array_of_files_to_load)) <- known_dims
    sub_array_of_not_found_files <- array(!dataset_has_files[i], 
                                          dim = files_to_load)
    names(dim(sub_array_of_not_found_files)) <- known_dims
    j <- 1
aho's avatar
aho committed
    if (!exists('selector_indices_save')) {
      selector_indices_save <- vector('list', length = length(dat))
    }
    if (!exists('selectors_total_list')) {
      selectors_total_list <- vector('list', length = length(dat))
    }
    selector_indices_save[[i]] <- vector('list', length = prod(files_to_load))
    selectors_total_list[[i]] <- vector('list', length = prod(files_to_load))
aho's avatar
aho committed
    while (j <= prod(files_to_load)) {
      selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ]
      names(selector_indices) <- known_dims
aho's avatar
aho committed

      tmp <- selector_indices
      tmp[which(known_dims == found_pattern_dim)] <- i
      selector_indices_save[[i]][[j]] <- tmp

aho's avatar
aho committed
      selectors <- sapply(1:length(known_dims), 
                          function (x) {
                            vector_to_pick <- 1
                            if (known_dims[x] %in% names(depending_file_dims)) {
                              vector_to_pick <- selector_indices[which(known_dims == depending_file_dims[[known_dims[x]]])]
                            }
                            dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]]
                          })
      names(selectors) <- known_dims
aho's avatar
aho committed
      selectors_total_list[[i]][[j]] <- selectors
      names(selectors_total_list[[i]][[j]]) <- known_dims
aho's avatar
aho committed
      replace_values[known_dims] <- selectors
      if (!dataset_has_files[i]) {
        if (any(is.na(selectors))) {
          replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))]
        }
        file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE)
        sub_array_of_files_to_load[j] <- file_path
        #sub_array_of_not_found_files[j] <- TRUE???
      } else {
        if (any(is.na(selectors))) {
          replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))]
          file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE)
          sub_array_of_files_to_load[j] <- file_path
          sub_array_of_not_found_files[j] <- TRUE
        } else {
          file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values)
aho's avatar
aho committed

#NOTE: After replacing tags, there is still * if path_glob_permissive is not FALSE.
          if (grepl('\\*', file_path)) {
            found_files <- Sys.glob(file_path)
            file_path <- found_files[1]   # choose only the first file.
#NOTE: Above line chooses only the first found file. Because * is not tags, which means
#      it is not a dimension. So it cannot store more than one item. If use * to define
#      the path, that * should only represent one possibility.
            if (length(found_files) > 1) {
              .warning("Using glob expression * to define the path, but more ",
                       "than one match is found. Choose the first match only.")
            }
          }

aho's avatar
aho committed
          if (!(length(grep("^http", file_path)) > 0)) {
            if (grepl(file_path, '*', fixed = TRUE)) {
              file_path_full <- Sys.glob(file_path)[1]
              if (nchar(file_path_full) > 0) {
                file_path <- file_path_full
              }
            }
          }
          sub_array_of_files_to_load[j] <- file_path
          if (is.null(indices_of_first_files_with_data[[i]])) {
            if (!(length(grep("^http", file_path)) > 0)) {
              if (!file.exists(file_path)) {
                file_path <- NULL
              }
            }
            if (!is.null(file_path)) {
              test_file <- NULL
              ## TODO: suppress error messages
              test_file <- file_opener(file_path)
              if (!is.null(test_file)) {
                selector_indices[which(known_dims == found_pattern_dim)] <- i
                indices_of_first_files_with_data[[i]] <- selector_indices
                selectors_of_first_files_with_data[[i]] <- selectors
                file_closer(test_file)
              }
            }
          }
        }
      }
      j <- j + 1
    }
    # Extend array as needed progressively
    if (is.null(array_of_files_to_load)) {
      array_of_files_to_load <- sub_array_of_files_to_load
      array_of_not_found_files <- sub_array_of_not_found_files
    } else {
      array_of_files_to_load <- .MergeArrays(array_of_files_to_load, sub_array_of_files_to_load,
                                             along = found_pattern_dim)
      ## TODO: file_dims, and variables like that.. are still ok now? I don't think so
      array_of_not_found_files <- .MergeArrays(array_of_not_found_files, sub_array_of_not_found_files,
                                               along = found_pattern_dim)
    }    
    dat[[i]][['selectors']] <- dat_selectors
  }
  if (all(sapply(indices_of_first_files_with_data, is.null))) {
    stop("No data files found for any of the specified datasets.")
  }
  
  ########################### READING INNER DIMS. #############################
  #print("J")
  ## TODO: To be run in parallel (local multi-core)
  # Now time to work out the inner file dimensions.
  # First pick the requested variables.

#//// This part is moved below the new code////
# NOTE: 'dims_to_iterate' only consider common_return_vars. Move to below can save some work 
#       and get the revised common_return_vars if it is changed.
#  dims_to_iterate <- NULL
#  for (return_var in names(return_vars)) {
#    dims_to_iterate <- unique(c(dims_to_iterate, return_vars[[return_var]]))
#  }
#  if (found_pattern_dim %in% dims_to_iterate) {
#    dims_to_iterate <- dims_to_iterate[-which(dims_to_iterate == found_pattern_dim)]
#  }
#//////////////////////////////////////////////

aho's avatar
aho committed
  common_return_vars <- NULL
  common_first_found_file <- NULL
  common_return_vars_pos <- NULL
  if (length(return_vars) > 0) {
    common_return_vars_pos <- which(sapply(return_vars, function(x) !(found_pattern_dim %in% x)))
  }
  if (length(common_return_vars_pos) > 0) {
    common_return_vars <- return_vars[common_return_vars_pos]
    return_vars <- return_vars[-common_return_vars_pos]
    common_first_found_file <- rep(FALSE, length(which(sapply(common_return_vars, length) == 0)))
    names(common_first_found_file) <- names(common_return_vars[which(sapply(common_return_vars, length) == 0)])
  }
  return_vars <- lapply(return_vars, 
                        function(x) {
                          if (found_pattern_dim %in% x) {
                            x[-which(x == found_pattern_dim)]
                          } else {
                            x
                          }
                        })
 
#////////////////////////////////////////////
  # Force return_vars = (time = NULL) to (time = 'sdate') if (1) selector = [sdate = 2, time = 4] or
  # (2) time_across = 'sdate'.
  # NOTE: Here is not in for loop of dat[[i]]
  for (i in 1:length(dat)) {
    for (inner_dim in expected_inner_dims[[i]]) {
      # The selectors for the inner dimension are taken.
      selector_array <- dat[[i]][['selectors']][[inner_dim]]
      file_dim_as_selector_array_dim <- 1

      if (any(found_file_dims[[i]] %in% names(dim(selector_array)))) {
        file_dim_as_selector_array_dim <- found_file_dims[[i]][which(found_file_dims[[i]] %in% names(dim(selector_array)))]
      }

      if (inner_dim %in% inner_dims_across_files | is.character(file_dim_as_selector_array_dim)) {  #(2) or (1)
        # inner_dim is not in return_vars or is NULL
        if (((!inner_dim %in% names(common_return_vars)) & (!inner_dim %in% names(return_vars))) |
            (inner_dim %in% names(common_return_vars) & is.null(common_return_vars[[inner_dim]])) ) {
          if (is.character(file_dim_as_selector_array_dim)) { #(1)
            if (file_dim_as_selector_array_dim %in% found_pattern_dim) {
              stop(paste0("Found '", inner_dim, "' selector has dimension of the pattern dim '",
                          found_pattern_dim, "', which is not allowed. To assign the dependency on the pattern dim, ",
                          "use 'return_vars = list(", inner_dim, " = 'dat')' instead."))
            } else {
              common_return_vars[[inner_dim]] <- file_dim_as_selector_array_dim
              tmp <- file_dim_as_selector_array_dim
            tmp <- file_dim_as_selector_array_dim
          } else if (inner_dim %in% inner_dims_across_files) { #(2)
            file_dim_name <- names(which(inner_dim == inner_dims_across_files))
            if (file_dim_name %in% found_pattern_dim) {
              stop(paste0("Found '", inner_dim, "' has across dependency on the pattern dim '",
                          found_pattern_dim, "', which is not allowed."))
            } else {
              common_return_vars[[inner_dim]] <- file_dim_name
          }
          .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", tmp,
                          "', but '", inner_dim, "' is not in return_vars list or is NULL. ",
                          "To provide the correct metadata, the value of ", inner_dim,
                          " in 'return_vars' is specified as '", tmp, "'."))
        }
      }
    }
  }
#////////////////////////////////////////////

# This part was above where return_vars is seperated into return_vars and common_return_vars
# NOTE: 'dims_to_iterate' only consider common_return_vars. Move to here can save some work 
#       and get the revised common_return_vars if it is changed in the part right above.
  dims_to_iterate <- NULL
  for (common_return_var in names(common_return_vars)) {
    dims_to_iterate <- unique(c(dims_to_iterate, common_return_vars[[common_return_var]]))
  }
#////////////////////////////////////////////

aho's avatar
aho committed
  if (length(common_return_vars) > 0) {
    picked_common_vars <- vector('list', length = length(common_return_vars))
    names(picked_common_vars) <- names(common_return_vars)
  } else {
    picked_common_vars <- NULL
  }
  picked_common_vars_ordered <- picked_common_vars
  picked_common_vars_unorder_indices <- picked_common_vars
  picked_vars <- vector('list', length = length(dat))
  names(picked_vars) <- dat_names
  picked_vars_ordered <- picked_vars
  picked_vars_unorder_indices <- picked_vars
  for (i in 1:length(dat)) {
    if (dataset_has_files[i]) {
      # Put all selectors in a list of a single list/vector of selectors. 
      # The dimensions that go across files will later be extended to have 
      # lists of lists/vectors of selectors.
      for (inner_dim in expected_inner_dims[[i]]) {
        if (!is.list(dat[[i]][['selectors']][[inner_dim]]) || 
            (is.list(dat[[i]][['selectors']][[inner_dim]]) && 
             length(dat[[i]][['selectors']][[inner_dim]]) == 2 &&
             is.null(names(dat[[i]][['selectors']][[inner_dim]])))) {
          dat[[i]][['selectors']][[inner_dim]] <- list(dat[[i]][['selectors']][[inner_dim]])
        }
      }
      if (length(return_vars) > 0) {
        picked_vars[[i]] <- vector('list', length = length(return_vars))
        names(picked_vars[[i]]) <- names(return_vars)
        picked_vars_ordered[[i]] <- picked_vars[[i]]
        picked_vars_unorder_indices[[i]] <- picked_vars[[i]]
      }
      indices_of_first_file <- as.list(indices_of_first_files_with_data[[i]])
      array_file_dims <- sapply(dat[[i]][['selectors']][found_file_dims[[i]]], function(x) length(x[[1]]))
      names(array_file_dims) <- found_file_dims[[i]]
      if (length(dims_to_iterate) > 0) {
        indices_of_first_file[dims_to_iterate] <- lapply(array_file_dims[dims_to_iterate], function(x) 1:x)
      }
      array_of_var_files <- do.call('[', c(list(x = array_of_files_to_load), indices_of_first_file, list(drop = FALSE)))
      array_of_var_indices <- array(1:length(array_of_var_files), dim = dim(array_of_var_files))
      array_of_not_found_var_files <- do.call('[', c(list(x = array_of_not_found_files), indices_of_first_file, list(drop = FALSE)))
      previous_indices <- rep(-1, length(indices_of_first_file))
      names(previous_indices) <- names(indices_of_first_file)
      first_found_file <- NULL
      if (length(return_vars) > 0) {
        first_found_file <- rep(FALSE, length(which(sapply(return_vars, length) == 0)))
        names(first_found_file) <- names(return_vars[which(sapply(return_vars, length) == 0)])
      }
      for (j in 1:length(array_of_var_files)) {
        current_indices <- which(array_of_var_indices == j, arr.ind = TRUE)[1, ]
        names(current_indices) <- names(indices_of_first_file)
        if (!is.na(array_of_var_files[j]) && !array_of_not_found_var_files[j]) {
          changed_dims <- which(current_indices != previous_indices)
          vars_to_read <- NULL
          if (length(return_vars) > 0) {
            vars_to_read <- names(return_vars)[sapply(return_vars, function(x) any(names(changed_dims) %in% x))]
          }
          if (!is.null(first_found_file)) {
            if (any(!first_found_file)) {
              vars_to_read <- c(vars_to_read, names(first_found_file[which(!first_found_file)]))
            }
          }
          if ((i == 1) && (length(common_return_vars) > 0)) {
            vars_to_read <- c(vars_to_read, names(common_return_vars)[sapply(common_return_vars, function(x) any(names(changed_dims) %in% x))])
          }
          if (!is.null(common_first_found_file)) {
            if (any(!common_first_found_file)) {
              vars_to_read <- c(vars_to_read, names(common_first_found_file[which(!common_first_found_file)]))
            }
          }
          file_object <- file_opener(array_of_var_files[j])
          if (!is.null(file_object)) {
            for (var_to_read in vars_to_read) {
              if (var_to_read %in% unlist(var_params)) {
                associated_dim_name <- names(var_params)[which(unlist(var_params) == var_to_read)]
              }
              var_name_to_reader <- var_to_read
              names(var_name_to_reader) <- 'var'
              var_dims <- file_dim_reader(NULL, file_object, var_name_to_reader, NULL,
                                          synonims)
              # file_dim_reader returns dimension names as found in the file.
              # Need to translate accoridng to synonims:
              names(var_dims) <- sapply(names(var_dims), 
                                        function(x) {
                                          which_entry <- which(sapply(synonims, function(y) x %in% y))
                                          if (length(which_entry) > 0) {
                                            names(synonims)[which_entry]
                                          } else {
                                            x
                                          }
                                        })
              if (!is.null(var_dims)) {
                var_file_dims <- NULL
                if (var_to_read %in% names(common_return_vars)) {
                  var_to_check <- common_return_vars[[var_to_read]]
                } else {
                  var_to_check <- return_vars[[var_to_read]]
                }
                if (any(names(dim(array_of_files_to_load)) %in% var_to_check)) {
                  var_file_dims <- dim(array_of_files_to_load)[which(names(dim(array_of_files_to_load)) %in% 
                                                                       var_to_check)]
                }
                if (((var_to_read %in% names(common_return_vars)) && 
                     is.null(picked_common_vars[[var_to_read]])) ||
                    ((var_to_read %in% names(return_vars)) && 
                     is.null(picked_vars[[i]][[var_to_read]]))) {
                  if (any(names(var_file_dims) %in% names(var_dims))) {
                    stop("Found a requested var in 'return_var' requested for a ",
                         "file dimension which also appears in the dimensions of ",
                         "the variable inside the file.\n", array_of_var_files[j])
                  }
                  special_types <- list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, 
                                        'Date' = as.Date)
                  first_sample <- file_var_reader(NULL, file_object, NULL, 
                                                  var_to_read, synonims)
                  if (any(class(first_sample) %in% names(special_types))) {
                    array_size <- prod(c(var_file_dims, var_dims))
                    new_array <- rep(special_types[[class(first_sample)[1]]](NA), array_size)
                    dim(new_array) <- c(var_file_dims, var_dims)
                  } else {
                    new_array <- array(dim = c(var_file_dims, var_dims))
                  }
                  attr(new_array, 'variables') <- attr(first_sample, 'variables')
                  if (var_to_read %in% names(common_return_vars)) {