Start.R 130 KB
Newer Older
Start <- function(..., # dim = indices/selectors, 
                       # dim_var = 'var', 
                       # dim_reorder = Sort/CircularSort, 
                       # dim_tolerance = number, 
                       # dim_depends = 'file_dim', 
                       # dim_across = 'file_dim', 
                  synonims = NULL, 
                  file_opener = NcOpener, 
                  file_var_reader = NcVarReader, 
                  file_dim_reader = NcDimReader, 
                  file_data_reader = NcDataReader, 
                  file_closer = NcCloser, 
                  transform = NULL, 
                  transform_params = NULL, 
                  transform_vars = NULL, 
                  transform_extra_cells = 0, 
                  apply_indices_after_transform = FALSE, 
                  pattern_dims = NULL, 
                  selector_checker = SelectorChecker, 
                  num_procs = NULL, silent = FALSE, debug = FALSE) {
                  #, config_file = NULL
                  #dictionary_dim_names = ,
                  #dictionary_var_names =  
  dim_params <- list(...)

  # Take *_var parameters apart
  var_params_ind <- grep('_var$', names(dim_params))
  var_params <- dim_params[var_params_ind]
  # Check all *_var are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (var_param in var_params) {
    if (!is.character(var_param)) {
      stop("All '*_var' parameters must be character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(var_params)[i], 
                                               '_var$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_var' parameters must be associated to a dimension parameter. Found parameter '", 
                  names(var_params)[i], "' but no parameter '", 
                  strsplit(names(var_params)[i], '_var$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'var_params' to be the name of 
  # the corresponding dimension.
  if (length(var_params) < 1) {
    var_params <- NULL
  } else {
    names(var_params) <- gsub('_var$', '', names(var_params))
  }

  # Take *_reorder parameters apart
  dim_reorder_params_ind <- grep('_reorder$', names(dim_params))
  dim_reorder_params <- dim_params[dim_reorder_params_ind]
  # Check all *_reorder are NULL or functions, and that they all have 
  # a matching dimension param.
  i <- 1
  for (dim_reorder_param in dim_reorder_params) {
    if (!is.function(dim_reorder_param)) {
      stop("All '*_reorder' parameters must be functions.")
    } else if (!any(grepl(paste0('^', strsplit(names(dim_reorder_params)[i], 
                                               '_reorder$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_reorder' parameters must be associated to a dimension parameter. Found parameter '", 
                  names(dim_reorder_params)[i], "' but no parameter '", 
                  strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "'."))
    } else if (!any(grepl(paste0('^', strsplit(names(dim_reorder_params)[i], 
                                               '_reorder$')[[1]][1], '$'),
                          names(var_params)))) {
      stop(paste0("All '*_reorder' parameters must be associated to a dimension parameter associated to a ",
                  "variable. Found parameter '", names(dim_reorder_params)[i], "' and dimension parameter '", 
                  strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "' but did not find variable ",
                  "parameter '", strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "_var'."))
    }
    i <- i + 1
  }
  # Make the keys of 'dim_reorder_params' to be the name of 
  # the corresponding dimension.
  if (length(dim_reorder_params) < 1) {
    dim_reorder_params <- NULL
  } else {
    names(dim_reorder_params) <- gsub('_reorder$', '', names(dim_reorder_params))
  }

  # Take *_tolerance parameters apart
  tolerance_params_ind <- grep('_tolerance$', names(dim_params))
  tolerance_params <- dim_params[tolerance_params_ind]
  # Check all *_tolerance are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (tolerance_param in tolerance_params) {
    if (!any(grepl(paste0('^', strsplit(names(tolerance_params)[i], 
                                               '_tolerance$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_tolerance' parameters must be associated to a dimension parameter. Found parameter '", 
                  names(tolerance_params)[i], "' but no parameter '", 
                  strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "'."))
    } else if (!any(grepl(paste0('^', strsplit(names(tolerance_params)[i], 
                                               '_tolerance$')[[1]][1], '$'),
                          names(var_params)))) {
      stop(paste0("All '*_tolerance' parameters must be associated to a dimension parameter associated to a ",
                  "variable. Found parameter '", names(tolerance_params)[i], "' and dimension parameter '", 
                  strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "' but did not find variable ",
                  "parameter '", strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "_var'."))
    }
    i <- i + 1
  }
  # Make the keys of 'tolerance_params' to be the name of 
  # the corresponding dimension.
  if (length(tolerance_params) < 1) {
    tolerance_params <- NULL
  } else {
    names(tolerance_params) <- gsub('_tolerance$', '', names(tolerance_params))
  }

  # Take *_depends parameters apart
  depends_params_ind <- grep('_depends$', names(dim_params))
  depends_params <- dim_params[depends_params_ind]
  # Check all *_depends are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (depends_param in depends_params) {
    if (!is.character(depends_param) || (length(depends_param) > 1)) {
      stop("All '*_depends' parameters must be single character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(depends_params)[i], 
                                               '_depends$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_depends' parameters must be associated to a dimension parameter. Found parameter '", 
                  names(depends_params)[i], "' but no parameter '", 
                  strsplit(names(depends_params)[i], '_depends$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'depends_params' to be the name of 
  # the corresponding dimension.
  if (length(depends_params) < 1) {
    depends_params <- NULL
  } else {
    names(depends_params) <- gsub('_depends$', '', names(depends_params))
  }
  # Change name to depending_file_dims
  depending_file_dims <- depends_params

  # Take *_across parameters apart
  across_params_ind <- grep('_across$', names(dim_params))
  across_params <- dim_params[across_params_ind]
  # Check all *_across are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (across_param in across_params) {
    if (!is.character(across_param) || (length(across_param) > 1)) {
      stop("All '*_across' parameters must be single character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(across_params)[i], 
                                               '_across$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_across' parameters must be associated to a dimension parameter. Found parameter '", 
                  names(across_params)[i], "' but no parameter '", 
                  strsplit(names(across_params)[i], '_across$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'across_params' to be the name of 
  # the corresponding dimension.
  if (length(across_params) < 1) {
    across_params <- NULL
  } else {
    names(across_params) <- gsub('_across$', '', names(across_params))
  }
  # Change name to inner_dims_across_files
  inner_dims_across_files <- across_params

  # Leave alone the dimension parameters in the variable dim_params
  if (length(c(var_params_ind, dim_reorder_params_ind, tolerance_params_ind,
               depends_params_ind, across_params_ind)) > 0) {
    dim_params <- dim_params[-c(var_params_ind, dim_reorder_params_ind,
                                tolerance_params_ind, depends_params_ind,
                                across_params_ind)]
  }
  dim_names <- names(dim_params)
  if (is.null(dim_names)) {
    stop("At least one pattern dim must be specified.")
  }

  # Check pattern_dims
  if (is.null(pattern_dims)) {
    .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '", 
                    dim_names[1], "' as 'pattern_dims'."))
    pattern_dims <- dim_names[1]
  } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) {
    pattern_dims <- unique(pattern_dims)
  } else {
    stop("Parameter 'pattern_dims' must be a vector of character strings.")
  }
  if (any(names(var_params) %in% pattern_dims)) {
    stop("'*_var' parameters specified for pattern dimensions. Remove or fix them.")
  }
  # Find the pattern dimension with the pattern specifications
  found_pattern_dim <- NULL
  for (pattern_dim in pattern_dims) {
    # Check all specifications in pattern_dim are valid
    dat <- datasets <- dim_params[[pattern_dim]]
    if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) {
      stop(paste0("Parameter '", pattern_dim, 
                  "' must be a list of lists with pattern specifications or a vector of character strings."))
    }
    if (!is.null(dim_reorder_params[[pattern_dim]])) {
      .warning(paste0("A reorder for the selectors of '", pattern_dim, 
                      "' has been specified, but it is a pattern dimension and the reorder will be ignored."))
    }
    if (any(sapply(dat, is.list))) {
      if (is.null(found_pattern_dim)) {
        found_pattern_dim <- pattern_dim
      } else {
        stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.")
      }
    }
  }
  if (is.null(found_pattern_dim)) {
    .warning(paste0("Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications (to be fetched in configuration file)."))
    found_pattern_dim <- pattern_dims[1]
  }
  # Once the pattern dimension with dataset specifications is found,
  # the variable 'dat' is mounted with the information of each
  # dataset.
  dat <- datasets <- dim_params[[found_pattern_dim]]
  dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames')
  dat_to_fetch <- c()
  dat_names <- c()
  if (!is.list(dat)) {
    dat <- as.list(dat)
  }
  for (i in 1:length(dat)) {
    if (is.character(dat[[i]]) && length(dat[[i]]) == 1 && nchar(dat[[i]]) > 0) {
      if (grepl('^(\\./|\\.\\./|/.*/|~/)', dat[[i]])) {
        dat[[i]] <- list(path = dat[[i]])
      } else {
        dat[[i]] <- list(name = dat[[i]])
      }
    } else if (!is.list(dat[[i]])) {
      stop(paste0("Parameter '", pattern_dim, 
                  "' is incorrect. It must be a list of lists or character strings."))
    }
    #if (!(all(names(dat[[i]]) %in% dat_info_names))) {
    #  stop("Error: parameter 'dat' is incorrect. There are unrecognized components in the information of some of the datasets. Check 'dat' in ?Load for details.")
    #}
    if (!('name' %in% names(dat[[i]]))) {
      dat[[i]][['name']] <- paste0('dat', i)
      if (!('path' %in% names(dat[[i]]))) {
        stop(paste0("Parameter '", found_pattern_dim, 
                    "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided."))
      }
    } else if (!('path' %in% names(dat[[i]]))) {
      dat_to_fetch <- c(dat_to_fetch, i)
    }
    #if ('path' %in% names(dat[[i]])) {
    #  if (!('nc_var_name' %in% names(dat[[i]]))) {
    #    dat[[i]][['nc_var_name']] <- '$var_name$'
    #  }
    #  if (!('suffix' %in% names(dat[[i]]))) {
    #    dat[[i]][['suffix']] <- ''
    #  }
    #  if (!('var_min' %in% names(dat[[i]]))) {
    #    dat[[i]][['var_min']] <- ''
    #  }
    #  if (!('var_max' %in% names(dat[[i]]))) {
    #    dat[[i]][['var_max']] <- ''
    #  }
    #}
    dat_names <- c(dat_names, dat[[i]][['name']])
  }
  if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) {
    .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.")
  }

  # Reorder inner_dims_across_files (to make the keys be the file dimensions,
  # and the values to be the inner dimensions that go across it).
  if (!is.null(inner_dims_across_files)) {
    # Reorder: example, convert list(ftime = 'chunk', ensemble = 'member', xx = 'chunk')
    #                        to list(chunk = c('ftime', 'xx'), member = 'ensemble')
    new_idaf <- list()
    for (i in names(inner_dims_across_files)) {
      if (!(inner_dims_across_files[[i]] %in% names(new_idaf))) {
        new_idaf[[inner_dims_across_files[[i]]]] <- i
      } else {
        new_idaf[[inner_dims_across_files[[i]]]] <- c(new_idaf[[inner_dims_across_files[[i]]]], i)
      }
    }
    inner_dims_across_files <- new_idaf
  }

  # Check return_vars
  if (is.null(return_vars)) {
    return_vars <- list()
#    if (length(var_params) > 0) {
#      return_vars <- as.list(paste0(names(var_params), '_var'))
#    } else {
#      return_vars <- list()
#    }
  }
  if (!is.list(return_vars)) {
    stop("Parameter 'return_vars' must be a list or NULL.")
  }
  if (length(return_vars) > 0 && is.null(names(return_vars))) {
#    names(return_vars) <- rep('', length(return_vars))
    stop("Parameter 'return_vars' must be a named list.")
  }
  i <- 1
  while (i <= length(return_vars)) {
#    if (names(return_vars)[i] == '') {
#      if (!(is.character(return_vars[[i]]) && (length(return_vars[[i]]) == 1))) {
#        stop("The ", i, "th specification in 'return_vars' is malformed.")
#      } 
#      if (!grepl('_var$', return_vars[[i]])) {
#        stop("The ", i, "th specification in 'return_vars' is malformed.")
#      }
#      dim_name <- strsplit(return_vars[[i]], '_var$')[[1]][1]
#      if (!(dim_name %in% names(var_params))) {
#        stop("'", dim_name, "_var' requested in 'return_vars' but ",
#             "no '", dim_name, "_var' specified in the .Load call.")
#      }
#      names(return_vars)[i] <- var_params[[dim_name]]
#      return_vars[[i]] <- found_pattern_dim
#    } else
    if (length(return_vars[[i]]) > 0) { 
      if (!is.character(return_vars[[i]])) {
        stop("The ", i, "th specification in 'return_vars' is malformed. It ",
             "must be a vector of character strings of valid file dimension ",
             "names.")
      }
    }
    i <- i + 1
  }

  # Check synonims
  if (!is.null(synonims)) {
    error <- FALSE
    if (!is.list(synonims)) {
      error <- TRUE
    }
    for (synonim_entry in names(synonims)) {
      if (!(synonim_entry %in% names(dim_params)) &&
          !(synonim_entry %in% names(return_vars))) {
        error <- TRUE
      }
      if (!is.character(synonims[[synonim_entry]]) ||
          length(synonims[[synonim_entry]]) < 1) {
        error <- TRUE
      }
    }
    if (error) {
      stop("Parameter 'synonims' must be a named list, where the names are ",
           "a name of a requested dimension or variable and the values are ",
           "vectors of character strings with at least one alternative name ",
           " for each dimension or variable in 'synonims'.")
    }
  }
  if (length(unique(names(synonims))) < length(names(synonims))) {
    stop("There must not be repeated entries in 'synonims'.")
  }
  if (length(unique(unlist(synonims))) < length(unlist(synonims))) {
    stop("There must not be repeated values in 'synonims'.")
  }
  # Make that all dims and vars have an entry in synonims, even if only dim_name = dim_name
  dim_entries_to_add <- which(!(names(dim_params) %in% names(synonims)))
  if (length(dim_entries_to_add) > 0) {
    synonims[names(dim_params)[dim_entries_to_add]] <- as.list(names(dim_params)[dim_entries_to_add])
  }
  var_entries_to_add <- which(!(names(var_params) %in% names(synonims)))
  if (length(var_entries_to_add) > 0) {
    synonims[names(var_params)[var_entries_to_add]] <- as.list(names(var_params)[var_entries_to_add])
  }
  # Check selector_checker
  if (is.null(selector_checker) || !is.function(selector_checker)) {
    stop("Parameter 'selector_checker' must be a function.")
  }

  # Check file_opener
  if (is.null(file_opener) || !is.function(file_opener)) {
    stop("Parameter 'file_opener' must be a function.")
  }

  # Check file_var_reader
  if (!is.null(file_var_reader) && !is.function(file_var_reader)) {
    stop("Parameter 'file_var_reader' must be a function.")
  }

  # Check file_dim_reader
  if (!is.null(file_dim_reader) && !is.function(file_dim_reader)) {
    stop("Parameter 'file_dim_reader' must be a function.")
  }

  # Check file_data_reader
  if (is.null(file_data_reader) || !is.function(file_data_reader)) {
    stop("Parameter 'file_data_reader' must be a function.")
  }

  # Check file_closer
  if (is.null(file_closer) || !is.function(file_closer)) {
    stop("Parameter 'file_closer' must be a function.")
  }

  # Check transform
  if (!is.null(transform)) {
    if (!is.function(transform)) {
      stop("Parameter 'transform' must be a function.")
    }
  }

  # Check transform_params
  if (!is.null(transform_params)) {
    if (!is.list(transform_params)) {
      stop("Parameter 'transform_params' must be a list.")
    }
    if (is.null(names(transform_params))) {
      stop("Parameter 'transform_params' must be a named list.")
    }
  }

  # Check transform_vars
  if (!is.null(transform_vars)) {
    if (!is.character(transform_vars)) {
      stop("Parameter 'transform_vars' must be a vector of character strings.")
    }
  }
  if (any(!(transform_vars %in% names(return_vars)))) {
    stop("All the variables specified in 'transform_vars' must also be specified in 'return_vars'.")
  }

Nicolau Manubens's avatar
Nicolau Manubens committed
  # Check apply_indices_after_transform
  if (!is.logical(apply_indices_after_transform)) {
    stop("Parameter 'apply_indices_after_transform' must be either TRUE or FALSE.")
Nicolau Manubens's avatar
Nicolau Manubens committed
  aiat <- apply_indices_after_transform

  # Check transform_extra_cells
  if (!is.numeric(transform_extra_cells)) {
    stop("Parameter 'transform_extra_cells' must be numeric.")
  }
  transform_extra_cells <- round(transform_extra_cells)

  # Check num_procs
  if (!is.null(num_procs)) {
    if (!is.numeric(num_procs)) {
      stop("Parameter 'num_procs' must be numeric.")
    } else {
      num_procs <- round(num_procs)
    }
  }

  # Check silent
  if (!is.logical(silent)) {
    stop("Parameter 'silent' must be logical.")
  }

  dim_params[[found_pattern_dim]] <- dat_names

  if (!silent) {
    .message(paste0("Exploring files... This will take a variable amount ",
                    "of time depending on the issued request and the ",
                    "performance of the file server..."))
  }

if (!is.character(debug)) {
dims_to_check <- c('time')
} else {
dims_to_check <- debug
debug <- TRUE
}

  ############################## READING FILE DIMS ############################
  # Check that no unrecognized variables are present in the path patterns
  # and also that no file dimensions are requested to THREDDs catalogs.
  # And in the mean time, build all the work pieces and look for the 
  # first available file of each dataset.
  array_of_files_to_load <- NULL
  array_of_not_found_files <- NULL
  indices_of_first_files_with_data <- vector('list', length(dat))
  selectors_of_first_files_with_data <- vector('list', length(dat))
  found_file_dims <- vector('list', length(dat))
  expected_inner_dims <- vector('list', length(dat))
  dataset_has_files <- rep(FALSE, length(dat))
#print("A")
  for (i in 1:length(dat)) {
#print("B")
    dat_selectors <- dim_params
    dat_selectors[[found_pattern_dim]] <- dat_selectors[[found_pattern_dim]][i]
    dim_vars <- paste0('$', dim_names, '$')
    file_dims <- dim_names[which(sapply(dim_vars, grepl, dat[[i]][['path']], fixed = TRUE))]
    file_dims <- unique(c(pattern_dims, file_dims))
    found_file_dims[[i]] <- file_dims
    expected_inner_dims[[i]] <- dim_names[which(!(dim_names %in% file_dims))]
    # (Check the depending_file_dims).
    if (any(c(names(depending_file_dims), unlist(depending_file_dims)) %in% 
            expected_inner_dims[[i]])) {
      stop(paste0("The dimension dependancies specified in ",
                  "'depending_file_dims' can only be between file ",
                  "dimensions, but some inner dimensions found in ",
                  "dependancies for '", dat[[i]][['name']], "', which ",
                  "has the following file dimensions: ", 
                  paste(paste0("'", file_dims, "'"), collapse = ', '), ".")) 
    } else {
      a <- names(depending_file_dims) %in% file_dims
      b <- unlist(depending_file_dims) %in% file_dims
      ab <- a & b
      if (any(!ab)) {
        .warning("Detected some dependancies in 'depending_file_dims' with ",
                 "non-existing dimension names. These will be disregarded.")
        depending_file_dims <- depending_file_dims[-which(!ab)]
      }
      if (any(names(depending_file_dims) == unlist(depending_file_dims))) {
        depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == unlist(depending_file_dims))]
      }
    }
    # (Check the inner_dims_across_files).
    if (any(!(names(inner_dims_across_files) %in% file_dims)) ||
        any(!(unlist(inner_dims_across_files) %in% expected_inner_dims[[i]]))) {
      stop(paste0("All relationships specified in ",
                  "'inner_dims_across_files' must be between a inner ",
                  "dimension and a file dimension. Found wrong ",
                  "specification for '", dat[[i]][['name']], "', which ",
                  "has the following file dimensions: ", 
                  paste(paste0("'", file_dims, "'"), collapse = ', '),
                  ", and the following inner dimensions: ", 
                  paste(paste0("'", expected_inner_dims[[i]], "'"), 
                        collapse = ', '), "."))
    }
    # (Check the return_vars).
    j <- 1
    while (j <= length(return_vars)) {
      if (any(!(return_vars[[j]] %in% file_dims))) {
        if (any(return_vars[[j]] %in% expected_inner_dims[[i]])) {
          stop("Found variables in 'return_vars' requested ",
               "for some inner dimensions (for dataset '",
               dat[[i]][['name']], "'), but variables can only be ",
               "requested for file dimensions.")
        } else {
          stop("Found variables in 'return_vars' requested ",
               "for non-existing dimensions.")
        }
      }
      j <- j + 1
    }
    # (Check the *_var parameters).
    if (any(!(unlist(var_params) %in% names(return_vars)))) {
      stop("All '*_var' params must associate a dimension to one of the ",
           "requested variables in 'return_vars'.")
    }
#    # (Check the circular_dims parameters). 
#    if (!all(names(circular_dims) %in% expected_inner_dims[[i]])) {
#      stop("Only inner dimensions can be requested as 'circular_dims'.")
#    }
    replace_values <- vector('list', length = length(file_dims))
    names(replace_values) <- file_dims
    # Take the first selector for all possible file dimensions
    for (file_dim in file_dims) {
      if (file_dim %in% names(var_params)) {
        .warning(paste0("The '", file_dim, "_var' param will be ignored since '", 
                        file_dim, "' is a file dimension (for the dataset with pattern ", 
                        dat[[i]][['path']], ")."))
      }
      if (!is.list(dat_selectors[[file_dim]]) || 
          (is.list(dat_selectors[[file_dim]]) && 
           length(dat_selectors[[file_dim]]) == 2 &&
           is.null(names(dat_selectors[[file_dim]])))) {
        dat_selectors[[file_dim]] <- list(dat_selectors[[file_dim]])
      }
      first_class <- class(dat_selectors[[file_dim]][[1]])
      first_length <- length(dat_selectors[[file_dim]][[1]])
      for (j in 1:length(dat_selectors[[file_dim]])) {
        sv <- selector_vector <- dat_selectors[[file_dim]][[j]]
        if (!identical(first_class, class(sv)) ||
            !identical(first_length, length(sv))) {
          stop("All provided selectors for depending dimensions must ",
               "be vectors of the same length and of the same class.")
        }
        if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) {
          dat_selectors[[file_dim]][[j]] <- selector_checker(selectors = sv,
                                                             return_indices = FALSE)
        } else if (!(is.numeric(sv) || 
                     (is.character(sv) && (length(sv) == 1) && (sv %in% c('all', 'first', 'last'))) || 
                     (is.list(sv) && (length(sv) == 2) && (all(sapply(sv, is.character)) || 
                                                           all(sapply(sv, is.numeric)))))) {
          stop("All explicitly provided selectors for file dimensions must be character strings.")
        } 
      }
      sv <- dat_selectors[[file_dim]][[1]]
      if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) {
        replace_values[[file_dim]] <- dat_selectors[[file_dim]][[1]][1]
      }
    }
#print("C")
    # Now we know which dimensions whose selectors are provided non-explicitly.
    undefined_file_dims <- file_dims[which(sapply(replace_values, is.null))]
    defined_file_dims <- file_dims[which(!(file_dims %in% undefined_file_dims))]
    # Quickly check if the depending dimensions are provided properly.
    for (file_dim in file_dims) {
      if (file_dim %in% names(depending_file_dims)) {
        ## TODO: Detect multi-dependancies and forbid.
        if (all(c(file_dim, depending_file_dims[[file_dim]]) %in% defined_file_dims)) {
          if (length(dat_selectors[[file_dim]]) != length(dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) {
            stop(paste0("If providing selectors for the depending ",
                        "dimension '", file_dim, "', a ",
                        "vector of selectors must be provided for ",
                        "each selector of the dimension it depends on, '",
                        depending_file_dims[[file_dim]], "'."))
          } else if (!all(names(dat_selectors[[file_dim]]) == dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) {
            stop(paste0("If providing selectors for the depending ",
                        "dimension '", file_dim, "', the name of the ",
                        "provided vectors of selectors must match ",
                        "exactly the selectors of the dimension it ",
                        "depends on, '", depending_file_dims[[file_dim]], "'."))
          }
        }
      }
    }
    # Find the possible values for the selectors that are provided as
    # indices. If the requested file is on server, impossible operation.
    if (length(grep("^http", dat[[i]][['path']])) > 0) {
      if (length(undefined_file_dims) > 0) {
        stop(paste0("All selectors for the file dimensions must be ",
                    "character strings if requesting data to a remote ",
                    "server. Found invalid selectors for the file dimensions ",
                    paste(paste0("'", undefined_file_dims, "'"), collapse = ', '), "."))
      }
      dataset_has_files[i] <- TRUE
    } else {
Nicolau Manubens's avatar
Nicolau Manubens committed
      dat[[i]] <- path.expand(dat[[i]][['path']])
      # Iterate over the known dimensions to find the first existing file.
      # The path to the first existing file will be used to find the 
      # values for the non explicitly defined selectors.
      first_file <- NULL
      first_file_selectors <- NULL
      if (length(undefined_file_dims) > 0) {
        replace_values[undefined_file_dims] <- '*'
      }
      ## TODO: What if length of defined_file_dims is 0? code might crash (in practice it worked for an example case)
      files_to_check <- sapply(dat_selectors[defined_file_dims], function(x) length(x[[1]]))
      sub_array_of_files_to_check <- array(1:prod(files_to_check), dim = files_to_check)
      j <- 1
#print("D")
      while (j <= prod(files_to_check) && is.null(first_file)) {
        selector_indices <- which(sub_array_of_files_to_check == j, arr.ind = TRUE)[1, ]
        selectors <- sapply(1:length(defined_file_dims), 
          function (x) {
            vector_to_pick <- 1
            if (defined_file_dims[x] %in% names(depending_file_dims)) {
              vector_to_pick <- selector_indices[which(defined_file_dims == depending_file_dims[[defined_file_dims[x]]])]
            }
            dat_selectors[defined_file_dims][[x]][[vector_to_pick]][selector_indices[x]]
          })
        replace_values[defined_file_dims] <- selectors
        file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values)
        file_path <- Sys.glob(file_path)
        if (length(file_path) > 0) {
          first_file <- file_path[1]
          first_file_selectors <- selectors
        }
        j <- j + 1
      }
#print("E")
      # Start looking for values for the non-explicitly defined selectors.
      if (is.null(first_file)) {
        .warning(paste0("No found files for the datset '", dat[[i]][['name']], 
                        "'. Provide existing selectors for the file dimensions ",
                        " or check and correct its path pattern: ", dat[[i]][['path']]))
      } else {
        dataset_has_files[i] <- TRUE
        ## TODO: Improve message here if no variable found:
        if (length(undefined_file_dims) > 0) {
          # Looking for the first values, parsed from first_file.
          first_values <- vector('list', length = length(undefined_file_dims))
          names(first_values) <- undefined_file_dims
          found_values <- 0
          stop <- FALSE
          try_dim <- 1
          last_success <- 1
          while ((found_values < length(undefined_file_dims)) && !stop) {
            u_file_dim <- undefined_file_dims[try_dim]
            if (is.null(first_values[[u_file_dim]])) {
              path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], 
                                           replace_values[-which(file_dims == u_file_dim)],
                                           allow_undefined_key_vars = TRUE)
              found_value <- .FindTagValue(path_with_globs_and_tag, 
                                           first_file, u_file_dim)
              if (!is.null(found_value)) {
                found_values <- found_values + 1
                last_success <- try_dim
                first_values[[u_file_dim]] <- found_value
                replace_values[[u_file_dim]] <- found_value
              }
            }
            try_dim <- (try_dim %% length(undefined_file_dims)) + 1
            if (try_dim == last_success) {
              stop <- TRUE
            }
          }
          if (found_values < length(undefined_file_dims)) {
            stop(paste0("Path pattern of dataset '", dat[[i]][['name']], 
                        "' is too complex. Could not automatically ",
                        "detect values for all non-explicitly defined ",
                        "indices. Check its pattern: ", dat[[i]][['path']]))
          }
          ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above
          ##       Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical
          ##       cmor case, requesting all members and chunks for fixed var and sdate. Not fixing
          ##       sdate raised 'too complex' error.
          # Replace shell globs in path pattern and keep the file_dims as tags
          dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, 
                                                        file_dims, dat[[i]][['name']], FALSE)
          # Now time to look for the available values for the non 
          # explicitly defined selectors for the file dimensions.
#print("H")
          # Check first the ones that do not depend on others.
          ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))],
                   undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))])
                  
          for (u_file_dim in ufd) {
            replace_values[undefined_file_dims] <- first_values
            replace_values[[u_file_dim]] <- '*'
            depended_dim <- NULL
            depended_dim_values <- NA
            selectors <- dat_selectors[[u_file_dim]][[1]]
            if (u_file_dim %in% names(depending_file_dims)) {
              depended_dim <- depending_file_dims[[u_file_dim]]
              depended_dim_values <- dat_selectors[[depended_dim]][[1]]
              dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values))
              names(dat_selectors[[u_file_dim]]) <- depended_dim_values
            } else {
              dat_selectors[[u_file_dim]] <- list()
            }
            if (u_file_dim %in% unlist(depending_file_dims)) {
              depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))]
              replace_values[depending_dims] <- rep('*', length(depending_dims))
            }
            for (j in 1:length(depended_dim_values)) {
              parsed_values <- c()
              if (!is.null(depended_dim)) {
                replace_values[[depended_dim]] <- depended_dim_values[j]
              }
              path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values)
              found_files <- Sys.glob(path_with_globs)
              ## TODO: Enhance this error message, or change by warning.
              ##       Raises if a wrong sdate is specified, for example.
              if (length(found_files) == 0) {
                .warning(paste0("Could not find files for any '", u_file_dim, 
                                "' for '", depended_dim, "' = '", 
                                depended_dim_values[j], "'."))
                dat_selectors[[u_file_dim]][[j]] <- NA
              } else {
                for (found_file in found_files) {
                  path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']],
                                               replace_values[-which(file_dims == u_file_dim)],
                                               allow_undefined_key_vars = TRUE)
                  parsed_values <- c(parsed_values, 
                                     .FindTagValue(path_with_globs_and_tag, found_file, 
                                                   u_file_dim))
                }
                dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors,
                                                                     var = unique(parsed_values),
                                                                     return_indices = FALSE)
              }
            }
          }
#print("I")
        } else {
          dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, 
                                                        defined_file_dims, dat[[i]][['name']], FALSE)
        }
      }
    }
    # Now fetch for the first available file
    if (dataset_has_files[i]) {
      known_dims <- file_dims
    } else {
      known_dims <- defined_file_dims
    }
    replace_values <- vector('list', length = length(known_dims))
    names(replace_values) <- known_dims
    files_to_load <- sapply(dat_selectors[known_dims], function(x) length(x[[1]]))
    files_to_load[found_pattern_dim] <- 1
    sub_array_of_files_to_load <- array(1:prod(files_to_load), 
Nicolau Manubens's avatar
Nicolau Manubens committed
                                        dim = files_to_load)
    names(dim(sub_array_of_files_to_load)) <- known_dims
    sub_array_of_not_found_files <- array(!dataset_has_files[i], 
Nicolau Manubens's avatar
Nicolau Manubens committed
                                          dim = files_to_load)
    names(dim(sub_array_of_not_found_files)) <- known_dims
    j <- 1
    while (j <= prod(files_to_load)) {
      selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ]
      names(selector_indices) <- known_dims
      selectors <- sapply(1:length(known_dims), 
        function (x) {
          vector_to_pick <- 1
          if (known_dims[x] %in% names(depending_file_dims)) {
            vector_to_pick <- selector_indices[which(known_dims == depending_file_dims[[known_dims[x]]])]
          }
          dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]]
        })
      names(selectors) <- known_dims
      replace_values[known_dims] <- selectors
      if (!dataset_has_files[i]) {
        if (any(is.na(selectors))) {
          replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))]
        }
        file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE)
        sub_array_of_files_to_load[j] <- file_path        
      } else {
        if (any(is.na(selectors))) {
          replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))]
          file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE)
          sub_array_of_files_to_load[j] <- file_path
          sub_array_of_not_found_files[j] <- TRUE
        } else {
          file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values)
          sub_array_of_files_to_load[j] <- file_path
          if (is.null(indices_of_first_files_with_data[[i]])) {
            if (!(length(grep("^http", file_path)) > 0)) {
              if (!file.exists(file_path)) {
                file_path <- NULL
              }
            }
            if (!is.null(file_path)) {
              test_file <- NULL
              ## TODO: suppress error messages
              test_file <- file_opener(file_path)
              if (!is.null(test_file)) {
                selector_indices[which(known_dims == found_pattern_dim)] <- i
                indices_of_first_files_with_data[[i]] <- selector_indices
                selectors_of_first_files_with_data[[i]] <- selectors
                file_closer(test_file)
              }
            }
          }
        }
      }
      j <- j + 1
    }
    # Extend array as needed progressively
    if (is.null(array_of_files_to_load)) {
        array_of_files_to_load <- sub_array_of_files_to_load
        array_of_not_found_files <- sub_array_of_not_found_files
    } else {
      array_of_files_to_load <- .MergeArrays(array_of_files_to_load, sub_array_of_files_to_load,
                                             along = found_pattern_dim)
      ## TODO: file_dims, and variables like that.. are still ok now? I don't think so
      array_of_not_found_files <- .MergeArrays(array_of_not_found_files, sub_array_of_not_found_files,
                                               along = found_pattern_dim)
    }    
    dat[[i]][['selectors']] <- dat_selectors
  }
  if (all(sapply(indices_of_first_files_with_data, is.null))) {
    stop("No data files found for any of the specified datasets.")
  }

  ########################### READING INNER DIMS. #############################
#print("J")
  ## TODO: To be run in parallel (local multi-core)
  # Now time to work out the inner file dimensions.
  # First pick the requested variables.
  dims_to_iterate <- NULL
  for (return_var in names(return_vars)) {
    dims_to_iterate <- unique(c(dims_to_iterate, return_vars[[return_var]]))
  }
  if (found_pattern_dim %in% dims_to_iterate) {
    dims_to_iterate <- dims_to_iterate[-which(dims_to_iterate == found_pattern_dim)]
  }
  common_return_vars <- NULL
  common_first_found_file <- NULL
  common_return_vars_pos <- NULL
  if (length(return_vars) > 0) {
    common_return_vars_pos <- which(sapply(return_vars, function(x) !(found_pattern_dim %in% x)))
  }
  if (length(common_return_vars_pos) > 0) {
    common_return_vars <- return_vars[common_return_vars_pos]
    return_vars <- return_vars[-common_return_vars_pos]
    common_first_found_file <- rep(FALSE, length(which(sapply(common_return_vars, length) == 0)))
    names(common_first_found_file) <- names(common_return_vars[which(sapply(common_return_vars, length) == 0)])
  }
  return_vars <- lapply(return_vars, 
    function(x) {
      if (found_pattern_dim %in% x) {
        x[-which(x == found_pattern_dim)]
      } else {
        x
      }
    })
  if (length(common_return_vars) > 0) {
    picked_common_vars <- vector('list', length = length(common_return_vars))
    names(picked_common_vars) <- names(common_return_vars)
  } else {
    picked_common_vars <- NULL
  }
  picked_common_vars_ordered <- picked_common_vars
  picked_common_vars_unorder_indices <- picked_common_vars
  picked_vars <- vector('list', length = length(dat))
  names(picked_vars) <- dat_names
  picked_vars_ordered <- picked_vars
  picked_vars_unorder_indices <- picked_vars
  for (i in 1:length(dat)) {
    if (dataset_has_files[i]) {
      # Put all selectors in a list of a single list/vector of selectors. 
      # The dimensions that go across files will later be extended to have 
      # lists of lists/vectors of selectors.
      for (inner_dim in expected_inner_dims[[i]]) {
        if (!is.list(dat[[i]][['selectors']][[inner_dim]]) || 
            (is.list(dat[[i]][['selectors']][[inner_dim]]) && 
             length(dat[[i]][['selectors']][[inner_dim]]) == 2 &&
             is.null(names(dat[[i]][['selectors']][[inner_dim]])))) {
          dat[[i]][['selectors']][[inner_dim]] <- list(dat[[i]][['selectors']][[inner_dim]])
        }
      }
      if (length(return_vars) > 0) {
        picked_vars[[i]] <- vector('list', length = length(return_vars))
        names(picked_vars[[i]]) <- names(return_vars)
        picked_vars_ordered[[i]] <- picked_vars[[i]]
        picked_vars_unorder_indices[[i]] <- picked_vars[[i]]
      }
      indices_of_first_file <- as.list(indices_of_first_files_with_data[[i]])
      array_file_dims <- sapply(dat[[i]][['selectors']][found_file_dims[[i]]], function(x) length(x[[1]]))
      names(array_file_dims) <- found_file_dims[[i]]
      if (length(dims_to_iterate) > 0) {
        indices_of_first_file[dims_to_iterate] <- lapply(array_file_dims[dims_to_iterate], function(x) 1:x)
      }
      array_of_var_files <- do.call('[', c(list(x = array_of_files_to_load), indices_of_first_file, list(drop = FALSE)))
      array_of_var_indices <- array(1:length(array_of_var_files), dim = dim(array_of_var_files))
      array_of_not_found_var_files <- do.call('[', c(list(x = array_of_not_found_files), indices_of_first_file, list(drop = FALSE)))
      previous_indices <- rep(-1, length(indices_of_first_file))
      names(previous_indices) <- names(indices_of_first_file)
      first_found_file <- NULL
      if (length(return_vars) > 0) {
        first_found_file <- rep(FALSE, length(which(sapply(return_vars, length) == 0)))
        names(first_found_file) <- names(return_vars[which(sapply(return_vars, length) == 0)])
      }
      for (j in 1:length(array_of_var_files)) {
        current_indices <- which(array_of_var_indices == j, arr.ind = TRUE)[1, ]
        names(current_indices) <- names(indices_of_first_file)
        if (!is.na(array_of_var_files[j]) && !array_of_not_found_var_files[j]) {
          changed_dims <- which(current_indices != previous_indices)
          vars_to_read <- NULL
          if (length(return_vars) > 0) {
            vars_to_read <- names(return_vars)[sapply(return_vars, function(x) any(names(changed_dims) %in% x))]
          }
          if (!is.null(first_found_file)) {
            if (any(!first_found_file)) {
              vars_to_read <- c(vars_to_read, names(first_found_file[which(!first_found_file)]))
            }
          }
          if ((i == 1) && (length(common_return_vars) > 0)) {
            vars_to_read <- c(vars_to_read, names(common_return_vars)[sapply(common_return_vars, function(x) any(names(changed_dims) %in% x))])
          }
          if (!is.null(common_first_found_file)) {
            if (any(!common_first_found_file)) {
              vars_to_read <- c(vars_to_read, names(common_first_found_file[which(!common_first_found_file)]))
            }
          }
          file_object <- file_opener(array_of_var_files[j])
          if (is.null(file_object)) {
            stop("Unexpected error. Could not find the file ", array_of_var_files[j])
          }
          for (var_to_read in vars_to_read) {
            if (var_to_read %in% unlist(var_params)) {
              associated_dim_name <- names(var_params)[which(unlist(var_params) == var_to_read)]
            }
            var_name_to_reader <- var_to_read
            names(var_name_to_reader) <- 'var'
            var_dims <- file_dim_reader(NULL, file_object, var_name_to_reader, NULL,
                                        synonims)
            # file_dim_reader returns dimension names as found in the file.
            # Need to translate accoridng to synonims:
            names(var_dims) <- sapply(names(var_dims), 
              function(x) {
                which_entry <- which(sapply(synonims, function(y) x %in% y))
                if (length(which_entry) > 0) {
                  names(synonims)[which_entry]
                } else {
                  x
                }
              })
            if (!is.null(var_dims)) {
              var_file_dims <- NULL
              if (var_to_read %in% names(common_return_vars)) {
                var_to_check <- common_return_vars[[var_to_read]]
              } else {
                var_to_check <- return_vars[[var_to_read]]
              }
              if (any(names(dim(array_of_files_to_load)) %in% var_to_check)) {
                var_file_dims <- dim(array_of_files_to_load)[which(names(dim(array_of_files_to_load)) %in% 
                                                                   var_to_check)]
              }
              if (((var_to_read %in% names(common_return_vars)) && 
                   is.null(picked_common_vars[[var_to_read]])) ||
                  ((var_to_read %in% names(return_vars)) && 
                   is.null(picked_vars[[i]][[var_to_read]]))) {
                if (any(names(var_file_dims) %in% names(var_dims))) {
                  stop("Found a requested var in 'return_var' requested for a ",
                       "file dimension which also appears in the dimensions of ",
                       "the variable inside the file.\n", array_of_var_files[j])