zzz.R 18.6 KB
Newer Older
# Take *_var parameters apart
take_var_params <- function(dim_params) {
 # Take *_var parameters apart
  var_params_ind <- grep('_var$', names(dim_params))
  var_params <- dim_params[var_params_ind]
  # Check all *_var are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (var_param in var_params) {
    if (!is.character(var_param)) {
      stop("All '*_var' parameters must be character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(var_params)[i],
                                               '_var$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_var' parameters must be associated to a dimension parameter. Found parameter '",
                  names(var_params)[i], "' but no parameter '",
                  strsplit(names(var_params)[i], '_var$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'var_params' to be the name of 
  # the corresponding dimension.
  if (length(var_params) < 1) {
    var_params <- NULL
  } else {
    names(var_params) <- gsub('_var$', '', names(var_params))
  }
  return(var_params)
}

# Take *_reorder parameters apart
take_var_reorder <- function(dim_params) {
  # Take *_reorder parameters apart
  dim_reorder_params_ind <- grep('_reorder$', names(dim_params))
  dim_reorder_params <- dim_params[dim_reorder_params_ind]
  # Make the keys of 'dim_reorder_params' to be the name of 
  # the corresponding dimension.
  if (length(dim_reorder_params) < 1) {
    dim_reorder_params <- NULL
  } else {
    names(dim_reorder_params) <- gsub('_reorder$', '', names(dim_reorder_params))
  }
  return(dim_reorder_params)
}

# Take *_depends parameters apart
take_var_depends <- function(dim_params) {
  depends_params_ind <- grep('_depends$', names(dim_params))
  depends_params <- dim_params[depends_params_ind]
  # Check all *_depends are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (depends_param in depends_params) {
    if (!is.character(depends_param) || (length(depends_param) > 1)) {
      stop("All '*_depends' parameters must be single character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(depends_params)[i],
                                               '_depends$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_depends' parameters must be associated to a dimension parameter. Found parameter '",
                  names(depends_params)[i], "' but no parameter '",
                  strsplit(names(depends_params)[i], '_depends$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'depends_params' to be the name of 
  # the corresponding dimension.
  if (length(depends_params) < 1) {
    depends_params <- NULL
  } else {
    names(depends_params) <- gsub('_depends$', '', names(depends_params))
  }
  return(depends_params)
}

# Take *_across parameters apart
take_var_across <- function(dim_params) {
  across_params_ind <- grep('_across$', names(dim_params))
  across_params <- dim_params[across_params_ind]
  # Check all *_across are NULL or vectors of character strings, and 
  # that they all have a matching dimension param.
  i <- 1
  for (across_param in across_params) {
    if (!is.character(across_param) || (length(across_param) > 1)) {
      stop("All '*_across' parameters must be single character strings.")
    } else if (!any(grepl(paste0('^', strsplit(names(across_params)[i],
                                               '_across$')[[1]][1], '$'),
                          names(dim_params)))) {
      stop(paste0("All '*_across' parameters must be associated to a dimension parameter. Found parameter '",
                  names(across_params)[i], "' but no parameter '",
                  strsplit(names(across_params)[i], '_across$')[[1]][1], "'."))
    }
    i <- i + 1
  }
  # Make the keys of 'across_params' to be the name of 
  # the corresponding dimension.
  if (length(across_params) < 1) {
    across_params <- NULL
  } else {
    names(across_params) <- gsub('_across$', '', names(across_params))
  }
  return(across_params)
}

# Leave alone the dimension parameters in the variable dim_params
rebuild_dim_params <- function(dim_params, merge_across_dims,
                               inner_dims_across_files) {
  var_params_ind <- grep('_var$', names(dim_params))
  dim_reorder_params_ind <- grep('_reorder$', names(dim_params))
  tolerance_params_ind <- grep('_tolerance$', names(dim_params))
  depends_params_ind <- grep('_depends$', names(dim_params))
  across_params_ind <- grep('_across$', names(dim_params))
  # Leave alone the dimension parameters in the variable dim_params
  if (length(c(var_params_ind, dim_reorder_params_ind, tolerance_params_ind,
               depends_params_ind, across_params_ind)) > 0) {
    dim_params <- dim_params[-c(var_params_ind, dim_reorder_params_ind,
                                tolerance_params_ind, depends_params_ind,
                                across_params_ind)]
    # Reallocating pairs of across file and inner dimensions if they have
    # to be merged. They are put one next to the other to ease merge later.
    if (merge_across_dims) {
      for (inner_dim_across in names(inner_dims_across_files)) {
        inner_dim_pos <- which(names(dim_params) == inner_dim_across)
        file_dim_pos <- which(names(dim_params) == inner_dims_across_files[[inner_dim_across]])
        new_pos <- inner_dim_pos
        if (file_dim_pos < inner_dim_pos) {
          new_pos <- new_pos - 1
        }
        dim_params_to_move <- dim_params[c(inner_dim_pos, file_dim_pos)]
        dim_params <- dim_params[-c(inner_dim_pos, file_dim_pos)]
        new_dim_params <- list()
        if (new_pos > 1) {
          new_dim_params <- c(new_dim_params, dim_params[1:(new_pos - 1)])
        }
        new_dim_params <- c(new_dim_params, dim_params_to_move)
        if (length(dim_params) >= new_pos) {
          new_dim_params <- c(new_dim_params, dim_params[new_pos:length(dim_params)])
        }
        dim_params <- new_dim_params
      }
    }
  }
  dim_names <- names(dim_params)
  if (is.null(dim_names)) {
    stop("At least one pattern dim must be specified.")
  }
  return(dim_params)
}

# Look for chunked dims
look_for_chunks <- function(dim_params, dim_names) {
    chunks <- vector('list', length(dim_names))
  names(chunks) <- dim_names
  for (dim_name in dim_names) {
    if (!is.null(attr(dim_params[[dim_name]], 'chunk'))) {
      chunks[[dim_name]] <- attr(dim_params[[dim_name]], 'chunk')
      attributes(dim_params[[dim_name]]) <- attributes(dim_params[[dim_name]])[-which(names(attributes(dim_params[[dim_name]])) == 'chunk')]
    } else {
      chunks[[dim_name]] <- c(chunk = 1, n_chunks = 1)
    }
  }
  return(chunks)
}

# This is a helper function to compute the chunk indices to take once the total
# number of indices for a dimension has been discovered.
  chunk_indices <- function(n_indices, chunk, n_chunks, dim_name) {
    if (n_chunks > n_indices) {
      stop("Requested to divide dimension '", dim_name, "' of length ",
           n_indices, " in ", n_chunks, " chunks, which is not possible.")
    }
    chunk_sizes <- rep(floor(n_indices / n_chunks), n_chunks)
    chunks_to_extend <- n_indices - chunk_sizes[1] * n_chunks
    if (chunks_to_extend > 0) {
      chunk_sizes[1:chunks_to_extend] <- chunk_sizes[1:chunks_to_extend] + 1
    }
    chunk_size <- chunk_sizes[chunk]
    offset <- 0
    if (chunk > 1) {
      offset <- sum(chunk_sizes[1:(chunk - 1)])
    }
    indices <- 1:chunk_sizes[chunk] + offset
    array(indices, dim = setNames(length(indices), dim_name))
  }

# Check pattern_dims
# Function found_pattern_dims may change pattern_dims in the parent.env
found_pattern_dims <- function(pattern_dims, dim_names, var_params,
                               dim_params, dim_reorder_params) {
  if (is.null(pattern_dims)) {
    .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '",
                    dim_names[1], "' as 'pattern_dims'."))
    assign('pattern_dims', dim_names[1], envir = parent.frame())
    pattern_dims <- dim_names[1]
  } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) {
    assign('pattern_dims', unique(pattern_dims), envir = parent.frame())
    pattern_dims <- unique(pattern_dims)
  } else {
    stop("Parameter 'pattern_dims' must be a vector of character strings.")
  }
  if (any(names(var_params) %in% pattern_dims)) {
    stop("'*_var' parameters specified for pattern dimensions. Remove or fix them.")
  }
  # Find the pattern dimension with the pattern specifications
  found_pattern_dim <- NULL
  for (pattern_dim in pattern_dims) {
    # Check all specifications in pattern_dim are valid
aho's avatar
aho committed
#    dat <- datasets <- dim_params[[pattern_dim]]
    dat <- dim_params[[pattern_dim]]
    if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) {
      stop(paste0("Parameter '", pattern_dim,
                  "' must be a list of lists with pattern specifications or a vector of character strings."))
    }
    if (!is.null(dim_reorder_params[[pattern_dim]])) {
      .warning(paste0("A reorder for the selectors of '", pattern_dim,
                      "' has been specified, but it is a pattern dimension and the reorder will be ignored."))
    }
    if (is.list(dat) || any(sapply(dat, is.list))) {
      if (is.null(found_pattern_dim)) {
        found_pattern_dim <- pattern_dim
      } else {
        stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.")
      }
    }
  }
  if (is.null(found_pattern_dim)) {
   .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications."))
    found_pattern_dim <- pattern_dims[1]
  }
  return(found_pattern_dim)
}

aho's avatar
aho committed

# The variable 'dat' is mounted with the information (name, path) of each dataset.
# NOTE: This function creates the object 'dat_names' in the parent env.
mount_dat <- function(dat, pattern_dim, found_pattern_dim) {

#  dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames')
  dat_to_fetch <- c()
  dat_names <- c()
  if (!is.list(dat)) {
    dat <- as.list(dat)
  } else {
    if (!any(sapply(dat, is.list))) {
      dat <- list(dat)
    }
  }
  for (i in 1:length(dat)) {
    if (is.character(dat[[i]]) && length(dat[[i]]) == 1 && nchar(dat[[i]]) > 0) {
      if (grepl('^(\\./|\\.\\./|/.*/|~/)', dat[[i]])) {
        dat[[i]] <- list(path = dat[[i]])
      } else {
        dat[[i]] <- list(name = dat[[i]])
      }
    } else if (!is.list(dat[[i]])) {
      stop(paste0("Parameter '", pattern_dim, 
                  "' is incorrect. It must be a list of lists or character strings."))
    }
    #if (!(all(names(dat[[i]]) %in% dat_info_names))) {
    #  stop("Error: parameter 'dat' is incorrect. There are unrecognized components in the information of some of the datasets. Check 'dat' in ?Load for details.")
    #}
    if (!('name' %in% names(dat[[i]]))) {
      dat[[i]][['name']] <- paste0('dat', i)
      if (!('path' %in% names(dat[[i]]))) {
        stop(paste0("Parameter '", found_pattern_dim, 
                    "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided."))
      }
    } else if (!('path' %in% names(dat[[i]]))) {
      dat_to_fetch <- c(dat_to_fetch, i)
    }
    #if ('path' %in% names(dat[[i]])) {
    #  if (!('nc_var_name' %in% names(dat[[i]]))) {
    #    dat[[i]][['nc_var_name']] <- '$var_name$'
    #  }
    #  if (!('suffix' %in% names(dat[[i]]))) {
    #    dat[[i]][['suffix']] <- ''
    #  }
    #  if (!('var_min' %in% names(dat[[i]]))) {
    #    dat[[i]][['var_min']] <- ''
    #  }
    #  if (!('var_max' %in% names(dat[[i]]))) {
    #    dat[[i]][['var_max']] <- ''
    #  }
    #}
    dat_names <- c(dat_names, dat[[i]][['name']])
  }
  if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) {
    .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.")
  }
  if (length(dat_to_fetch) > 0) {
    stop("Specified only the name for some data sets, but not the path ",
         "pattern. This option has not been yet implemented.")
  }

  assign('dat_names', dat_names, envir = parent.frame())
  return(dat)
}

# Add attributes indicating whether this dimension selector is value or indice
add_value_indices_flag <- function(x) {
  if (is.null(attr(x, 'values')) || is.null(attr(x, 'indices'))) {
    flag <- (any(x %in% c('all', 'first', 'last')) || is.numeric(unlist(x)))
    attr(x, 'values') <- !flag
    attr(x, 'indices') <- flag
  }
  return(x)
}


# Find the value for the undefined selector (i.e., indices()). Use the value from the first 
# found file.
# Note that "dat[[i]][['path']]" in parent env. is changed in this function.
find_ufd_value <- function(undefined_file_dims, dat, i, replace_values,
                           first_file, file_dims, path_glob_permissive, 
                           depending_file_dims, dat_selectors, selector_checker, chunks) {
  first_values <- vector('list', length = length(undefined_file_dims))
  names(first_values) <- undefined_file_dims
  found_values <- 0
  stop <- FALSE
  try_dim <- 1
  last_success <- 1
  while ((found_values < length(undefined_file_dims)) && !stop) {
    u_file_dim <- undefined_file_dims[try_dim]
    if (is.null(first_values[[u_file_dim]])) {
      path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], 
                                                           replace_values[-which(file_dims == u_file_dim)],
                                                           allow_undefined_key_vars = TRUE)
      found_value <- .FindTagValue(path_with_globs_and_tag, 
                                   first_file, u_file_dim)
      if (!is.null(found_value)) {
        found_values <- found_values + 1
        last_success <- try_dim
        first_values[[u_file_dim]] <- found_value
        replace_values[[u_file_dim]] <- found_value
      }
    }
    try_dim <- (try_dim %% length(undefined_file_dims)) + 1
    if (try_dim == last_success) {
      stop <- TRUE
    }
  }
  if (found_values < length(undefined_file_dims)) {
    stop(paste0("Path pattern of dataset '", dat[[i]][['name']], 
                "' is too complex. Could not automatically ",
                "detect values for all non-explicitly defined ",
                "indices. Check its pattern: ", dat[[i]][['path']]))
  }
  ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above
  ##       Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical
  ##       cmor case, requesting all members and chunks for fixed var and sdate. Not fixing
  ##       sdate raised 'too complex' error.
  # Replace shell globs in path pattern and keep the file_dims as tags
  dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, 
                                                file_dims, dat[[i]][['name']], path_glob_permissive)

  # Now time to look for the available values for the non 
  # explicitly defined selectors for the file dimensions.
  #print("H")
  # Check first the ones that do not depend on others.
  ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))],
           undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))])
  
  for (u_file_dim in ufd) {
    replace_values[undefined_file_dims] <- first_values
    replace_values[[u_file_dim]] <- '*'
    depended_dim <- NULL
    depended_dim_values <- NA

    #NOTE: Here 'selectors' is always 1. Is it supposed to be like this?
    selectors <- dat_selectors[[u_file_dim]][[1]]
    if (u_file_dim %in% names(depending_file_dims)) {
      depended_dim <- depending_file_dims[[u_file_dim]]
      depended_dim_values <- dat_selectors[[depended_dim]][[1]]
      dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values))
      names(dat_selectors[[u_file_dim]]) <- depended_dim_values
    } else {
      dat_selectors[[u_file_dim]] <- list()
    }
    if (u_file_dim %in% unlist(depending_file_dims)) {
      depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))]
      replace_values[depending_dims] <- rep('*', length(depending_dims))
    }
    for (j in 1:length(depended_dim_values)) {
      parsed_values <- c()
      if (!is.null(depended_dim)) {
        replace_values[[depended_dim]] <- depended_dim_values[j]
      }
      path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values)
      found_files <- Sys.glob(path_with_globs)
      ## TODO: Enhance this error message, or change by warning.
      ##       Raises if a wrong sdate is specified, for example.
      if (length(found_files) == 0) {
        .warning(paste0("Could not find files for any '", u_file_dim, 
                        "' for '", depended_dim, "' = '", 
                        depended_dim_values[j], "'."))
        dat_selectors[[u_file_dim]][[j]] <- NA
      } else {
        for (found_file in found_files) {
          path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']],
                                                               replace_values[-which(file_dims == u_file_dim)],
                                                               allow_undefined_key_vars = TRUE)
          parsed_values <- c(parsed_values, 
                             .FindTagValue(path_with_globs_and_tag, found_file, 
                                           u_file_dim))
        }
        #TODO: selector_checker() doesn't allow selectors to be characters. For selectors
        #      like "member = 'r7i1p1f1", it cannot be defined with values.
        dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors,
                                                             var = unique(parsed_values),
                                                             return_indices = FALSE)
        # Take chunk if needed
        dat_selectors[[u_file_dim]][[j]] <- dat_selectors[[u_file_dim]][[j]][chunk_indices(length(dat_selectors[[u_file_dim]][[j]]),
                                                                                           chunks[[u_file_dim]]['chunk'],
                                                                                           chunks[[u_file_dim]]['n_chunks'], 
                                                                                           u_file_dim)]
      }
    }
  }
  #NOTE: change 'dat' in parent env. because "dat[[i]][['path']]" is changed.
  assign('dat', dat, envir = parent.frame())
  return(dat_selectors)
}