From cd10405a42d191bf04e98138ed0eac9db909a9e3 Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 16 Jul 2024 12:11:21 +0200 Subject: [PATCH 1/6] Unify multipath = TRUE and multipath = FALSE options for decadal loading (WIP) --- modules/Loading/R/helper_loading_decadal.R | 66 ++++++---------------- modules/Loading/R/load_decadal.R | 45 +++------------ 2 files changed, 26 insertions(+), 85 deletions(-) diff --git a/modules/Loading/R/helper_loading_decadal.R b/modules/Loading/R/helper_loading_decadal.R index b93f3279..444739a3 100644 --- a/modules/Loading/R/helper_loading_decadal.R +++ b/modules/Loading/R/helper_loading_decadal.R @@ -109,57 +109,25 @@ correct_daily_for_leap <- function(data = NULL, time_attr, return_time = TRUE) { # table, grid, version: A list with variables as name. E.g., list(tas = 'Amon') get_dcpp_path <- function(archive, exp.name, table, grid, version, sdates) { - # Define path (monthly and daily) - multi_path <- FALSE - if (is.null(archive$System[[exp.name]]$src$first_dcppB_syear) | - isTRUE(all(sdates < archive$System[[exp.name]]$src$first_dcppB_syear))) { # only dcppA - if (length(table) == 1) { # only one variable - fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$hcst, - '$ensemble$', table, '$var$', grid, version) - fcst.files <- paste0('$var$_', table, '_*_dcppA-hindcast_s$syear$-$ensemble$_', grid, '_$chunk$.nc') - } else { # multiple vars - fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$hcst, - '$ensemble$', '$table$', '$var$', '$grid$', '$version$') - fcst.files <- paste0('$var$_', '$table$', '_*_dcppA-hindcast_s$syear$-$ensemble$_', '$grid$', '_$chunk$.nc') - } - path_list <- file.path(fcst.path, fcst.files) + if (length(table) == 1) { # only one variable + fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$hcst, + '$ensemble$', table, '$var$', grid, version) + fcst.files <- paste0('$var$_', table, '_*_$dcpp$_s$syear$-$ensemble$_', grid, '_$chunk$.nc') + } else { # multiple vars + fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$hcst, + '$ensemble$', '$table$', '$var$', '$grid$', '$version$') + fcst.files <- paste0('$var$_', '$table$', '_*_$dcpp$_s$syear$-$ensemble$_', '$grid$', '_$chunk$.nc') + } + path_list <- file.path(fcst.path, fcst.files) - } else { - if (all(sdates >= archive$System[[exp.name]]$src$first_dcppB_syear)) { # only dcppB - if (length(table) == 1) { # only one variable - fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$fcst, - '$ensemble$', table, '$var$', grid, version) - - fcst.files <- paste0('$var$_', table, '_*_dcppB-forecast_s$syear$-$ensemble$_', grid, '_$chunk$.nc') - } else { - fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$fcst, - '$ensemble$', '$table$', '$var$', '$grid$', '$version$') - fcst.files <- paste0('$var$_', '$table$', '_*_dcppB-forecast_s$syear$-$ensemble$_', '$grid$', '_$chunk$.nc') - } - path_list <- file.path(fcst.path, fcst.files) - - } else { # have both dcppA and dcppB - # Create one path for each sdate - #TODO: When *_depends = 'syear' can be more than one, use one path with $dcpp$ - multi_path <- TRUE - path_list <- vector('list', length = length(sdates)) - for (i_sdate in 1:length(sdates)) { - if (sdates[i_sdate] >= archive$System[[exp.name]]$src$first_dcppB_syear) { - path_list[[i_sdate]] <- - list(path = file.path(archive$src, archive$System[[exp.name]]$src$fcst, - '$ensemble$', table, '$var$', grid, #version, - paste0('v*/$var$_', table, '_*_dcppB-forecast_s', sdates[i_sdate], - '-$ensemble$_', grid, '_$chunk$.nc'))) - } else { - path_list[[i_sdate]] <- - list(path = file.path(archive$src, archive$System[[exp.name]]$src$hcst, - '$ensemble$', table, '$var$', grid, #version, - paste0('v*/$var$_', table, '_*_dcppA-hindcast_s', sdates[i_sdate], - '-$ensemble$_', grid, '_$chunk$.nc'))) - } - } + dcpp_list <- vector('list', length = length(sdates)) + for (i_sdate in 1:length(sdates)) { + if (sdates[i_sdate] >= archive$System[[exp.name]]$src$first_dcppB_syear) { + dcpp_list[[i_sdate]] <- "dcppB-forecast*" + } else { + dcpp_list[[i_sdate]] <- "dcppA-hindcast" } } - return(list(path_list = path_list, multi_path = multi_path)) + return(list(path_list = path_list, dcpp_list = dcpp_list)) } diff --git a/modules/Loading/R/load_decadal.R b/modules/Loading/R/load_decadal.R index d3b4f439..ea890014 100644 --- a/modules/Loading/R/load_decadal.R +++ b/modules/Loading/R/load_decadal.R @@ -88,6 +88,7 @@ load_decadal <- function(recipe) { version = version, sdates = sdates_hcst) path_list <- tmp$path_list multi_path <- tmp$multi_path + dcpp_list <- tmp$dcpp_list #TODO: to make this case work; enhance Start() if it's possible if (multi_path & length(variable) > 1) { @@ -100,6 +101,8 @@ load_decadal <- function(recipe) { syear = paste0(sdates_hcst), chunk = 'all', chunk_depends = 'syear', + dcpp = dcpp_list, + dcpp_depends = 'syear', time = indices(time_ind), time_across = 'chunk', merge_across_dims = TRUE, @@ -129,38 +132,13 @@ load_decadal <- function(recipe) { metadata_dims = 'var')) } - if (!multi_path) { - Start_hcst_arg_list <- Start_default_arg_list - hcst <- do.call(Start, Start_hcst_arg_list) + Start_hcst_arg_list <- Start_default_arg_list + hcst <- do.call(Start, Start_hcst_arg_list) - } else { - Start_hcst_arg_list <- Start_default_arg_list - Start_hcst_arg_list[['syear']] <- NULL - Start_hcst_arg_list[['chunk_depends']] <- NULL - remove_ind <- which(Start_hcst_arg_list[['return_vars']][['time']] == 'syear') - Start_hcst_arg_list[['return_vars']][['time']] <- Start_hcst_arg_list[['return_vars']][['time']][-remove_ind] - - hcst <- do.call(Start, Start_hcst_arg_list) - - # Reshape and reorder dimensions - ## dat should be 1, syear should be length of dat; reorder dimensions - dim(hcst) <- c(dat = 1, syear = as.numeric(dim(hcst))[1], dim(hcst)[2:6]) - hcst <- s2dv::Reorder(hcst, c('dat', 'var', 'syear', 'time', 'latitude', 'longitude', 'ensemble')) - - # Manipulate time attr because Start() cannot read it correctly - wrong_time_attr <- attr(hcst, 'Variables')$common$time # dim: [time], the first syear only - tmp <- array(dim = c(dim(hcst)[c('syear', 'time')])) - tmp[1, ] <- wrong_time_attr - yr_diff <- (sdates_hcst - sdates_hcst[1])[-1] #diff(sdates_hcst) - for (i_syear in 1:length(yr_diff)) { - tmp[(i_syear + 1), ] <- wrong_time_attr + lubridate::years(yr_diff[i_syear]) - } - attr(hcst, 'Variables')$common$time <- as.POSIXct(tmp, origin = '1970-01-01', tz = 'UTC') - - } - tmp_time_attr <- attr(hcst, 'Variables')$common$time - + + ## TODO: Remove extra dcpp dimension + ## TODO: Remove this part? # change syear to c(sday, sweek, syear) # dim(hcst) should be [dat, var, sday, sweek, syear, time, latitude, longitude, ensemble] dim(hcst) <- c(dim(hcst)[1:2], sday = 1, sweek = 1, dim(hcst)[3:7]) @@ -171,11 +149,6 @@ load_decadal <- function(recipe) { } dim(attr(hcst, 'Variables')$common$time) <- c(sday = 1, sweek = 1, dim(tmp_time_attr)) - #TODO: as.s2dv_cube() needs to be improved to recognize "variable" is under $dat1 - if (multi_path) { - attributes(hcst)$Variables$common[[variable]] <- attributes(hcst)$Variables$dat1[[variable]] - } - # Change class from startR_array to s2dv_cube suppressWarnings( hcst <- as.s2dv_cube(hcst) @@ -191,7 +164,7 @@ load_decadal <- function(recipe) { path_list <- tmp$path_list multi_path <- tmp$multi_path - #TODO: to make this case work; enhance Start() if it's possible + ## TODO: Try this case if (multi_path & length(variable) > 1) { stop("The recipe requests multiple variables and start dates from both dpccA-hindcast and dcppB-forecast. This case is not available for now.") } -- GitLab From e1b89c175b1c9d3b90681c5eaf85a0a67db7bba5 Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 16 Jul 2024 16:12:37 +0200 Subject: [PATCH 2/6] Merge multi_path = TRUE and multi_path = FALSE for decadal loading --- conf/archive_decadal.yml | 16 +++++ modules/Loading/Loading.R | 4 ++ modules/Loading/R/helper_loading_decadal.R | 25 +++++-- modules/Loading/R/load_decadal.R | 76 +++++++--------------- 4 files changed, 63 insertions(+), 58 deletions(-) diff --git a/conf/archive_decadal.yml b/conf/archive_decadal.yml index 0697fc3f..da889a08 100644 --- a/conf/archive_decadal.yml +++ b/conf/archive_decadal.yml @@ -8,6 +8,7 @@ esarchive: src: hcst: "exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/" fcst: + startR: "exp/ecearth/a1ua/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/$dcpp$/" monthly_mean: #NOTE: tos is under both Amon and Omon --> wait to be changed table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "tos":["Amon", "Omon"]} @@ -31,6 +32,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/ec-earth3/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/" fcst: + startR: "exp/CMIP6/$dcpp$/ec-earth3/DCPP/EC-Earth-Consortium/EC-Earth3/$dcpp$/" monthly_mean: table: {"tas":"Amon"} grid: {"tas":"gr"} @@ -52,6 +54,7 @@ esarchive: src: hcst: "exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/" fcst: "exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppB-forecast/" + startR: "exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/$dcpp$/" first_dcppB_syear: 2021 # hcst: "exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/" # fcst: "exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppB-forecast/" @@ -91,6 +94,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/" fcst: "exp/CMIP6/dcppB-forecast/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/dcppB-forecast/" + startR: "exp/CMIP6/$dcpp$/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/$dcpp$/" first_dcppB_syear: 2019 monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "ts":"Amon", "tos":"Omon"} @@ -113,6 +117,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/BCC-CSM2-MR/DCPP/BCC/BCC-CSM2-MR/dcppA-hindcast/" fcst: + startR: "exp/CMIP6/$dcpp$/BCC-CSM2-MR/DCPP/BCC/BCC-CSM2-MR/$dcpp$/" monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon"} grid: {"tas":"gn", "pr":"gn", "psl":"gn"} @@ -135,6 +140,7 @@ esarchive: src: hcst: "exp/canesm5/cmip6-dcppA-hindcast/original_files/cmorfiles/DCPP/CCCma/CanESM5/dcppA-hindcast/" fcst: "exp/canesm5/cmip6-dcppB-forecast_i1p2/original_files/cmorfiles/DCPP/CCCma/CanESM5/dcppB-forecast/" + startR: "exp/canesm5/cmip6-$dcpp$/original_files/cmorfiles/DCPP/CCCma/CanESM5/$dcpp$/" first_dcppB_syear: 2020 monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "tasmin":"Amon", "tasmax":"Amon", "tos":"Omon"} @@ -158,6 +164,7 @@ esarchive: src: hcst: "exp/ncar/cesm-dple-dcppA-hindcast/cmorfiles/DCPP/NCAR/CESM1-1-CAM5-CMIP5/dcppA-hindcast" fcst: + startR: "exp/ncar/cesm-dple-$dcpp$/cmorfiles/DCPP/NCAR/CESM1-1-CAM5-CMIP5/$dcpp$/" monthly_mean: table: {"tas":"Amon", "pr":"Amon"} grid: {"tas":"gn", "pr":"gn"} @@ -180,6 +187,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/CMCC-CM2-SR5/DCPP/CMCC/CMCC-CM2-SR5/dcppA-hindcast/" fcst: "exp/CMIP6/dcppB-forecast/CMCC-CM2-SR5/DCPP/CMCC/CMCC-CM2-SR5/dcppB-forecast/" + startR: "exp/CMIP6/$dcpp$/CMCC-CM2-SR5/DCPP/CMCC/CMCC-CM2-SR5/$dcpp$/" first_dcppB_syear: 2020 monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "prc":"Amon", "ts":"Amon"} @@ -201,6 +209,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/FGOALS-f3-L/DCPP/CAS/FGOALS-f3-L/dcppA-hindcast/" fcst: "exp/CMIP6/dcppB-forecast/FGOALS-f3-L/DCPP/CAS/FGOALS-f3-L/dcppB-forecast/" + startR: "exp/CMIP6/$dcpp$/FGOALS-f3-L/DCPP/CAS/FGOALS-f3-L/$dcpp$/" first_dcppB_syear: 2017 monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "tos":"Omon"} @@ -223,6 +232,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/IPSL-CM6A-LR/DCPP/IPSL/IPSL-CM6A-LR/dcppA-hindcast/" fcst: + startR: "exp/CMIP6/$dcpp$/IPSL-CM6A-LR/DCPP/IPSL/IPSL-CM6A-LR/$dcpp$/" monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "sfcWind":"Amon"} grid: {"tas":"gr", "pr":"gr", "psl":"gr", "sfcWind":"gr"} @@ -243,6 +253,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/MIROC6/DCPP/MIROC/MIROC6/dcppA-hindcast/" fcst: "exp/CMIP6/dcppA-hindcast/MIROC6/DCPP/MIROC/MIROC6/dcppA-hindcast/" + startR: "exp/CMIP6/$dcpp$/MIROC6/DCPP/MIROC/MIROC6/$dcpp$/" monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "tasmin":"Amon", "tasmax":"Amon"} grid: {"tas":"gn", "pr":"gn", "psl":"gn", "tasmin":"gn", "tasmax":"gn"} @@ -263,6 +274,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/MPI-ESM1-2-HR/DCPP/MPI-M/MPI-ESM1-2-HR/dcppA-hindcast/" fcst: + startR: "exp/CMIP6/$dcpp$/MPI-ESM1-2-HR/DCPP/MPI-M/MPI-ESM1-2-HR/$dcpp$/" monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "tasmin":"Amon", "tasmax":"Amon"} grid: {"tas":"gn", "pr":"gn", "psl":"gn", "tasmin":"gn", "tasmax":"gn"} @@ -283,6 +295,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/MPI-ESM1-2-LR/DCPP/MPI-M/MPI-ESM1-2-LR/dcppA-hindcast/" fcst: + startR: "exp/CMIP6/$dcpp$/MPI-ESM1-2-LR/DCPP/MPI-M/MPI-ESM1-2-LR/$dcpp$/" monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon", "ts":"Amon"} grid: {"tas":"gn", "pr":"gn", "psl":"gn", "ts":"gn"} @@ -303,6 +316,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/MRI-ESM2-0/DCPP/MRI/MRI-ESM2-0/dcppA-hindcast/" fcst: + startR: "exp/CMIP6/$dcpp$/MRI-ESM2-0/DCPP/MRI/MRI-ESM2-0/$dcpp$/" monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon"} grid: {"tas":"gn", "pr":"gn", "psl":"gn"} @@ -324,6 +338,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/NorCPM1/DCPP/NCC/NorCPM1/dcppA-hindcast/" fcst: + startR: "exp/CMIP6/$dcpp$/NorCPM1/DCPP/NCC/NorCPM1/$dcpp$/" monthly_mean: table: {"tas":"Amon", "pr":"Amon", "psl":"Amon"} grid: {"tas":"gn", "pr":"gn", "psl":"gn"} @@ -344,6 +359,7 @@ esarchive: src: hcst: "exp/CMIP6/dcppA-hindcast/NorCPM1/DCPP/NCC/NorCPM1/dcppA-hindcast/" fcst: + startR: "exp/CMIP6/$dcpp$/NorCPM1/DCPP/NCC/NorCPM1/$dcpp$/" monthly_mean: table: {"pr":"Amon", "psl":"Amon"} grid: {"pr":"gn", "psl":"gn"} diff --git a/modules/Loading/Loading.R b/modules/Loading/Loading.R index 6e219bf1..f5361218 100644 --- a/modules/Loading/Loading.R +++ b/modules/Loading/Loading.R @@ -1,6 +1,10 @@ source("tools/libs.R") Loading <- function(recipe) { + ## TODO: remove + path <- "/esarchive/scratch/vagudets/repos/startR/R/" + ff <- lapply(list.files(path), function(x) paste0(path, x)) + invisible(lapply(ff, source)) # Source correct function depending on filesystem and time horizon # Case: CERISE (Mars) if (tolower(recipe$Run$filesystem) == "mars") { diff --git a/modules/Loading/R/helper_loading_decadal.R b/modules/Loading/R/helper_loading_decadal.R index 444739a3..4533100f 100644 --- a/modules/Loading/R/helper_loading_decadal.R +++ b/modules/Loading/R/helper_loading_decadal.R @@ -109,21 +109,36 @@ correct_daily_for_leap <- function(data = NULL, time_attr, return_time = TRUE) { # table, grid, version: A list with variables as name. E.g., list(tas = 'Amon') get_dcpp_path <- function(archive, exp.name, table, grid, version, sdates) { + if (length(table) == 1) { # only one variable - fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$hcst, + fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$startR, '$ensemble$', table, '$var$', grid, version) fcst.files <- paste0('$var$_', table, '_*_$dcpp$_s$syear$-$ensemble$_', grid, '_$chunk$.nc') } else { # multiple vars - fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$hcst, + fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$startR, '$ensemble$', '$table$', '$var$', '$grid$', '$version$') fcst.files <- paste0('$var$_', '$table$', '_*_$dcpp$_s$syear$-$ensemble$_', '$grid$', '_$chunk$.nc') } path_list <- file.path(fcst.path, fcst.files) - + dcppB_string <- "dcppB-forecast" + ## NOTE: CanESM5 has a 'special' path pattern... Adding the global expression '*' + ## only works if Start() is loading both dcppA and dcppB. For dcppB only, + ## it crashes. + if (exp.name == "CanESM5") { + if (all(sdates >= archive$System[[exp.name]]$src$first_dcppB_syear)) { + # Replace first instance of $dcpp$ with the actual subdirectory name + path_list <- str_replace(path_list, fixed("$dcpp$"), "dcppB-forecast_i1p2") + } else { + # Add global expression to the string so that it will find 'dcppB-forecast_i1p2' + dcppB_string <- paste0(dcppB_string, "*") + } + } dcpp_list <- vector('list', length = length(sdates)) + names(dcpp_list) <- as.character(sdates) for (i_sdate in 1:length(sdates)) { - if (sdates[i_sdate] >= archive$System[[exp.name]]$src$first_dcppB_syear) { - dcpp_list[[i_sdate]] <- "dcppB-forecast*" + if ((!is.null(archive$System[[exp.name]]$src$first_dcppB_syear)) && + (sdates[i_sdate] >= archive$System[[exp.name]]$src$first_dcppB_syear)) { + dcpp_list[[i_sdate]] <- dcppB_string } else { dcpp_list[[i_sdate]] <- "dcppA-hindcast" } diff --git a/modules/Loading/R/load_decadal.R b/modules/Loading/R/load_decadal.R index ea890014..a598afe0 100644 --- a/modules/Loading/R/load_decadal.R +++ b/modules/Loading/R/load_decadal.R @@ -16,7 +16,7 @@ load_decadal <- function(recipe) { archive <- read_yaml(paste0("conf/archive_decadal.yml"))[[recipe$Run$filesystem]] # Print Start() info or not - DEBUG <- FALSE + DEBUG <- TRUE ## TODO: this should come from the main script # Create output folder and log: @@ -27,7 +27,6 @@ load_decadal <- function(recipe) { exp.name <- recipe$Analysis$Datasets$System$name # 'HadGEM3' ref.name <- recipe$Analysis$Datasets$Reference$name # 'era5' member <- strsplit(recipe$Analysis$Datasets$System$member, ', | |,')[[1]] #c("r1i1p1f2", "r2i1p1f2") - # variable <- recipe$Analysis$Variables$name #'tas' variable <- strsplit(recipe$Analysis$Variables$name, ", | |,")[[1]] store.freq <- recipe$Analysis$Variables$freq #monthly_mean lats.min <- as.numeric(recipe$Analysis$Region$latmin) #0 @@ -77,9 +76,11 @@ load_decadal <- function(recipe) { regrid_params <- get_regrid_params(recipe, archive) # Only if the time length in each chunk may differ that we need largest_dims_length to be TRUE. Otherwise, set FALSE to increase efficiency. - need_largest_dims_length <- ifelse(exp.name %in% c('HadGEM3-GC31-MM', 'EC-Earth3-i2'), TRUE, FALSE) - + need_largest_dims_length <- ifelse(exp.name %in% c('HadGEM3-GC31-MM', 'EC-Earth3-i2'), TRUE, FALSE) + ## TODO: Remove if possible + multi_path <- FALSE + #------------------------------------------- # Step 1: Load the hcst #------------------------------------------- @@ -87,10 +88,9 @@ load_decadal <- function(recipe) { tmp <- get_dcpp_path(archive = archive, exp.name = exp.name, table = table, grid = grid, version = version, sdates = sdates_hcst) path_list <- tmp$path_list - multi_path <- tmp$multi_path dcpp_list <- tmp$dcpp_list - #TODO: to make this case work; enhance Start() if it's possible + #TODO: test this case if (multi_path & length(variable) > 1) { stop("The recipe requests multiple variables and start dates from both dpccA-hindcast and dcppB-forecast. This case is not available for now.") } @@ -137,11 +137,10 @@ load_decadal <- function(recipe) { tmp_time_attr <- attr(hcst, 'Variables')$common$time - ## TODO: Remove extra dcpp dimension ## TODO: Remove this part? # change syear to c(sday, sweek, syear) # dim(hcst) should be [dat, var, sday, sweek, syear, time, latitude, longitude, ensemble] - dim(hcst) <- c(dim(hcst)[1:2], sday = 1, sweek = 1, dim(hcst)[3:7]) + dim(hcst) <- c(dim(hcst)[1:2], sday = 1, sweek = 1, dim(hcst)[3:length(dim(hcst))]) if (!identical(dim(tmp_time_attr), dim(hcst)[c('syear', 'time')])) { error(recipe$Run$logger, "hcst has problem in matching data and time attr dimension.") @@ -149,6 +148,8 @@ load_decadal <- function(recipe) { } dim(attr(hcst, 'Variables')$common$time) <- c(sday = 1, sweek = 1, dim(tmp_time_attr)) + # Remove 'dcpp' dimension: + hcst <- Subset(hcst, along = "dcpp", indices = 1, drop = "selected") # Change class from startR_array to s2dv_cube suppressWarnings( hcst <- as.s2dv_cube(hcst) @@ -162,67 +163,35 @@ load_decadal <- function(recipe) { tmp <- get_dcpp_path(archive = archive, exp.name = exp.name, table = table, grid = grid, version = version, sdates = sdates_fcst) path_list <- tmp$path_list - multi_path <- tmp$multi_path + dcpp_list <- tmp$dcpp_list ## TODO: Try this case if (multi_path & length(variable) > 1) { - stop("The recipe requests multiple variables and start dates from both dpccA-hindcast and dcppB-forecast. This case is not available for now.") + stop("The recipe requests multiple variables and start dates from both", + "dpccA-hindcast and dcppB-forecast. This case is not available for now.") } # monthly & daily - if (!multi_path) { - #NOTE: the adjustment for two cases (multiple files per sdate or not) has been made in hcst - Start_fcst_arg_list <- Start_default_arg_list - Start_fcst_arg_list[['dat']] <- path_list - Start_fcst_arg_list[['syear']] <- paste0(sdates_fcst) - fcst <- do.call(Start, Start_fcst_arg_list) - - - } else { # multi_path + Start_fcst_arg_list <- Start_default_arg_list + Start_fcst_arg_list[['dat']] <- path_list + Start_fcst_arg_list[['syear']] <- paste0(sdates_fcst) + Start_fcst_arg_list[['dcpp']] <- dcpp_list + fcst <- do.call(Start, Start_fcst_arg_list) - #TODO: time attribute is not correct. Improve Start(). - Start_fcst_arg_list <- Start_default_arg_list - Start_fcst_arg_list[['dat']] <- path_list - Start_fcst_arg_list[['syear']] <- NULL - Start_fcst_arg_list[['chunk_depends']] <- NULL - remove_ind <- which(Start_fcst_arg_list[['return_vars']][['time']] == 'syear') - Start_fcst_arg_list[['return_vars']][['time']] <- Start_fcst_arg_list[['return_vars']][['time']][-remove_ind] - fcst <- do.call(Start, Start_fcst_arg_list) - - # Reshape and reorder dimensions - ## dat should be 1, syear should be length of dat; reorder dimensions - ## dim(fcst) should be [dat, var, syear, time, latitude, longitude, ensemble] - dim(fcst) <- c(dat = 1, syear = as.numeric(dim(fcst))[1], dim(fcst)[2:6]) - fcst <- s2dv::Reorder(fcst, c('dat', 'var', 'syear', 'time', 'latitude', 'longitude', 'ensemble')) - - # Manipulate time attr because Start() cannot read it correctly - wrong_time_attr <- attr(fcst, 'Variables')$common$time # dim: [time], the first syear only - tmp <- array(dim = c(dim(fcst)[c('syear', 'time')])) - tmp[1, ] <- wrong_time_attr - yr_diff <- (sdates_fcst - sdates_fcst[1])[-1] #diff(sdates_fcst) - for (i_syear in 1:length(yr_diff)) { - tmp[(i_syear + 1), ] <- wrong_time_attr + lubridate::years(yr_diff[i_syear]) - } - attr(fcst, 'Variables')$common$time <- as.POSIXct(tmp, origin = '1970-01-01', tz = 'UTC') - - } - tmp_time_attr <- attr(fcst, 'Variables')$common$time # change syear to c(sday, sweek, syear) # dim(fcst) should be [dat, var, sday, sweek, syear, time, latitude, longitude, ensemble] - dim(fcst) <- c(dim(fcst)[1:2], sday = 1, sweek = 1, dim(fcst)[3:7]) + dim(fcst) <- c(dim(fcst)[1:2], sday = 1, sweek = 1, dim(fcst)[3:length(dim(fcst))]) if (!identical(dim(tmp_time_attr), dim(fcst)[c('syear', 'time')])) { error(recipe$Run$logger, "fcst has problem in matching data and time attr dimension.") stop() } dim(attr(fcst, 'Variables')$common$time) <- c(sday = 1, sweek = 1, dim(tmp_time_attr)) - - #TODO: as.s2dv_cube() needs to be improved to recognize "variable" is under $dat1 - if (multi_path) { - attributes(fcst)$Variables$common[[variable]] <- attributes(fcst)$Variables$dat1[[variable]] - } + + # Remove 'dcpp' dimension: + fcst <- Subset(fcst, along = "dcpp", indices = 1, drop = "selected") # Change class from startR_array to s2dv_cube suppressWarnings( @@ -230,7 +199,8 @@ load_decadal <- function(recipe) { ) # Only syear could be different - if (!identical(dim(hcst$data)[-5], dim(fcst$data)[-5])) { + syear_dim <- which(names(dim(hcst$data)) == 'syear') + if (!identical(dim(hcst$data)[-syear_dim], dim(fcst$data)[-syear_dim])) { error(recipe$Run$logger, "hcst and fcst do not share the same dimension structure.") stop() -- GitLab From 11f089acf5051661622db34720a2d3329ca87598 Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 16 Jul 2024 16:50:19 +0200 Subject: [PATCH 3/6] Minor change --- modules/Loading/R/load_decadal.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/Loading/R/load_decadal.R b/modules/Loading/R/load_decadal.R index a598afe0..d3ce1e1e 100644 --- a/modules/Loading/R/load_decadal.R +++ b/modules/Loading/R/load_decadal.R @@ -238,7 +238,7 @@ load_decadal <- function(recipe) { lubridate::minute(dates) <- 00 # Restore correct dimensions dim(dates) <- dim(dates_file) - + obs <- Start(dat = obs.path, var = variable, var_dir = var_dir_obs, @@ -268,7 +268,7 @@ load_decadal <- function(recipe) { #//////////////// # Method 2: reshape hcst time attr's date into an array with time dim then as obs date selector #//////////////// - + obs <- Start(dat = obs.path, var = variable, var_dir = var_dir_obs, -- GitLab From bd3d8162e87dbef264c7eebd73852c32f90613d4 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 17 Jul 2024 17:07:34 +0200 Subject: [PATCH 4/6] Update archive and adapt for multivariable and daily case --- conf/archive_decadal.yml | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/conf/archive_decadal.yml b/conf/archive_decadal.yml index da889a08..a2f340e2 100644 --- a/conf/archive_decadal.yml +++ b/conf/archive_decadal.yml @@ -390,10 +390,34 @@ esarchive: name: "ERA5" institution: "European Centre for Medium-Range Weather Forecasts" src: "recon/ecmwf/era5/" - monthly_mean: {"tas":"_f1h-r1440x721cds", "prlr":"_f1h-r1440x721cds", "psl":"_f1h-r1440x721cds", "tos":"_f1h-r1440x721cds"} - daily_mean: {"tas":"_f1h-r1440x721cds/", "rsds":"_f1h-r1440x721cds/", - "prlr":"_f1h-r1440x721cds/", "sfcWind":"_f1h-r1440x721cds/", - "tos":"_f1h-r1440x721cds"} + daily_mean: {"tas":"daily_mean/tas_f1h-r1440x721cds/", + "rsds":"daily_mean/rsds_f1h-r1440x721cds/", + "prlr":"daily_mean/prlr_f1h-r1440x721cds/", + "g300":"daily_mean/g300_f1h-r1440x721cds/", + "g500":"daily_mean/g500_f1h-r1440x721cds/", + "g850":"daily_mean/g850_f1h-r1440x721cds/", + "sfcWind":"daily_mean/sfcWind_f1h-r1440x721cds/", + "tasmax":"daily/tasmax-r1440x721cds/", + "tasmin":"daily/tasmin-r1440x721cds/", + "ta300":"daily_mean/ta300_f1h-r1440x721cds/", + "ta500":"daily_mean/ta500_f1h-r1440x721cds/", + "ta850":"daily_mean/ta850_f1h-r1440x721cds/", + "hurs":"daily_mean/hurs_f1h-r1440x721cds/"} + monthly_mean: {"tas":"monthly_mean/tas_f1h-r1440x721cds/", + "psl":"monthly_mean/psl_f1h-r1440x721cds/", + "prlr":"monthly_mean/prlr_f1h-r1440x721cds/", + "rsds":"monthly_mean/rsds_f1h-r1440x721cds/", + "g300":"monthly_mean/g300_f1h-r1440x721cds/", + "g500":"monthly_mean/g500_f1h-r1440x721cds/", + "g850":"monthly_mean/g850_f1h-r1440x721cds/", + "sfcWind":"monthly_mean/sfcWind_f1h-r1440x721cds/", + "tasmax":"monthly_mean/tasmax_f1h-r1440x721cds/", + "tasmin":"monthly_mean/tasmin_f1h-r1440x721cds/", + "ta300":"montly_mean/ta300_f1h-r1440x721cds/", + "ta500":"monthly_mean/ta500_f1h-r1440x721cds/", + "ta850":"monthly_mean/ta850_f1h-r1440x721cds/", + "tos":"monthly_mean/tos_f1h-r1440x721cds/", + "sic":"monthly_mean/sic_f1h-r1440x721cds/"} calendar: "gregorian" reference_grid: "/esarchive/recon/ecmwf/era5/monthly_mean/tas_f1h-r1440x721cds/tas_201805.nc" @@ -406,8 +430,11 @@ esarchive: name: "JRA-55" institution: "European Centre for Medium-Range Weather Forecasts" src: "recon/jma/jra55/" - monthly_mean: {"tas":"_f6h", "psl":"_f6h", "tos":"", "pr":"_s0-3h", "prlr":"_s0-3h"} - daily_mean: {"tas":"_f6h", "psl":"_f6h", "prlr":"_s0-3h", "sfcWind":"_f6h"} + monthly_mean: {"tas":"monthly_mean/tas_f6h", "psl":"monthly_mean/psl_f6h", + "tos":"", "pr":"monthly_mean/pr_s0-3h", + "prlr":"monthly_mean/prlr_s0-3h"} + daily_mean: {"tas":"daily_mean/tas_f6h", "psl":"daily_mean/psl_f6h", + "prlr":"daily_mean/prlr_s0-3h", "sfcWind":"daily_mean/sfcWind_f6h"} calendar: "proleptic_gregorian" reference_grid: "/esarchive/recon/jma/jra55/monthly_mean/tas_f6h/tas_200811.nc" -- GitLab From 4fefe80c955dff194aa149b8d71949c8a7efad60 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 17 Jul 2024 17:07:54 +0200 Subject: [PATCH 5/6] Clean code; modifications for daily case and multivariable case --- modules/Loading/R/helper_loading_decadal.R | 11 +++-- modules/Loading/R/load_decadal.R | 57 ++++++++++------------ 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/modules/Loading/R/helper_loading_decadal.R b/modules/Loading/R/helper_loading_decadal.R index 4533100f..9b71c94b 100644 --- a/modules/Loading/R/helper_loading_decadal.R +++ b/modules/Loading/R/helper_loading_decadal.R @@ -12,7 +12,7 @@ get_daily_time_ind <- function(ftimemin, ftimemax, initial_month, sdates, calendar) { #NOTE: "sdates" is not needed if leap year is not considered - if (!calendar %in% c('360-day', '365_day', 'noleap', 'standard', 'proleptic_gregorian', + if (!calendar %in% c('360_day', '365_day', 'noleap', 'standard', 'proleptic_gregorian', 'gregorian')) stop("The calendar is not recognized. Please contact maintainers.") @@ -109,12 +109,13 @@ correct_daily_for_leap <- function(data = NULL, time_attr, return_time = TRUE) { # table, grid, version: A list with variables as name. E.g., list(tas = 'Amon') get_dcpp_path <- function(archive, exp.name, table, grid, version, sdates) { - - if (length(table) == 1) { # only one variable + if (length(table) == 1) { + # only one variable, or the variables share a common path pattern fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$startR, - '$ensemble$', table, '$var$', grid, version) + '$ensemble$', table, '$var$', grid, 'v*') fcst.files <- paste0('$var$_', table, '_*_$dcpp$_s$syear$-$ensemble$_', grid, '_$chunk$.nc') - } else { # multiple vars + } else { + # path pattern depends on the variable fcst.path <- file.path(archive$src, archive$System[[exp.name]]$src$startR, '$ensemble$', '$table$', '$var$', '$grid$', '$version$') fcst.files <- paste0('$var$_', '$table$', '_*_$dcpp$_s$syear$-$ensemble$_', '$grid$', '_$chunk$.nc') diff --git a/modules/Loading/R/load_decadal.R b/modules/Loading/R/load_decadal.R index d3ce1e1e..bfd28979 100644 --- a/modules/Loading/R/load_decadal.R +++ b/modules/Loading/R/load_decadal.R @@ -39,14 +39,14 @@ load_decadal <- function(recipe) { sdates_fcst <- recipe$Analysis$Time$fcst if (store.freq == "monthly_mean") { - time_ind <- (as.numeric(recipe$Analysis$Time$ftime_min):as.numeric(recipe$Analysis$Time$ftime_max)) - - } else if (store.freq == "daily_mean") { + time_ind <- (as.numeric(recipe$Analysis$Time$ftime_min):as.numeric(recipe$Analysis$Time$ftime_max)) + } else if (store.freq %in% c("daily", "daily_mean")) { time_ind <- get_daily_time_ind(ftimemin = as.numeric(recipe$Analysis$Time$ftime_min), ftimemax = as.numeric(recipe$Analysis$Time$ftime_max), initial_month = archive$System[[exp.name]]$initial_month, sdates = sdates_hcst, calendar = archive$System[[exp.name]]$calendar) + store.freq <- "daily_mean" } #NOTE: May be used in the future @@ -57,10 +57,13 @@ load_decadal <- function(recipe) { #------------------------- if (store.freq == "monthly_mean") { table <- archive$System[[exp.name]][[store.freq]]$table[variable] #list(tas = 'Amon') + grid <- archive$System[[exp.name]][[store.freq]]$grid[variable] #list(tas = 'gr') } else { table <- 'day' + # For grid, get first element as they are all the same? + grid <- archive$System[[exp.name]][[store.freq]]$grid[variable][[1]] } - grid <- archive$System[[exp.name]][[store.freq]]$grid[variable] #list(tas = 'gr') + # grid <- archive$System[[exp.name]][[store.freq]]$grid[variable] #list(tas = 'gr') version <- archive$System[[exp.name]][[store.freq]]$version[variable] #list(tas = 'v20210910') if (identical(member, 'all')) { member <- strsplit(archive$System[[exp.name]]$member, ',')[[1]] @@ -78,9 +81,6 @@ load_decadal <- function(recipe) { # Only if the time length in each chunk may differ that we need largest_dims_length to be TRUE. Otherwise, set FALSE to increase efficiency. need_largest_dims_length <- ifelse(exp.name %in% c('HadGEM3-GC31-MM', 'EC-Earth3-i2'), TRUE, FALSE) - ## TODO: Remove if possible - multi_path <- FALSE - #------------------------------------------- # Step 1: Load the hcst #------------------------------------------- @@ -90,11 +90,6 @@ load_decadal <- function(recipe) { path_list <- tmp$path_list dcpp_list <- tmp$dcpp_list - #TODO: test this case - if (multi_path & length(variable) > 1) { - stop("The recipe requests multiple variables and start dates from both dpccA-hindcast and dcppB-forecast. This case is not available for now.") - } - Start_default_arg_list <- list( dat = path_list, var = variable, @@ -112,12 +107,12 @@ load_decadal <- function(recipe) { longitude = values(list(lons.min, lons.max)), longitude_reorder = circularsort, ensemble = member, + path_glob_permissive = 2, # for version transform = regrid_params$fcst.transform, transform_extra_cells = 2, transform_params = list(grid = regrid_params$fcst.gridtype, method = regrid_params$fcst.gridmethod), transform_vars = c('latitude', 'longitude'), - # path_glob_permissive = 2, # for version synonims = list(longitude = c('lon', 'longitude'), latitude = c('lat', 'latitude')), return_vars = list(latitude = NULL, longitude = NULL, @@ -125,13 +120,14 @@ load_decadal <- function(recipe) { silent = !DEBUG, retrieve = T) - if (length(variable) > 1) { + if (length(table) > 1) { Start_default_arg_list <- c(Start_default_arg_list, list(table = table, grid = grid, version = version, table_depends = 'var', grid_depends = 'var', version_depends = 'var', metadata_dims = 'var')) + Start_default_arg_list[["path_glob_permissive"]] <- FALSE } - + Start_hcst_arg_list <- Start_default_arg_list hcst <- do.call(Start, Start_hcst_arg_list) @@ -148,8 +144,13 @@ load_decadal <- function(recipe) { } dim(attr(hcst, 'Variables')$common$time) <- c(sday = 1, sweek = 1, dim(tmp_time_attr)) - # Remove 'dcpp' dimension: - hcst <- Subset(hcst, along = "dcpp", indices = 1, drop = "selected") + # Remove 'dcpp' and other extra dimensions: + if (length(table) > 1) { + hcst <- Subset(hcst, along = c("dcpp", "table", "grid", "version"), + indices = list(1, 1, 1, 1), drop = "selected") + } else { + hcst <- Subset(hcst, along = "dcpp", indices = 1, drop = "selected") + } # Change class from startR_array to s2dv_cube suppressWarnings( hcst <- as.s2dv_cube(hcst) @@ -165,12 +166,6 @@ load_decadal <- function(recipe) { path_list <- tmp$path_list dcpp_list <- tmp$dcpp_list - ## TODO: Try this case - if (multi_path & length(variable) > 1) { - stop("The recipe requests multiple variables and start dates from both", - "dpccA-hindcast and dcppB-forecast. This case is not available for now.") - } - # monthly & daily Start_fcst_arg_list <- Start_default_arg_list Start_fcst_arg_list[['dat']] <- path_list @@ -190,8 +185,13 @@ load_decadal <- function(recipe) { } dim(attr(fcst, 'Variables')$common$time) <- c(sday = 1, sweek = 1, dim(tmp_time_attr)) - # Remove 'dcpp' dimension: - fcst <- Subset(fcst, along = "dcpp", indices = 1, drop = "selected") + # Remove 'dcpp' and any other extra dimensions: + if (length(table) > 1) { + fcst <- Subset(fcst, along = c("dcpp", "table", "grid", "version"), + indices = list(1, 1, 1, 1), drop = "selected") + } else { + fcst <- Subset(fcst, along = "dcpp", indices = 1, drop = "selected") + } # Change class from startR_array to s2dv_cube suppressWarnings( @@ -214,7 +214,7 @@ load_decadal <- function(recipe) { # Step 3. Load the reference #------------------------------------------- obs.path <- file.path(archive$src, archive$Reference[[ref.name]]$src, - store.freq, "$var$$var_dir$", "$var$_$file_date$.nc") + "$var_dir$", "$var$_$file_date$.nc") var_dir_obs <- archive$Reference[[ref.name]][[store.freq]][variable] # list(tas = "_f1h-r1440x721cds", tos = "_f1h-r1440x721cds") # obs.path <- file.path(archive$src, archive$Reference[[ref.name]]$src, store.freq, @@ -293,11 +293,6 @@ load_decadal <- function(recipe) { retrieve = TRUE) } - - #dim(attr(obs, 'Variables')$common$time) - # sday sweek syear time - # 1 1 2 14 - # Remove var_dir dimension obs <- Subset(obs, along = "var_dir", indices = 1, drop = "selected") -- GitLab From 2b269e7a5996c487c5e72da85d395f62c7fe7ce2 Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 29 Jul 2024 16:49:27 +0200 Subject: [PATCH 6/6] Add modified version of startR --- modules/Loading/Loading.R | 4 +- modules/Loading/tmp/startR/R/AddStep.R | 149 + .../tmp/startR/R/ByChunks_autosubmit.R | 663 +++ .../Loading/tmp/startR/R/ByChunks_ecflow.R | 1022 ++++ modules/Loading/tmp/startR/R/CDORemapper.R | 123 + modules/Loading/tmp/startR/R/Collect.R | 461 ++ modules/Loading/tmp/startR/R/Compute.R | 197 + modules/Loading/tmp/startR/R/NcCloser.R | 25 + modules/Loading/tmp/startR/R/NcDataReader.R | 395 ++ modules/Loading/tmp/startR/R/NcDimReader.R | 118 + modules/Loading/tmp/startR/R/NcOpener.R | 27 + modules/Loading/tmp/startR/R/NcVarReader.R | 70 + .../Loading/tmp/startR/R/SelectorChecker.R | 308 ++ modules/Loading/tmp/startR/R/Sort.R | 85 + modules/Loading/tmp/startR/R/Start.R | 4518 +++++++++++++++++ modules/Loading/tmp/startR/R/Step.R | 141 + modules/Loading/tmp/startR/R/Utils.R | 1017 ++++ modules/Loading/tmp/startR/R/indices.R | 34 + modules/Loading/tmp/startR/R/values.R | 35 + modules/Loading/tmp/startR/R/zzz.R | 1579 ++++++ 20 files changed, 10969 insertions(+), 2 deletions(-) create mode 100644 modules/Loading/tmp/startR/R/AddStep.R create mode 100644 modules/Loading/tmp/startR/R/ByChunks_autosubmit.R create mode 100644 modules/Loading/tmp/startR/R/ByChunks_ecflow.R create mode 100644 modules/Loading/tmp/startR/R/CDORemapper.R create mode 100644 modules/Loading/tmp/startR/R/Collect.R create mode 100644 modules/Loading/tmp/startR/R/Compute.R create mode 100644 modules/Loading/tmp/startR/R/NcCloser.R create mode 100644 modules/Loading/tmp/startR/R/NcDataReader.R create mode 100644 modules/Loading/tmp/startR/R/NcDimReader.R create mode 100644 modules/Loading/tmp/startR/R/NcOpener.R create mode 100644 modules/Loading/tmp/startR/R/NcVarReader.R create mode 100644 modules/Loading/tmp/startR/R/SelectorChecker.R create mode 100644 modules/Loading/tmp/startR/R/Sort.R create mode 100644 modules/Loading/tmp/startR/R/Start.R create mode 100644 modules/Loading/tmp/startR/R/Step.R create mode 100644 modules/Loading/tmp/startR/R/Utils.R create mode 100644 modules/Loading/tmp/startR/R/indices.R create mode 100644 modules/Loading/tmp/startR/R/values.R create mode 100644 modules/Loading/tmp/startR/R/zzz.R diff --git a/modules/Loading/Loading.R b/modules/Loading/Loading.R index 9736c115..2a465c95 100644 --- a/modules/Loading/Loading.R +++ b/modules/Loading/Loading.R @@ -1,8 +1,8 @@ source("tools/libs.R") Loading <- function(recipe) { - ## TODO: remove - path <- "/esarchive/scratch/vagudets/repos/startR/R/" + ## TODO: remove with new release of startR + path <- "modules/Loading/tmp/startR/R/" ff <- lapply(list.files(path), function(x) paste0(path, x)) invisible(lapply(ff, source)) # Source correct function depending on filesystem and time horizon diff --git a/modules/Loading/tmp/startR/R/AddStep.R b/modules/Loading/tmp/startR/R/AddStep.R new file mode 100644 index 00000000..00af3ab7 --- /dev/null +++ b/modules/Loading/tmp/startR/R/AddStep.R @@ -0,0 +1,149 @@ +#'Create the workflow with the previous defined operation and data. +#' +#'The step that combines the previous declared data and operation together to +#'create the complete workflow. It is the final step before data processing. +#' +#'@param inputs One or a list of objects of the class 'startR_cube' returned by +#' Start(), indicating the data to be processed. +#'@param step_fun A startR step function as returned by Step(). +#'@param \dots Additional parameters for the inputs of function defined in +#' 'step_fun' by Step(). +#' +#'@return A list of the class 'startR_workflow' containing all the objects +#' needed for the data operation. +#'@examples +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011', '200012') +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = 'all', +#' longitude = 'all', +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#' pi_short <- 3.14 +#' fun <- function(x, pi_val) { +#' lat = attributes(x)$Variables$dat1$latitude +#' weight = sqrt(cos(lat * pi_val / 180)) +#' corrected = Apply(list(x), target_dims = "latitude", +#' fun = function(x) {x * weight}) +#' } +#' +#' +#' step <- Step(fun = fun, +#' target_dims = 'latitude', +#' output_dims = 'latitude', +#' use_libraries = c('multiApply'), +#' use_attributes = list(data = "Variables")) +#' wf <- AddStep(data, step, pi_val = pi_short) +#' +#'@importFrom methods is +#'@export +AddStep <- function(inputs, step_fun, ...) { + # Check step_fun + if (!is(step_fun, 'startR_step_fun')) { + stop("Parameter 'step_fun' must be a startR step function as returned by Step.") + } + + # Check inputs + if (is(inputs, 'startR_cube') | is(inputs, 'startR_workflow')) { + inputs <- list(inputs) + names(inputs) <- 'input1' + } + else if (is.list(inputs)) { + if (any(!sapply(inputs, + function(x) is(x, 'startR_cube') | is(x, 'startR_workflow')))) { + stop("Parameter 'inputs' must be one or a list of objects of the class ", + "'startR_cube' or 'startR_workflow'.") + } + } else { + stop("Parameter 'inputs' must be one or a list of objects of the class ", + "'startR_cube' or 'startR_workflow'.") + } + + # Consistency checks + if (!is.null(attr(step_fun, "UseAttributes"))) { + if (!all(names(inputs) == names(attr(step_fun, "UseAttributes")))) { + names(inputs) <- names(attr(step_fun, "UseAttributes")) + .warning(paste("The name of inputs is not assigned or differs from", + "name of use_attributes list in Step(). Force inputs", + "name to be consistent with use_attributes list")) + } + } + + if (length(inputs) != length(attr(step_fun, 'TargetDims'))) { + stop("The number of provided 'inputs' (", length(inputs), ") does not ", + "match the number of expected inputs by the provided 'step_fun' (", + length(attr(step_fun, 'TargetDims')), ").") + } + + # Work out the total target dims of the step + previous_target_dims <- NULL + all_input_dims <- NULL + for (input in 1:length(inputs)) { + dims_to_compare <- names(attr(inputs[[input]], 'Dimensions')) + if (!all(attr(step_fun, 'TargetDims')[[input]] %in% dims_to_compare)) { + stop("The target dimensions required by 'step_fun' for the input ", input, + " are not present in the corresponding provided object in 'inputs'.") + } + if (is(inputs[[input]], 'startR_workflow')) { + if (is.null(previous_target_dims)) { + previous_target_dims <- attr(inputs[[input]], 'TargetDims') + } else { + dims1 <- rep(1, length(previous_target_dims)) + names(dims1) <- previous_target_dims + dims2 <- rep(1, length(attr(inputs[[input]], 'TargetDims'))) + names(dims2) <- attr(inputs[[input]], 'TargetDims') + previous_target_dims <- names(.MergeArrayDims(dims1, dims2)[[1]]) + } + } + new_input_dims <- attr(inputs[[input]], 'Dimensions') + if (any(is.na(new_input_dims))) { + new_input_dims[which(is.na(new_input_dims))] <- rep(1, length(which(is.na(new_input_dims)))) + } + if (is.null(all_input_dims)) { + all_input_dims <- new_input_dims + } else { + all_input_dims <- .MergeArrayDims(all_input_dims, new_input_dims)[[1]] + } + } + + new_target_dims <- unique(unlist(attr(step_fun, 'TargetDims'))) + result <- list() + dims1 <- rep(1, length(previous_target_dims)) + names(dims1) <- previous_target_dims + dims2 <- rep(1, length(new_target_dims)) + names(dims2) <- new_target_dims + target_dims <- names(.MergeArrayDims(dims1, dims2)[[1]]) + for (output in 1:length(attr(step_fun, 'OutputDims'))) { + workflow <- list(inputs = inputs, + fun = step_fun, + params = list(...)) + if (!is.null(attr(step_fun, 'OutputDims')[[output]])) { + dimensions <- rep(NA, length(attr(step_fun, 'OutputDims')[[output]])) + names(dimensions) <- attr(step_fun, 'OutputDims')[[output]] + } else { + dimensions <- NULL + } + in_dims_to_remove <- which(names(all_input_dims) %in% new_target_dims) + if (length(in_dims_to_remove) > 0) { + dimensions <- c(dimensions, all_input_dims[-in_dims_to_remove]) + } else { + dimensions <- c(dimensions, all_input_dims) + } + attr(workflow, 'Dimensions') <- dimensions + attr(workflow, 'AllTargetDims') <- target_dims + class(workflow) <- 'startR_workflow' + result[[names(attr(step_fun, 'OutputDims'))[output]]] <- workflow + } + + if (length(result) == 1) { + result[[1]] + } else { + result + } +} diff --git a/modules/Loading/tmp/startR/R/ByChunks_autosubmit.R b/modules/Loading/tmp/startR/R/ByChunks_autosubmit.R new file mode 100644 index 00000000..ec336329 --- /dev/null +++ b/modules/Loading/tmp/startR/R/ByChunks_autosubmit.R @@ -0,0 +1,663 @@ +#'Execute the operation by chunks +#' +#'This is an internal function used in Compute(), executing the operation by +#'the chunks specified in Compute(). It also returns the configuration details +#'and profiling information. It is used when the workflow manager is +#'Autosubmit. +#' +#'@param step_fun A function with the class 'startR_step_fun' containing the +#' details of operation. +#'@param cube_headers A list with the class 'startR_cube' returned by Start(). +#' It contains the details of data to be operated. +#'@param \dots Additional parameters for the inputs of 'step_fun'. +#'@param chunks A named list of dimensions which to split the data along and +#' the number of chunks to make for each. The chunked dimension can only be +#' those not required as the target dimension in function Step(). The default +#' value is 'auto', which lists all the non-target dimensions and each one has +#' one chunk. +#'@param threads_load An integer indicating the number of execution threads to +#' use for the data retrieval stage. The default value is 1. +#'@param threads_compute An integer indicating the number of execution threads +#' to use for the computation. The default value is 1. +#'@param cluster A list of components that define the configuration of the +#' machine to be run on. The comoponents vary from different machines. Check +#' \href{https://earth.bsc.es/gitlab/es/startR/-/blob/master/inst/doc/practical_guide.md}{practical guide} +#' for more details and examples. +#'@param autosubmit_suite_dir A character string indicating the path to a folder +#' where to store temporary files generated for the automatic management of the +#' workflow manager. This path should be available in local workstation as well +#' as autosubmit machine. The default value is NULL, and a temporary folder +#' under the current working folder will be created. +#'@param autosubmit_server A character vector indicating the login node of the +#' autosubmit machine. It can be "bscesautosubmit01" or "bscesautosubmit02". +#' The default value is NULL, and the node will be randomly chosen. +#'@param silent A logical value deciding whether to print the computation +#' progress (FALSE) on the R session or not (TRUE). It only works when the +#' execution runs locally or the parameter 'wait' is TRUE. The default value +#' is FALSE. +#'@param debug A logical value deciding whether to return detailed messages on +#' the progress and operations in a Compute() call (TRUE) or not (FALSE). +#' Automatically changed to FALSE if parameter 'silent' is TRUE. The default +#' value is FALSE. +#'@param wait A logical value deciding whether the R session waits for the +#' Compute() call to finish (TRUE) or not (FALSE). If FALSE, it will return an +#' object with all the information of the startR execution that can be stored +#' in your disk. After that, the R session can be closed and the results can +#' be collected later with the Collect() function. The default value is TRUE. +#' +#'@return A list of data arrays for the output returned by the last step in the +#' specified workflow. The configuration details and profiling information are +#' attached as attributes to the returned list of arrays. +#' +#'@examples +#' # ByChunks_autosubmit() is internally used in Compute(), not intended to be +#' # used by users. The example just illustrates the inputs of +#' # ByChunks_autosubmit(). +#' # data_path <- system.file('extdata', package = 'startR') +#' # path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' # sdates <- c('200011', '200012') +#' # data <- Start(dat = list(list(path = path_obs)), +#' # var = 'tos', +#' # sdate = sdates, +#' # time = 'all', +#' # latitude = 'all', +#' # longitude = 'all', +#' # return_vars = list(latitude = 'dat', +#' # longitude = 'dat', +#' # time = 'sdate'), +#' # retrieve = FALSE) +#' # fun <- function(x) { +#' # lat = attributes(x)$Variables$dat1$latitude +#' # weight = sqrt(cos(lat * pi / 180)) +#' # corrected = Apply(list(x), target_dims = "latitude", +#' # fun = function(x) {x * weight}) +#' # } +#' # step <- Step(fun = fun, +#' # target_dims = 'latitude', +#' # output_dims = 'latitude', +#' # use_libraries = c('multiApply'), +#' # use_attributes = list(data = "Variables")) +#' #ByChunks_autosubmit(step, data) +#' +#'@import multiApply +#'@importFrom methods is +#'@noRd +ByChunks_autosubmit <- function(step_fun, cube_headers, ..., chunks = 'auto', + threads_load = 1, threads_compute = 1, + cluster = NULL, + autosubmit_suite_dir = NULL, autosubmit_server = NULL, + silent = FALSE, debug = FALSE, wait = TRUE) { + + #NOTE: + #autosubmit_suite_dir: /home/Earth/aho/startR_local_autosubmit/ + #autosubmit_suite_dir_suite: /home/Earth/aho/startR_local_autosubmit/STARTR_CHUNKING_a68h/ + #remote_autosubmit_suite_dir: /esarchive/autosubmit/a68h/proj/ + #remote_autosubmit_suite_dir_suite: /esarchive/autosubmit/a68h/proj/STARTR_CHUNKING_a68h/ + + # Build object to store profiling timings + t_begin_total <- Sys.time() + t_begin_bychunks_setup <- t_begin_total + timings <- list(nchunks = NULL, + concurrent_chunks = NULL, + cores_per_job = NULL, + threads_load = NULL, + threads_compute = NULL, + bychunks_setup = NULL, + transfer = NULL, + queue = NULL, + job_setup = NULL, + load = NULL, + compute = NULL, + transfer_back = NULL, + merge = NULL, + total = NULL) + + MergeArrays <- .MergeArrays + + # Sanity checks + ## step_fun + if (!is(step_fun, 'startR_step_fun')) { + stop("Parameter 'step_fun' must be of the class 'startR_step_fun', as returned ", + "by the function Step.") + } + + ## cube_headers + if (is(cube_headers, 'startR_cube')) { + cube_headers <- list(cube_headers) + } + if (!all(sapply(lapply(cube_headers, class), + function(x) 'startR_cube' %in% x))) { + stop("All objects passed in 'cube_headers' must be of class 'startR_cube', ", + "as returned by Start().") + } + if (length(cube_headers) != length(attr(step_fun, 'TargetDims'))) { + stop("Number of inputs in parameter 'cube_headers' must be equal to the ", + "number of inputs expected by the function 'step_fun'.") + } + + ## threads_load and threads_compute + if (!is.numeric(threads_load)) { + stop("Parameter 'threads_load' must be a numeric value.") + } + threads_load <- round(threads_load) + if (!is.numeric(threads_compute)) { + stop("Parameter 'threads_compute' must be a numeric value.") + } + threads_compute <- round(threads_compute) + timings[['threads_load']] <- threads_load + timings[['threads_compute']] <- threads_compute + + ## autosubmit_suite_dir + if (is.null(autosubmit_suite_dir)) { + # Create a tmp folder as autosubmit_suite_dir + autosubmit_suite_dir <- file.path(getwd(), "startR_autosubmit_temp") + if (!dir.exists(autosubmit_suite_dir)) { + dir.create("startR_autosubmit_temp", recursive = FALSE) + } + .warning(paste0("Parameter 'autosubmit_suite_dir' is not specified. Create a temporary ", + "folder under current directory: ", autosubmit_suite_dir, "/. Make sure ", + "that Autosubmit machine can find this path.")) + } + if (!is.character(autosubmit_suite_dir)) { + stop("Parameter 'autosubmit_suite_dir' must be a character string.") + } + + ## autosubmit_server + if (!is.null(autosubmit_server)) { + if (!autosubmit_server %in% c('bscesautosubmit01', 'bscesautosubmit02')) { + stop("Parameter 'autosubmit_server' must be one existing Autosubmit machine login node, 'bscesautosubmit01' or 'bscesautosubmit02'.") + } + } else { + autosubmit_server <- paste0('bscesautosubmit0', sample(1:2, 1)) + } + + ## silent + if (!is.logical(silent)) { + stop("Parameter 'silent' must be logical.") + } + + ## debug + if (!is.logical(debug)) { + stop("Parameter 'debug' must be logical.") + } + if (silent) { + debug <- FALSE + } + + ## wait + if (!is.logical(wait)) { + stop("Parameter 'wait' must be logical.") + } + + ## cluster + default_cluster <- list(queue_host = NULL, +# queue_type = 'slurm', + data_dir = NULL, +# temp_dir = NULL, + lib_dir = NULL, + init_commands = list(''), + r_module = 'R', + CDO_module = NULL, + autosubmit_module = 'autosubmit', + node_memory = NULL, # not used + cores_per_job = NULL, + job_wallclock = '01:00:00', + max_jobs = 6, + extra_queue_params = list(''), +# bidirectional = TRUE, + polling_period = 10, + special_setup = 'none', + expid = NULL, + hpc_user = NULL, + run_dir = NULL) + if (!is.list(cluster) || is.null(names(cluster))) { + stop("Parameter 'cluster' must be a named list.") + } + if (any(!(names(cluster) %in% c('queue_host', 'queue_type', 'data_dir', + 'temp_dir', 'lib_dir', 'init_commands', + 'r_module', 'CDO_module', 'autosubmit_module', + 'ecflow_module', 'node_memory', + 'cores_per_job', 'job_wallclock', 'max_jobs', + 'extra_queue_params', 'bidirectional', + 'polling_period', 'special_setup', 'expid', 'hpc_user', + 'run_dir' + )))) { + stop("Found invalid component names in parameter 'cluster'.") + } + # Remove ecFlow components + redundant_components <- c('queue_type', 'temp_dir', 'ecflow_module', 'bidirectional') + if (any(redundant_components %in% names(cluster))) { + tmp <- redundant_components[which(redundant_components %in% names(cluster))] + .warning(paste0("Cluster component ", paste(tmp, collapse = ','), + " not used when Autosubmit is the workflow manager.")) + cluster[[tmp]] <- NULL + } + default_cluster[names(cluster)] <- cluster + cluster <- default_cluster + + ### queue_host + support_hpcs <- c('local', 'nord3') # names in platforms.yml + if (is.null(cluster$queue_host) || !cluster$queue_host %in% support_hpcs) { + stop("Cluster component 'queue_host' must be one of the follows: ", + paste(support_hpcs, collapse = ','), '.') + } + + ### data_dir + is_data_dir_shared <- FALSE + if (is.null(cluster[['data_dir']])) { + is_data_dir_shared <- TRUE + } else { + if (!is.character(cluster[['data_dir']])) { + stop("The component 'data_dir' of the parameter 'cluster' must be a character string.") + } + remote_data_dir <- cluster[['data_dir']] + } + ### lib_dir + if (!is.null(cluster[['lib_dir']])) { + if (!is.character(cluster[['lib_dir']])) { + stop("The component 'lib_dir', of the parameter 'cluster' must be NULL or ", + "a character string.") + } + } + ### init_commands + if (!is.list(cluster[['init_commands']]) || + !all(sapply(cluster[['init_commands']], is.character))) { + stop("The component 'init_commands' of the parameter 'cluster' must be a list of ", + "character strings.") + } + ### r_module + if (!is.character(cluster[['r_module']])) { + stop("The component 'r_module' of the parameter 'cluster' must be a character string.") + } + if ((nchar(cluster[['r_module']]) < 1) || (grepl(' ', cluster[['r_module']]))) { + stop("The component 'r_module' of the parameter 'cluster' must have at least one character ", + "and contain no blank spaces.") + } + ### CDO_module + if (!is.null(cluster[['CDO_module']])) { + if (!is.character(cluster[['CDO_module']])) { + stop("The component 'CDO_module' of the parameter 'cluster' must be a character string.") + } + if (nchar(cluster[['CDO_module']]) < 1 || grepl(' ', cluster[['CDO_module']])) { + warning("The component 'CDO_module' of parameter 'cluster' must have ", + " than 1 and only the first element will be used.") + } + cluster[['r_module']] <- paste(cluster[['r_module']], cluster[['CDO_module']]) + } + ### autosubmit_module + if (!is.character(cluster[['autosubmit_module']])) { + stop("The component 'autosubmit_module' of the parameter 'cluster' must be a character string.") + } + ### cores_per_job + if (is.null(cluster[['cores_per_job']])) { + cluster[['cores_per_job']] <- threads_compute + } + if (!is.numeric(cluster[['cores_per_job']])) { + stop("The component 'cores_per_job' of the parameter 'cluster' must be numeric.") + } + cluster[['cores_per_job']] <- round(cluster[['cores_per_job']]) +# NOTE: Why do we have this condition? +# if (cluster[['cores_per_job']] > threads_compute) { +# .message("WARNING: 'threads_compute' should be >= cluster[['cores_per_job']].") +# } + ### job_wallclock + tmp <- strsplit( '01:00:00', ':')[[1]] + if (!length(tmp) %in% c(2, 3) | any(!grepl("^[0-9]+$", tmp)) | any(nchar(tmp) != 2)) { + stop("The compoment 'job_wallclock' should be the format of HH:MM or HH:MM:SS.") + } + ### max_jobs + if (!is.numeric(cluster[['max_jobs']])) { + stop("The component 'max_jobs' of the parameter 'cluster' must be numeric.") + } + cluster[['max_jobs']] <- round(cluster[['max_jobs']]) + ### extra_queue_params + if (!is.list(cluster[['extra_queue_params']]) || + !all(sapply(cluster[['extra_queue_params']], is.character))) { + stop("The component 'extra_queue_params' of the parameter 'cluster' must be a list of ", + "character strings.") + } + ### polling_period + if (!is.numeric(cluster[['polling_period']])) { + stop("The component 'polling_period' of the parameter 'cluster' must be numeric.") + } + cluster[['polling_period']] <- round(cluster[['polling_period']]) + ### special_setup + if (!(cluster[['special_setup']] %in% c('none', 'marenostrum4'))) { + stop("The value provided for the component 'special_setup' of the parameter ", + "'cluster' is not recognized.") + } + ### expid + as_module <- cluster[['autosubmit_module']] + if (is.null(cluster[['expid']])) { + text <- system( + paste0("module load ", as_module, "; ", + "autosubmit expid -H local -d 'startR computation'"), + intern = T) + cluster[['expid']] <- strsplit( + text[grep("The new experiment", text)], + "\"")[[1]][2] + message(paste0("ATTENTION: The new experiment '", cluster[['expid']], + "' is created. Please note it down.")) + } else { + if (!is.character(cluster[['expid']]) | length(cluster[['expid']]) != 1) { + stop("The component 'expid' of the parameter 'cluster' must be a character string.") + } + if (!dir.exists(file.path("/esarchive/autosubmit", cluster[['expid']]))) { + stop("Cluster component 'expid' is not found under /esarchive/autosubmit/.") + } + } + suite_id <- cluster[['expid']] + + ### hpc_user + if (!is.null(cluster$hpc_user) && (!is.character(cluster$hpc_user) | length(cluster$hpc_user) != 1)) { + stop("Cluster component 'hpc_user' must be a character string.") + } + ### run_dir + if (!is.null(cluster$run_dir)) { + if (!dir.exists(cluster$run_dir)) { + stop("Cluster component 'run_dir' ", cluster$run_dir," is not found.") + } + } + +#============================================== + + autosubmit_suite_dir_suite <- paste0(autosubmit_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') + if (!dir.exists(autosubmit_suite_dir_suite)) { + dir.create(autosubmit_suite_dir_suite, recursive = TRUE) + } + if (!dir.exists(autosubmit_suite_dir_suite)) { + stop("Could not find or create the directory in parameter 'autosubmit_suite_dir'.") + } + + remote_autosubmit_suite_dir <- file.path("/esarchive/autosubmit/", suite_id, 'proj') + remote_autosubmit_suite_dir_suite <- paste0(remote_autosubmit_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') + + + # Work out chunked dimensions and target dimensions + all_dims <- lapply(cube_headers, attr, 'Dimensions') + all_dims_merged <- NULL + for (i in all_dims) { + if (is.null(all_dims_merged)) { + all_dims_merged <- i + } else { + all_dims_merged <- .MergeArrayDims(all_dims_merged, i)[[3]] + } + } + all_dimnames <- names(all_dims_merged) + + target_dims_indices <- which(all_dimnames %in% unlist(attr(step_fun, 'TargetDims'))) + target_dims <- NULL + if (length(target_dims_indices) > 0) { + target_dims <- all_dimnames[target_dims_indices] + } + + chunked_dims <- all_dimnames + if (length(target_dims_indices) > 0) { + chunked_dims <- chunked_dims[-target_dims_indices] + } + if (length(chunked_dims) < 1) { + stop("Not possible to process input by chunks. All input dimensions are ", + "target dimensions.") + } + + # Check all input headers have matching dimensions + cube_index <- 1 + for (cube_header in cube_headers) { + + # Check if all the margin dims are consistent among datasets + if (!all(chunked_dims %in% names(attr(cube_header, "Dimensions")))) { + trouble_dim_name <- chunked_dims[which(!chunked_dims %in% + names(attr(cube_header, "Dimensions")))] + stop(paste0("Found margin dimension, ", toString(trouble_dim_name), + ", is not in input data ", cube_index, ".")) + } + + # Only check margin dimensions (i.e., chunked_dims) + if (!all(attr(cube_header, 'Dimensions')[chunked_dims] == all_dims_merged[names(attr(cube_header, 'Dimensions'))][chunked_dims])) { + stop("All provided 'cube_headers' must have matching dimension lengths ", + "with each other.") + } + if (!all(attr(step_fun, 'TargetDims')[[cube_index]] %in% names(attr(cube_header, 'Dimensions')))) { + stop("All provided 'cube_headers' must contain at least the target dimensions ", + "expected by 'step_fun'.") + } + cube_index <- cube_index + 1 + # work out expected result dimensions + } + + # Check chunks + default_chunks <- as.list(rep(1, length(chunked_dims))) + names(default_chunks) <- chunked_dims + if (length(chunks) == 1 && chunks == 'auto') { + chunks <- default_chunks + } + if (!is.list(chunks)) { + stop("Parameter 'chunks' must be a named list or 'auto'.") + } + if (is.null(names(chunks))) { + stop("Parameter 'chunks' must be a named list or 'auto'.") + } + if (any(!(names(chunks) %in% chunked_dims))) { + stop("All names in parameter 'chunks' must be one of the non-target dimensions ", + "present in the cubes in 'cube_headers'. The target dimensions are ", + paste(paste0("'", target_dims, "'"), collapse = ', '), ". The non-target ", + "dimensions (margins) are ", paste(paste0("'", chunked_dims, "'"), collapse = ', '), ".") + } + if (any(!(((unlist(chunks) %% 1) == 0) | (unlist(chunks) == 'all')))) { + stop("All values in parameter 'chunks' must take a numeric value or 'all'.") + } + if (any(unlist(chunks) < 1)) { + stop("All values in parameter 'chunks' must be >= 1.") + } + for (chunk_spec in 1:length(chunks)) { + if (chunks[[chunk_spec]] > all_dims_merged[names(chunks)[chunk_spec]]) { + stop("Too many chunks requested for the dimension ", names(chunks)[chunk_spec], + ". Maximum allowed is ", all_dims_merged[names(chunks)[chunk_spec]]) + } + } + default_chunks[names(chunks)] <- chunks + #NOTE: chunks here has all the margin dims, not only the chunked ones + chunks <- default_chunks + timings[['nchunks']] <- prod(unlist(chunks)) + + # Replace 'all's + chunks_all <- which(unlist(chunks) == 'all') + if (length(chunks_all) > 0) { + chunks[chunks_all] <- all_dims[names(chunks)[chunks_all]] + } + + # Copy load_process_save_chunk_autosubmit.R into local folder + chunk_script <- file(system.file('chunking/Autosubmit/load_process_save_chunk_autosubmit.R', + package = 'startR')) + chunk_script_lines <- readLines(chunk_script) + close(chunk_script) + chunk_script_lines <- gsub('^lib_dir <- *', paste0('lib_dir <- ', + paste(deparse(cluster[['lib_dir']]), collapse = '\n')), + chunk_script_lines) + #TODO: Change out_dir to somewhere else like expid/outputs/ + chunk_script_lines <- gsub('^out_dir <- *', paste0('out_dir <- ', + paste(deparse(remote_autosubmit_suite_dir_suite), collapse = '\n')), chunk_script_lines) + chunk_script_lines <- gsub('^debug <- *', paste0('debug <- ', paste(deparse(debug), collapse = '\n')), + chunk_script_lines) + deparsed_calls <- paste0('start_calls <- list(') + extra_path <- '' + if (cluster[['special_setup']] == 'marenostrum4') { + extra_path <- '/gpfs/archive/bsc32/' + } + for (cube_header in 1:length(cube_headers)) { + pattern_dim <- attr(cube_headers[[cube_header]], 'PatternDim') + bk_pattern_dim <- cube_headers[[cube_header]][[pattern_dim]] + bk_expected_files <- attr(cube_headers[[cube_header]], 'ExpectedFiles') + if (!is_data_dir_shared) { + cube_headers[[cube_header]][[pattern_dim]] <- paste0(remote_data_dir, '/', + extra_path, '/', cube_headers[[cube_header]][[pattern_dim]]) + for (file_n in 1:length(bk_expected_files)) { + attr(cube_headers[[cube_header]], 'ExpectedFiles')[file_n] <- paste0(remote_data_dir, '/', + extra_path, '/', attr(cube_headers[[cube_header]], 'ExpectedFiles')[file_n]) + } + } + deparsed_calls <- paste0(deparsed_calls, '\nquote(', + paste(deparse(cube_headers[[cube_header]]), collapse = '\n'), + ')') + cube_headers[[cube_header]][[pattern_dim]] <- bk_pattern_dim + attr(cube_headers[[cube_header]], 'ExpectedFiles') <- bk_expected_files + if (cube_header < length(cube_headers)) { + deparsed_calls <- paste0(deparsed_calls, ', ') + } + } + deparsed_calls <- paste0(deparsed_calls, '\n)') + chunk_script_lines <- gsub('^start_calls <- *', deparsed_calls, chunk_script_lines) + chunk_script_lines <- gsub('^start_calls_attrs <- *', + paste0('start_calls_attrs <- ', + paste(deparse(lapply(cube_headers, attributes)), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^param_dimnames <- *', + paste0('param_dimnames <- ', + paste(deparse(chunked_dims), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^threads_load <- *', paste0('threads_load <- ', threads_load), + chunk_script_lines) + chunk_script_lines <- gsub('^threads_compute <- *', paste0('threads_compute <- ', threads_compute), + chunk_script_lines) + chunk_script_lines <- gsub('^fun <- *', paste0('fun <- ', paste(deparse(step_fun), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^params <- *', paste0('params <- ', paste(deparse(list(...)), collapse = '\n')), + chunk_script_lines) + writeLines(chunk_script_lines, paste0(autosubmit_suite_dir_suite, '/load_process_save_chunk_autosubmit.R')) + + # Write and copy startR_autosubmit.sh into local folder + write_autosubmit_bash(chunks, cluster, autosubmit_suite_dir = autosubmit_suite_dir) + + # Modify conf files from template and rewrite to /esarchive/autosubmit/expid/conf/ + write_autosubmit_confs(chunks, cluster, autosubmit_suite_dir) + + # Iterate through chunks + chunk_array <- array(1:prod(unlist(chunks)), dim = (unlist(chunks))) + arrays_of_results <- vector('list', length(attr(step_fun, 'OutputDims'))) + names(arrays_of_results) <- names(attr(step_fun, 'OutputDims')) + for (component in 1:length(arrays_of_results)) { + arrays_of_results[[component]] <- vector('list', prod((unlist(chunks)))) + dim(arrays_of_results[[component]]) <- (unlist(chunks)) + } + found_first_result <- FALSE + for (i in 1:length(chunk_array)) { + chunk_indices <- which(chunk_array == i, arr.ind = TRUE)[1, ] + names(chunk_indices) <- names(dim(chunk_array)) + } + + + timings[['cores_per_job']] <- cluster[['cores_per_job']] + timings[['concurrent_chunks']] <- cluster[['max_jobs']] + + t_end_bychunks_setup <- Sys.time() + timings[['bychunks_setup']] <- as.numeric(difftime(t_end_bychunks_setup, + t_begin_bychunks_setup, units = 'secs')) + if (!is_data_dir_shared) { + #NOTE: Not consider this part yet + t_begin_transfer <- Sys.time() + .message("Sending involved files to the cluster file system...") + files_to_send <- NULL + #files_to_check <- NULL + for (cube_header in 1:length(cube_headers)) { + expected_files <- attr(cube_headers[[cube_header]], 'ExpectedFiles') + #files_to_check <- c(files_to_check, expected_files) + #if (cluster[['special_setup']] == 'marenostrum4') { + # expected_files <- paste0('/gpfs/archive/bsc32/', expected_files) + #} + files_to_send <- c(files_to_send, expected_files) + } + #which_files_exist <- sapply(files_to_check, file.exists) + which_files_exist <- sapply(files_to_send, file.exists) + files_to_send <- files_to_send[which_files_exist] + if (cluster[['special_setup']] == 'marenostrum4') { + file_spec <- paste(paste0("/gpfs/archive/bsc32/", + files_to_send), collapse = ' ') + system(paste0("ssh ", cluster[['queue_host']], " 'mkdir -p ", remote_data_dir, + ' ; module load transfer ; cd ', remote_autosubmit_suite_dir_suite, + ' ; dtrsync -Rrav ', '\'', file_spec, '\' "', remote_data_dir, '/"', + " ; sleep 1 ; ", + "while [[ ! $(ls dtrsync_*.out 2>/dev/null | wc -l) -ge 1 ]] ; ", + "do sleep 2 ; done", + " ; sleep 1 ; ", + 'while [[ ! $(grep "total size is" dtrsync_*.out | ', + "wc -l) -ge 1 ]] ; ", + "do sleep 5 ; done", "'")) + } else { + file_spec <- paste(files_to_send, collapse = ' :') + system(paste0("ssh ", cluster[['queue_host']], ' "mkdir -p ', + remote_data_dir, '"')) + system(paste0("rsync -Rrav '", file_spec, "' '", + cluster[['queue_host']], ":", remote_data_dir, "/'")) + } + .message("Files sent successfully.") + t_end_transfer <- Sys.time() + timings[['transfer']] <- as.numeric(difftime(t_end_transfer, t_begin_transfer, units = 'secs')) + } else { + timings[['transfer']] <- 0 + } + if (!silent) { + .message(paste0("Processing chunks... ")) + } + time_begin_first_chunk <- Sys.time() + sys_commands <- paste0("module load ", as_module, "; ", + "autosubmit create ", suite_id, " -np; ", + "autosubmit refresh ", suite_id, "; ") + if (wait) { + sys_commands <- paste0(sys_commands, "autosubmit run ", suite_id) + } else { + sys_commands <- paste0(sys_commands, "nohup autosubmit run ", suite_id, " >/dev/null 2>&1 &") # disown? + } + if (gsub('[[:digit:]]', "", Sys.getenv('HOSTNAME')) == 'bscesautosubmit') { + #NOTE: If we ssh to AS VM and run everything there, we don't need to ssh here + system(sys_commands) + + } else { +# } else if (gsub("[[:digit:]]", "", Sys.getenv("HOSTNAME")) == "bscearth") { + # ssh from WS to AS VM to run exp + as_login <- paste0(Sys.getenv("USER"), '@', autosubmit_server, '.bsc.es') + sys_commands <- paste0('ssh ', as_login, ' "', sys_commands, '"') #'; exit"') + system(sys_commands) + +# } else { +# stop("Cannot identify host", Sys.getenv("HOSTNAME"), ". Where to run AS exp?") + } + + # Check the size of tmp/ASLOGS/jobs_failed_status.log. If it is not 0, the jobs failed. + failed_file_size <- system(paste0("du /esarchive/autosubmit/", suite_id, "/tmp/ASLOGS/jobs_failed_status.log"), intern = T) + if (substr(failed_file_size, 1, 1) != 0) { + # Remove bigmemory objects (e.g., a68h_1_1 and a68h_1_1.desc) if they exist + # If run_dir is specified, the files are under run_dir; if not, files are under proj/STARTR_CHUNKING_xxxx/ + if (!is.null(cluster[['run_dir']])) { + file.remove( + file.path(cluster[['run_dir']], + list.files(cluster[['run_dir']])[grepl(paste0("^", suite_id, "_.*"), list.files(cluster[['run_dir']]))]) + ) + } else { + file.remove( + file.path(remote_autosubmit_suite_dir_suite, + list.files(remote_autosubmit_suite_dir_suite)[grepl(paste0("^", suite_id, "_.*"), list.files(remote_autosubmit_suite_dir_suite))]) + ) + } + + stop("Some Autosubmit jobs failed. Check GUI and logs.") + } + + timings[['total']] <- t_begin_total + startr_exec <- list(cluster = cluster, workflow_manager = 'autosubmit', + suite_id = suite_id, chunks = chunks, + num_outputs = length(arrays_of_results), + autosubmit_suite_dir = autosubmit_suite_dir, #ecflow_server = ecflow_server, + timings = timings) + class(startr_exec) <- 'startR_exec' + + if (wait) { + result <- Collect(startr_exec, wait = TRUE, remove = T) + .message("Computation ended successfully.") + return(result) + + } else { + # if wait = F, return startr_exec and merge chunks in Collect(). + return(startr_exec) + } + +} diff --git a/modules/Loading/tmp/startR/R/ByChunks_ecflow.R b/modules/Loading/tmp/startR/R/ByChunks_ecflow.R new file mode 100644 index 00000000..6292448c --- /dev/null +++ b/modules/Loading/tmp/startR/R/ByChunks_ecflow.R @@ -0,0 +1,1022 @@ +#'Execute the operation by chunks +#' +#'This is an internal function used in Compute(), executing the operation by +#'the chunks specified in Compute(). It also returns the configuration details +#'and profiling information. +#' +#'@param step_fun A function with the class 'startR_step_fun' containing the +#' details of operation. +#'@param cube_headers A list with the class 'startR_cube' returned by Start(). +#' It contains the details of data to be operated. +#'@param \dots Additional parameters for the inputs of 'step_fun'. +#'@param chunks A named list of dimensions which to split the data along and +#' the number of chunks to make for each. The chunked dimension can only be +#' those not required as the target dimension in function Step(). The default +#' value is 'auto', which lists all the non-target dimensions and each one has +#' one chunk. +#'@param threads_load An integer indicating the number of execution threads to +#' use for the data retrieval stage. The default value is 1. +#'@param threads_compute An integer indicating the number of execution threads +#' to use for the computation. The default value is 1. +#'@param cluster A list of components that define the configuration of the +#' machine to be run on. The comoponents vary from the different machines. +#' Check +#' \href{https://earth.bsc.es/gitlab/es/startR/-/blob/master/inst/doc/practical_guide.md}{practical guide} +#' for more examples. +#' Only needed when the computation is not run locally. The default value is +#' NULL. +#'@param ecflow_suite_dir A character string indicating the path to a folder in +#' the local workstation where to store temporary files generated for the +#' automatic management of the workflow. Only needed when the execution is run +#' remotely. The default value is NULL. +#'@param ecflow_server A named vector indicating the host and port of the +#' EC-Flow server. The vector form should be +#' \code{c(host = 'hostname', port = port_number)}. Only needed when the +#' execution is run remotely. The default value is NULL. +#'@param silent A logical value deciding whether to print the computation +#' progress (FALSE) on the R session or not (TRUE). It only works when the +#' execution runs locally or the parameter 'wait' is TRUE. The default value +#' is FALSE. +#'@param debug A logical value deciding whether to return detailed messages on +#' the progress and operations in a Compute() call (TRUE) or not (FALSE). +#' Automatically changed to FALSE if parameter 'silent' is TRUE. The default +#' value is FALSE. +#'@param wait A logical value deciding whether the R session waits for the +#' Compute() call to finish (TRUE) or not (FALSE). If FALSE, it will return an +#' object with all the information of the startR execution that can be stored +#' in your disk. After that, the R session can be closed and the results can +#' be collected later with the Collect() function. The default value is TRUE. +#' +#'@return A list of data arrays for the output returned by the last step in the +#' specified workflow. The configuration details and profiling information are +#' attached as attributes to the returned list of arrays. +#' +#'@examples +#' # ByChunks_ecflow() is internally used in Compute(), not intended to be used +#' # by users. The example just illustrates the inputs of ByChunks_ecflow(). +#' # data_path <- system.file('extdata', package = 'startR') +#' # path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' # sdates <- c('200011', '200012') +#' # data <- Start(dat = list(list(path = path_obs)), +#' # var = 'tos', +#' # sdate = sdates, +#' # time = 'all', +#' # latitude = 'all', +#' # longitude = 'all', +#' # return_vars = list(latitude = 'dat', +#' # longitude = 'dat', +#' # time = 'sdate'), +#' # retrieve = FALSE) +#' # fun <- function(x) { +#' # lat = attributes(x)$Variables$dat1$latitude +#' # weight = sqrt(cos(lat * pi / 180)) +#' # corrected = Apply(list(x), target_dims = "latitude", +#' # fun = function(x) {x * weight}) +#' # } +#' # step <- Step(fun = fun, +#' # target_dims = 'latitude', +#' # output_dims = 'latitude', +#' # use_libraries = c('multiApply'), +#' # use_attributes = list(data = "Variables")) +#' #ByChunks_ecflow(step, data) +#' +#'@import multiApply +#'@importFrom methods is +#'@noRd +ByChunks_ecflow <- function(step_fun, cube_headers, ..., chunks = 'auto', + threads_load = 1, threads_compute = 1, + cluster = NULL, + ecflow_suite_dir = NULL, + ecflow_server = NULL, + silent = FALSE, debug = FALSE, + wait = TRUE) { + # Build object to store profiling timings + t_begin_total <- Sys.time() + t_begin_bychunks_setup <- t_begin_total + timings <- list(nchunks = NULL, + concurrent_chunks = NULL, + cores_per_job = NULL, + threads_load = NULL, + threads_compute = NULL, + bychunks_setup = NULL, + transfer = NULL, + queue = NULL, + job_setup = NULL, + load = NULL, + compute = NULL, + transfer_back = NULL, + merge = NULL, + total = NULL) + + MergeArrays <- .MergeArrays + + # Check input headers + if (is(cube_headers, 'startR_cube')) { + cube_headers <- list(cube_headers) + } + if (!all(sapply(lapply(cube_headers, class), + function(x) 'startR_cube' %in% x))) { + stop("All objects passed in 'cube_headers' must be of class 'startR_cube', ", + "as returned by Start().") + } + + # Check step_fun + if (!is.function(step_fun)) { + stop("Parameter 'step_fun' must be a function.") + } + + # Check cores + if (!is.numeric(threads_load)) { + stop("Parameter 'threads_load' must be a numeric value.") + } + threads_load <- round(threads_load) + if (!is.numeric(threads_compute)) { + stop("Parameter 'threads_compute' must be a numeric value.") + } + threads_compute <- round(threads_compute) + timings[['threads_load']] <- threads_load + timings[['threads_compute']] <- threads_compute + + on_cluster <- !is.null(cluster) + + # Check ecflow_suite_dir + suite_id <- sample(10 ^ 10, 1) + ecflow_suite_dir_suite <- '' + if (on_cluster) { + if (is.null(ecflow_suite_dir)) { + stop("Parameter 'ecflow_suite_dir' must be specified when dispatching on a cluster.") + } + if (!is.character(ecflow_suite_dir)) { + stop("Parameter 'ecflow_suite_dir' must be a character string.") + } + ecflow_suite_dir_suite <- paste0(ecflow_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') + dir.create(ecflow_suite_dir_suite, recursive = TRUE) + if (!dir.exists(ecflow_suite_dir_suite)) { + stop("Could not find or create the directory in ", + "parameter 'ecflow_suite_dir'.") + } + } + + # Check cluster + default_cluster <- list(queue_host = NULL, + queue_type = 'slurm', + data_dir = NULL, + temp_dir = NULL, + lib_dir = NULL, + init_commands = list(''), + r_module = 'R', + CDO_module = NULL, + ecflow_module = 'ecFlow', + node_memory = NULL, + cores_per_job = NULL, + job_wallclock = '01:00:00', + max_jobs = 6, + extra_queue_params = list(''), + bidirectional = TRUE, + polling_period = 10, + special_setup = 'none') + if (on_cluster) { + if (!is.list(cluster)) { + stop("Parameter 'cluster' must be a named list.") + } + if (is.null(names(cluster))) { + stop("Parameter 'cluster' must be a named list.") + } + if (any(!(names(cluster) %in% c('queue_host', 'queue_type', 'data_dir', + 'temp_dir', 'lib_dir', 'init_commands', + 'r_module', 'CDO_module', 'autosubmit_module', + 'ecflow_module', 'node_memory', + 'cores_per_job', 'job_wallclock', 'max_jobs', + 'extra_queue_params', 'bidirectional', + 'polling_period', 'special_setup', 'expid', 'hpc_user')))) { + + stop("Found invalid component names in parameter 'cluster'.") + } + # Remove ecFlow components + redundant_components <- c('autosubmit_module', 'expid', 'hpc_user') + if (any(redundant_components %in% names(cluster))) { + tmp <- redundant_components[which(redundant_components %in% names(cluster))] + .warning(paste0("Cluster component ", paste(tmp, collapse = ','), " not used when ecFlow is the workflow manager.")) + cluster[[tmp]] <- NULL + } + default_cluster[names(cluster)] <- cluster + } + localhost_name <- NULL + cluster <- default_cluster + remote_ecflow_suite_dir <- ecflow_suite_dir + is_data_dir_shared <- FALSE + is_ecflow_suite_dir_shared <- FALSE + if (on_cluster) { + #localhost_name <- Sys.info()[['nodename']] + localhost_name <- system('hostname -f', intern = TRUE) + if (Sys.which('ecflow_client') == '') { + stop("ecFlow must be installed in order to run the computation on clusters.") + } + if (is.null(cluster[['queue_host']])) { + queue_host <- localhost_name + } else if ((cluster[['queue_host']] %in% c('localhost', '127.0.0.1', localhost_name)) || + grepl(paste0('^', localhost_name), cluster[['queue_host']])) { + queue_host <- localhost_name + } + if (!(cluster[['queue_type']] %in% c('slurm', 'pbs', 'lsf', 'host'))) { + stop("The only supported 'queue_type's are 'slurm', 'pbs', 'lsf' and 'host'.") + } + if (is.null(cluster[['data_dir']])) { + is_data_dir_shared <- TRUE + } else { + if (!is.character(cluster[['data_dir']])) { + stop("The component 'data_dir' of the parameter 'cluster' must be a character string.") + } + remote_data_dir <- cluster[['data_dir']] + } + if (is.null(cluster[['temp_dir']])) { + is_ecflow_suite_dir_shared <- TRUE + } else { + if (!is.character(cluster[['temp_dir']])) { + stop("The component 'temp_dir' of the parameter 'cluster' must be a character string.") + } + remote_ecflow_suite_dir <- cluster[['temp_dir']] + } + if (!is.null(cluster[['lib_dir']])) { + if (!is.character(cluster[['lib_dir']])) { + stop("The component 'lib_dir', of the parameter 'cluster' must be NULL or ", + "a character string.") + } + } + if (!is.logical(cluster[['bidirectional']])) { + stop("The component 'bidirectional' of the parameter 'cluster' must be a logical value.") + } + if (cluster[['bidirectional']]) { + cluster[['init_commands']] <- c(cluster[['init_commands']], + list(paste('module load', cluster[['ecflow_module']]))) + } + if (!is.list(cluster[['init_commands']]) || + !all(sapply(cluster[['init_commands']], is.character))) { + stop("The component 'init_commands' of the parameter 'cluster' must be a list of ", + "character strings.") + } + if (!is.character(cluster[['r_module']])) { + stop("The component 'r_module' of the parameter 'cluster' must be a character string.") + } + if ((nchar(cluster[['r_module']]) < 1) || (grepl(' ', cluster[['r_module']]))) { + stop("The component 'r_module' of the parameter 'cluster' must have at least one character ", + "and contain no blank spaces.") + } + if (!is.null(cluster[['CDO_module']])) { + if (!is.character(cluster[['CDO_module']])) { + stop("The component 'CDO_module' of the parameter 'cluster' must be a character string.") + } + if (nchar(cluster[['CDO_module']]) < 1 || grepl(' ', cluster[['CDO_module']])) { + .warning(paste0("The component 'CDO_module' of parameter 'cluster' must have ", + " than 1 and only the first element will be used.")) + } + cluster[['r_module']] <- paste(cluster[['r_module']], cluster[['CDO_module']]) + } + if (!is.character(cluster[['ecflow_module']])) { + stop("The component 'ecflow_module' of the parameter 'cluster' must be a character string.") + } + if ((nchar(cluster[['ecflow_module']]) < 1) || + (grepl(' ', cluster[['ecflow_module']]))) { + stop("The component 'ecflow_module' of the parameter 'cluster' must have at least ", + "one character, and contain no blank spaces.") + } + if (is.null(cluster[['cores_per_job']])) { + cluster[['cores_per_job']] <- threads_compute + } + if (!is.numeric(cluster[['cores_per_job']])) { + stop("The component 'cores_per_job' of the parameter 'cluster' must be numeric.") + } + cluster[['cores_per_job']] <- round(cluster[['cores_per_job']]) + if (cluster[['cores_per_job']] > threads_compute) { + .message("WARNING: 'threads_compute' should be >= cluster[['cores_per_job']].") + } + if (!is.list(cluster[['extra_queue_params']]) || + !all(sapply(cluster[['extra_queue_params']], is.character))) { + stop("The component 'extra_queue_params' of the parameter 'cluster' must be a list of ", + "character strings.") + } + if (!(cluster[['special_setup']] %in% c('none', 'marenostrum4'))) { + stop("The value provided for the component 'special_setup' of the parameter ", + "'cluster' is not recognized.") + } + } + + # Check ecflow_suite_dir + remote_ecflow_suite_dir_suite <- '' + if (on_cluster) { + remote_ecflow_suite_dir_suite <- paste0(remote_ecflow_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') + } + + # Check ecflow_server + if (!is.null(ecflow_server) && !(is.character(ecflow_server))) { + stop("Parameter 'ecflow_server' must be a character string if specified.") + } + + # Check silent + if (!is.logical(silent)) { + stop("Parameter 'silent' must be logical.") + } + + # Check debug + if (!is.logical(debug)) { + stop("Parameter 'debug' must be logical.") + } + if (silent) { + debug <- FALSE + } + + # Check wait + if (!is.logical(wait)) { + stop("Parameter 'wait' must be logical.") + } + + # Work out chunked dimensions and target dimensions + all_dims <- lapply(cube_headers, attr, 'Dimensions') + all_dims_merged <- NULL + for (i in all_dims) { + if (is.null(all_dims_merged)) { + all_dims_merged <- i + } else { + all_dims_merged <- .MergeArrayDims(all_dims_merged, i)[[3]] + } + } + all_dimnames <- names(all_dims_merged) + + target_dims_indices <- which(all_dimnames %in% unlist(attr(step_fun, 'TargetDims'))) + target_dims <- NULL + if (length(target_dims_indices) > 0) { + target_dims <- all_dimnames[target_dims_indices] + } + + chunked_dims <- all_dimnames + if (length(target_dims_indices) > 0) { + chunked_dims <- chunked_dims[-target_dims_indices] + } + if (length(chunked_dims) < 1) { + stop("Not possible to process input by chunks. All input dimensions are ", + "target dimensions.") + } + + if (length(cube_headers) != length(attr(step_fun, 'TargetDims'))) { + stop("Number of inputs in parameter 'cube_headers' must be equal to the ", + "number of inputs expected by the function 'step_fun'.") + } + # Check all input headers have matching dimensions + cube_index <- 1 + for (cube_header in cube_headers) { + + # Check if all the margin dims are consistent among datasets + if (!all(chunked_dims %in% names(attr(cube_header, "Dimensions")))) { + trouble_dim_name <- chunked_dims[which(!chunked_dims %in% + names(attr(cube_header, "Dimensions")))] + stop(paste0("Found margin dimension, ", toString(trouble_dim_name), + ", is not in input data ", cube_index, ".")) + } + + # Only check margin dimensions (i.e., chunked_dims) + if (!all(attr(cube_header, 'Dimensions')[chunked_dims] == all_dims_merged[names(attr(cube_header, 'Dimensions'))][chunked_dims])) { + stop("All provided 'cube_headers' must have matching dimension lengths ", + "with each other.") + } + if (!all(attr(step_fun, 'TargetDims')[[cube_index]] %in% names(attr(cube_header, 'Dimensions')))) { + stop("All provided 'cube_headers' must contain at least the target dimensions ", + "expected by 'step_fun'.") + } + cube_index <- cube_index + 1 + # work out expected result dimensions + } + + # Check chunks + default_chunks <- as.list(rep(1, length(chunked_dims))) + names(default_chunks) <- chunked_dims + if (length(chunks) == 1 && chunks == 'auto') { + chunks <- default_chunks + } + if (!is.list(chunks)) { + stop("Parameter 'chunks' must be a named list or 'auto'.") + } + if (is.null(names(chunks))) { + stop("Parameter 'chunks' must be a named list or 'auto'.") + } + if (any(!(names(chunks) %in% chunked_dims))) { + stop("All names in parameter 'chunks' must be one of the non-target dimensions ", + "present in the cubes in 'cube_headers'. The target dimensions are ", + paste(paste0("'", target_dims, "'"), collapse = ', '), ". The non-target ", + "dimensions (margins) are ", paste(paste0("'", chunked_dims, "'"), collapse = ', '), ".") + } + if (any(!(((unlist(chunks) %% 1) == 0) | (unlist(chunks) == 'all')))) { + stop("All values in parameter 'chunks' must take a numeric value or 'all'.") + } + if (any(unlist(chunks) < 1)) { + stop("All values in parameter 'chunks' must be >= 1.") + } + for (chunk_spec in 1:length(chunks)) { + if (chunks[[chunk_spec]] > all_dims_merged[names(chunks)[chunk_spec]]) { + stop("Too many chunks requested for the dimension ", names(chunks)[chunk_spec], + ". Maximum allowed is ", all_dims_merged[names(chunks)[chunk_spec]]) + } + } + default_chunks[names(chunks)] <- chunks + chunks <- default_chunks + timings[['nchunks']] <- prod(unlist(chunks)) + + # Check step_fun + if (!is(step_fun, 'startR_step_fun')) { + stop("Parameter 'step_fun' must be of the class 'startR_step_fun', as returned ", + "by the function Step.") + } + + # Replace 'all's + chunks_all <- which(unlist(chunks) == 'all') + if (length(chunks_all) > 0) { + chunks[chunks_all] <- all_dims[names(chunks)[chunks_all]] + } + # Mount the ecFlow suite + if (on_cluster) { + .message(paste0("ATTENTION: Dispatching chunks on a remote cluster", + ". Make sure passwordless ", + "access is properly set in both directions.")) + + # Copy load_process_save_chunk_ecflow.R into shared folder + chunk_script <- file(system.file('chunking/ecFlow/load_process_save_chunk_ecflow.R', + package = 'startR')) + chunk_script_lines <- readLines(chunk_script) + close(chunk_script) + chunk_script_lines <- gsub('^lib_dir <- *', paste0('lib_dir <- ', + paste(deparse(cluster[['lib_dir']]), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^out_dir <- *', paste0('out_dir <- ', + paste(deparse(remote_ecflow_suite_dir_suite), collapse = '\n')), chunk_script_lines) + chunk_script_lines <- gsub('^debug <- *', paste0('debug <- ', paste(deparse(debug), collapse = '\n')), + chunk_script_lines) + deparsed_calls <- paste0('start_calls <- list(') + extra_path <- '' + if (cluster[['special_setup']] == 'marenostrum4') { + extra_path <- '/gpfs/archive/bsc32/' + } + for (cube_header in 1:length(cube_headers)) { + pattern_dim <- attr(cube_headers[[cube_header]], 'PatternDim') + bk_pattern_dim <- cube_headers[[cube_header]][[pattern_dim]] + bk_expected_files <- attr(cube_headers[[cube_header]], 'ExpectedFiles') + if (!is_data_dir_shared) { + cube_headers[[cube_header]][[pattern_dim]] <- paste0(remote_data_dir, '/', + extra_path, '/', cube_headers[[cube_header]][[pattern_dim]]) + for (file_n in 1:length(bk_expected_files)) { + attr(cube_headers[[cube_header]], 'ExpectedFiles')[file_n] <- paste0(remote_data_dir, '/', + extra_path, '/', attr(cube_headers[[cube_header]], 'ExpectedFiles')[file_n]) + } + } + deparsed_calls <- paste0(deparsed_calls, '\nquote(', + paste(deparse(cube_headers[[cube_header]]), collapse = '\n'), + ')') + cube_headers[[cube_header]][[pattern_dim]] <- bk_pattern_dim + attr(cube_headers[[cube_header]], 'ExpectedFiles') <- bk_expected_files + if (cube_header < length(cube_headers)) { + deparsed_calls <- paste0(deparsed_calls, ', ') + } + } + deparsed_calls <- paste0(deparsed_calls, '\n)') + chunk_script_lines <- gsub('^start_calls <- *', deparsed_calls, chunk_script_lines) + chunk_script_lines <- gsub('^start_calls_attrs <- *', paste0('start_calls_attrs <- ', paste(deparse(lapply(cube_headers, attributes)), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^param_dimnames <- *', paste0('param_dimnames <- ', paste(deparse(chunked_dims), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^threads_load <- *', paste0('threads_load <- ', threads_load), + chunk_script_lines) + chunk_script_lines <- gsub('^threads_compute <- *', paste0('threads_compute <- ', threads_compute), + chunk_script_lines) + chunk_script_lines <- gsub('^fun <- *', paste0('fun <- ', paste(deparse(step_fun), collapse = '\n')), + chunk_script_lines) + chunk_script_lines <- gsub('^params <- *', paste0('params <- ', paste(deparse(list(...)), collapse = '\n')), + chunk_script_lines) + writeLines(chunk_script_lines, paste0(ecflow_suite_dir_suite, '/load_process_save_chunk_ecflow.R')) + + # Copy Chunk.ecf into shared folder + chunk_ecf_script <- file(system.file('chunking/ecFlow/Chunk.ecf', + package = 'startR')) + chunk_ecf_script_lines <- readLines(chunk_ecf_script) + close(chunk_ecf_script) + if (cluster[['queue_type']] == 'host') { + chunk_ecf_script_lines <- gsub('^include_queue_header', + '', + chunk_ecf_script_lines) + } else { + chunk_ecf_script_lines <- gsub('^include_queue_header', + paste0('%include "./', cluster[['queue_type']], '.h"'), + chunk_ecf_script_lines) + } + chunk_ecf_script_lines <- gsub('^include_init_commands', + paste0(paste0(cluster[['init_commands']], collapse = '\n'), '\n'), + chunk_ecf_script_lines) + chunk_ecf_script_lines <- gsub('^include_module_load', + paste0('module load ', cluster[['r_module']]), + chunk_ecf_script_lines) + ecf_vars <- paste0('%', as.vector(sapply(chunked_dims, + function(x) { + c(toupper(x), paste0(toupper(x), '_N')) + })), '%') + # if (!is_ecflow_suite_dir_shared && (cluster[['queue_host']] != localhost_name)) { + # #transfer_back_line <- paste0('rsync -rav %REMOTE_ECF_HOME% ', localhost_name, + # # ':%ECF_HOME%\nrm -f %ECF_HOME%/', + # # paste0('*', paste(ecf_vars[((1:(length(ecf_vars) / 2)) * 2) - 1], collapse = '*'), '*.Rds')) + result_file_id <- paste0('*', + paste(paste0('_', ecf_vars[((1:(length(ecf_vars) / 2)) * 2) - 1], '__'), + collapse = '*'), '*') + # transfer_back_line <- paste0('rsync -rav %REMOTE_ECF_HOME%/%SUITE%/ ', + # localhost_name, + # ':%ECF_HOME%/%SUITE%/\nscp %REMOTE_ECF_HOME%/', + # result_file_id, ' ', localhost_name, + # ':%ECF_HOME%\nrm -f %REMOTE_ECF_HOME%/', + # result_file_id) + # } else { + # transfer_back_line <- '' + # } + chunk_ecf_script_lines <- gsub('^Rscript load_process_save_chunk_ecflow.R --args \\$task_path insert_indices', + paste0('Rscript load_process_save_chunk_ecflow.R --args $task_path ', paste(ecf_vars, collapse = ' ')), + chunk_ecf_script_lines) + #chunk_ecf_script_lines <- gsub('^include_transfer_back_and_rm', transfer_back_line, chunk_ecf_script_lines) + writeLines(chunk_ecf_script_lines, paste0(ecflow_suite_dir_suite, '/Chunk.ecf')) + + # Copy merge_chunks.R into tmp folder + # merge_script <- file(system.file('chunking/merge_chunks.R', + # package = 'startR')) + # merge_script_lines <- readLines(merge_script) + # close(merge_script) + # merge_script_lines <- gsub('^shared_dir <- *', paste0('shared_dir <- ', + # paste(deparse(shared_dir_suite), collapse = '\n')), merge_script_lines) + # writeLines(merge_script_lines, paste0(shared_dir_suite, '/merge_chunks.R')) + + # Copy Merge.ecf into tmp folder + #TODO: Modify chain of parameters sent to r script when merging + #chunks progressively + # merge_ecf_script <- file(system.file('chunking/Merge.ecf', + # package = 'startR')) + # merge_ecf_script_lines <- readLines(merge_ecf_script) + # close(merge_ecf_script) + # writeLines(merge_ecf_script_lines, paste0(shared_dir_suite, '/Merge.ecf')) + + # Copy queue header into shared folder + #file.copy(system.file(paste0('chunking/', cluster[['queue_type']], '.h'), package = 'startR'), + # ecflow_suite_dir_suite) + chunk_queue_header <- file(system.file(paste0('chunking/ecFlow/', cluster[['queue_type']], '.h'), package = 'startR')) + chunk_queue_header_lines <- readLines(chunk_queue_header) + close(chunk_queue_header) + chunk_queue_header_lines <- gsub('^include_extra_queue_params', + paste0(paste0(cluster[['extra_queue_params']], collapse = '\n'), '\n'), + chunk_queue_header_lines) + writeLines(chunk_queue_header_lines, paste0(ecflow_suite_dir_suite, '/', cluster[['queue_type']], '.h')) + + # Copy headers + file.copy(system.file('chunking/ecFlow/head.h', package = 'startR'), + ecflow_suite_dir_suite) + file.copy(system.file('chunking/ecFlow/tail.h', package = 'startR'), + ecflow_suite_dir_suite) + } + + add_line <- function(suite, line, tabs) { + c(suite, paste0(paste(rep(' ', tabs), collapse = ''), line)) + } + suite <- NULL + tabs <- 0 + suite <- add_line(suite, paste0('suite STARTR_CHUNKING_', suite_id), tabs) + tabs <- tabs + 2 + submit_command <- '' + if (cluster[['queue_type']] == 'slurm') { + submit_command <- 'sbatch' + } else if (cluster[['queue_type']] == 'pbs') { + submit_command <- 'qsub' + } else if (cluster[['queue_type']] == 'lsf') { + submit_command <- 'bsub <' + } else if (cluster[['queue_type']] == 'host') { + submit_command <- 'bash' + } + if (on_cluster) { + suite <- add_line(suite, paste0("edit BIDIRECTIONAL '", cluster[['bidirectional']], "'"), tabs) + suite <- add_line(suite, paste0("edit QUEUE_HOST '", cluster[['queue_host']], "'"), tabs) + suite <- add_line(suite, paste0("edit ECF_HOST '", localhost_name, "'"), tabs) + suite <- add_line(suite, paste0("edit EC_HOST_FULL '", localhost_name, "'"), tabs) + suite <- add_line(suite, paste0("edit RESULT_FILE_ID '", result_file_id, "'"), tabs) + #} else { + # suite <- add_line(suite, paste0("edit ECF_JOB_CMD '", submit_command, " %ECF_JOB% > %ECF_JOBOUT% 2>&1 &'"), tabs) + } + suite <- add_line(suite, paste0("edit ECF_HOME '", ecflow_suite_dir_suite, "'"), tabs) + suite <- add_line(suite, paste0("edit REMOTE_ECF_HOME '", remote_ecflow_suite_dir_suite, "'"), tabs) + suite <- add_line(suite, paste0("edit CORES_PER_JOB ", cluster[['cores_per_job']], ""), tabs) + suite <- add_line(suite, paste0("edit JOB_WALLCLOCK '", cluster[['job_wallclock']], "'"), tabs) + suite <- add_line(suite, paste0("limit max_jobs ", cluster[['max_jobs']]), tabs) + suite <- add_line(suite, paste0("inlimit max_jobs"), tabs) + suite <- add_line(suite, "family computation", tabs) + tabs <- tabs + 2 + + if (on_cluster) { + # source $HOME/.profile ; + sync_command <- '' + if (!is_ecflow_suite_dir_shared) { + sync_command <- paste0("rsync -rav ", + "%ECF_HOME%/ ", + "%QUEUE_HOST%:%REMOTE_ECF_HOME%/ ; ") + } + suite <- add_line(suite, paste0("edit ECF_JOB_CMD '", + #"mkdir -p %REMOTE_ECF_HOME%/%SUITE%/ ; ", + sync_command, + "ssh %QUEUE_HOST% \"", + "date --rfc-3339=seconds > %REMOTE_ECF_HOME%/%ECF_NAME%.submit_time ; ", + submit_command, + " %REMOTE_ECF_HOME%/%ECF_NAME%.job%ECF_TRYNO% > ", + "%REMOTE_ECF_HOME%/%ECF_NAME%.%ECF_TRYNO% 2>&1 &\" ", + "2>&1'"), tabs) + if (is_ecflow_suite_dir_shared) { + suite <- add_line(suite, paste0("edit REPORT_BACK 'FALSE'"), tabs) + } else { + suite <- add_line(suite, paste0("edit REPORT_BACK 'TRUE'"), tabs) + } + } + + # Open nested ecFlow families + for (i in length(chunked_dims):1) { + suite <- add_line(suite, paste0('family ', chunked_dims[i], '_CHUNK_', 1), tabs) + tabs <- tabs + 2 + suite <- add_line(suite, paste0('edit ', toupper(chunked_dims[i]), ' ', 1), tabs) + suite <- add_line(suite, paste0('edit ', toupper(chunked_dims[i]), '_N ', chunks[[chunked_dims[i]]]), tabs) + } + + # Iterate through chunks + chunk_array <- array(1:prod(unlist(chunks)), dim = (unlist(chunks))) + arrays_of_results <- vector('list', length(attr(step_fun, 'OutputDims'))) + names(arrays_of_results) <- names(attr(step_fun, 'OutputDims')) + for (component in 1:length(arrays_of_results)) { + arrays_of_results[[component]] <- vector('list', prod((unlist(chunks)))) + dim(arrays_of_results[[component]]) <- (unlist(chunks)) + } + if (!on_cluster) { + t_end_bychunks_setup <- Sys.time() + timings[['bychunks_setup']] <- as.numeric(difftime(t_end_bychunks_setup, + t_begin_bychunks_setup, units = 'secs')) + timings[['transfer']] <- 0 + timings[['queue']] <- 0 + timings[['job_setup']] <- 0 + timings[['transfer_back']] <- 0 + if (!silent) { + .message(paste0("Processing chunks... ", + "remaining time estimate soon...")) + } + time_before_first_chunk <- Sys.time() + time_after_first_chunk <- NULL + } + previous_chunk_indices <- rep(1, length(chunks)) + found_first_result <- FALSE + for (i in 1:length(chunk_array)) { + chunk_indices <- which(chunk_array == i, arr.ind = TRUE)[1, ] + names(chunk_indices) <- names(dim(chunk_array)) + # ADD CHUNK SCRIPT TO SUITE + families_to_jump <- which(chunk_indices != previous_chunk_indices) + if (length(families_to_jump) > 0) { + families_to_jump <- max(families_to_jump) + # Close ecFlow families + for (j in 1:families_to_jump) { + tabs <- tabs - 2 + suite <- add_line(suite, paste0('endfamily'), tabs) + } + # Open ecFlow families + for (j in families_to_jump:1) { + suite <- add_line(suite, paste0('family ', (chunked_dims)[j], '_CHUNK_', chunk_indices[j]), tabs) + tabs <- tabs + 2 + suite <- add_line(suite, paste0('edit ', toupper((chunked_dims)[j]), ' ', chunk_indices[j]), tabs) + suite <- add_line(suite, paste0('edit ', toupper((chunked_dims)[j]), '_N ', chunks[[(chunked_dims)[j]]]), tabs) + } + } + suite <- add_line(suite, "task Chunk", tabs) + + if (!on_cluster) { + if (!silent) { + .message(paste("Loading chunk", i, + "out of", length(chunk_array), "...")) + } + data <- vector('list', length(cube_headers)) + t_begin_load <- Sys.time() + for (input in 1:length(data)) { + start_call <- cube_headers[[input]] + dims_to_alter <- which(names(attr(start_call, 'Dimensions')) %in% names(chunks)) + names_dims_to_alter <- names(attr(start_call, 'Dimensions'))[dims_to_alter] + # If any dimension comes from split dimensions + split_dims <- attr(start_call, 'SplitDims') + + if (length(split_dims) != 0){ + + for (k in 1:length(split_dims)) { + if (any(names(split_dims[[k]]) %in% names_dims_to_alter)) { + chunks_split_dims <- rep(1, length(split_dims[[k]])) + names(chunks_split_dims) <- names(split_dims[[k]]) + chunks_indices_split_dims <- chunks_split_dims + split_dims_to_alter <- which(names(split_dims[[k]]) %in% names_dims_to_alter) + chunks_split_dims[split_dims_to_alter] <- unlist(chunks[names(split_dims[[k]])[split_dims_to_alter]]) + chunks_indices_split_dims[split_dims_to_alter] <- chunk_indices[names(split_dims[[k]])[split_dims_to_alter]] + start_call[[names(split_dims)[k]]] <- .chunk(chunks_indices_split_dims, chunks_split_dims, + eval(start_call[[names(split_dims)[k]]])) + dims_to_alter_to_remove <- which(names_dims_to_alter %in% names(split_dims[[k]])) + if (length(dims_to_alter_to_remove) > 0) { + dims_to_alter <- dims_to_alter[-dims_to_alter_to_remove] + names_dims_to_alter <- names_dims_to_alter[-dims_to_alter_to_remove] + } + } + } + } + + if (length(dims_to_alter) > 0) { + for (call_dim in names(attr(start_call, 'Dimensions'))[dims_to_alter]) { + start_call[[call_dim]] <- .chunk(chunk_indices[call_dim], chunks[[call_dim]], + eval(start_call[[call_dim]])) + } + } + start_call[['silent']] <- !debug + if (!('num_procs' %in% names(start_call))) { + start_call[['num_procs']] <- threads_load + } + data[[input]] <- eval(start_call) + } + t_end_load <- Sys.time() + timings[['load']] <- c(timings[['load']], + as.numeric(difftime(t_end_load, t_begin_load, units = 'secs'))) + if (!silent) { + .message(paste("Processing...")) + } + #TODO: Find a better way to assign the names of data. When multiple steps for Compute is available, this way may fail. + names(data) <- names(cube_headers) + t_begin_compute <- Sys.time() + result <- multiApply::Apply(data, + target_dims = attr(step_fun, 'TargetDims'), + fun = step_fun, ..., + output_dims = attr(step_fun, 'OutputDims'), + use_attributes = attr(step_fun, 'UseAttributes'), + ncores = threads_compute) + if (!found_first_result) { + names(arrays_of_results) <- names(result) + found_first_result <- TRUE + } + for (component in 1:length(result)) { + arrays_of_results[[component]][[i]] <- result[[component]] + } + rm(data) + gc() + t_end_compute <- Sys.time() + timings[['compute']] <- c(timings[['compute']], + as.numeric(difftime(t_end_compute, + t_begin_compute, units = 'secs'))) + } + + # Time estimate + if (!on_cluster) { + if (is.null(time_after_first_chunk)) { + time_after_first_chunk <- Sys.time() + if (!silent) { + estimate <- (time_after_first_chunk - + time_before_first_chunk) * + (length(chunk_array) - 1) + units(estimate) <- 'mins' + .message( + paste0("Remaining time estimate (at ", format(time_after_first_chunk), ") ", + "(neglecting merge time): ", format(estimate)) + ) + } + } + } + previous_chunk_indices <- chunk_indices + } + + # Close nested ecFlow families + for (i in length(chunked_dims):1) { + tabs <- tabs - 2 + suite <- add_line(suite, paste0('endfamily'), tabs) + } + + # Close the ecFlow suite + tabs <- tabs - 2 + suite <- add_line(suite, paste0('endfamily'), tabs) + # suite <- add_line(suite, "family merge", tabs) + # tabs <- tabs + 2 + # suite <- add_line(suite, "trigger computation == complete", tabs) + # suite <- add_line(suite, "edit ECF_JOB_CMD 'bash %ECF_JOB% > %ECF_JOBOUT% 2>&1 &'", tabs) + # suite <- add_line(suite, "task Merge", tabs) + # tabs <- tabs - 2 + # suite <- add_line(suite, paste0('endfamily'), tabs) + + tabs <- tabs - 2 + suite <- add_line(suite, "endsuite", tabs) + + # Run ecFlow suite if needed + if (on_cluster) { + timings[['cores_per_job']] <- cluster[['cores_per_job']] + timings[['concurrent_chunks']] <- cluster[['max_jobs']] + suite_file <- paste0(ecflow_suite_dir_suite, '/startR_chunking.def') + suite_file_o <- file(suite_file) + writeLines(suite, suite_file_o) + close(suite_file_o) + + default_ecflow_server <- list(host = localhost_name, port = '5678') + if (is.null(ecflow_server)) { + .warning("Parameter 'ecflow_server' has not been specified but execution on ", + "cluster has been requested. An ecFlow server instance will ", + "be created on localhost:5678.") + } else { + if ('host' %in% names(ecflow_server)) { + stop("A host has been specified for the 'ecflow_server', but this option is not available yet.") + } + default_ecflow_server[names(ecflow_server)] <- ecflow_server + } + ecflow_server <- default_ecflow_server + system(paste0("ecflow_start.sh -p ", ecflow_server[['port']])) + system(paste0("ecflow_client --load=", suite_file, " --host=", + ecflow_server[['host']], " --port=", ecflow_server[['port']])) + if (!is_ecflow_suite_dir_shared) { + system(paste0('ssh ', cluster[['queue_host']], ' "mkdir -p ', + remote_ecflow_suite_dir_suite, '"')) + system(paste0('rsync -ra ', ecflow_suite_dir_suite, + ' ', cluster[['queue_host']], ':', + remote_ecflow_suite_dir_suite)) + } + t_end_bychunks_setup <- Sys.time() + timings[['bychunks_setup']] <- as.numeric(difftime(t_end_bychunks_setup, + t_begin_bychunks_setup, units = 'secs')) + if (!is_data_dir_shared) { + t_begin_transfer <- Sys.time() + .message("Sending involved files to the cluster file system...") + files_to_send <- NULL + #files_to_check <- NULL + for (cube_header in 1:length(cube_headers)) { + expected_files <- attr(cube_headers[[cube_header]], 'ExpectedFiles') + #files_to_check <- c(files_to_check, expected_files) + #if (cluster[['special_setup']] == 'marenostrum4') { + # expected_files <- paste0('/gpfs/archive/bsc32/', expected_files) + #} + files_to_send <- c(files_to_send, expected_files) + } + #which_files_exist <- sapply(files_to_check, file.exists) + which_files_exist <- sapply(files_to_send, file.exists) + files_to_send <- files_to_send[which_files_exist] + if (cluster[['special_setup']] == 'marenostrum4') { + file_spec <- paste(paste0("/gpfs/archive/bsc32/", + files_to_send), collapse = ' ') + system(paste0("ssh ", cluster[['queue_host']], " 'mkdir -p ", remote_data_dir, + ' ; module load transfer ; cd ', remote_ecflow_suite_dir_suite, + ' ; dtrsync -Rrav ', '\'', file_spec, '\' "', remote_data_dir, '/"', + " ; sleep 1 ; ", + "while [[ ! $(ls dtrsync_*.out 2>/dev/null | wc -l) -ge 1 ]] ; ", + "do sleep 2 ; done", + " ; sleep 1 ; ", + 'while [[ ! $(grep "total size is" dtrsync_*.out | ', + "wc -l) -ge 1 ]] ; ", + "do sleep 5 ; done", "'")) + } else { + file_spec <- paste(files_to_send, collapse = ' :') + system(paste0("ssh ", cluster[['queue_host']], ' "mkdir -p ', + remote_data_dir, '"')) + system(paste0("rsync -Rrav '", file_spec, "' '", + cluster[['queue_host']], ":", remote_data_dir, "/'")) + } + .message("Files sent successfully.") + t_end_transfer <- Sys.time() + timings[['transfer']] <- as.numeric(difftime(t_end_transfer, t_begin_transfer, units = 'secs')) + } else { + timings[['transfer']] <- 0 + } + if (!silent) { + .message(paste0("Processing chunks... ")) + } + time_begin_first_chunk <- Sys.time() + # time_after_first_chunk <- NULL + system(paste0("ecflow_client --begin=STARTR_CHUNKING_", suite_id, + " --host=", ecflow_server[['host']], " --port=", + ecflow_server[['port']])) + + timings[['total']] <- t_begin_total + startr_exec <- list(cluster = cluster, ecflow_server = ecflow_server, + workflow_manager = 'ecFlow', + suite_id = suite_id, chunks = chunks, + num_outputs = length(arrays_of_results), + ecflow_suite_dir = ecflow_suite_dir, + timings = timings) + class(startr_exec) <- 'startR_exec' + if (wait) { + if (!silent) { + .message(paste0("Remaining time estimate soon... ")) + # while (is.null(time_after_first_chunk)) { + # if (any(grepl('.*\\.Rds$', list.files(ecflow_suite_dir_suite)))) { + # time_after_first_chunk <- Sys.time() + # estimate <- (time_after_first_chunk - + # time_before_first_chunk) * + # ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / + # cluster[['max_jobs']]) + # units(estimate) <- 'mins' + # .message( + # paste0('Remaining time estimate (neglecting queue and ', + # 'merge time) (at ', format(time_after_first_chunk), + # '): ', format(estimate), ' (', + # format(time_after_first_chunk - + # time_before_first_chunk), ' per chunk)') + # ) + # } else if (!cluster[['bidirectional']]) { + # rsync_output <- tryCatch({ + # system(paste0("rsync -ra --ignore-missing-args ", + # cluster[['queue_host']], ":", + # remote_ecflow_suite_dir_suite, "/*.Rds ", + # ecflow_suite_dir_suite, "/"), intern = TRUE) + # }, error = function(e) { + # message("Warning: rsync from remote server to collect results failed. ", + # "Retrying soon.") + # failed <- TRUE + # }) + # Sys.sleep(cluster[['polling_period']]) + # } + # } + startr_exec[['t_begin_first_chunk']] <- time_begin_first_chunk + } + result <- Collect(startr_exec, wait = TRUE) + .message("Computation ended successfully.") + result + } else { + startr_exec + } + } else { + timings[['cores_per_job']] <- NA + timings[['concurrent_chunks']] <- 1 + t_begin_merge <- Sys.time() + for (component in 1:length(arrays_of_results)) { + arrays_of_results[[component]] <- .MergeArrayOfArrays(arrays_of_results[[component]]) + } + t_end_merge <- Sys.time() + timings[['merge']] <- as.numeric(difftime(t_end_merge, t_begin_merge, units = 'secs')) + t_end_total <- t_end_merge + timings[['total']] <- as.numeric(difftime(t_end_total, t_begin_total, units = 'secs')) + message(paste0("* Computation ended successfully.")) + message(paste0("* Number of chunks: ", + timings[['nchunks']])) + message(paste0("* Max. number of concurrent chunks (jobs): ", + timings[['concurrent_chunks']])) + message(paste0("* Requested cores per job: ", + timings[['cores_per_job']])) + message(paste0("* Load threads per chunk: ", + timings[['threads_load']])) + message(paste0("* Compute threads per chunk: ", + timings[['threads_compute']])) + message(paste0("* Total time (s): ", + timings[['total']])) + message(paste0("* Chunking setup: ", + timings[['bychunks_setup']])) + message(paste0("* Data upload to cluster: ", + timings[['transfer']])) + message(paste0("* All chunks: ", + timings[['total']] - + timings[['bychunks_setup']] - + timings[['transfer']] - + timings[['transfer_back']] - + timings[['merge']])) + message(paste0("* Transfer results from cluster: ", + timings[['transfer_back']])) + message(paste0("* Merge: ", + timings[['merge']])) + message(paste0("* Each chunk: ")) + message(paste0("* queue: ")) + message(paste0("* mean: ", + mean(timings[['queue']]))) + message(paste0("* min: ", + min(timings[['queue']]))) + message(paste0("* max: ", + max(timings[['queue']]))) + message(paste0("* job setup: ")) + message(paste0("* mean: ", + mean(timings[['job_setup']]))) + message(paste0("* min: ", + min(timings[['job_setup']]))) + message(paste0("* max: ", + max(timings[['job_setup']]))) + message(paste0("* load: ")) + message(paste0("* mean: ", + mean(timings[['load']]))) + message(paste0("* min: ", + min(timings[['load']]))) + message(paste0("* max: ", + max(timings[['load']]))) + message(paste0("* compute: ")) + message(paste0("* mean: ", + mean(timings[['compute']]))) + message(paste0("* min: ", + min(timings[['compute']]))) + message(paste0("* max: ", + max(timings[['compute']]))) + attr(arrays_of_results, 'startR_compute_profiling') <- timings + arrays_of_results + } + #TODO: check result dimensions match expected dimensions +} + +ByChunks <- function(step_fun, cube_headers, ..., chunks = 'auto', + threads_load = 2, threads_compute = 1, + cluster = NULL, + ecflow_suite_dir = NULL, + ecflow_server = NULL, + silent = FALSE, debug = FALSE, + wait = TRUE) { + + stop(.Deprecated("ByChunks_ecflow")) +} + diff --git a/modules/Loading/tmp/startR/R/CDORemapper.R b/modules/Loading/tmp/startR/R/CDORemapper.R new file mode 100644 index 00000000..60aa0e2d --- /dev/null +++ b/modules/Loading/tmp/startR/R/CDORemapper.R @@ -0,0 +1,123 @@ +#'CDO Remap Data Transformation for 'startR' +#' +#'This is a transform function that uses CDO software to remap longitude-latitude +#'data subsets onto a specified target grid, intended for use as parameter +#''transform' in a Start() call. This function complies with the input/output +#'interface required by Start() defined in the documentation for the parameter +#''transform' of function Start().\cr\cr +#'This function uses the function CDORemap() in the package 's2dv' to +#'perform the interpolation, hence CDO is required to be installed. +#' +#'@param data_array A data array to be transformed. See details in the +#' documentation of the parameter 'transform' of the function Start(). +#'@param variables A list of auxiliary variables required for the transformation, +#' automatically provided by Start(). See details in the documentation of the +#' parameter 'transform' of the function Start(). +#'@param file_selectors A charcter vector indicating the information of the path of +#' the file parameter 'data_array' comes from. See details in the documentation of +#' the parameter 'transform' of the function Start(). The default value is NULL. +#'@param crop_domain A list of the transformed domain of each transform +#' variable, automatically provided by Start(). +#'@param \dots A list of additional parameters to adjust the transform process, +#' as provided in the parameter 'transform_params' in a Start() call. See details +#' in the documentation of the parameter 'transform' of the function Start(). +#' +#'@return An array with the same amount of dimensions as the input data array, +#' potentially with different sizes, and potentially with the attribute +#' 'variables' with additional auxiliary data. See details in the documentation +#' of the parameter 'transform' of the function Start(). +#'@seealso \code{\link[s2dv]{CDORemap}} +#' +#'@examples +#'# Used in Start(): +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011') +#' \dontrun{ +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = values(list(-60, 60)), +#' latitude_reorder = Sort(decreasing = TRUE), +#' longitude = values(list(-120, 120)), +#' longitude_reorder = CircularSort(-180, 180), +#' transform = CDORemapper, +#' transform_params = list(grid = 'r360x181', +#' method = 'conservative'), +#' transform_vars = c('latitude', 'longitude'), +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#' } +#'@importFrom s2dv CDORemap +#'@importFrom utils getFromNamespace +#'@export +CDORemapper <- function(data_array, variables, file_selectors = NULL, + crop_domain = NULL, ...) { + file_dims <- names(file_selectors) + known_lon_names <- getFromNamespace('.KnownLonNames', 'startR')() + known_lat_names <- getFromNamespace('.KnownLatNames', 'startR')() + if (!any(known_lon_names %in% names(variables)) || + !any(known_lat_names %in% names(variables))) { + stop("The longitude and latitude variables must be requested in ", + "'return_vars' and specified in 'transform_vars' for the ", + "CDORemapper to work.") + } + lon_name <- names(variables)[which(names(variables) %in% known_lon_names)[1]] + lons <- variables[[lon_name]] + if (!is.null(dim(lons))) { + dims_to_subset <- which(names(dim(lons)) %in% file_dims) + if (length(dims_to_subset) > 0) { + lons_to_use <- as.list(rep(TRUE, length(dim(lons)))) + names(lons_to_use) <- names(dim(lons)) + lons_to_use[dims_to_subset] <- as.list(rep(1, length(dims_to_subset))) + attr_bk <- attributes(lons) + lons <- do.call('[', c(list(x = lons), lons_to_use, list(drop = TRUE))) + attributes(lons) <- attr_bk + } + } + lat_name <- names(variables)[which(names(variables) %in% known_lat_names)[1]] + lats <- variables[[lat_name]] + if (!is.null(dim(lats))) { + dims_to_subset <- which(names(dim(lats)) %in% file_dims) + if (length(dims_to_subset) > 0) { + lats_to_use <- as.list(rep(TRUE, length(dim(lats)))) + names(lats_to_use) <- names(dim(lats)) + lats_to_use[dims_to_subset] <- as.list(rep(1, length(dims_to_subset))) + attr_bk <- attributes(lons) + lats <- do.call('[', c(list(x = lats), lats_to_use, list(drop = TRUE))) + attributes(lats) <- attr_bk + } + } + extra_params <- list(...) + + if (!all(c('grid', 'method') %in% names(extra_params))) { + stop("Parameters 'grid' and 'method' must be specified for the ", + "CDORemapper, via the 'transform_params' argument.") + } + + # Use crop_domain to get 'crop' + if (!is.null(crop_domain)) { + ## lon + lon_name <- names(crop_domain)[which(names(crop_domain) %in% known_lon_names)] + crop_lon <- unlist(crop_domain[[lon_name]]) + ## lat + lat_name <- names(crop_domain)[which(names(crop_domain) %in% known_lat_names)] + crop_lat <- unlist(crop_domain[[lat_name]]) + crop_values <- c(crop_lon, crop_lat) + + if ('crop' %in% names(extra_params)) { + warning("Argument 'crop' in 'transform_params' for CDORemapper() is ", + "deprecated. It is automatically assigned as the selected domain ", + "in Start() call.") + } + extra_params[['crop']] <- crop_values + } + + result <- do.call(s2dv::CDORemap, c(list(data_array, lons, lats), extra_params)) + return_variables <- list(result$lons, result$lats) + names(return_variables) <- c(lon_name, lat_name) + list(data_array = result$data_array, variables = return_variables) +} diff --git a/modules/Loading/tmp/startR/R/Collect.R b/modules/Loading/tmp/startR/R/Collect.R new file mode 100644 index 00000000..5ae8b150 --- /dev/null +++ b/modules/Loading/tmp/startR/R/Collect.R @@ -0,0 +1,461 @@ +#'Collect and merge the computation results +#' +#'The final step of the startR workflow after the data operation. It is used when +#'the parameter 'wait' of Compute() is FALSE. It combines all the chunks of the +#'results as one data array when the execution is done. See more details on +#'\href{https://earth.bsc.es/gitlab/es/startR/-/blob/master/inst/doc/practical_guide.md}{practical guide}. +#'Collect() calls Collect_ecflow() or Collect_autosubmit() according to the +#'chosen workflow manager. +#'@param startr_exec An R object returned by Compute() when the parameter 'wait' +#' of Compute() is FALSE. It can be directly from a Compute() call or read from +#' the RDS file. +#'@param wait A logical value deciding whether the R session waits for the +#' Collect() call to finish (TRUE) or not (FALSE). If TRUE, it will be a +#' blocking call, in which Collect() will retrieve information from the HPC, +#' including signals and outputs, each polling_period seconds. The the status +#' can be monitored on the workflow manager GUI. Collect() will not return +#' until the results of all the chunks have been received. If FALSE, Collect() +#' return an error if the execution has not finished, otherwise it will return +#' the merged array. The default value is TRUE. +#'@param remove A logical value deciding whether to remove of all chunk results +#' received from the HPC after data being collected, as well as the local job +#' folder under 'ecflow_suite_dir' or 'autosubmit_suite_dir'. To preserve the +#' data and Collect() them as many times as desired, set remove to FALSE. The +#' default value is TRUE. +#' @param on_remote A logical value deciding to the function is run locally and +#' sync the outputs back from HPC (FALSE, default), or it is run on HPC +#' (TRUE). +#'@return A list of merged data array. +#' +#'@examples +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011', '200012') +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = 'all', +#' longitude = 'all', +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#' fun <- function(x) { +#' lat = attributes(x)$Variables$dat1$latitude +#' weight = sqrt(cos(lat * pi / 180)) +#' corrected = Apply(list(x), target_dims = "latitude", +#' fun = function(x) {x * weight}) +#' } +#' step <- Step(fun = fun, +#' target_dims = 'latitude', +#' output_dims = 'latitude', +#' use_libraries = c('multiApply'), +#' use_attributes = list(data = "Variables")) +#' wf <- AddStep(data, step) +#' \dontrun{ +#' res <- Compute(wf, chunks = list(longitude = 2, sdate = 2), +#' threads_load = 1, +#' threads_compute = 4, +#' cluster = list(queue_host = 'nord3', +#' queue_type = 'lsf', +#' temp_dir = '/on_hpc/tmp_dir/', +#' cores_per_job = 2, +#' job_wallclock = '05:00', +#' max_jobs = 4, +#' extra_queue_params = list('#BSUB -q bsc_es'), +#' bidirectional = FALSE, +#' polling_period = 10 +#' ), +#' ecflow_suite_dir = '/on_local_machine/username/ecflow_dir/', +#' wait = FALSE) +#' saveRDS(res, file = 'test_collect.Rds') +#' collect_info <- readRDS('test_collect.Rds') +#' result <- Collect(collect_info, wait = TRUE) +#' } +#' +#'@export +Collect <- function(startr_exec, wait = TRUE, remove = TRUE, on_remote = FALSE) { + # Parameter checks + if (!is(startr_exec, 'startR_exec')) { + stop("Parameter 'startr_exec' must be an object of the class ", + "'startR_exec', as returned by Compute(..., wait = FALSE).") + } + if (!tolower(startr_exec$workflow_manager) %in% c('ecflow', 'autosubmit')) { + stop("Cannot identify the workflow manager. Check the value of 'startr_exec$workflow_manager', which should be 'ecFlow' or 'Autosubmit'.") + } + if (!is.logical(wait)) { + stop("Parameter 'wait' must be logical.") + } + if (!is.logical(remove)) { + stop("Parameter 'remove' must be logical.") + } + if (!is.logical(on_remote)) { + stop("Parameter 'on_remote' must be logical.") + } + + if (tolower(startr_exec$workflow_manager) == 'ecflow') { + res <- Collect_ecflow(startr_exec, wait = wait, remove = remove, on_remote = on_remote) + } else if (tolower(startr_exec$workflow_manager) == 'autosubmit') { + res <- Collect_autosubmit(startr_exec, wait = wait, remove = remove, on_remote = on_remote) + } + + return(res) +} + +Collect_ecflow <- function(startr_exec, wait = TRUE, remove = TRUE, on_remote = FALSE) { + + if (!on_remote && Sys.which('ecflow_client') == '') { + stop("ecFlow must be installed in order to collect results from a ", + "Compute() execution.") + } + cluster <- startr_exec[['cluster']] + ecflow_server <- startr_exec[['ecflow_server']] + suite_id <- startr_exec[['suite_id']] + chunks <- startr_exec[['chunks']] + num_outputs <- startr_exec[['num_outputs']] + ecflow_suite_dir <- startr_exec[['ecflow_suite_dir']] + timings <- startr_exec[['timings']] + ecflow_suite_dir_suite <- paste0(ecflow_suite_dir, '/STARTR_CHUNKING_', + suite_id, '/') + if (!is.null(cluster[['temp_dir']])) { #NOTE: Which case doesn't have temp_dir? + remote_ecflow_suite_dir <- cluster[['temp_dir']] + remote_ecflow_suite_dir_suite <- paste0(cluster[['temp_dir']], + '/STARTR_CHUNKING_', + suite_id, '/') + } + find_task_name <- function(received_file) { + file_name <- received_file + parts <- strsplit(file_name, '__')[[1]] + parts <- parts[c(2:(length(parts) - 1))] + chunk_indices <- rev(sapply(parts, function(x) { + as.numeric(strsplit(x, '_')[[1]][2]) + })) + task_pattern <- paste(paste0('*_', chunk_indices, '/'), + collapse = '') + task_glob <- paste0(ecflow_suite_dir_suite, '/*/*/', + task_pattern) + task_path <- Sys.glob(task_glob) + if (length(task_path) != 1) { + stop("Unexpected error while receiving results.") + } + task_name <- strsplit(task_path, 'computation')[[1]][2] + task_name <- paste0('/STARTR_CHUNKING_', suite_id, + '/computation', task_name) + task_name + } + done <- FALSE + attempt <- 1 + if (!on_remote) { + sum_received_chunks <- sum(grepl('.*\\.Rds$', + list.files(ecflow_suite_dir_suite))) + } + + if (cluster[['bidirectional']]) { + t_transfer_back <- NA + } else { + t_transfer_back <- 0 + } + time_before_first_chunk <- startr_exec[['t_begin_first_chunk']] + first_chunk_received <- FALSE + rsync_petition_file_lines <- c('+ *.Rds', '+ *.timings', '+ *.crashed', + '+ *.running', '- *') + rsync_petition_file <- tempfile() + writeLines(rsync_petition_file_lines, rsync_petition_file) + Sys.sleep(2) + while (!done) { + if (!on_remote) { + if (cluster[['bidirectional']]) { + status <- system(paste0("ecflow_client --get_state=STARTR_CHUNKING_", + suite_id, " --host=", + ecflow_server[['host']], " --port=", ecflow_server[['port']]), + intern = TRUE) + if (any(grepl(paste0("suite STARTR_CHUNKING_", suite_id, " #.* state:complete"), status))) { + done <- TRUE + } else if (!wait) { + stop("Computation in progress...") + } + if (!first_chunk_received) { + if (any(grepl('state:complete', status))) { + if (!is.null(time_before_first_chunk)) { + time_after_first_chunk <- Sys.time() + estimate <- (time_after_first_chunk - + time_before_first_chunk) * + ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / + cluster[['max_jobs']]) + units(estimate) <- 'mins' + .message( + paste0('Remaining time estimate (neglecting queue and ', + 'merge time) (at ', format(time_after_first_chunk), + '): ', format(estimate), ' (', + format(time_after_first_chunk - + time_before_first_chunk), ' per chunk)') + ) + } + first_chunk_received <- TRUE + } + } + Sys.sleep(min(sqrt(attempt), 5)) + } else { + #if (sum_received_chunks == 0) { + # # Accounting for the fist chunk received in ByChunks and + # # setting it to complete + # # ByChunks needs the first chunk to calculate remaining time + # received_files <- list.files(ecflow_suite_dir_suite) + # received_chunks <- received_files[grepl('Rds$', + # received_files)] + #} + failed <- FALSE + t_begin_transfer_back <- Sys.time() + rsync_output <- tryCatch({ + system(paste0("rsync -rav --include-from=", rsync_petition_file, " '", + cluster[['queue_host']], ":", remote_ecflow_suite_dir_suite, "' ", + ecflow_suite_dir_suite, "/"), intern = TRUE) + }, error = function(e) { + message("Warning: rsync from remote server to collect results failed. ", + "Retrying soon.") + failed <- TRUE + }) + t_end_transfer_back <- Sys.time() + t_transfer_back <- t_transfer_back + as.numeric(difftime(t_end_transfer_back, + t_begin_transfer_back, units = 'secs')) + if (!failed) { + #if (sum_received_chunks == 0) { + # rsync_output <- c(rsync_output, received_chunks) + #} + received_running <- grepl('running$', rsync_output) + for (received_chunk_index in which(received_running)) { + file_name <- rsync_output[received_chunk_index] + task_name <- find_task_name(file_name) + system(paste0('ecflow_client --force=active recursive ', + task_name, + " --host=", ecflow_server[['host']], + " --port=", ecflow_server[['port']])) + } + received_crashed <- grepl('crashed$', rsync_output) + for (received_chunk_index in which(received_crashed)) { + file_name <- rsync_output[received_chunk_index] + task_name <- find_task_name(file_name) + system(paste0('ecflow_client --force=aborted recursive ', + task_name, + " --host=", ecflow_server[['host']], + " --port=", ecflow_server[['port']])) + } + received_chunks <- grepl('Rds$', rsync_output) + for (received_chunk_index in which(received_chunks)) { + file_name <- rsync_output[received_chunk_index] + task_name <- find_task_name(file_name) + system(paste0('ecflow_client --force=complete recursive ', + task_name, + " --host=", ecflow_server[['host']], + " --port=", ecflow_server[['port']])) + sum_received_chunks <- sum_received_chunks + 1 + if (!first_chunk_received) { + if (!is.null(time_before_first_chunk)) { + time_after_first_chunk <- Sys.time() + estimate <- (time_after_first_chunk - + time_before_first_chunk) * + ceiling((prod(unlist(chunks)) - cluster[['max_jobs']]) / + cluster[['max_jobs']]) + units(estimate) <- 'mins' + .message( + paste0('Remaining time estimate (neglecting queue and ', + 'merge time) (at ', format(time_after_first_chunk), + '): ', format(estimate), ' (', + format(time_after_first_chunk - + time_before_first_chunk), ' per chunk)') + ) + } + first_chunk_received <- TRUE + } + } + if (sum_received_chunks / num_outputs == prod(unlist(chunks))) { + done <- TRUE + } else if (!wait) { + stop("Computation in progress...") + } + } + Sys.sleep(cluster[['polling_period']]) + } + + } else { # on_remote + + sum_received_chunks <- sum(grepl('.*\\.Rds$', list.files(remote_ecflow_suite_dir_suite ))) + + if (sum_received_chunks / num_outputs == prod(unlist(chunks))) { + done <- TRUE + } else if (!wait) { + stop("Computation in progress...") + } else { + message("Computation in progress, ", sum_received_chunks, " of ", prod(unlist(chunks)), " chunks are done.") + message("Will try again after polling_period...") + Sys.sleep(cluster[['polling_period']]) + } + + } + attempt <- attempt + 1 + } + file.remove(rsync_petition_file) + timings[['transfer_back']] <- t_transfer_back + if (!on_remote && !is.null(cluster[['temp_dir']])) { + system(paste0('ssh ', cluster[['queue_host']], ' "rm -rf ', + remote_ecflow_suite_dir_suite, '"')) + } + if (remove) { + .warning("ATTENTION: The source chunks will be removed from the ", + "system. Store the result after Collect() ends if needed.") + } + if (!on_remote) { + target_folder <- ecflow_suite_dir + target_folder_suite <- ecflow_suite_dir_suite + } else { + target_folder <- remote_ecflow_suite_dir + target_folder_suite <- remote_ecflow_suite_dir_suite + } + t_begin_merge <- Sys.time() + result <- .MergeChunks(target_folder, suite_id, remove) + t_end_merge <- Sys.time() + timings[['merge']] <- as.numeric(difftime(t_end_merge, t_begin_merge, units = 'secs')) + received_files <- list.files(target_folder_suite, full.names = TRUE) + received_timings_files <- received_files[grepl('timings$', received_files)] + for (timings_file in received_timings_files) { + times <- readRDS(timings_file) + timings[['queue']] <- c(timings[['queue']], times['queue']) + timings[['job_setup']] <- c(timings[['job_setup']], times['job_setup']) + timings[['load']] <- c(timings[['load']], times['load']) + timings[['compute']] <- c(timings[['compute']], times['compute']) + } + if (remove) { + if (!on_remote) { + system(paste0("ecflow_client --delete=force yes /STARTR_CHUNKING_", + suite_id, " --host=", ecflow_server[['host']], + " --port=", ecflow_server[['port']])) + } + unlink(target_folder_suite, recursive = TRUE) + } + if (attempt > 2) { + t_end_total <- Sys.time() + timings[['total']] <- as.numeric(difftime(t_end_total, timings[['total']], units = 'secs')) + } else { + # When attempt <= 2, it means all results were ready possibly from + # long ago, so is not straightfowrard to work out total time. + timings[['total']] <- NA + } + message(paste0("* Computation ended successfully.")) + message(paste0("* Number of chunks: ", + timings[['nchunks']])) + message(paste0("* Max. number of concurrent chunks (jobs): ", + timings[['concurrent_chunks']])) + message(paste0("* Requested cores per job: ", + timings[['cores_per_job']])) + message(paste0("* Load threads per chunk: ", + timings[['threads_load']])) + message(paste0("* Compute threads per chunk: ", + timings[['threads_compute']])) + message(paste0("* Total time (s): ", + timings[['total']])) + message(paste0("* Chunking setup: ", + timings[['bychunks_setup']])) + message(paste0("* Data upload to cluster: ", + timings[['transfer']])) + message(paste0("* All chunks: ", + timings[['total']] - + timings[['bychunks_setup']] - + timings[['transfer']] - + timings[['transfer_back']] - + timings[['merge']])) + message(paste0("* Transfer results from cluster: ", + timings[['transfer_back']])) + message(paste0("* Merge: ", + timings[['merge']])) + message(paste0("* Each chunk: ")) + message(paste0("* queue: ")) + message(paste0("* mean: ", + mean(timings[['queue']]))) + message(paste0("* min: ", + min(timings[['queue']]))) + message(paste0("* max: ", + max(timings[['queue']]))) + message(paste0("* job setup: ")) + message(paste0("* mean: ", + mean(timings[['job_setup']]))) + message(paste0("* min: ", + min(timings[['job_setup']]))) + message(paste0("* max: ", + max(timings[['job_setup']]))) + message(paste0("* load: ")) + message(paste0("* mean: ", + mean(timings[['load']]))) + message(paste0("* min: ", + min(timings[['load']]))) + message(paste0("* max: ", + max(timings[['load']]))) + message(paste0("* compute: ")) + message(paste0("* mean: ", + mean(timings[['compute']]))) + message(paste0("* min: ", + min(timings[['compute']]))) + message(paste0("* max: ", + max(timings[['compute']]))) + #system("ecflow_client --shutdown --port=5678") + #system("ecflow_stop.sh -p 5678") + #result <- readRDS(paste0(ecflow_output_dir, '/result.Rds')) + #file.remove(paste0(ecflow_output_dir, '/result.Rds')) + attr(result, 'startR_compute_profiling') <- timings + result +} + + + +Collect_autosubmit <- function(startr_exec, wait = TRUE, remove = TRUE, on_remote = FALSE) { + + suite_id <- startr_exec[['suite_id']] + chunks <- startr_exec[['chunks']] + num_outputs <- startr_exec[['num_outputs']] + autosubmit_suite_dir <- startr_exec[['autosubmit_suite_dir']] + autosubmit_suite_dir_suite <- paste0(autosubmit_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') + remote_autosubmit_suite_dir <- file.path("/esarchive/autosubmit/", suite_id, 'proj') + remote_autosubmit_suite_dir_suite <- paste0(remote_autosubmit_suite_dir, '/STARTR_CHUNKING_', suite_id, '/') + run_dir <- startr_exec$cluster[['run_dir']] + + done <- FALSE + + while (!done) { # If wait, try until it is done + sum_received_chunks <- sum(grepl('.*\\.Rds$', list.files(remote_autosubmit_suite_dir_suite))) + if (sum_received_chunks / num_outputs == prod(unlist(chunks))) { + done <- TRUE + + } else if (!wait) { + stop("Computation in progress...") + } else { + message("Computation in progress, ", sum_received_chunks, " of ", prod(unlist(chunks)), " chunks are done...\n", + "Check status on Autosubmit GUI: https://earth.bsc.es/autosubmitapp/experiment/", suite_id) + Sys.sleep(startr_exec$cluster[['polling_period']]) + } + + } # while !done + + result <- .MergeChunks(remote_autosubmit_suite_dir, suite_id, remove = remove) + if (remove) { + .warning("ATTENTION: The source chunks will be removed from the ", + "system. Store the result after Collect() ends if needed.") + unlink(paste0(autosubmit_suite_dir_suite), + recursive = TRUE) + } + + # Remove bigmemory objects (e.g., a68h_1_1_1_1_1 and a68h_1_1_1_1_1.desc) + # If run_dir is specified, the files are under run_dir; if not, files are under proj/STARTR_CHUNKING_xxxx/ + if (!is.null(run_dir)) { + file.remove( + file.path(run_dir, + list.files(run_dir)[grepl(paste0("^", suite_id, "_.*"), list.files(run_dir))]) + ) + } else { + file.remove( + file.path(remote_autosubmit_suite_dir_suite, + list.files(remote_autosubmit_suite_dir_suite)[grepl(paste0("^", suite_id, "_.*"), list.files(remote_autosubmit_suite_dir_suite))]) + ) + } + + return(result) +} diff --git a/modules/Loading/tmp/startR/R/Compute.R b/modules/Loading/tmp/startR/R/Compute.R new file mode 100644 index 00000000..5a58abd9 --- /dev/null +++ b/modules/Loading/tmp/startR/R/Compute.R @@ -0,0 +1,197 @@ +#'Specify the execution parameters and trigger the execution +#' +#'The step of the startR workflow after the complete workflow is defined by +#'AddStep(). This function specifies the execution parameters and triggers the +#'execution. The execution can be operated locally or on a remote machine. If +#'it is the latter case, the configuration of the machine needs to be +#'sepecified in the function, and the EC-Flow server is required to be +#'installed.\cr\cr +#'The execution can be operated by chunks to avoid overloading the RAM memory. +#'After all the chunks are finished, Compute() will gather and merge them, and +#'return a single data object, including one or multiple multidimensional data +#'arrays and additional metadata. +#' +#'@param workflow A list of the class 'startR_workflow' returned by function +#' AddSteop() or of class 'startR_cube' returned by function Start(). It +#' contains all the objects needed for the execution. +#'@param chunks A named list of dimensions which to split the data along and +#' the number of chunks to make for each. The chunked dimension can only be +#' those not required as the target dimension in function Step(). The default +#' value is 'auto', which lists all the non-target dimensions and each one has +#' one chunk. +#'@param threads_load An integer indicating the number of parallel execution +#' cores to use for the data retrieval stage. The default value is 1. +#'@param threads_compute An integer indicating the number of parallel execution +#' cores to use for the computation. The default value is 1. +#'@param cluster A list of components that define the configuration of the +#' machine to be run on. The comoponents vary from the different machines. +#' Check \href{https://earth.bsc.es/gitlab/es/startR/-/blob/master/inst/doc/practical_guide.md}{Practical guide on GitLab} for more +#' details and examples. Only needed when the computation is not run locally. +#' The default value is NULL. +#'@param workflow_manager Can be NULL, 'ecFlow' or 'Autosubmit'. The default is +#' 'ecFlow'. +#'@param ecflow_suite_dir A character string indicating the path to a folder in +#' the local workstation where to store temporary files generated for the +#' automatic management of the workflow. Only needed when the execution is run +#' remotely. The default value is NULL. +#'@param ecflow_server A named vector indicating the host and port of the +#' EC-Flow server. The vector form should be +#' \code{c(host = 'hostname', port = port_number)}. Only needed when the +#' execution is run remotely. The default value is NULL. +#'@param autosubmit_suite_dir A character string indicating the path to a folder +#' where to store temporary files generated for the automatic management of the +#' workflow manager. This path should be available in local workstation as well +#' as autosubmit machine. The default value is NULL, and a temporary folder +#' under the current working folder will be created. +#'@param autosubmit_server A character vector indicating the login node of the +#' autosubmit machine. It can be "bscesautosubmit01" or "bscesautosubmit02". +#' The default value is NULL, and the node will be randomly chosen. +#'@param silent A logical value deciding whether to print the computation +#' progress (FALSE) on the R session or not (TRUE). It only works when the +#' execution runs locally or the parameter 'wait' is TRUE. The default value +#' is FALSE. +#'@param debug A logical value deciding whether to return detailed messages on +#' the progress and operations in a Compute() call (TRUE) or not (FALSE). +#' Automatically changed to FALSE if parameter 'silent' is TRUE. The default +#' value is FALSE. +#'@param wait A logical value deciding whether the R session waits for the +#' Compute() call to finish (TRUE) or not (FALSE). If FALSE, it will return an +#' object with all the information of the startR execution that can be stored +#' in your disk. After that, the R session can be closed and the results can +#' be collected later with the Collect() function. The default value is TRUE. +#' +#'@return A list of data arrays for the output returned by the last step in the +#' specified workflow (wait = TRUE), or an object with information about the +#' startR execution (wait = FALSE). The configuration details and profiling +#' information are attached as attributes to the returned list of arrays. +#'@examples +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011', '200012') +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = 'all', +#' longitude = 'all', +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#' fun <- function(x) { +#' lat = attributes(x)$Variables$dat1$latitude +#' weight = sqrt(cos(lat * pi / 180)) +#' corrected = Apply(list(x), target_dims = "latitude", +#' fun = function(x) {x * weight}) +#' } +#' step <- Step(fun = fun, +#' target_dims = 'latitude', +#' output_dims = 'latitude', +#' use_libraries = c('multiApply'), +#' use_attributes = list(data = "Variables")) +#' wf <- AddStep(data, step) +#' res <- Compute(wf, chunks = list(longitude = 4, sdate = 2)) +#' +#'@importFrom methods is +#'@export +Compute <- function(workflow, chunks = 'auto', workflow_manager = 'ecFlow', + threads_load = 1, threads_compute = 1, + cluster = NULL, ecflow_suite_dir = NULL, ecflow_server = NULL, + autosubmit_suite_dir = NULL, autosubmit_server = NULL, + silent = FALSE, debug = FALSE, wait = TRUE) { + # Check workflow + if (!is(workflow, 'startR_cube') & !is(workflow, 'startR_workflow')) { + stop("Parameter 'workflow' must be an object of class 'startR_cube' as ", + "returned by Start or of class 'startR_workflow' as returned by ", + "AddStep.") + } + + if (is(workflow, 'startR_cube')) { + #machine_free_ram <- 1000000000 + #max_ram_ratio <- 0.5 + #data_size <- prod(c(attr(workflow, 'Dimensions'), 8)) + #if (data_size > (machine_free_ram * max_ram_ratio)) { + # stop("It is not possible to fit the requested data (", data_size, + # " bytes) into the maximum allowed free ram (", max_ram_ratio, + # " x ", machine_free_ram, ").") + #} + eval(workflow) + } else { + # TODO: + #explore tree of operations and identify set of operations that reduce dimensionality as much as possible + # while being able to fit in (cluster and to exploit number of available nodes) | (machine) + #combine set of operations into a single function + #Goal: to build manually a function following this pattern: + #operation <- function(input1, input2) { + # fun1 <- workflow$fun + # fun1(input1, input2, names(workflow$params)[1] = workflow$params[[1]]) + #} + op_text <- "function(" + op_text <- paste0(op_text, + paste(paste0('input', 1:length(workflow$inputs)), + collapse = ', ')) + op_text <- paste0(op_text, ") {") + op_text <- paste0(op_text, "\n fun1 <- ", paste(deparse(workflow$fun), collapse = '\n')) + op_text <- paste0(op_text, "\n res <- fun1(", + paste(paste0('input', 1:length(workflow$inputs)), + collapse = ", ")) + if (length(workflow$params) > 0) { + for (j in 1:length(workflow$params)) { + op_text <- paste0(op_text, ", ") + op_text <- paste0(op_text, names(workflow$params)[j], " = ", + paste(deparse(workflow$params[[j]]), collapse = '\n')) + } + } + op_text <- paste0(op_text, ")") + op_text <- paste0(op_text, "\n}") + operation <- eval(parse(text = op_text)) + operation <- Step(operation, + attr(workflow$fun, 'TargetDims'), + attr(workflow$fun, 'OutputDims'), + attr(workflow$fun, 'UseLibraries'), + attr(workflow$fun, 'UseAttributes')) + + if (!all(sapply(workflow$inputs, class) == 'startR_cube')) { + stop("Workflows with only one step supported by now.") + } + + # Run ByChunks with the chosen operation + if (!is.null(cluster)) { + if (is.null(workflow_manager)) { + stop("Specify parameter 'workflow_manager' as 'ecFlow' or 'Autosubmit'.") + } else if (!tolower(workflow_manager) %in% c('ecflow', 'autosubmit')) { + stop("Parameter 'workflow_manager' can only be 'ecFlow' or 'Autosubmit'.") + } + } else { # run locally + workflow_manager <- 'ecflow' + } + + if (tolower(workflow_manager) == 'ecflow') { + # ecFlow or run locally + res <- ByChunks_ecflow(step_fun = operation, + cube_headers = workflow$inputs, + chunks = chunks, + threads_load = threads_load, + threads_compute = threads_compute, + cluster = cluster, + ecflow_suite_dir = ecflow_suite_dir, + ecflow_server = ecflow_server, + silent = silent, debug = debug, wait = wait) + } else { + res <- ByChunks_autosubmit(step_fun = operation, + cube_headers = workflow$inputs, + chunks = chunks, + threads_load = threads_load, + threads_compute = threads_compute, + cluster = cluster, + autosubmit_suite_dir = autosubmit_suite_dir, + autosubmit_server = autosubmit_server, + silent = silent, debug = debug, wait = wait) + + } + + # TODO: carry out remaining steps locally, using multiApply + # Return results + res + } +} diff --git a/modules/Loading/tmp/startR/R/NcCloser.R b/modules/Loading/tmp/startR/R/NcCloser.R new file mode 100644 index 00000000..476592ee --- /dev/null +++ b/modules/Loading/tmp/startR/R/NcCloser.R @@ -0,0 +1,25 @@ +#'NetCDF file closer for 'startR' +#' +#'This is a file closer function for NetCDF files, intended for use as +#'parameter 'file_closer' in a Start() call. This function complies with the +#'input/output interface required by Start() defined in the documentation for +#'the parameter 'file_closer'.\cr\cr +#'This function uses the function NcClose() in the package 'easyNCDF', +#'which in turn uses nc_close() in the package 'ncdf4'. +#' +#'@param file_object An open connection to a NetCDF file, optionally with +#' additional header information. See details in the documentation of the +#' parameter 'file_closer' of the function Start(). +#'@return This function returns NULL. +#'@examples +#'data_path <- system.file('extdata', package = 'startR') +#'path_obs <- file.path(data_path, 'obs/monthly_mean/tos/tos_200011.nc') +#'connection <- NcOpener(path_obs) +#'NcCloser(connection) +#'@seealso \code{\link{NcOpener}} \code{\link{NcDataReader}} +#' \code{\link{NcDimReader}} \code{\link{NcVarReader}} +#'@import easyNCDF +#'@export +NcCloser <- function(file_object) { + easyNCDF::NcClose(file_object) +} diff --git a/modules/Loading/tmp/startR/R/NcDataReader.R b/modules/Loading/tmp/startR/R/NcDataReader.R new file mode 100644 index 00000000..47817c65 --- /dev/null +++ b/modules/Loading/tmp/startR/R/NcDataReader.R @@ -0,0 +1,395 @@ +#'NetCDF file data reader for 'startR' +#' +#'This is a data reader function for NetCDF files, intended for use as parameter +#'file_data_reader in a Start() call. This function complies with the +#'input/output interface required by Start() defined in the documentation for +#'the parameter 'file_data_reader'.\cr\cr +#'This function uses the function NcToArray() in the package 'easyNCDF', which +#'in turn uses nc_var_get() in the package 'ncdf4'. +#' +#'@param file_path A character string indicating the path to the data file to +#' read. See details in the documentation of the parameter 'file_data_reader' +#' of the function Start(). The default value is NULL. +#'@param file_object An open connection to a NetCDF file, optionally with +#' additional header information. See details in the documentation of the +#' parameter 'file_data_reader' of the function Start(). The default value is +#' NULL. +#'@param file_selectors A named list containing the information of the path of +#' the file to read data from. It is automatically provided by Start(). See +#' details in the documentation of the parameter 'file_data_reader' of the +#' function Start(). The default value is NULL. +#'@param inner_indices A named list of numeric vectors indicating the indices +#' to take from each of the inner dimensions in the requested file. It is +#' automatically provided by Start(). See details in the documentation of the +#' parameter 'file_data_reader' of the function Start(). The default value is +#' NULL. +#'@param synonims A named list indicating the synonims for the dimension names +#' to look for in the requested file, exactly as provided in the parameter +#' 'synonims' in a Start() call. See details in the documentation of the +#' parameter 'file_data_reader' of the function Start(). +#' +#'@return A multidimensional data array with the named dimensions and indices +#' requested in 'inner_indices', potentially with the attribute 'variables' +#' with additional auxiliary data. See details in the documentation of the +#' parameter 'file_data_reader' of the function Start(). +#'@examples +#' data_path <- system.file('extdata', package = 'startR', mustWork = TRUE) +#' file_to_open <- file.path(data_path, 'obs/monthly_mean/tos/tos_200011.nc') +#' file_selectors <- c(dat = 'dat1', var = 'tos', sdate = '200011') +#' first_round_indices <- list(time = 1, latitude = 1:8, longitude = 1:16) +#' synonims <- list(dat = 'dat', var = 'var', sdate = 'sdate', time = 'time', +#' latitude = 'latitude', longitude = 'longitude') +#' sub_array <- NcDataReader(file_to_open, NULL, file_selectors, +#' first_round_indices, synonims) +#'@seealso \code{\link{NcOpener}} \code{\link{NcDimReader}} +#' \code{\link{NcCloser}} \code{\link{NcVarReader}} +#'@import easyNCDF PCICt +#'@export +NcDataReader <- function(file_path = NULL, file_object = NULL, + file_selectors = NULL, inner_indices = NULL, + synonims) { + close <- FALSE + if (!is.null(file_object)) { + file_to_read <- file_object + file_path <- file_object$filename + } else if (!is.null(file_path)) { + file_to_read <- NcOpener(file_path) + close <- TRUE + } else { + stop("Either 'file_path' or 'file_object' must be provided.") + } + + if (is.null(file_to_read)) { + return(NULL) + } + var_requested <- is.null(inner_indices) + + drop_var_dim <- FALSE + if (any(c('var', 'variable') %in% names(file_selectors))) { + if (!any(c('var', 'variable') %in% names(inner_indices))) { + inner_indices <- c(inner_indices, + list(var = file_selectors[[which(names(file_selectors) %in% + c('var', 'variable'))[1]]])) + drop_var_dim <- TRUE + } + } + + vars_in_file <- easyNCDF::NcReadVarNames(file_to_read) + if (any(names(inner_indices) %in% c('var', 'variable'))) { + position_of_var <- which(names(inner_indices) %in% c('var', 'variable'))[1] + } else if (length(vars_in_file) == 1) { + inner_indices <- c(inner_indices, + list(var = vars_in_file)) + drop_var_dim <- TRUE + position_of_var <- length(inner_indices) + } else { + stop("A 'var'/'variable' file dimension or inner dimension must be ", + "requested for NcDataReader() to read NetCDF files with more than ", + "one variable.") + } + + inner_indices[[position_of_var]] <- sapply(inner_indices[[position_of_var]], + function(x) { + if (x %in% names(synonims)) { + x_in_file <- which(synonims[[x]] %in% vars_in_file) + if (length(x_in_file) < 1) { + stop("Could not find variable '", x, "' (or its synonims if ", + "specified) in the file ", file_path) + } + if (length(x_in_file) > 1) { + stop("Found more than one matches for the synonims of the ", + "variable '", x, "' in the file ", file_path) + } + synonims[[x]][x_in_file] + } else { + if (is.character(x) && !(x %in% c('all', 'first', 'last'))) { + if (!(x %in% vars_in_file)) { + stop("Could not find variable '", x, "' (or its synonims if ", + "specified) in the file ", file_path) + } + } + x + } + }) + #inner_indices[[position_of_var]] <- SelectorChecker(inner_indices[[position_of_var]], vars_in_file) + dims_in_file <- NcDimReader(NULL, file_to_read, NULL, + inner_indices[position_of_var], synonims) + names(inner_indices) <- sapply(names(inner_indices), + function(x) { + if (x %in% names(synonims)) { + x_in_file <- which(synonims[[x]] %in% names(dims_in_file)) + if (length(x_in_file) < 1) { + stop("Could not find dimension '", x, "' (or its synonims if ", + "specified) in the file ", file_path) + } + if (length(x_in_file) > 1) { + stop("Found more than one matches for the synonims of the ", + "dimension '", x, "' in the file ", file_path) + } + synonims[[x]][x_in_file] + } else { + if (!(x %in% names(dims_in_file))) { + stop("Could not find dimension '", x, "' (or its synonims if ", + "specified) in the file ", file_path) + } + x + } + }) + if (drop_var_dim) { + dims_in_file <- dims_in_file[-which(names(dims_in_file) %in% c('var', 'variable'))] + } + singleton_unspecified_dims <- which((dims_in_file == 1) & + !(names(dims_in_file) %in% names(inner_indices))) + if (length(singleton_unspecified_dims) > 0) { + dims_in_file <- dims_in_file[-singleton_unspecified_dims] + } + if (var_requested) { + result <- easyNCDF::NcToArray(file_to_read, inner_indices, drop_var_dim = drop_var_dim, + expect_all_indices = FALSE, allow_out_of_range = TRUE) + } else { + if (any(!(names(dims_in_file) %in% names(inner_indices)))) { + expected_dim_names <- names(inner_indices) + if (drop_var_dim) { + expected_dim_names <- expected_dim_names[-position_of_var] + } + stop("Unexpected extra dimensions (of length > 1) in the file.\nExpected: ", + paste(expected_dim_names, collapse = ', '), "\n", + "Found: ", paste(names(dims_in_file), collapse = ', '), "\n", + file_path) + } + result <- easyNCDF::NcToArray(file_to_read, inner_indices, drop_var_dim = drop_var_dim, + expect_all_indices = TRUE, allow_out_of_range = TRUE) + } + if (!is.null(dim(result))) { + names(dim(result)) <- sapply(names(dim(result)), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + } + if (!is.null(result)) { + names(attr(result, 'variables')) <- sapply(names(attr(result, 'variables')), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + + if (length(names(attr(result, 'variables'))) == 1) { + # The 1st condition is for implicit time dim (if time length = 1, it is + # allowed to not be defined in Start call. Therefore, it is not in the list + # of synonims); + # the 2nd condition is for the normal case; the 3rd one is that if return_vars + # has a variable that is not 'time'. The only way to know if it should be time + # is to check calendar. + # All these conditions are to prevent the variables with time-like units but + # actually not a time variable, e.g., drought period [days]. + if (names(attr(result, 'variables')) == 'time' | + 'time' %in% synonims[[names(attr(result, 'variables'))]] | + 'calendar' %in% names(attr(result, 'variables')[[1]])) { + var_name <- names(attr(result, 'variables')) + units <- attr(result, 'variables')[[var_name]][['units']] + + if (units %in% c('seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years')) { + if (units == 'seconds') { +# units <- 'secs' + } else if (units == 'minutes') { +# units <- 'mins' + result <- result * 60 # min to sec + } + result[] <- paste(result[], units) + + } else if (grepl(' since ', units)) { + # Find the calendar + calendar <- attr(result, 'variables')[[var_name]]$calendar + # Calendar types recognized by as.PCICt() + cal.list <- c("365_day", "365", "noleap", "360_day", "360", "gregorian", "standard", "proleptic_gregorian") + + if (is.null(calendar)) { + warning("Calendar is missing. Use the standard calendar to calculate time values.") + calendar <- 'gregorian' + } else if (!calendar %in% cal.list) { + # if calendar is not recognized by as.PCICt(), forced it to be standard + warning("The calendar type '", calendar, "' is not recognized by NcDataReader(). It is forced to be standard type.") + calendar <- 'gregorian' + } + if (calendar == 'standard') calendar <- 'gregorian' + + parts <- strsplit(units, ' since ')[[1]] + units <- parts[1] + + if (units %in% c('second', 'seconds')) { +# units <- 'secs' + } else if (units %in% c('minute', 'minutes')) { +# units <- 'mins' + result <- result * 60 # min to sec + } else if (units %in% c('hour', 'hours')) { + result <- result * 60 * 60 # hour to sec + } else if (units %in% c('day', 'days')) { +# units <- 'days' + result <- result * 24 * 60 * 60 # day to sec + } else if (units %in% c('month', 'months')) { + # define day in each month + leap_month_day <- c(31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) + no_leap_month_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) + + # If calendar is gregorian, we get the result date directly instead of calculating how many seconds we have. + # The other calendar type can also do this but then we need to calculate each date in for loop. + #TODO: Try to use 'clock' to calculate the date (but dependency will be added) + if (calendar == 'gregorian') { + # Origin year and month and day + ori_year <- as.numeric(substr(parts[2], 1, 4)) + ori_month <- as.numeric(substr(parts[2], 6, 7)) + ori_day <- as.numeric(substr(parts[2], 9, 10)) + if (is.na(ori_month)) { + ori_month <- as.numeric(substr(parts[2], 6, 6)) + ori_day <- as.numeric(substr(parts[2], 8, 8)) + } + if (!is.numeric(ori_year) | !is.numeric(ori_month) | !is.numeric(ori_day)) { + stop(paste0("The time unit attribute format is not 'YYYY-MM-DD' or 'YYYY-M-D'. ", + "Check the file or contact the maintainer.")) + } + result_vec <- rep(NA, length = length(result)) + for (result_i in 1:length(result)) { + yr_num <- floor(result[result_i] / 12) + month_left <- result[result_i] - yr_num * 12 + result_year <- ori_year + yr_num + result_month <- ori_month + floor(month_left) + result_day <- ori_day + #NOTE: Assumption that hour down is 00 + result_hour <- 0 + if (result_month > 12) { + result_year <- result_year + 1 + result_month <- result_month - 12 + } + if (month_left %% 1 != 0) { + if (result_month == 2) { + day_in_month <- ifelse(s2dv::LeapYear(result_year), 29, 28) + } else { + day_in_month <- no_leap_month_day[result_month] + } + result_day <- ori_day + (month_left - floor(month_left)) * day_in_month + if (result_day > day_in_month) { + result_month <- result_month + 1 + result_day <- result_day - day_in_month + } + if (result_month > 12) { + result_year <- result_year + 1 + result_month <- result_month - 12 + } + # if there is hour left + if (result_day %% 1 != 0) { + result_hour <- (result_day - floor(result_day)) * 24 + result_day <- floor(result_day) + } + if (result_hour %% 1 != 0) { + warning("The time value is not correct below 'hour'.") + result_hour <- round(result_hour) + } + } + result_month <- sprintf("%02d", result_month) + result_day <- sprintf("%02d", result_day) + result_hour <- sprintf("%02d", result_hour) + # Combine all the parts into one string + tmp <- paste(result_year, result_month, result_day, sep = '-') + tmp <- paste0(tmp, ' ', result_hour, ':00:00') + result_vec[result_i] <- tmp + } + # Transfer the strings to time class + new_array <- PCICt::as.PCICt(result_vec, cal = 'gregorian') + new_array <- suppressWarnings(PCICt::as.POSIXct.PCICt(new_array, tz = "UTC")) + +# if (calendar == 'gregorian') { +# # Find how many years + months +# yr_num <- floor(result / 12) +# month_left <- result - yr_num * 12 +# # Find the leap years we care +# if (ori_month <= 2) { +# leap_num <- length(which(sapply(ori_year:(ori_year + yr_num - 1), s2dv::LeapYear))) +# } else { +# leap_num <- length(which(sapply((ori_year + 1):(ori_year + yr_num), s2dv::LeapYear))) +# } +# total_days <- leap_num * 366 + (yr_num - leap_num) * 365 # not include month_left yet +# +# if (month_left != 0) { +##TODO: This part until result <- total_days* 24 ... is not correct. It doesn't consider ori_day +# if ((ori_month + month_left - 1) <= 12) { # the last month is still in the same last yr +# # Is the last year a leap year? +# last_leap <- s2dv::LeapYear(ori_year + yr_num) +# if (last_leap) { +# month_day_vector <- leap_month_day +# } else { +# month_day_vector <- no_leap_month_day +# } +# if (month_left >= 1) { # Only a few days in Jan. only, directly go to the next "if" +# total_days <- total_days + sum(month_day_vector[ori_month:(ori_month + month_left - 1)]) +# } +# if ((month_left %% 1) != 0) { +# # month_left has decimal point like 11.5 +# total_days <- total_days + (month_left - floor(month_left)) * month_day_vector[ceiling(month_left)] +# } +# } else { # the last month ends in the next yr +# if (ori_month == 2) { # e.g., 2005-02-16 + 11mth = 2006-01-16 +# last_leap <- s2dv::LeapYear(ori_year + yr_num) # still consider 2005 +# if (last_leap) { +# total_days <- total_days + sum(leap_month_day[2:12]) +# } else { +# total_days <- total_days + sum(no_leap_month_day[2:12]) +# } +# } else { # e.g., 2005-04-16 + 11mth = 2006-03-16 +# last_leap <- s2dv::LeapYear(ori_year + yr_num + 1) +# needed_month <- c(ori_month:12, 1:(ori_month + month_left - 12 - 1)) +# if (last_leap) { +# total_days <- total_days + sum(leap_month_day[needed_month]) +# } else { +# total_days <- total_days + sum(no_leap_month_day[needed_month]) +# } +# } +# } +# } +# result <- total_days * 24 * 60 * 60 # day to sec + + } else if (calendar %in% c('365_day',' 365', 'noleap')) { + yr_num <- floor(result / 12) + month_left <- result - yr_num * 12 + total_days <- 365 * yr_num + sum(no_leap_month_day[ori_month:(month_left - 1)]) + result <- total_days * 24 * 60 * 60 # day to sec + + } else if (calendar %in% c('360_day', '360')) { + result <- result * 30 * 24 * 60 * 60 # day to sec + + } else { #old code. The calendar is not in any of the above. + #NOTE: Should not have a chance to be used because the calendar types are forced to be standard above already. + result <- result * 30.5 + result <- result * 24 * 60 * 60 # day to sec + } + } + + if (!(units %in% c('month', 'months') & calendar == 'gregorian')) { + new_array <- PCICt::as.PCICt(result, cal = calendar, origin = parts[2])[] + new_array <- suppressWarnings(PCICt::as.POSIXct.PCICt(new_array, tz = "UTC")) + } + #new_array <- seq(as.POSIXct(parts[2]), + # length = max(result, na.rm = TRUE) + 1, + # by = units)[result[] + 1] + dim(new_array) <- dim(result) + attr(new_array, 'variables') <- attr(result, 'variables') + result <- new_array + } + } + } + } + + if (close) { + NcCloser(file_to_read) + } + + result +} diff --git a/modules/Loading/tmp/startR/R/NcDimReader.R b/modules/Loading/tmp/startR/R/NcDimReader.R new file mode 100644 index 00000000..9b11a599 --- /dev/null +++ b/modules/Loading/tmp/startR/R/NcDimReader.R @@ -0,0 +1,118 @@ +#'NetCDF dimension reader for 'startR' +#' +#'A dimension reader function for NetCDF files, intended for use as parameter +#''file_dim_reader' in a Start() call. It complies with the input/output +#'interface required by Start() defined in the documentation for the parameter +#''file_dim_reader' of that function.\cr\cr +#'This function uses the function NcReadDims() in the package 'easyNCDF'. +#' +#'@param file_path A character string indicating the path to the data file to +#' read. See details in the documentation of the parameter 'file_dim_reader' +#' of the function Start(). The default value is NULL. +#'@param file_object An open connection to a NetCDF file, optionally with +#' additional header information. See details in the documentation of the +#' parameter 'file_dim_reader' of the function Start(). The default value is +#' NULL. +#'@param file_selectors A named list containing the information of the path of +#' the file to read data from. It is automatically provided by Start(). See +#' details in the documentation of the parameter 'file_dim_reader' of the +#' function Start(). The default value is NULL. +#'@param inner_indices A named list of numeric vectors indicating the indices +#' to take from each of the inner dimensions in the requested file. It is +#' automatically provided by Start(). See details in the documentation of the +#' parameter 'file_dim_reader' of the function Start(). The default value is +#' NULL. +#'@param synonims A named list indicating the synonims for the dimension names +#' to look for in the requested file, exactly as provided in the parameter +#' 'synonims' in a Start() call. See details in the documentation of the +#' parameter 'file_dim_reader' of the function Start(). +#' +#'@return A named numeric vector with the names and sizes of the dimensions of +#' the requested file. +#'@examples +#' data_path <- system.file('extdata', package = 'startR') +#' file_to_open <- file.path(data_path, 'obs/monthly_mean/tos/tos_200011.nc') +#' file_selectors <- c(dat = 'dat1', var = 'tos', sdate = '200011') +#' first_round_indices <- list(time = 1, latitude = 1:8, longitude = 1:16) +#' synonims <- list(dat = 'dat', var = 'var', sdate = 'sdate', time = 'time', +#' latitude = 'latitude', longitude = 'longitude') +#' dim_of_file <- NcDimReader(file_to_open, NULL, file_selectors, +#' first_round_indices, synonims) +#'@seealso \code{\link{NcOpener}} \code{\link{NcDataReader}} +#' \code{\link{NcCloser}} \code{\link{NcVarReader}} +#'@import easyNCDF +#'@importFrom stats setNames +#'@export +NcDimReader <- function(file_path = NULL, file_object = NULL, + file_selectors = NULL, inner_indices = NULL, + synonims) { + close <- FALSE + if (!is.null(file_object)) { + file_to_read <- file_object + file_path <- file_object$filename + } else if (!is.null(file_path)) { + file_to_read <- NcOpener(file_path) + close <- TRUE + } else { + stop("Either 'file_path' or 'file_object' must be provided.") + } + + vars_in_file <- easyNCDF::NcReadVarNames(file_to_read) + if (any(c('var', 'variable') %in% names(inner_indices))) { + vars_to_read <- inner_indices[[which(names(inner_indices) %in% c('var', 'variable'))[1]]] + var_tag <- names(inner_indices)[[which(names(inner_indices) %in% c('var', 'variable'))[1]]] + } else if (any(c('var', 'variable') %in% names(file_selectors))) { + vars_to_read <- file_selectors[[which(names(file_selectors) %in% c('var', 'variable'))[1]]] + var_tag <- names(file_selectors)[[which(names(file_selectors) %in% c('var', 'variable'))[1]]] + } else if (length(vars_in_file) == 1) { + vars_to_read <- vars_in_file + file_selectors <- c(file_selectors, list(var = vars_in_file)) + var_tag <- 'var' + } else { + stop("NcDimReader expected to find a requested 'var' or 'variable' in 'file_selectors'.") + } + + if ((length(vars_to_read) == 1) && (vars_to_read[1] == 'var_names')) { + result <- setNames(length(vars_in_file), var_tag) + } else { + vars_to_read <- sapply(vars_to_read, + function(x) { + if (x %in% names(synonims)) { + x_in_file <- which(synonims[[x]] %in% vars_in_file) + if (length(x_in_file) < 1) { + stop("Could not find variable '", x, "' (or its synonims if ", + "specified) in the file ", file_path) + } + if (length(x_in_file) > 1) { + stop("Found more than one matches for the synonims of the ", + "variable '", x, "' in the file ", file_path) + } + synonims[[x]][x_in_file] + } else { + if (is.character(x) && !(x %in% c('all', 'last', 'first'))) { + if (!(x %in% vars_in_file)) { + stop("Could not find variable '", x, "' (or its synonims if ", + "specified) in the file ", file_path) + } + } + x + } + }) + vars_to_read <- SelectorChecker(vars_to_read, vars_in_file, + return_indices = FALSE) + read_dims <- easyNCDF::NcReadDims(file_to_read, vars_to_read) + if (any(c('var', 'variable') %in% names(inner_indices))) { + names(read_dims)[which(names(read_dims) == 'var')] <- var_tag + read_dims[var_tag] <- length(vars_in_file) + } else { + read_dims <- read_dims[-which(names(read_dims) == 'var')] + } + result <- read_dims + } + + if (close) { + NcCloser(file_to_read) + } + + result +} diff --git a/modules/Loading/tmp/startR/R/NcOpener.R b/modules/Loading/tmp/startR/R/NcOpener.R new file mode 100644 index 00000000..a301432b --- /dev/null +++ b/modules/Loading/tmp/startR/R/NcOpener.R @@ -0,0 +1,27 @@ +#'NetCDF file opener for 'startR' +#' +#'This is a file opener function for NetCDF files, intended for use as parameter +#''file_opener' in a Start() call. This function complies with the input/output +#'interface required by Start() defined in the documentation for the parameter +#''file_opener'.\cr\cr +#'This function uses the function NcOpen() in the package 'easyNCDF', which in +#'turn uses nc_open() in the package 'ncdf4'. +#' +#'@param file_path A character string indicating the path to the data file to +#' read. See details in the documentation of the parameter 'file_opener' of the +#' function Start(). +#'@return An open connection to a NetCDF file with additional header +#' information as returned by nc_open() in the package 'ncdf4'. See details in +#' the documentation of the parameter 'file_opener' of the function Start(). +#'@examples +#'data_path <- system.file('extdata', package = 'startR') +#'path_obs <- file.path(data_path, 'obs/monthly_mean/tos/tos_200011.nc') +#'connection <- NcOpener(path_obs) +#'NcCloser(connection) +#'@seealso \code{\link{NcDimReader}} \code{\link{NcDataReader}} +#' \code{\link{NcCloser}} \code{\link{NcVarReader}} +#'@import easyNCDF +#'@export +NcOpener <- function(file_path) { + easyNCDF::NcOpen(file_path) +} diff --git a/modules/Loading/tmp/startR/R/NcVarReader.R b/modules/Loading/tmp/startR/R/NcVarReader.R new file mode 100644 index 00000000..b78e89eb --- /dev/null +++ b/modules/Loading/tmp/startR/R/NcVarReader.R @@ -0,0 +1,70 @@ +#'NetCDF variable reader for 'startR' +#' +#'This is an auxiliary variable reader function for NetCDF files, intended for +#'use as parameter 'file_var_reader' in a Start() call. It complies with the +#'input/output interface required by Start() defined in the documentation for +#'the parameter 'file_var_reader' of that function.\cr\cr +#'This function uses the function NcDataReader() in the package 'startR', +#'which in turn uses NcToArray() in the package 'easyNCDF', which in turn uses +#'nc_var_get() in the package 'ncdf4'. +#' +#'@param file_path A character string indicating the path to the data file to +#' read the variable from. See details in the documentation of the parameter +#' 'file_var_reader' of the function Start(). The default value is NULL. +#'@param file_object An open connection to a NetCDF file, optionally with +#' additional header information. See details in the documentation of the +#' parameter 'file_var_reader' of the function Start(). The default value is +#' NULL. +#'@param file_selectors A named list containing the information of the path of +#' the file to read data from. It is automatically provided by Start(). See +#' details in the documentation of the parameter 'file_var_reader' of the +#' function Start(). The default value is NULL. +#'@param var_name A character string with the name of the variable to be read. +#' The default value is NULL. +#'@param synonims A named list indicating the synonims for the dimension names +#' to look for in the requested file, exactly as provided in the parameter +#' 'synonims' in a Start() call. See details in the documentation of the +#' parameter 'file_var_reader' of the function Start(). +#' +#'@return A multidimensional data array with the named dimensions, potentially +#' with the attribute 'variables' with additional auxiliary data. See details +#' in the documentation of the parameter 'file_var_reader' of the function +#' Start(). +#'@examples +#' data_path <- system.file('extdata', package = 'startR') +#' file_to_open <- file.path(data_path, 'obs/monthly_mean/tos/tos_200011.nc') +#' file_selectors <- c(dat = 'dat1', var = 'tos', sdate = '200011') +#' synonims <- list(dat = 'dat', var = 'var', sdate = 'sdate', time = 'time', +#' latitude = 'latitude', longitude = 'longitude') +#' var <- NcVarReader(file_to_open, NULL, file_selectors, +#' 'tos', synonims) +#'@seealso \code{\link{NcOpener}} \code{\link{NcDataReader}} +#' \code{\link{NcCloser}} \code{\link{NcDimReader}} +#'@export +NcVarReader <- function(file_path = NULL, file_object = NULL, + file_selectors = NULL, var_name = NULL, + synonims) { + if (!is.null(file_object)) { + file_to_read <- file_object + } else if (!is.null(file_path)) { + file_to_read <- file_path + } else { + stop("Either 'file_path' or 'file_object' must be provided.") + } + if (var_name %in% c('var_names')) { + vars_in_file <- easyNCDF::NcReadVarNames(file_to_read) + vars_in_file <- sapply(vars_in_file, + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + dim(vars_in_file) <- c(var_names = length(vars_in_file)) + vars_in_file + } else { + NcDataReader(file_path, file_object, list(var = var_name), NULL, synonims) + } +} diff --git a/modules/Loading/tmp/startR/R/SelectorChecker.R b/modules/Loading/tmp/startR/R/SelectorChecker.R new file mode 100644 index 00000000..92e1d1b9 --- /dev/null +++ b/modules/Loading/tmp/startR/R/SelectorChecker.R @@ -0,0 +1,308 @@ +#'Translate a set of selectors into a set of numeric indices +#' +#'This is a selector checker function intended for use as parameter +#''selector_checker' in a Start() call. It translates a set of selectors which +#'is the value for one dimension into a set of numeric indices corresponding to +#'the coordinate variable. The function complies with the input/output interface +#'required by Start() defined in the documentation for the parameter +#''selector_checker' of Start(). +#' +#'@param selectors A vector or a list of two of numeric indices or variable +#' values to be retrieved for a dimension, automatically provided by Start(). +#' See details in the documentation of the parameters 'selector_checker' and +#' '\dots' of the function Start(). +#'@param var A vector of values of a coordinate variable for which to search +#' matches with the provided indices or values in the parameter 'selectors', +#' automatically provided by Start(). See details in the documentation of the +#' parameters 'selector_checker' and '\dots' of the function Start(). The +#' default value is NULL. When not specified, SelectorChecker() simply returns +#' the input indices. +#'@param return_indices A logical value automatically configured by Start(), +#' telling whether to return the numeric indices or coordinate variable values +#' after the matching. The default value is TRUE. +#'@param tolerance A numeric value indicating a tolerance value to be used in +#' the matching of 'selectors' and 'var'. See documentation on +#' '_tolerance' in \code{\dots} in the documentation of the function +#' Start(). The default value is NULL. +#' +#'@return A vector of either the indices of the matching values (if +#' return_indices = TRUE) or the matching values themselves (if return_indices +#' = FALSE). +#'@examples +#'# Get the latitudes from 10 to 20 degree +#'sub_array_of_selectors <- list(10, 20) +#'# The latitude values from original file +#'sub_array_of_values <- seq(90, -90, length.out = 258)[2:257] +#'SelectorChecker(sub_array_of_selectors, sub_array_of_values) +#' +#'@importFrom methods is +#'@export +SelectorChecker <- function(selectors, var = NULL, return_indices = TRUE, + tolerance = NULL) { + if (length(selectors) == 0) { + stop("No selectors provided in 'selectors'.") + } + if (return_indices) { + if (is.list(selectors)) { + if (length(selectors) != 2) { + stop("'selectors' provided in a wrong format.") + } + crescent_selectors <- TRUE + if (all(sapply(selectors, + function(x) { + any(c("numeric", "integer", "POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(x)) + }))) { + if (selectors[[2]] < selectors[[1]]) { + crescent_selectors <- FALSE + } + } + for (i in 1:length(selectors)) { + if (is.null(var)) { + if (!is.numeric(selectors[[i]])) { + stop("No selector values provided in 'var'.") + } else { + selectors[[i]] <- round(selectors[[i]]) + } + } else if (is.na(selectors[[i]])) { + if (i == 1) { + if (crescent_selectors) { + selectors[[i]] <- 1 + } else { + selectors[[i]] <- length(var) + } + } + else { + if (crescent_selectors) { + selectors[[i]] <- length(var) + } else { + selectors[[i]] <- 1 + } + } + } else if (is.character(selectors[[i]])) { + if (is.character(var)) { + candidate <- which(var == selectors[[i]]) + if (length(candidate) > 0) { + selectors[[i]] <- candidate[1] + } else { + stop("Selector value not found in 'var'.") + } + } else { + stop("Character selectors provided but possible values in 'var' are not character.") + } + } else if (is.numeric(selectors[[i]])) { + if (is.numeric(var)) { + + tol <- 0 + if (!is.null(tolerance)) { + if (!is(tolerance, "numeric")) { + stop("Expected a numeric *_tolerance.") + } + tol <- tolerance + } + + val <- selectors[[i]] + + if (i == 1) { + if (crescent_selectors) { + val <- val - tol + if (var[1] < var[2]) { + selectors[[i]] <- which(var >= val)[1] + } else if (var[1] > var[2]) { + selectors[[i]] <- rev(which(var >= val))[1] + } + + } else { + val <- val + tol + if (var[1] < var[2]) { + selectors[[i]] <- rev(which(var <= val))[1] + } else if (var[1] > var[2]) { + selectors[[i]] <- which(var <= val)[1] + } + } + } + else if (i == 2) { + if (crescent_selectors) { + val <- val + tol + if (var[1] < var[2]) { + selectors[[i]] <- rev(which(var <= val))[1] + } else if (var[1] > var[2]) { + selectors[[i]] <- which(var <= val)[1] + } + + } else { + val <- val - tol + if (var[1] < var[2]) { + selectors[[i]] <- which(var >= val)[1] + } else if (var[1] > var[2]) { + selectors[[i]] <- rev(which(var >= val))[1] + } + } + } + + + } else { + stop("Numeric selectors provided but possible values in 'var' are not numeric.") + } + } else if (any(c("POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(selectors[[i]]))) { + # TODO: Here, change to as above (numeric part). + if (any(c("POSIXct", "POSIXlt", "POSIXt", "Date") %in% class(var))) { + val <- selectors[[i]] + tol <- 0 + if (!is.null(tolerance)) { + if (!is(tolerance, "difftime")) { + stop("Expected a difftime *_tolerance.") + } + tol <- tolerance + } + if (i == 1) { + if (crescent_selectors) { + val <- val - tol + selectors[[i]] <- which(var >= val)[1] + } else { + val <- val + tol + selectors[[i]] <- rev(which(var <= val))[1] + } + } + else { + if (crescent_selectors) { + val <- val + tol + selectors[[i]] <- rev(which(var <= val))[1] + } else { + val <- val - tol + selectors[[i]] <- which(var >= val)[1] + } + } + } else { + stop("Datetime selectors provided but possible values in 'var' are not datetime.") + } + } + } + + # The checker is returning a list of two indices. + ##selectors[[1]]:selectors[[2]] + selectors + } else if (is.numeric(selectors)) { + if (is.null(var)) { + ## TODO: Crash if negative indices? + round(selectors) + } else { + if (is.numeric(var)) { + if (!all(selectors %in% var)) { + .warning(paste0("Numeric selectors have been ", + "provided for a dimension defined along a ", + "numeric variable, but no exact match ", + "found for all the selectors. Taking the index of the ", + "nearest values.")) + } + if (!is.null(tolerance)) { + if (!is(tolerance, 'numeric')) { + stop("Expected a numeric *_tolerance.") + } + } + sapply(selectors, function(x) { + dif <- abs(var - x) + res <- which.min(dif)[1] + if (!is.null(tolerance)) { + if (dif[res] > tolerance) { + stop("Could not find a value in 'var' close ", + "enough to one of the 'selectors', ", + "according to 'tolerance'.") + } + } + res + }) + } else { + stop("Numeric selectors provided but possible values in 'var' are not numeric.") + } + } + } else if (any(c('POSIXct', 'POSIXlt', 'POSIXt', 'Date') %in% class(selectors))) { + if (is.null(var)) { + stop("Numeric selectors have been provided for a dimension ", + "defined along a date variable, but no possible values ", + "provided in 'var'.") + } + if (!all(selectors %in% var)) { + .warning(paste0("Date selectors have been ", + "provided for a dimension defined along a ", + "date variable, but no exact match ", + "found for all the selectors. Taking the index of the ", + "nearest values.")) + } + if (!is.null(tolerance)) { + if (!is(tolerance, 'difftime')) { + stop("Expected a difftime *_tolerance.") + } + } + sapply(selectors, function(x) { + dif <- abs(var - x) + res <- which.min(dif)[1] + if (!is.null(tolerance)) { + if (dif[res] > tolerance) { + res <- NA + #stop("Could not find a value in 'var' close ", + # "enough to one of the 'selectors', ", + # "according to 'tolerance'.") + } + } + res + }) + } else { + if (is.null(var)) { + stop("No selector values provided in 'var'.") + } else { + if ((length(selectors) == 1) && + (selectors %in% c('all', 'first', 'last'))) { + if (selectors == 'all') { + 1:length(var) + } else if (selectors == 'first') { + 1 + } else { + length(var) + } + } else { + if (!identical(class(var), class(selectors))) { + stop("Class of provided selectors does not match class of 'var'.") + } + candidates <- match(as.vector(selectors), as.vector(var)) + if (length(candidates) == 0 | any(is.na(candidates))) { + stop("Selectors do not match values in 'var'.") + } else if (length(candidates) != length(selectors)) { + stop("Some selectors do not match values in 'var'.") + } + candidates + } + } + } + } else { + if (!is.null(var)) { + if (is.list(selectors)) { + if (length(selectors) != 2) { + stop("'selectors' provided in a wrong format.") + } else { + var[selectors[[1]]:selectors[[2]]] + } + } else if (is.numeric(selectors)) { + if (length(selectors) > 0) { + var[selectors] + } else { + stop("No selectors provided.") + } + } else { + if ((length(selectors) == 1) && + (selectors %in% c('all', 'first', 'last'))) { + if (selectors == 'all') { + var + } else if (selectors == 'first') { + head(var, 1) + } else { + tail(var, 1) + } + } else { + selectors + } + } + } else { + selectors + } + } +} diff --git a/modules/Loading/tmp/startR/R/Sort.R b/modules/Loading/tmp/startR/R/Sort.R new file mode 100644 index 00000000..4f74d68e --- /dev/null +++ b/modules/Loading/tmp/startR/R/Sort.R @@ -0,0 +1,85 @@ +#'Sort the coordinate variable values in a Start() call +#' +#'The reorder function intended for use as parameter '_reorder' +#'in a call to the function Start(). This function complies with the +#'input/output interface required by Start() defined in the documentation +#'for the parameter \code{\dots} of that function.\cr\cr +#'The coordinate applied to Sort() consists of an increasing or decreasing +#'sort of the values. It is useful for adjusting the latitude order.\cr\cr +#'The coordinate applied to CircularSort() consists of a circular sort of +#'values, where any values beyond the limits specified in the parameters +#''start' and 'end' is applied a modulus to fall in the specified +#'range. This is useful for circular coordinates such as the Earth longitudes. +#'@name Sort +#'@aliases CircularSort +#'@param start A numeric indicating the lower bound of the circular range. +#'@param end A numeric indicating the upper bound of the circular range. +#'@param \dots Additional parameters to adjust the reorderig. See function +#' sort() for more details. +#' +#'@return +#'A list of 2 containing: +#'\item{$x}{ +#' The reordered values. +#'} +#'\item{$ix}{ +#' The permutation indices of $x in the original coordinate. +#'} +#'@examples +#' # Used in Start(): +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011', '200012') +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = values(list(-60, 60)), +#' latitude_reorder = Sort(decreasing = TRUE), +#' longitude = values(list(-120, 120)), +#' longitude_reorder = CircularSort(-180, 180), +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#' +#'@rdname Sort +#'@export +Sort <- function(...) { + params <- list(...) + f <- "function(x) { + dim_bk <- dim(x) + x <- do.call(sort, c(list(x, index.return = TRUE), + PARAMS)) + dim(x$x) <- dim_bk + dim(x$ix) <- dim_bk + x + }" + f <- gsub("PARAMS", deparse(params), f) + r <- eval(parse(text = f)) + attr(r, 'circular') <- FALSE + r +} + +#'@rdname Sort +#'@export +CircularSort <- function(start, end, ...) { + params <- list(...) + f <- "function (x) { + start <- START + end <- END + dim_bk <- dim(x) + x <- do.call(sort, c(list((x - start) %% (end - start) + start, + index.return = TRUE), + PARAMS)) + dim(x$x) <- dim_bk + dim(x$ix) <- dim_bk + x + }" + f <- gsub("START", deparse(start), f) + f <- gsub("END", deparse(end), f) + f <- gsub("PARAMS", deparse(params), f) + r <- eval(parse(text = f)) + attr(r, 'circular') <- TRUE + r +} diff --git a/modules/Loading/tmp/startR/R/Start.R b/modules/Loading/tmp/startR/R/Start.R new file mode 100644 index 00000000..3616f232 --- /dev/null +++ b/modules/Loading/tmp/startR/R/Start.R @@ -0,0 +1,4518 @@ +#'Declare, discover, subset and retrieve multidimensional distributed data sets +#' +#'See the \href{https://earth.bsc.es/gitlab/es/startR}{startR documentation and +#'tutorial} for a step-by-step explanation on how to use Start().\cr\cr +#'Nowadays in the era of big data, large multidimensional data sets from +#'diverse sources need to be combined and processed. Analysis of big data in any +#'field is often highly complex and time-consuming. Taking subsets of these data +#'sets and processing them efficiently become an indispensable practice. This +#'technique is also known as Domain Decomposition, Map Reduce or, more commonly, +#''chunking'.\cr\cr +#'startR (Subset, TrAnsform, ReTrieve, arrange and process large +#'multidimensional data sets in R) is an R project started at BSC with the aim +#'to develop a tool that allows the user to automatically process large +#'multidimensional distributed data sets. It is an open source project that is +#'open to external collaboration and funding, and will continuously evolve to +#'support as many data set formats as possible while maximizing its efficiency.\cr\cr +#'startR provides a framework under which a data set (collection of one +#'or multiple data files, potentially distributed over various remote servers) +#'are perceived as if they all were part of a single large multidimensional +#'array. Once such multidimensional array is declared, any user-defined function +#'can be applied to the data in a \code{apply}-like fashion, where startR +#'transparently implements the Map Reduce paradigm. The steps to follow in order +#'to process a collection of big data sets are as follows:\cr +#'\itemize{ +#' \item{ +#'Declaring the data set, i.e. declaring the distribution of the data files +#'involved, the dimensions and shape of the multidimensional array, and the +#'boundaries of the target data. This step can be performed with the +#'Start() function. Numeric indices or coordinate values can be used when +#'fixing the boundaries. It is common having the need to apply transformations, +#'pre-processing or reordering to the data. Start() accepts user-defined +#'transformation or reordering functions to be applied for such purposes. Once a +#'data set is declared, a list of involved files, dimension lengths, memory size +#'and other metadata is made available. Optionally, the data set can be +#'retrieved and loaded onto the current R session if it is small enough. +#' } +#' \item{ +#'Declaring the workflow of operations to perform on the involved data set(s). +#'This step can be performed with the Step() and AddStep() functions. +#' } +#' \item{ +#'Defining the computation settings. The mandatory settings include a) how many +#'subsets to divide the data sets into and along which dimensions; b) which +#'platform to perform the workflow of operations on (local machine or remote +#'machine/HPC?), how to communicate with it (unidirectional or bidirectional +#'connection? shared or separate file systems?), which queuing system it uses +#'(slurm, PBS, LSF, none?); and c) how many parallel jobs and execution threads +#'per job to use when running the calculations. This step can be performed when +#'building up the call to the Compute() function. +#' } +#' \item{ +#'Running the computation. startR transparently implements the Map Reduce +#'paradigm, according to the settings in the previous steps. The progress can +#'optionally be monitored with the EC-Flow workflow management tool. When the +#'computation ends, a report of performance timings is displayed. This step can +#'be triggered with the Compute() function. +#' } +#'} +#'startR is not bound to a specific file format. Interface functions to +#'custom file formats can be provided for Start() to read them. As this +#'version, startR includes interface functions to the following file formats: +#'\itemize{ +#' \item{ +#'NetCDF +#' } +#'} +#'Metadata and auxilliary data is also preserved and arranged by Start() +#'in the measure that it is retrieved by the interface functions for a specific +#'file format. +#' +#'@param \dots A selection of custemized parameters depending on the data +#'format. When we retrieve data from one or a collection of data sets, +#'the involved data can be perceived as belonging to a large multi-dimensional +#'array. For instance, let us consider an example case. We want to retrieve data +#'from a source, which contains data for the number of monthly sales of various +#'items, and also for their retail price each month. The data on source is +#'stored as follows:\cr\cr +#'\command{ +#'\cr # /data/ +#'\cr # |-> sales/ +#'\cr # | |-> electronics +#'\cr # | | |-> item_a.data +#'\cr # | | |-> item_b.data +#'\cr # | | |-> item_c.data +#'\cr # | |-> clothing +#'\cr # | |-> item_d.data +#'\cr # | |-> idem_e.data +#'\cr # | |-> idem_f.data +#'\cr # |-> prices/ +#'\cr # |-> electronics +#'\cr # | |-> item_a.data +#'\cr # | |-> item_b.data +#'\cr # | |-> item_c.data +#'\cr # |-> clothing +#'\cr # |-> item_d.data +#'\cr # |-> item_e.data +#'\cr # |-> item_f.data +#'}\cr\cr +#'Each item file contains data, stored in whichever format, for the sales or +#'prices over a time period, e.g. for the past 24 months, registered at 100 +#'different stores over the world. Whichever the format it is stored in, each +#'file can be perceived as a container of a data array of 2 dimensions, time and +#'store. Let us assume the '.data' format allows to keep a name for each of +#'these dimensions, and the actual names are 'time' and 'store'.\cr\cr +#'The different item files for sales or prices can be perceived as belonging to +#'an 'item' dimension of length 3, and the two groups of three items to a +#''section' dimension of length 2, and the two groups of two sections (one with +#'the sales and the other with the prices) can be perceived as belonging also to +#'another dimension 'variable' of length 2. Even the source can be perceived as +#'belonging to a dimension 'source' of length 1.\cr\cr +#'All in all, in this example, the whole data could be perceived as belonging to +#'a multidimensional 'large array' of dimensions\cr +#'\command{ +#'\cr # source variable section item store month +#'\cr # 1 2 2 3 100 24 +#'} +#'\cr\cr +#'The dimensions of this 'large array' can be classified in two types. The ones +#'that group actual files (the file dimensions) and the ones that group data +#'values inside the files (the inner dimensions). In the example, the file +#'dimensions are 'source', 'variable', 'section' and 'item', whereas the inner +#'dimensions are 'store' and 'month'. +#'\cr\cr +#'Having the dimensions of our target sources in mind, the parameter \code{\dots} +#'expects to receive information on: +#' \itemize{ +#' \item{ +#'The names of the expected dimensions of the 'large dataset' we want to +#'retrieve data from +#' } +#' \item{ +#'The indices to take from each dimension (and other constraints) +#' } +#' \item{ +#'How to reorder the dimension if needed +#' } +#' \item{ +#'The location and organization of the files of the data sets +#' } +#' } +#'For each dimension, the 3 first information items can be specified with a set +#'of parameters to be provided through \code{\dots}. For a given dimension +#''dimname', six parameters can be specified:\cr +#'\command{ +#'\cr # dimname = , # 'all' / 'first' / 'last' / +#'\cr # # indices(c(1, 10, 20)) / +#'\cr # # indices(c(1:20)) / +#'\cr # # indices(list(1, 20)) / +#'\cr # # c(1, 10, 20) / c(1:20) / +#'\cr # # list(1, 20) +#'\cr # dimname_var = , +#'\cr # dimname_tolerance = , +#'\cr # dimname_reorder = , +#'\cr # dimname_depends = , +#'\cr # dimname_across = +#'} +#'\cr\cr +#'The \bold{indices to take} can be specified in three possible formats (see +#'code comments above for examples). The first format consists in using +#'character tags, such as 'all' (take all the indices available for that +#'dimension), 'first' (take only the first) and 'last' (only the last). The +#'second format consists in using numeric indices, which have to be wrapped in a +#'call to the indices() helper function. For the second format, either a +#'vector of numeric indices can be provided, or a list with two numeric indices +#'can be provided to take all the indices in the range between the two specified +#'indices (both extremes inclusive). The third format consists in providing a +#'vector character strings (for file dimensions) or of values of whichever type +#'(for inner dimensions). For the file dimensions, the provided character +#'strings in the third format will be used as components to build up the final +#'path to the files (read further). For inner dimensions, the provided values in +#'the third format will be compared to the values of an associated coordinate +#'variable (must be specified in '_reorder', read further), and the +#'indices of the closest values will be retrieved. When using the third format, +#'a list with two values can also be provided to take all the indices of the +#'values within the specified range. +#'\cr\cr +#'The \bold{name of the associated coordinate variable} must be a character +#'string with the name of an associated coordinate variable to be found in the +#'data files (in all* of them). For this to work, a 'file_var_reader' +#'function must be specified when calling Start() (see parameter +#''file_var_reader'). The coordinate variable must also be requested in the +#'parameter 'return_vars' (see its section for details). This feature only +#'works for inner dimensions. +#'\cr\cr +#'The \bold{tolerance value} is useful when indices for an inner dimension are +#'specified in the third format (values of whichever type). In that case, the +#'indices of the closest values in the coordinate variable are seeked. However +#'the closest value might be too distant and we would want to consider no real +#'match exists for such provided value. This is possible via the tolerance, +#'which allows to specify a threshold beyond which not to seek for matching +#'values and mark that index as missing value. +#'\cr\cr +#'The \bold{reorder_function} is useful when indices for an inner dimension are +#'specified in the third fromat, and the retrieved indices need to be reordered +#'in function of their provided associated variable values. A function can be +#'provided, which receives as input a vector of values, and returns as outputs a +#'list with the components \code{$x} with the reordered values, and \code{$ix} +#'with the permutation indices. Two reordering functions are included in +#'startR, the Sort() and the CircularSort(). +#'\cr\cr +#'The \bold{name of another dimension} to be specified in _depends, +#'only available for file dimensions, must be a character string with the name +#'of another requested \bold{file dimension} in \code{\dots}, and will make +#'Start() aware that the path components of a file dimension can vary in +#'function of the path component of another file dimension. For instance, in the +#'example above, specifying \code{item_depends = 'section'} will make +#'Start() aware that the item names vary in function of the section, i.e. +#'section 'electronics' has items 'a', 'b' and 'c' but section 'clothing' has +#'items 'd', 'e', 'f'. Otherwise Start() would expect to find the same +#'item names in all the sections. +#'If values() is used to define dimensions, it is possible to provide different +#'values of the depending dimension for each depended dimension values. For +#'example, if \code{section = c('electronics', 'clothing')}, we can use +#'\code{item = list(electronics = c('a', 'b', 'c'), clothing = c('d', 'e', 'f'))}. +#'\cr\cr +#'The \bold{name of another dimension} to be specified in '_across', +#'only available for inner dimensions, must be a character string with the name +#'of another requested \bold{inner dimension} in \code{\dots}, and will make +#'Start() aware that an inner dimension extends along multiple files. For +#'instance, let us imagine that in the example above, the records for each item +#'are so large that it becomes necessary to split them in multiple files each +#'one containing the registers for a different period of time, e.g. in 10 files +#'with 100 months each ('item_a_period1.data', 'item_a_period2.data', and so on). +#'In that case, the data can be perceived as having an extra file dimension, the +#''period' dimension. The inner dimension 'month' would extend across multiple +#'files, and providing the parameter \code{month = indices(1, 300)} would make +#'Start() crash because it would perceive we have made a request out of +#'bounds (each file contains 100 'month' indices, but we requested 1 to 300). +#'This can be solved by specifying the parameter \code{month_across = period} (a +#'long with the full specification of the dimension 'period'). +#'\cr\cr +#'\bold{Defining the path pattern} +#'\cr +#'As mentioned above, the parameter \dots also expects to receive information +#'with the location of the data files. In order to do this, a special dimension +#'must be defined. In that special dimension, in place of specifying indices to +#'take, a path pattern must be provided. The path pattern is a character string +#'that encodes the way the files are organized in their source. It must be a +#'path to one of the data set files in an accessible local or remote file system, +#'or a URL to one of the files provided by a local or remote server. The regions +#'of this path that vary across files (along the file dimensions) must be +#'replaced by wildcards. The wildcards must match any of the defined file +#'dimensions in the call to Start() and must be delimited with heading +#'and trailing '$'. Shell globbing expressions can be used in the path pattern. +#'See the next code snippet for an example of a path pattern. +#'\cr\cr +#'All in all, the call to Start() to load the entire data set in the +#'example of store item sales, would look as follows: +#'\cr +#'\command{ +#'\cr # data <- Start(source = paste0('/data/$variable$/', +#'\cr # '$section$/$item$.data'), +#'\cr # variable = 'all', +#'\cr # section = 'all', +#'\cr # item = 'all', +#'\cr # item_depends = 'section', +#'\cr # store = 'all', +#'\cr # month = 'all') +#'} +#'\cr\cr +#'Note that in this example it would still be pending to properly define the +#'parameters 'file_opener', 'file_closer', 'file_dim_reader', +#''file_var_reader' and 'file_data_reader' for the '.data' file format +#'(see the corresponding sections). +#'\cr\cr +#'The call to Start() will return a multidimensional R array with the +#'following dimensions: +#'\cr +#'\command{ +#'\cr # source variable section item store month +#'\cr # 1 2 2 3 100 24 +#'} +#'\cr +#'The dimension specifications in the \code{\dots} do not have to follow any +#'particular order. The returned array will have the dimensions in the same order +#'as they have been specified in the call. For example, the following call: +#'\cr +#'\command{ +#'\cr # data <- Start(source = paste0('/data/$variable$/', +#'\cr # '$section$/$item$.data'), +#'\cr # month = 'all', +#'\cr # store = 'all', +#'\cr # item = 'all', +#'\cr # item_depends = 'section', +#'\cr # section = 'all', +#'\cr # variable = 'all') +#'} +#'\cr\cr +#'would return an array with the following dimensions: +#'\cr +#'\command{ +#'\cr # source month store item section variable +#'\cr # 1 24 100 3 2 2 +#'} +#'\cr\cr +#'Next, a more advanced example to retrieve data for only the sales records, for +#'the first section ('electronics'), for the 1st and 3rd items and for the +#'stores located in Barcelona (assuming the files contain the variable +#''store_location' with the name of the city each of the 100 stores are located +#'at): +#'\cr +#'\command{ +#'\cr # data <- Start(source = paste0('/data/$variable$/', +#'\cr # '$section$/$item$.data'), +#'\cr # variable = 'sales', +#'\cr # section = 'first', +#'\cr # item = indices(c(1, 3)), +#'\cr # item_depends = 'section', +#'\cr # store = 'Barcelona', +#'\cr # store_var = 'store_location', +#'\cr # month = 'all', +#'\cr # return_vars = list(store_location = NULL)) +#'} +#'\cr\cr +#'The defined names for the dimensions do not necessarily have to match the +#'names of the dimensions inside the file. Lists of alternative names to be +#'seeked can be defined in the parameter 'synonims'. +#'\cr\cr +#'If data from multiple sources (not necessarily following the same structure) +#'has to be retrieved, it can be done by providing a vector of character strings +#'with path pattern specifications, or, in the extended form, by providing a +#'list of lists with the components 'name' and 'path', and the name of the +#'dataset and path pattern as values, respectively. For example: +#'\cr +#'\command{ +#'\cr # data <- Start(source = list( +#'\cr # list(name = 'sourceA', +#'\cr # path = paste0('/sourceA/$variable$/', +#'\cr # '$section$/$item$.data')), +#'\cr # list(name = 'sourceB', +#'\cr # path = paste0('/sourceB/$section$/', +#'\cr # '$variable$/$item$.data')) +#'\cr # ), +#'\cr # variable = 'sales', +#'\cr # section = 'first', +#'\cr # item = indices(c(1, 3)), +#'\cr # item_depends = 'section', +#'\cr # store = 'Barcelona', +#'\cr # store_var = 'store_location', +#'\cr # month = 'all', +#'\cr # return_vars = list(store_location = NULL)) +#'} +#'\cr +#' +#'@param return_vars A named list where the names are the names of the +#'variables to be fetched in the files, and the values are vectors of +#'character strings with the names of the file dimension which to retrieve each +#'variable for, or NULL if the variable has to be retrieved only once +#'from any (the first) of the involved files.\cr\cr +#'Apart from retrieving a multidimensional data array, retrieving auxiliary +#'variables inside the files can also be needed. The parameter +#''return_vars' allows for requesting such variables, as long as a +#''file_var_reader' function is also specified in the call to +#'Start() (see documentation on the corresponding parameter). +#'\cr\cr +#'In the case of the the item sales example (see documentation on parameter +#'\code{\dots)}, the store location variable is requested with the parameter\cr +#'\code{return_vars = list(store_location = NULL)}.\cr This will cause +#'Start() to fetch once the variable 'store_location' and return it in +#'the component\cr \code{$Variables$common$store_location},\cr and will be an +#'array of character strings with the location names, with the dimensions +#'\code{c('store' = 100)}. Although useless in this example, we could ask +#'Start() to fetch and return such variable for each file along the +#'items dimension as follows: \cr +#'\code{return_vars = list(store_location = c('item'))}.\cr In that case, the +#'variable will be fetched once from a file of each of the items, and will be +#'returned as an array with the dimensions \code{c('item' = 3, 'store' = 100)}. +#'\cr\cr +#'If a variable is requested along a file dimension that contains path pattern +#'specifications ('source' in the example), the fetched variable values will be +#'returned in the component\cr \code{$Variables$$}.\cr +#'For example: +#'\cr +#'\command{ +#'\cr # data <- Start(source = list( +#'\cr # list(name = 'sourceA', +#'\cr # path = paste0('/sourceA/$variable$/', +#'\cr # '$section$/$item$.data')), +#'\cr # list(name = 'sourceB', +#'\cr # path = paste0('/sourceB/$section$/', +#'\cr # '$variable$/$item$.data')) +#'\cr # ), +#'\cr # variable = 'sales', +#'\cr # section = 'first', +#'\cr # item = indices(c(1, 3)), +#'\cr # item_depends = 'section', +#'\cr # store = 'Barcelona', +#'\cr # store_var = 'store_location', +#'\cr # month = 'all', +#'\cr # return_vars = list(store_location = c('source', +#'\cr # 'item'))) +#'\cr # # Checking the structure of the returned variables +#'\cr # str(found_data$Variables) +#'\cr # Named list +#'\cr # ..$common: NULL +#'\cr # ..$sourceA: Named list +#'\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +#'\cr # ..$sourceB: Named list +#'\cr # .. ..$store_location: char[1:18(3d)] 'Barcelona' 'Barcelona' ... +#'\cr # # Checking the dimensions of the returned variable +#'\cr # # for the source A +#'\cr # dim(found_data$Variables$sourceA) +#'\cr # item store +#'\cr # 3 3 +#'} +#'\cr\cr +#'The names of the requested variables do not necessarily have to match the +#'actual variable names inside the files. A list of alternative names to be +#'seeked can be specified via the parameter 'synonims'. +#' +#'@param synonims A named list where the names are the requested variable or +#'dimension names, and the values are vectors of character strings with +#'alternative names to seek for such dimension or variable.\cr\cr +#'In some requests, data from different sources may follow different naming +#'conventions for the dimensions or variables, or even files in the same source +#'could have varying names. This parameter is in order for Start() to +#'properly identify the dimensions or variables with different names. +#'\cr\cr +#'In the example used in parameter 'return_vars', it may be the case that +#'the two involved data sources follow slightly different naming conventions. +#'For example, source A uses 'sect' as name for the sections dimension, whereas +#'source B uses 'section'; source A uses 'store_loc' as variable name for the +#'store locations, whereas source B uses 'store_location'. This can be taken +#'into account as follows: +#'\cr +#'\command{ +#'\cr # data <- Start(source = list( +#'\cr # list(name = 'sourceA', +#'\cr # path = paste0('/sourceA/$variable$/', +#'\cr # '$section$/$item$.data')), +#'\cr # list(name = 'sourceB', +#'\cr # path = paste0('/sourceB/$section$/', +#'\cr # '$variable$/$item$.data')) +#'\cr # ), +#'\cr # variable = 'sales', +#'\cr # section = 'first', +#'\cr # item = indices(c(1, 3)), +#'\cr # item_depends = 'section', +#'\cr # store = 'Barcelona', +#'\cr # store_var = 'store_location', +#'\cr # month = 'all', +#'\cr # return_vars = list(store_location = c('source', +#'\cr # 'item')), +#'\cr # synonims = list( +#'\cr # section = c('sec', 'section'), +#'\cr # store_location = c('store_loc', +#'\cr # 'store_location') +#'\cr # )) +#'} +#'\cr +#' +#'@param file_opener A function that receives as a single parameter +#' 'file_path' a character string with the path to a file to be opened, +#' and returns an object with an open connection to the file (optionally with +#' header information) on success, or returns NULL on failure. +#'\cr\cr +#'This parameter takes by default NcOpener() (an opener function for NetCDF +#'files). +#'\cr\cr +#'See NcOpener() for a template to build a file opener for your own file +#'format. +#' +#'@param file_var_reader A function with the header \code{file_path = NULL}, +#' \code{file_object = NULL}, \code{file_selectors = NULL}, \code{var_name}, +#' \code{synonims} that returns an array with auxiliary data (i.e. data from a +#' variable) inside a file. Start() will provide automatically either a +#' 'file_path' or a 'file_object' to the 'file_var_reader' +#' function (the function has to be ready to work whichever of these two is +#' provided). The parameter 'file_selectors' will also be provided +#' automatically to the variable reader, containing a named list where the +#' names are the names of the file dimensions of the queried data set (see +#' documentation on \code{\dots}) and the values are single character strings +#' with the components used to build the path to the file being read (the one +#' provided in 'file_path' or 'file_object'). The parameter 'var_name' +#' will be filled in automatically by Start() also, with the name of one +#' of the variales to be read. The parameter 'synonims' will be filled in +#' with exactly the same value as provided in the parameter 'synonims' in +#' the call to Start(), and has to be used in the code of the variable +#' reader to check for alternative variable names inside the target file. The +#' 'file_var_reader' must return a (multi)dimensional array with named +#' dimensions, and optionally with the attribute 'variales' with other +#' additional metadata on the retrieved variable. +#'\cr\cr +#'Usually, the 'file_var_reader' should be a degenerate case of the +#''file_data_reader' (see documentation on the corresponding parameter), +#'so it is recommended to code the 'file_data_reder' in first place. +#'\cr\cr +#'This parameter takes by default NcVarReader() (a variable reader function +#'for NetCDF files). +#'\cr\cr +#'See NcVarReader() for a template to build a variale reader for your own +#'file format. +#' +#'@param file_dim_reader A function with the header \code{file_path = NULL}, +#' \code{file_object = NULL}, \code{file_selectors = NULL}, \code{synonims} +#' that returns a named numeric vector where the names are the names of the +#' dimensions of the multidimensional data array in the file and the values are +#' the sizes of such dimensions. Start() will provide automatically +#' either a 'file_path' or a 'file_object' to the +#' 'file_dim_reader' function (the function has to be ready to work +#' whichever of these two is provided). The parameter 'file_selectors' +#' will also be provided automatically to the dimension reader, containing a +#' named list where the names are the names of the file dimensions of the +#' queried data set (see documentation on \code{\dots}) and the values are +#' single character strings with the components used to build the path to the +#' file being read (the one provided in 'file_path' or 'file_object'). +#' The parameter 'synonims' will be filled in with exactly the same value +#' as provided in the parameter 'synonims' in the call to Start(), +#' and can optionally be used in advanced configurations. +#'\cr\cr +#'This parameter takes by default NcDimReader() (a dimension reader +#'function for NetCDF files). +#'\cr\cr +#'See NcDimReader() for (an advanced) template to build a dimension reader +#'for your own file format. +#' +#'@param file_data_reader A function with the header \code{file_path = NULL}, +#' \code{file_object = NULL}, \code{file_selectors = NULL}, +#' \code{inner_indices = NULL}, \code{synonims} that returns a subset of the +#' multidimensional data array inside a file (even if internally it is not an +#' array). Start() will provide automatically either a 'file_path' +#' or a 'file_object' to the 'file_data_reader' function (the +#' function has to be ready to work whichever of these two is provided). The +#' parameter 'file_selectors' will also be provided automatically to the +#' data reader, containing a named list where the names are the names of the +#' file dimensions of the queried data set (see documentation on \code{\dots}) +#' and the values are single character strings with the components used to +#' build the path to the file being read (the one provided in 'file_path' or +#' 'file_object'). The parameter 'inner_indices' will be filled in +#' automatically by Start() also, with a named list of numeric vectors, +#' where the names are the names of all the expected inner dimensions in a file +#' to be read, and the numeric vectors are the indices to be taken from the +#' corresponding dimension (the indices may not be consecutive nor in order). +#' The parameter 'synonims' will be filled in with exactly the same value +#' as provided in the parameter 'synonims' in the call to Start(), +#' and has to be used in the code of the data reader to check for alternative +#' dimension names inside the target file. The 'file_data_reader' must +#' return a (multi)dimensional array with named dimensions, and optionally with +#' the attribute 'variables' with other additional metadata on the retrieved +#' data. +#'\cr\cr +#'Usually, 'file_data_reader' should use 'file_dim_reader' +#'(see documentation on the corresponding parameter), so it is recommended to +#'code 'file_dim_reder' in first place. +#'\cr\cr +#'This parameter takes by default NcDataReader() (a data reader function +#'for NetCDF files). +#'\cr\cr +#'See NcDataReader() for a template to build a data reader for your own +#'file format. +#' +#'@param file_closer A function that receives as a single parameter +#' 'file_object' an open connection (as returned by 'file_opener') +#' to one of the files to be read, optionally with header information, and +#' closes the open connection. Always returns NULL. +#'\cr\cr +#'This parameter takes by default NcCloser() (a closer function for NetCDF +#'files). +#'\cr\cr +#'See NcCloser() for a template to build a file closer for your own file +#'format. +#' +#'@param transform A function with the header \code{dara_array}, +#' \code{variables}, \code{file_selectors = NULL}, \code{\dots}. It receives as +#' input, through the parameter \code{data_array}, a subset of a +#' multidimensional array (as returned by 'file_data_reader'), applies a +#' transformation to it and returns it, preserving the amount of dimensions but +#' potentially modifying their size. This transformation may require data from +#' other auxiliary variables, automatically provided to 'transform' +#' through the parameter 'variables', in the form of a named list where +#' the names are the variable names and the values are (multi)dimensional +#' arrays. Which variables need to be sent to 'transform' can be specified +#' with the parameter 'transform_vars' in Start(). The parameter +#' 'file_selectors' will also be provided automatically to +#' 'transform', containing a named list where the names are the names of +#' the file dimensions of the queried data set (see documentation on +#' \code{\dots}) and the values are single character strings with the +#' components used to build the path to the file the subset being processed +#' belongs to. The parameter \code{\dots} will be filled in with other +#' additional parameters to adjust the transformation, exactly as provided in +#' the call to Start() via the parameter 'transform_params'. +#'@param transform_params A named list with additional parameters to be sent to +#' the 'transform' function (if specified). See documentation on parameter +#' 'transform' for details. +#'@param transform_vars A vector of character strings with the names of +#' auxiliary variables to be sent to the 'transform' function (if +#' specified). All the variables to be sent to 'transform' must also +#' have been requested as return variables in the parameter 'return_vars' +#' of Start(). +#'@param transform_extra_cells An integer of extra indices to retrieve from the +#' data set, beyond the requested indices in \code{\dots}, in order for +#' 'transform' to dispose of additional information to properly apply +#' whichever transformation (if needed). As many as +#' 'transform_extra_cells' will be retrieved beyond each of the limits for +#' each of those inner dimensions associated to a coordinate variable and sent +#' to 'transform' (i.e. present in 'transform_vars'). After +#' 'transform' has finished, Start() will take again and return a +#' subset of the result, for the returned data to fall within the specified +#' bounds in \code{\dots}. The default value is 2. +#'@param apply_indices_after_transform A logical value indicating when a +#' 'transform' is specified in Start() and numeric indices are +#' provided for any of the inner dimensions that depend on coordinate variables, +#' these numeric indices can be made effective (retrieved) before applying the +#' transformation or after. The boolean flag allows to adjust this behaviour. +#' It takes FALSE by default (numeric indices are applied before sending +#' data to 'transform'). +#'@param pattern_dims A character string indicating the name of the dimension +#' with path pattern specifications (see \code{\dots} for details). If not +#' specified, Start() assumes the first provided dimension is the pattern +#' dimension, with a warning. +#'@param metadata_dims A vector of character strings with the names of the file +#' dimensions which to return metadata for. As noted in 'file_data_reader', +#' the data reader can optionally return auxiliary data via the attribute +#' 'variables' of the returned array. Start() by default returns the +#' auxiliary data read for only the first file of each source (or data set) in +#' the pattern dimension (see \code{\dots} for info on what the pattern +#' dimension is). However it can be configured to return the metadata for all +#' the files along any set of file dimensions. The default value is NULL, and +#' it will be assigned automatically as parameter 'pattern_dims'. +#'@param selector_checker A function used internaly by Start() to +#' translate a set of selectors (values for a dimension associated to a +#' coordinate variable) into a set of numeric indices. It takes by default +#' SelectorChecker() and, in principle, it should not be required to +#' change it for customized file formats. The option to replace it is left open +#' for more versatility. See the code of SelectorChecker() for details on +#' the inputs, functioning and outputs of a selector checker. +#'@param merge_across_dims A logical value indicating whether to merge +#' dimensions across which another dimension extends (according to the +#' '_across' parameters). Takes the value FALSE by default. For +#' example, if the dimension 'time' extends across the dimension 'chunk' and +#' \code{merge_across_dims = TRUE}, the resulting data array will only contain +#' only the dimension 'time' as long as all the chunks together. +#'@param merge_across_dims_narm A logical value indicating whether to remove +#' the additional NAs from data when parameter 'merge_across_dims' is TRUE. +#' It is helpful when the length of the to-be-merged dimension is different +#' across another dimension. For example, if the dimension 'time' extends +#' across dimension 'chunk', and the time length along the first chunk is 2 +#' while along the second chunk is 10. Setting this parameter as TRUE can +#' remove the additional 8 NAs at position 3 to 10. The default value is TRUE, +#' but will be automatically turned to FALSE if 'merge_across_dims = FALSE'. +#'@param split_multiselected_dims A logical value indicating whether to split a +#' dimension that has been selected with a multidimensional array of selectors +#' into as many dimensions as present in the selector array. The default value +#' is FALSE. +#'@param path_glob_permissive A logical value or an integer specifying how many +#' folder levels in the path pattern, beginning from the end, the shell glob +#' expressions must be preserved and worked out for each file. The default +#' value is FALSE, which is equivalent to 0. TRUE is equivalent to 1.\cr\cr +#'When specifying a path pattern for a dataset, it might contain shell glob +#'experissions. For each dataset, the first file matching the path pattern is +#'found, and the found file is used to work out fixed values for the glob +#'expressions that will be used for all the files of the dataset. However, in +#'some cases, the values of the shell glob expressions may not be constant for +#'all files in a dataset, and they need to be worked out for each file +#'involved.\cr\cr +#'For example, a path pattern could be as follows: \cr +#'\code{'/path/to/dataset/$var$_*/$date$_*_foo.nc'}. \cr Leaving +#'\code{path_glob_permissive = FALSE} will trigger automatic seek of the +#' contents to replace the asterisks (e.g. the first asterisk matches with +#' \code{'bar'} and the second with \code{'baz'}. The found contents will be +#' used for all files in the dataset (in the example, the path pattern will be +#' fixed to\cr \code{'/path/to/dataset/$var$_bar/$date$_baz_foo.nc'}. However, if +#' any of the files in the dataset have other contents in the position of the +#' asterisks, Start() will not find them (in the example, a file like \cr +#' \code{'/path/to/dataset/precipitation_bar/19901101_bin_foo.nc'} would not be +#' found). Setting \code{path_glob_permissive = 1} would preserve global +#' expressions in the latest level (in the example, the fixed path pattern +#' would be\cr \code{'/path/to/dataset/$var$_bar/$date$_*_foo.nc'}, and the +#' problematic file mentioned before would be found), but of course this would +#' slow down the Start() call if the dataset involves a large number of +#' files. Setting \code{path_glob_permissive = 2} would leave the original path +#' pattern with the original glob expressions in the 1st and 2nd levels (in the +#' example, both asterisks would be preserved, thus would allow Start() +#' to recognize files such as \cr +#' \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +#'Note that each glob expression can only represent one possibility (Start() +#'chooses the first). Because \code{*} is not the tag, which means it cannot +#'be a dimension of the output array. Therefore, only one possibility can be +#'adopted. For example, if \cr +#'\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +#'has two matches:\cr +#'\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +#'\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +#'only the first found file will be used. +#'@param largest_dims_length A logical value or a named integer vector +#' indicating if Start() should examine all the files to get the largest +#' length of the inner dimensions (TRUE) or use the first valid file of each +#' dataset as the returned dimension length (FALSE). Since examining all the +#' files could be time-consuming, a vector can be used to explicitly specify +#' the expected length of the inner dimensions. For those inner dimensions not +#' specified, the first valid file will be used. The default value is FALSE.\cr\cr +#' This parameter is useful when the required files don't have consistent +#' inner dimension. For example, there are 10 required experimental data files +#' of a series of start dates. The data only contain 25 members for the first +#' 2 years while 51 members for the later years. If \code{'largest_dims_length = FALSE'}, +#' the returned member dimension length will be 25 only. The 26th to 51st +#' members in the later 8 years will be discarded. If \code{'largest_dims_length = TRUE'}, +#' the returned member dimension length will be 51. To save the resource, +#' \code{'largest_dims_length = c(member = 51)'} can also be used. +#'@param retrieve A logical value indicating whether to retrieve the data +#' defined in the Start() call or to explore only its dimension lengths +#' and names, and the values for the file and inner dimensions. The default +#' value is FALSE. +#'@param num_procs An integer of number of processes to be created for the +#' parallel execution of the retrieval/transformation/arrangement of the +#' multiple involved files in a call to Start(). If set to NULL, +#' takes the number of available cores (as detected by future::availableCores). +#' The default value is 1 (no parallel execution). +#'@param ObjectBigmemory a character string to be included as part of the +#' bigmemory object name. This parameter is thought to be used internally by the +#' chunking capabilities of startR. +#'@param silent A logical value of whether to display progress messages (FALSE) +#' or not (TRUE). The default value is FALSE. +#'@param debug A logical value of whether to return detailed messages on the +#' progress and operations in a Start() call (TRUE) or not (FALSE). The +#' default value is FALSE. +#' +#'@return If \code{retrieve = TRUE} the involved data is loaded into RAM memory +#' and an object of the class 'startR_cube' with the following components is +#' returned:\cr +#' \item{Data}{ +#' Multidimensional data array with named dimensions, with the data values +#' requested via \code{\dots} and other parameters. This array can potentially +#' contain metadata in the attribute 'variables'. +#' } +#' \item{Variables}{ +#' Named list of 1 + N components, containing lists of retrieved variables (as +#' requested in 'return_vars') common to all the data sources (in the 1st +#' component, \code{$common}), and for each of the N dara sources (named after +#' the source name, as specified in \dots, or, if not specified, \code{$dat1}, +#' \code{$dat2}, ..., \code{$datN}). Each of the variables are contained in a +#' multidimensional array with named dimensions, and potentially with the +#' attribute 'variables' with additional auxiliary data. +#' } +#' \item{Files}{ +#' Multidimensonal character string array with named dimensions. Its dimensions +#' are the file dimensions (as requested in \code{\dots}). Each cell in this +#' array contains a path to a retrieved file, or NULL if the corresponding +#' file was not found. +#' } +#' \item{NotFoundFiles}{ +#' Array with the same shape as \code{$Files} but with NULL in the +#' positions for which the corresponding file was found, and a path to the +#' expected file in the positions for which the corresponding file was not +#' found. +#' } +#' \item{FileSelectors}{ +#' Multidimensional character string array with named dimensions, with the same +#' shape as \code{$Files} and \code{$NotFoundFiles}, which contains the +#' components used to build up the paths to each of the files in the data +#' sources. +#' } +#' \item{PatternDim}{ +#' Character string containing the name of the file pattern dimension. +#' } +#'If \code{retrieve = FALSE} the involved data is not loaded into RAM memory and +#'an object of the class 'startR_header' with the following components is +#' returned:\cr +#' \item{Dimensions}{ +#' Named vector with the dimension lengths and names of the data involved in +#' the Start() call. +#' } +#' \item{Variables}{ +#' Named list of 1 + N components, containing lists of retrieved variables (as +#' requested in 'return_vars') common to all the data sources (in the 1st +#' component, \code{$common}), and for each of the N dara sources (named after +#' the source name, as specified in \dots, or, if not specified, \code{$dat1}, +#' \code{$dat2}, ..., \code{$datN}). Each of the variables are contained in a +#' multidimensional array with named dimensions, and potentially with the +#' attribute 'variables' with additional auxiliary data. +#' } +#' \item{ExpectedFiles}{ +#' Multidimensonal character string array with named dimensions. Its dimensions +#' are the file dimensions (as requested in \dots). Each cell in this array +#' contains a path to a file to be retrieved (which may exist or not). +#' } +#' \item{FileSelectors}{ +#' Multidimensional character string array with named dimensions, with the same +#' shape as \code{$Files} and \code{$NotFoundFiles}, which contains the +#' components used to build up the paths to each of the files in the data +#' sources. +#' } +#' \item{PatternDim}{ +#' Character string containing the name of the file pattern dimension. +#' } +#' \item{StartRCall}{ +#' List of parameters sent to the Start() call, with the parameter +#' 'retrieve' set to TRUE. Intended for calling in order to +#' retrieve the associated data a posteriori with a call to do.call(). +#' } +#' +#'@examples +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011', '200012') +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = 'all', +#' longitude = 'all', +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#' +#'@import bigmemory multiApply parallel abind future +#'@importFrom utils str +#'@importFrom stats na.omit setNames +#'@importFrom ClimProjDiags Subset +#'@importFrom methods is +#'@export +Start <- function(..., # dim = indices/selectors, + # dim_var = 'var', + # dim_reorder = Sort/CircularSort, + # dim_tolerance = number, + # dim_depends = 'file_dim', + # dim_across = 'file_dim', + return_vars = NULL, + synonims = NULL, + file_opener = NcOpener, + file_var_reader = NcVarReader, + file_dim_reader = NcDimReader, + file_data_reader = NcDataReader, + file_closer = NcCloser, + transform = NULL, + transform_params = NULL, + transform_vars = NULL, + transform_extra_cells = 2, + apply_indices_after_transform = FALSE, + pattern_dims = NULL, + metadata_dims = NULL, + selector_checker = SelectorChecker, + merge_across_dims = FALSE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = FALSE, + path_glob_permissive = FALSE, + largest_dims_length = FALSE, + retrieve = FALSE, + num_procs = 1, + ObjectBigmemory = NULL, + silent = FALSE, debug = FALSE) { + #, config_file = NULL + #dictionary_dim_names = , + #dictionary_var_names = + + # Specify Subset() is from ClimProjDiags + Subset <- ClimProjDiags::Subset + + dim_params <- list(...) + # Take *_var parameters apart + var_params <- take_var_params(dim_params) + + # Take *_reorder parameters apart + dim_reorder_params <- take_var_reorder(dim_params) + + # Take *_tolerance parameters apart + tolerance_params_ind <- grep('_tolerance$', names(dim_params)) + tolerance_params <- dim_params[tolerance_params_ind] + + # Take *_depends parameters apart + depending_file_dims <- take_var_depends(dim_params) + + # Take *_across parameters apart + inner_dims_across_files <- take_var_across(dim_params) + + # Check merge_across_dims + if (!is.logical(merge_across_dims)) { + stop("Parameter 'merge_across_dims' must be TRUE or FALSE.") + } + if (merge_across_dims & is.null(inner_dims_across_files)) { + merge_across_dims <- FALSE + .warning("Parameter 'merge_across_dims' is changed to FALSE because there is no *_across argument.") + } + + # Check merge_across_dims_narm + if (!is.logical(merge_across_dims_narm)) { + stop("Parameter 'merge_across_dims_narm' must be TRUE or FALSE.") + } + if (!merge_across_dims & merge_across_dims_narm) { + merge_across_dims_narm <- FALSE + } + # Leave alone the dimension parameters in the variable dim_params + dim_params <- rebuild_dim_params(dim_params, merge_across_dims, + inner_dims_across_files) + dim_names <- names(dim_params) + # Look for chunked dims + chunks <- look_for_chunks(dim_params, dim_names) + + # Check pattern_dims + # Function found_pattern_dims may change pattern_dims in the .GlobalEnv + found_pattern_dim <- found_pattern_dims(pattern_dims, dim_names, var_params, + dim_params, dim_reorder_params) + + # Check all *_reorder are NULL or functions, and that they all have + # a matching dimension param. + i <- 1 + for (dim_reorder_param in dim_reorder_params) { + if (!is.function(dim_reorder_param)) { + stop("All '*_reorder' parameters must be functions.") + } else if (!any(grepl(paste0('^', strsplit(names(dim_reorder_params)[i], + '_reorder$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_reorder' parameters must be associated to a dimension parameter. Found parameter '", + names(dim_reorder_params)[i], "' but no parameter '", + strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "'.")) + #} else if (!any(grepl(paste0('^', strsplit(names(dim_reorder_params)[i], + # '_reorder$')[[1]][1], '$'), + # names(var_params)))) { + # stop(paste0("All '*_reorder' parameters must be associated to a dimension parameter associated to a ", + # "variable. Found parameter '", names(dim_reorder_params)[i], "' and dimension parameter '", + # strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "' but did not find variable ", + # "parameter '", strsplit(names(dim_reorder_params)[i], '_reorder$')[[1]][1], "_var'.")) + } + i <- i + 1 + } + + # Check all *_tolerance are NULL or vectors of character strings, and + # that they all have a matching dimension param. + i <- 1 + for (tolerance_param in tolerance_params) { + if (!any(grepl(paste0('^', strsplit(names(tolerance_params)[i], + '_tolerance$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_tolerance' parameters must be associated to a dimension parameter. Found parameter '", + names(tolerance_params)[i], "' but no parameter '", + strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "'.")) + #} else if (!any(grepl(paste0('^', strsplit(names(tolerance_params)[i], + # '_tolerance$')[[1]][1], '$'), + # names(var_params)))) { + # stop(paste0("All '*_tolerance' parameters must be associated to a dimension parameter associated to a ", + # "variable. Found parameter '", names(tolerance_params)[i], "' and dimension parameter '", + # strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "' but did not find variable ", + # "parameter '", strsplit(names(tolerance_params)[i], '_tolerance$')[[1]][1], "_var'.")) + } + i <- i + 1 + } + # Make the keys of 'tolerance_params' to be the name of + # the corresponding dimension. + if (length(tolerance_params) < 1) { + tolerance_params <- NULL + } else { + names(tolerance_params) <- gsub('_tolerance$', '', names(tolerance_params)) + } + + # Check metadata_dims + if (!is.null(metadata_dims)) { + if (any(is.na(metadata_dims))) { + metadata_dims <- NULL + } else if (!is.character(metadata_dims) || (length(metadata_dims) < 1)) { + stop("Parameter 'metadata' dims must be a vector of at least one character string.") + } + } else { + metadata_dims <- pattern_dims + } + + # Check if pattern_dims is the first item in metadata_dims + if ((pattern_dims %in% metadata_dims) & metadata_dims[1] != pattern_dims) { + metadata_dims <- c(pattern_dims, metadata_dims[-which(metadata_dims == pattern_dims)]) + } + # Check if metadata_dims has more than 2 elements + if ((metadata_dims[1] == pattern_dims & length(metadata_dims) > 2)) { + .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", + "function. Keep '", metadata_dims[1], "' and '", metadata_dims[2], "' only.")) + metadata_dims <- metadata_dims[1:2] + } else if (!(pattern_dims %in% metadata_dims) & length(metadata_dims) > 1) { + .warning(paste0("Parameter 'metadata_dims' has too many elements which serve repetitive ", + "function. Keep '", metadata_dims[1], "' only.")) + metadata_dims <- metadata_dims[1] + } + + # Once the pattern dimension with dataset specifications is found, + # the variable 'dat' is mounted with the information of each + # dataset. + # Take only the datasets for the requested chunk + dats_to_take <- get_chunk_indices(length(dim_params[[found_pattern_dim]]), + chunks[[found_pattern_dim]]['chunk'], + chunks[[found_pattern_dim]]['n_chunks'], + found_pattern_dim) + dim_params[[found_pattern_dim]] <- dim_params[[found_pattern_dim]][dats_to_take] + dat <- dim_params[[found_pattern_dim]] + #NOTE: This function creates the object 'dat_names' + dat_names <- c() + dat <- mount_dat(dat, pattern_dims, found_pattern_dim, dat_names) + + dim_params[[found_pattern_dim]] <- dat_names + + # Reorder inner_dims_across_files (to make the keys be the file dimensions, + # and the values to be the inner dimensions that go across it). + if (!is.null(inner_dims_across_files)) { + # Reorder: example, convert list(ftime = 'chunk', ensemble = 'member', xx = 'chunk') + # to list(chunk = c('ftime', 'xx'), member = 'ensemble') + new_idaf <- list() + for (i in names(inner_dims_across_files)) { + if (!(inner_dims_across_files[[i]] %in% names(new_idaf))) { + new_idaf[[inner_dims_across_files[[i]]]] <- i + } else { + new_idaf[[inner_dims_across_files[[i]]]] <- c(new_idaf[[inner_dims_across_files[[i]]]], i) + } + } + inner_dims_across_files <- new_idaf + } + + # Check return_vars + if (is.null(return_vars)) { + return_vars <- list() + # if (length(var_params) > 0) { + # return_vars <- as.list(paste0(names(var_params), '_var')) + # } else { + # return_vars <- list() + # } + } + if (!is.list(return_vars)) { + stop("Parameter 'return_vars' must be a list or NULL.") + } + if (length(return_vars) > 0 && is.null(names(return_vars))) { + # names(return_vars) <- rep('', length(return_vars)) + stop("Parameter 'return_vars' must be a named list.") + } + i <- 1 + while (i <= length(return_vars)) { + # if (names(return_vars)[i] == '') { + # if (!(is.character(return_vars[[i]]) && (length(return_vars[[i]]) == 1))) { + # stop("The ", i, "th specification in 'return_vars' is malformed.") + # } + # if (!grepl('_var$', return_vars[[i]])) { + # stop("The ", i, "th specification in 'return_vars' is malformed.") + # } + # dim_name <- strsplit(return_vars[[i]], '_var$')[[1]][1] + # if (!(dim_name %in% names(var_params))) { + # stop("'", dim_name, "_var' requested in 'return_vars' but ", + # "no '", dim_name, "_var' specified in the .Load call.") + # } + # names(return_vars)[i] <- var_params[[dim_name]] + # return_vars[[i]] <- found_pattern_dim + # } else + if (length(return_vars[[i]]) > 0) { + if (!is.character(return_vars[[i]])) { + stop("The ", i, "th specification in 'return_vars' is malformed. It ", + "must be a vector of character strings of valid file dimension ", + "names.") + } + } + i <- i + 1 + } + + # Check synonims + if (!is.null(synonims)) { + error <- FALSE + if (!is.list(synonims)) { + error <- TRUE + } + for (synonim_entry in names(synonims)) { + if (!(synonim_entry %in% names(dim_params)) && + !(synonim_entry %in% names(return_vars))) { + error <- TRUE + } + if (!is.character(synonims[[synonim_entry]]) || + length(synonims[[synonim_entry]]) < 1) { + error <- TRUE + } + } + if (error) { + stop("Parameter 'synonims' must be a named list, where the names are ", + "a name of a requested dimension or variable and the values are ", + "vectors of character strings with at least one alternative name ", + " for each dimension or variable in 'synonims'.") + } + } + if (length(unique(names(synonims))) < length(names(synonims))) { + stop("There must not be repeated entries in 'synonims'.") + } + if (length(unique(unlist(synonims))) < length(unlist(synonims))) { + stop("There must not be repeated values in 'synonims'.") + } + # Make that all dims and vars have an entry in synonims, even if only dim_name = dim_name + dim_entries_to_add <- which(!(names(dim_params) %in% names(synonims))) + if (length(dim_entries_to_add) > 0) { + synonims[names(dim_params)[dim_entries_to_add]] <- as.list(names(dim_params)[dim_entries_to_add]) + } + var_entries_to_add <- which(!(names(var_params) %in% names(synonims))) + if (length(var_entries_to_add) > 0) { + synonims[names(var_params)[var_entries_to_add]] <- as.list(names(var_params)[var_entries_to_add]) + } + + # Check if return_vars name is inner dim name. If it is synonim, change back to inner dim name + # and return a warning. + use_syn_names <- which(names(return_vars) %in% unlist(synonims) & + !names(return_vars) %in% names(synonims)) + if (!identical(use_syn_names, integer(0))) { + for (use_syn_name in use_syn_names) { + wrong_name <- names(return_vars)[use_syn_name] + names(return_vars)[use_syn_name] <- names(unlist( + lapply(lapply(synonims, '%in%', + names(return_vars)[use_syn_name]), + which))) + .warning(paste0("The name '", wrong_name, "' in parameter 'return_vars' in synonim. ", + "Change it back to the inner dimension name, '", + names(return_vars)[use_syn_name], "'.")) + } + } + + # Check selector_checker + if (is.null(selector_checker) || !is.function(selector_checker)) { + stop("Parameter 'selector_checker' must be a function.") + } + + # Check file_opener + if (is.null(file_opener) || !is.function(file_opener)) { + stop("Parameter 'file_opener' must be a function.") + } + + # Check file_var_reader + if (!is.null(file_var_reader) && !is.function(file_var_reader)) { + stop("Parameter 'file_var_reader' must be a function.") + } + + # Check file_dim_reader + if (!is.null(file_dim_reader) && !is.function(file_dim_reader)) { + stop("Parameter 'file_dim_reader' must be a function.") + } + + # Check file_data_reader + if (is.null(file_data_reader) || !is.function(file_data_reader)) { + stop("Parameter 'file_data_reader' must be a function.") + } + + # Check file_closer + if (is.null(file_closer) || !is.function(file_closer)) { + stop("Parameter 'file_closer' must be a function.") + } + + # Check transform + if (!is.null(transform)) { + if (!is.function(transform)) { + stop("Parameter 'transform' must be a function.") + } + } + + # Check transform_params + if (!is.null(transform_params)) { + if (!is.list(transform_params)) { + stop("Parameter 'transform_params' must be a list.") + } + if (is.null(names(transform_params))) { + stop("Parameter 'transform_params' must be a named list.") + } + } + + # Check transform_vars + if (!is.null(transform_vars)) { + if (!is.character(transform_vars)) { + stop("Parameter 'transform_vars' must be a vector of character strings.") + } + } + if (any(!(transform_vars %in% names(return_vars)))) { + stop("All the variables specified in 'transform_vars' must also be specified in 'return_vars'.") + } + + # Check apply_indices_after_transform + if (!is.logical(apply_indices_after_transform)) { + stop("Parameter 'apply_indices_after_transform' must be either TRUE or FALSE.") + } + aiat <- apply_indices_after_transform + + # Check transform_extra_cells + if (!is.numeric(transform_extra_cells)) { + stop("Parameter 'transform_extra_cells' must be numeric.") + } + transform_extra_cells <- round(transform_extra_cells) + + # Check split_multiselected_dims + if (!is.logical(split_multiselected_dims)) { + stop("Parameter 'split_multiselected_dims' must be TRUE or FALSE.") + } + + # Check path_glob_permissive + if (!is.numeric(path_glob_permissive) && !is.logical(path_glob_permissive)) { + stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or an integer.") + } + if (length(path_glob_permissive) != 1) { + stop("Parameter 'path_glob_permissive' must be of length 1.") + } + + # Check largest_dims_length + if (!is.numeric(largest_dims_length) && !is.logical(largest_dims_length)) { + stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.") + } + if (is.numeric(largest_dims_length)) { + if (any(largest_dims_length %% 1 != 0) | any(largest_dims_length < 0) | is.null(names(largest_dims_length))) { + stop("Parameter 'largest_dims_length' must be TRUE, FALSE or a named integer vector.") + } + } + if (is.logical(largest_dims_length) && length(largest_dims_length) != 1) { + stop("Parameter 'path_glob_permissive' must be TRUE, FALSE or a named integer vector.") + } + + # Check retrieve + if (!is.logical(retrieve)) { + stop("Parameter 'retrieve' must be TRUE or FALSE.") + } + + # Check num_procs + if (!is.null(num_procs)) { + if (!is.numeric(num_procs)) { + stop("Parameter 'num_procs' must be numeric.") + } else { + num_procs <- round(num_procs) + } + } + + # Check silent + if (!is.logical(silent)) { + stop("Parameter 'silent' must be logical.") + } + + if (!silent) { + .message(paste0("Exploring files... This will take a variable amount ", + "of time depending on the issued request and the ", + "performance of the file server...")) + } + + if (!is.character(debug)) { + dims_to_check <- c('time') + } else { + dims_to_check <- debug + debug <- TRUE + } + + ############################## READING FILE DIMS ############################ + # Check that no unrecognized variables are present in the path patterns + # and also that no file dimensions are requested to THREDDs catalogs. + # And in the mean time, build all the work pieces and look for the + # first available file of each dataset. + array_of_files_to_load <- NULL + array_of_not_found_files <- NULL + indices_of_first_files_with_data <- vector('list', length(dat)) + selectors_of_first_files_with_data <- vector('list', length(dat)) + dataset_has_files <- rep(FALSE, length(dat)) + found_file_dims <- vector('list', length(dat)) + expected_inner_dims <- vector('list', length(dat)) + + #print("A") + for (i in 1:length(dat)) { + #print("B") + dat_selectors <- dim_params + dat_selectors[[found_pattern_dim]] <- dat_selectors[[found_pattern_dim]][i] + dim_vars <- paste0('$', dim_names, '$') + file_dims <- which(sapply(dim_vars, grepl, dat[[i]][['path']], fixed = TRUE)) + if (length(file_dims) > 0) { + file_dims <- dim_names[file_dims] + } + file_dims <- unique(c(pattern_dims, file_dims)) + found_file_dims[[i]] <- file_dims + expected_inner_dims[[i]] <- dim_names[which(!(dim_names %in% file_dims))] + # (Check the depending_file_dims). + if (any(c(names(depending_file_dims), unlist(depending_file_dims)) %in% + expected_inner_dims[[i]])) { + stop(paste0("The dimension dependancies specified in ", + "'depending_file_dims' can only be between file ", + "dimensions, but some inner dimensions found in ", + "dependancies for '", dat[[i]][['name']], "', which ", + "has the following file dimensions: ", + paste(paste0("'", file_dims, "'"), collapse = ', '), ".")) + } else { + a <- names(depending_file_dims) %in% file_dims + b <- unlist(depending_file_dims) %in% file_dims + ab <- a & b + if (any(!ab)) { + .warning(paste0("Detected some dependancies in 'depending_file_dims' with ", + "non-existing dimension names. These will be disregarded.")) + depending_file_dims <- depending_file_dims[-which(!ab)] + } + if (any(names(depending_file_dims) == unlist(depending_file_dims))) { + depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == unlist(depending_file_dims))] + } + } + # (Check the inner_dims_across_files). + if (any(!(names(inner_dims_across_files) %in% file_dims)) || + any(!(unlist(inner_dims_across_files) %in% expected_inner_dims[[i]]))) { + stop(paste0("All relationships specified in ", + "'_across' parameters must be between a inner ", + "dimension and a file dimension. Found wrong ", + "specification for '", dat[[i]][['name']], "', which ", + "has the following file dimensions: ", + paste(paste0("'", file_dims, "'"), collapse = ', '), + ", and the following inner dimensions: ", + paste(paste0("'", expected_inner_dims[[i]], "'"), + collapse = ', '), ".")) + } + # (Check the return_vars). + j <- 1 + while (j <= length(return_vars)) { + if (any(!(return_vars[[j]] %in% file_dims))) { + if (any(return_vars[[j]] %in% expected_inner_dims[[i]])) { + stop("Found variables in 'return_vars' requested ", + "for some inner dimensions (for dataset '", + dat[[i]][['name']], "'), but variables can only be ", + "requested for file dimensions.") + } else { + stop("Found variables in 'return_vars' requested ", + "for non-existing dimensions.") + } + } + j <- j + 1 + } + # (Check the metadata_dims). + if (!is.null(metadata_dims)) { + if (any(!(metadata_dims %in% file_dims))) { + stop("All dimensions in 'metadata_dims' must be file dimensions.") + } + } + + # Add attributes indicating whether this dimension selector is value or indice + tmp <- lapply(dat_selectors[which(dim_names != pattern_dims)], add_value_indices_flag) + dat_selectors <- c(dat_selectors[pattern_dims], tmp) + + ## Look for _var params that should be requested automatically. + for (dim_name in dim_names[-which(dim_names == pattern_dims)]) { + ## The following code 'rewrites' var_params for all datasets. If providing different + ## path pattern repositories with different file/inner dimensions, var_params might + ## have to be handled for each dataset separately. + if ((attr(dat_selectors[[dim_name]], 'values') || (dim_name %in% c('var', 'variable'))) && + !(dim_name %in% names(var_params)) && !(dim_name %in% file_dims)) { + if (dim_name %in% c('var', 'variable')) { + var_params <- c(var_params, setNames(list('var_names'), dim_name)) + .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", + dim_name, "_var' provided. ", '"', dim_name, "_var = '", + 'var_names', "'", '"', " has been automatically added to ", + "the Start call.")) + } else { + var_params <- c(var_params, setNames(list(dim_name), dim_name)) + .warning(paste0("Found specified values for dimension '", dim_name, "' but no '", + dim_name, "_var' requested. ", '"', dim_name, "_var = '", + dim_name, "'", '"', " has been automatically added to ", + "the Start call.")) + } + } + + if (attr(dat_selectors[[dim_name]], 'indices') & !(dim_name %in% names(var_params))) { + if (dim_name %in% transform_vars) { + var_params <- c(var_params, setNames(list(dim_name), dim_name)) + .warning(paste0("Found dimension '", dim_name, "' is required to transform but no '", + dim_name, "_var' provided. ", '"', dim_name, "_var = '", + dim_name, "'", '"', " has been automatically added to ", + "the Start call.")) + } else if (dim_name %in% names(dim_reorder_params)) { + var_params <- c(var_params, setNames(list(dim_name), dim_name)) + .warning(paste0("Found dimension '", dim_name, "' is required to reorder but no '", + dim_name, "_var' provided. ", '"', dim_name, "_var = '", + dim_name, "'", '"', " has been automatically added to ", + "the Start call.")) + } + } + } + + ## (Check the *_var parameters). + if (any(!(unlist(var_params) %in% names(return_vars)))) { + vars_to_add <- which(!(unlist(var_params) %in% names(return_vars))) + new_return_vars <- vector('list', length(vars_to_add)) + names(new_return_vars) <- unlist(var_params)[vars_to_add] + return_vars <- c(return_vars, new_return_vars) + .warning(paste0("All '*_var' params must associate a dimension to one of the ", + "requested variables in 'return_vars'. The following variables", + " have been added to 'return_vars': ", + paste(paste0("'", unlist(var_params), "'"), collapse = ', '))) + } + + # Examine the selectors of file dim and create 'replace_values', which uses the first + # explicit selector (i.e., character) for all file dimensions. + replace_values <- vector('list', length = length(file_dims)) + names(replace_values) <- file_dims + for (file_dim in file_dims) { + if (file_dim %in% names(var_params)) { + .warning(paste0("The '", file_dim, "_var' param will be ignored since '", + file_dim, "' is a file dimension (for the dataset with pattern ", + dat[[i]][['path']], ").")) + } + # If the selector is a vector or a list of 2 without names (represent the value range) + if (!is.list(dat_selectors[[file_dim]]) || + (is.list(dat_selectors[[file_dim]]) && + length(dat_selectors[[file_dim]]) == 2 && + is.null(names(dat_selectors[[file_dim]])))) { + dat_selectors[[file_dim]] <- list(dat_selectors[[file_dim]]) + } + first_class <- class(dat_selectors[[file_dim]][[1]]) + first_length <- length(dat_selectors[[file_dim]][[1]]) + + # Length will be > 1 if it is list since beginning, e.g., depending dim is a list with + # names as depended dim. + for (j in 1:length(dat_selectors[[file_dim]])) { + sv <- selector_vector <- dat_selectors[[file_dim]][[j]] + if (!inherits(sv, first_class) || + !identical(first_length, length(sv))) { + stop("All provided selectors for depending dimensions must ", + "be vectors of the same length and of the same class.") + } + if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) { + #NOTE: ???? It doesn't make any changes. + dat_selectors[[file_dim]][[j]] <- selector_checker(selectors = sv, + return_indices = FALSE) + # Take chunk if needed (only defined dim; undefined dims will be chunked later in + # find_ufd_value(). + if (chunks[[file_dim]]['n_chunks'] > 1) { + desired_chunk_indices <- get_chunk_indices( + length(dat_selectors[[file_dim]][[j]]), + chunks[[file_dim]]['chunk'], + chunks[[file_dim]]['n_chunks'], + file_dim) + dat_selectors[[file_dim]][[j]] <- dat_selectors[[file_dim]][[j]][desired_chunk_indices] + # chunk the depending dim as well + if (file_dim %in% depending_file_dims) { + depending_dim_name <- names(which(file_dim == depending_file_dims)) + # Chunk it only if it is defined dim (i.e., list of character with names of depended dim) + if (!(length(dat_selectors[[depending_dim_name]]) == 1 && + dat_selectors[[depending_dim_name]] %in% c('all', 'first', 'last'))) { + if (any(sapply(dat_selectors[[depending_dim_name]], is.character))) { + dat_selectors[[depending_dim_name]] <- + dat_selectors[[depending_dim_name]][desired_chunk_indices] + } + } + } + } + } else if (!(is.numeric(sv) || + (is.character(sv) && (length(sv) == 1) && (sv %in% c('all', 'first', 'last'))) || + (is.list(sv) && (length(sv) == 2) && (all(sapply(sv, is.character)) || + all(sapply(sv, is.numeric)))))) { + stop("All explicitly provided selectors for file dimensions must be character strings.") + } + } + sv <- dat_selectors[[file_dim]][[1]] + # 'replace_values' has the first selector (if it's character) or NULL (if it's not explicitly + # defined) for each file dimension. + if (is.character(sv) && !((length(sv) == 1) && (sv[1] %in% c('all', 'first', 'last')))) { + replace_values[[file_dim]] <- sv[1] + } + } + #print("C") + # Now we know which dimensions whose selectors are provided non-explicitly. + undefined_file_dims <- file_dims[which(sapply(replace_values, is.null))] + defined_file_dims <- file_dims[which(!(file_dims %in% undefined_file_dims))] + # Quickly check if the depending dimensions are provided properly. The check is only for + # if the depending and depended file dims are both explicited defined. + for (file_dim in file_dims) { + if (file_dim %in% names(depending_file_dims)) { + + # Return error if depended dim is a list of values while depending dim is not + # defined (i.e., indices or 'all') + if (file_dim %in% defined_file_dims & + !(depending_file_dims[[file_dim]] %in% defined_file_dims)) { + stop(paste0("The depended dimension, ", file_dim, ", is explictly defined ", + "by a list of values, while the depending dimension, ", + depending_file_dims[[file_dim]], ", is not explictly defined. ", + "Specify ", depending_file_dims[[file_dim]], " by characters.")) + } + + ## TODO: Detect multi-dependancies and forbid. + #NOTE: The if statement below is tricky. It tries to distinguish if the depending dim + # has the depended dim as the names of the list. However, if the depending dim + # doesn't have list names and its length is 2 (i.e., list( , )), Start() thinks + # it means the range, just like `lat = values(list(10, 20))`. And because of this, + # we won't enter the following if statement, and the error will occur later in + # SelectorChecker(). Need to find a way to distinguish if list( , ) means range or + # just the values. + if (all(c(file_dim, depending_file_dims[[file_dim]]) %in% defined_file_dims)) { + if (length(dat_selectors[[file_dim]]) != length(dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) { + stop(paste0("If providing selectors for the depending ", + "dimension '", file_dim, "', a ", + "vector of selectors must be provided for ", + "each selector of the dimension it depends on, '", + depending_file_dims[[file_dim]], "'.")) + } else if (!all(names(dat_selectors[[file_dim]]) == dat_selectors[[depending_file_dims[[file_dim]]]][[1]])) { + stop(paste0("If providing selectors for the depending ", + "dimension '", file_dim, "', the name of the ", + "provided vectors of selectors must match ", + "exactly the selectors of the dimension it ", + "depends on, '", depending_file_dims[[file_dim]], "'.")) + } else if (is.null(names(dat_selectors[[file_dim]]))) { + .warning(paste0("The selectors for the depending dimension '", file_dim, "' do not ", + "have list names. Assume that the order of the selectors matches the ", + "depended dimensions '", depending_file_dims[[file_dim]], "''s order.")) + } + } + } + } + + # Find the possible values for the selectors that are provided as + # indices. If the requested file is on server, impossible operation. + if (length(grep("^http", dat[[i]][['path']])) > 0) { + if (length(undefined_file_dims) > 0) { + stop(paste0("All selectors for the file dimensions must be ", + "character strings if requesting data to a remote ", + "server. Found invalid selectors for the file dimensions ", + paste(paste0("'", undefined_file_dims, "'"), collapse = ', '), ".")) + } + dataset_has_files[i] <- TRUE + } else { + dat[[i]][['path']] <- path.expand(dat[[i]][['path']]) + # Iterate over the known dimensions to find the first existing file. + # The path to the first existing file will be used to find the + # values for the non explicitly defined selectors. + first_file <- NULL + first_file_selectors <- NULL + if (length(undefined_file_dims) > 0) { + replace_values[undefined_file_dims] <- '*' + } + ## TODO: What if length of defined_file_dims is 0? code might crash (in practice it worked for an example case) + files_to_check <- sapply(dat_selectors[defined_file_dims], function(x) length(x[[1]])) + sub_array_of_files_to_check <- array(1:prod(files_to_check), dim = files_to_check) + j <- 1 + #print("D") + while (j <= prod(files_to_check) && is.null(first_file)) { + selector_indices <- which(sub_array_of_files_to_check == j, arr.ind = TRUE)[1, ] + selectors <- sapply(1:length(defined_file_dims), + function (x) { + vector_to_pick <- 1 + if (defined_file_dims[x] %in% names(depending_file_dims)) { + vector_to_pick <- selector_indices[which(defined_file_dims == depending_file_dims[[defined_file_dims[x]]])] + } + dat_selectors[defined_file_dims][[x]][[vector_to_pick]][selector_indices[x]] + }) + replace_values[defined_file_dims] <- selectors + file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + file_path <- Sys.glob(file_path) + if (length(file_path) > 0) { + first_file <- file_path[1] + first_file_selectors <- selectors + } + j <- j + 1 + } + #print("E") + # Start looking for values for the non-explicitly defined selectors. + if (is.null(first_file)) { + .warning(paste0("No found files for the datset '", dat[[i]][['name']], + "'. Provide existing selectors for the file dimensions ", + " or check and correct its path pattern: ", dat[[i]][['path']])) + } else { + dataset_has_files[i] <- TRUE + ## TODO: Improve message here if no variable found: + if (length(undefined_file_dims) > 0) { + # Note: "dat[[i]][['path']]" is changed by the function below. + dat_selectors <- find_ufd_value(undefined_file_dims, dat, i, replace_values, + first_file, file_dims, path_glob_permissive, + depending_file_dims, dat_selectors, selector_checker, + chunks) + #print("I") + } else { + #NOTE: If there is no non-explicitly defined dim, use the first found file + # to modify. Problem: '*' doesn't catch all the possible file. Only use + # the first file. + dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, + defined_file_dims, dat[[i]][['name']], path_glob_permissive) + } + } + } + dat[[i]][['selectors']] <- dat_selectors + + # Now fetch for the first available file + if (dataset_has_files[i]) { + known_dims <- file_dims + } else { + known_dims <- defined_file_dims + } + replace_values <- vector('list', length = length(known_dims)) + names(replace_values) <- known_dims + files_to_load <- sapply(dat_selectors[known_dims], function(x) length(x[[1]])) + files_to_load[found_pattern_dim] <- 1 + sub_array_of_files_to_load <- array(1:prod(files_to_load), + dim = files_to_load) + names(dim(sub_array_of_files_to_load)) <- known_dims + sub_array_of_not_found_files <- array(!dataset_has_files[i], + dim = files_to_load) + names(dim(sub_array_of_not_found_files)) <- known_dims + + if (largest_dims_length) { + if (!exists('selector_indices_save')) { + selector_indices_save <- vector('list', length = length(dat)) + } + if (!exists('selectors_total_list')) { + selectors_total_list <- vector('list', length = length(dat)) + } + selector_indices_save[[i]] <- vector('list', length = prod(files_to_load)) + selectors_total_list[[i]] <- vector('list', length = prod(files_to_load)) + } + + j <- 1 + # NOTE: This while loop has these objects that are used afterward: 'sub_array_of_files_to_load', + # 'sub_array_of_not_found_files', 'indices_of_first_files_with_data', 'selectors_of_first_files_with_data'; + # 'selector_indices_save' and 'selectors_total_list' are used if 'largest_dims_length = T'. + while (j <= prod(files_to_load)) { + selector_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] + names(selector_indices) <- known_dims + + if (largest_dims_length) { + tmp <- selector_indices + tmp[which(known_dims == found_pattern_dim)] <- i + selector_indices_save[[i]][[j]] <- tmp + } + + # This 'selectors' is only used in this while loop + selectors <- sapply(1:length(known_dims), + function (x) { + vector_to_pick <- 1 + if (known_dims[x] %in% names(depending_file_dims)) { + vector_to_pick <- selector_indices[which(known_dims == depending_file_dims[[known_dims[x]]])] + } + dat_selectors[known_dims][[x]][[vector_to_pick]][selector_indices[x]] + }) + names(selectors) <- known_dims + + if (largest_dims_length) { + selectors_total_list[[i]][[j]] <- selectors + names(selectors_total_list[[i]][[j]]) <- known_dims + } + + # 'replace_values' and 'file_path' are only used in this while loop + replace_values[known_dims] <- selectors + if (!dataset_has_files[i]) { + if (any(is.na(selectors))) { + replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))] + } + file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE) + sub_array_of_files_to_load[j] <- file_path + #sub_array_of_not_found_files[j] <- TRUE??? + } else { + if (any(is.na(selectors))) { + replace_values <- replace_values[-which(names(replace_values) %in% names(selectors[which(is.na(selectors))]))] + file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values, TRUE) + sub_array_of_files_to_load[j] <- file_path + sub_array_of_not_found_files[j] <- TRUE + } else { + file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + + #NOTE: After replacing tags, there is still * if path_glob_permissive is not FALSE. + # Find the possible value to substitute *. + if (grepl('\\*', file_path)) { + found_files <- Sys.glob(file_path) + file_path <- found_files[1] # choose only the first file. + #NOTE: Above line chooses only the first found file. Because * is not tags, which means + # it is not a dimension. So it cannot store more than one item. If use * to define + # the path, that * should only represent one possibility. + if (length(found_files) > 1) { + .warning("Using glob expression * to define the path, but more ", + "than one match is found. Choose the first match only.") + } + } + + if (!(length(grep("^http", file_path)) > 0)) { + if (grepl(file_path, '*', fixed = TRUE)) { + file_path_full <- Sys.glob(file_path)[1] + if (nchar(file_path_full) > 0) { + file_path <- file_path_full + } + } + } + sub_array_of_files_to_load[j] <- file_path + if (is.null(indices_of_first_files_with_data[[i]])) { + if (!(length(grep("^http", file_path)) > 0)) { + if (!file.exists(file_path)) { + file_path <- NULL + } + } + if (!is.null(file_path)) { + test_file <- NULL + ## TODO: suppress error messages + test_file <- file_opener(file_path) + if (!is.null(test_file)) { + selector_indices[which(known_dims == found_pattern_dim)] <- i + indices_of_first_files_with_data[[i]] <- selector_indices + selectors_of_first_files_with_data[[i]] <- selectors + file_closer(test_file) + } + } + } + } + } + j <- j + 1 + } + # Extend array as needed progressively + if (is.null(array_of_files_to_load)) { + array_of_files_to_load <- sub_array_of_files_to_load + array_of_not_found_files <- sub_array_of_not_found_files + } else { + array_of_files_to_load <- .MergeArrays(array_of_files_to_load, sub_array_of_files_to_load, + along = found_pattern_dim) + ## TODO: file_dims, and variables like that.. are still ok now? I don't think so + array_of_not_found_files <- .MergeArrays(array_of_not_found_files, sub_array_of_not_found_files, + along = found_pattern_dim) + } + } + if (all(sapply(indices_of_first_files_with_data, is.null))) { + stop("No data files found for any of the specified datasets.") + } + + ########################### READING INNER DIMS. ############################# + #print("J") + ## TODO: To be run in parallel (local multi-core) + # Now time to work out the inner file dimensions. + # First pick the requested variables. + +#//// This part is moved below the new code//// +# NOTE: 'dims_to_iterate' only consider common_return_vars. Move to below can save some work +# and get the revised common_return_vars if it is changed. +# dims_to_iterate <- NULL +# for (return_var in names(return_vars)) { +# dims_to_iterate <- unique(c(dims_to_iterate, return_vars[[return_var]])) +# } +# if (found_pattern_dim %in% dims_to_iterate) { +# dims_to_iterate <- dims_to_iterate[-which(dims_to_iterate == found_pattern_dim)] +# } +#////////////////////////////////////////////// + + # Separate 'return_vars' into 'common_return_vars' and 'return_vars' (those = 'dat'). + common_return_vars <- NULL + common_first_found_file <- NULL + common_return_vars_pos <- NULL + if (length(return_vars) > 0) { + common_return_vars_pos <- which(sapply(return_vars, function(x) !(found_pattern_dim %in% x))) + } + if (length(common_return_vars_pos) > 0) { + common_return_vars <- return_vars[common_return_vars_pos] + return_vars <- return_vars[-common_return_vars_pos] + common_first_found_file <- rep(FALSE, length(which(sapply(common_return_vars, length) == 0))) + names(common_first_found_file) <- names(common_return_vars[which(sapply(common_return_vars, length) == 0)]) + } + +#!!!!!!!Check here. return_vars has removed the common ones, and here remove 'dat' value???? +#It seems like it does some benefits to later parts + return_vars <- lapply(return_vars, + function(x) { + if (found_pattern_dim %in% x) { + x[-which(x == found_pattern_dim)] + } else { + x + } + }) +#//////////////////////////////////////////// + # Force return_vars = (time = NULL) to (time = 'sdate') if (1) selector = [sdate = 2, time = 4] or + # (2) time_across = 'sdate'. + # NOTE: Not sure if the loop over dat is needed here. In theory, all the dat + # should have the same dimensions (?) so expected_inner_dims and + # found_file_dims are the same. The selector_array may possible be + # different, but then the attribute will be correct? If it's different, + # it should depend on 'dat' (but here we only consider common_return_vars) + for (i in 1:length(dat)) { + for (inner_dim in expected_inner_dims[[i]]) { + # The selectors for the inner dimension are taken. + selector_array <- dat[[i]][['selectors']][[inner_dim]] + file_dim_as_selector_array_dim <- 1 + + if (any(found_file_dims[[i]] %in% names(dim(selector_array)))) { + file_dim_as_selector_array_dim <- + found_file_dims[[i]][which(found_file_dims[[i]] %in% names(dim(selector_array)))] + } + if (inner_dim %in% inner_dims_across_files | + is.character(file_dim_as_selector_array_dim)) { #(2) or (1) + # inner_dim is not in return_vars or is NULL + need_correct <- FALSE + if (((!inner_dim %in% names(common_return_vars)) & + (!inner_dim %in% names(return_vars))) | + (inner_dim %in% names(common_return_vars) & + is.null(common_return_vars[[inner_dim]]))) { + need_correct <- TRUE + } else if (inner_dim %in% names(common_return_vars) & + (inner_dim %in% inner_dims_across_files) & + !is.null(names(inner_dims_across_files))) { #(2) + if (!names(inner_dims_across_files) %in% common_return_vars[[inner_dim]]) need_correct <- TRUE + + } else if (inner_dim %in% names(common_return_vars) & + is.character(file_dim_as_selector_array_dim)) { #(1) + if (!all(file_dim_as_selector_array_dim %in% common_return_vars[[inner_dim]])) { + need_correct <- TRUE + file_dim_as_selector_array_dim <- file_dim_as_selector_array_dim[which(!file_dim_as_selector_array_dim %in% common_return_vars[[inner_dim]])] + } + } + if (need_correct) { + common_return_vars[[inner_dim]] <- + c(common_return_vars[[inner_dim]], + correct_return_vars(inner_dim, inner_dims_across_files, + found_pattern_dim, file_dim_as_selector_array_dim)) + } + } + } + } + + # Return info about return_vars when dat > 1 + if (length(dat) > 1 & length(common_return_vars) > 0) { + .message("\n", "[ATTENTION]", + paste0("According to parameter 'return_vars', the inner dimensions: ", + paste(names(common_return_vars), collapse = ', '), + ", are common among all the datasets. Please be sure that ", + "this is expected to avoid potential wrong results, and ", + "verify the outputs carefully."), + "\n", indent = 1) + } + +#//////////////////////////////////////////// + +# This part was above where return_vars is seperated into return_vars and common_return_vars +# NOTE: 'dims_to_iterate' only consider common_return_vars. Move to here can save some work +# and get the revised common_return_vars if it is changed in the part right above. + dims_to_iterate <- NULL + for (common_return_var in names(common_return_vars)) { + dims_to_iterate <- unique(c(dims_to_iterate, common_return_vars[[common_return_var]])) + } +#//////////////////////////////////////////// + + # Change the structure of 'dat'. If the selector is not list or it is list of 2 that represents + # range, make it as list. The dimensions that go across files will later be extended to have + # lists of lists/vectors of selectors. + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + for (inner_dim in expected_inner_dims[[i]]) { + if (!is.list(dat[[i]][['selectors']][[inner_dim]]) || # not list, or + (is.list(dat[[i]][['selectors']][[inner_dim]]) && # list of 2 that represents range + length(dat[[i]][['selectors']][[inner_dim]]) == 2 && + is.null(names(dat[[i]][['selectors']][[inner_dim]])))) { + dat[[i]][['selectors']][[inner_dim]] <- list(dat[[i]][['selectors']][[inner_dim]]) + } + } + } + } + + + # Use 'common_return_vars' and 'return_vars' to generate the initial picked(_common)_vars, + # picked(_common)_vars_ordered, and picked(_common)_vars_unorder_indices. + ## Create 'picked_common_vars' + if (length(common_return_vars) > 0) { + picked_common_vars <- vector('list', length = length(common_return_vars)) + names(picked_common_vars) <- names(common_return_vars) + } else { + picked_common_vars <- NULL + } + picked_common_vars_ordered <- picked_common_vars + picked_common_vars_unorder_indices <- picked_common_vars + + ## Create 'picked_vars' + picked_vars <- vector('list', length = length(dat)) + names(picked_vars) <- dat_names + if (length(return_vars) > 0) { + picked_vars <- lapply(picked_vars, function(x) { + x <- vector('list', length = length(return_vars))} ) + picked_vars <- lapply(picked_vars, setNames, names(return_vars)) + } + picked_vars_ordered <- picked_vars + + picked_vars_unorder_indices <- picked_vars + + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + indices_of_first_file <- as.list(indices_of_first_files_with_data[[i]]) + array_file_dims <- sapply(dat[[i]][['selectors']][found_file_dims[[i]]], function(x) length(x[[1]])) + names(array_file_dims) <- found_file_dims[[i]] + if (length(dims_to_iterate) > 0) { + indices_of_first_file[dims_to_iterate] <- lapply(array_file_dims[dims_to_iterate], function(x) 1:x) + } + array_of_var_files <- do.call('[', c(list(x = array_of_files_to_load), indices_of_first_file, list(drop = FALSE))) + array_of_var_indices <- array(1:length(array_of_var_files), dim = dim(array_of_var_files)) + array_of_not_found_var_files <- do.call('[', c(list(x = array_of_not_found_files), indices_of_first_file, list(drop = FALSE))) + # Create previous_indices. The initial value is -1 because there is no 'previous' before the + # 1st current_indices. + previous_indices <- rep(-1, length(indices_of_first_file)) + names(previous_indices) <- names(indices_of_first_file) + # Create first_found_file for vars_to_read defining. It is for the dim value in return_vars + # that is NULL or character(0). Because these dims only need to be read once, so + # first_found_file indicates if these dims have been read or not. + # If read, it turns to TRUE and won't be included in vars_to_read again in the next + # 'for j loop'. + first_found_file <- NULL + if (length(return_vars) > 0) { + first_found_file <- rep(FALSE, length(which(sapply(return_vars, length) == 0))) + names(first_found_file) <- names(return_vars[which(sapply(return_vars, length) == 0)]) + } + + for (j in 1:length(array_of_var_files)) { + current_indices <- which(array_of_var_indices == j, arr.ind = TRUE)[1, ] + names(current_indices) <- names(indices_of_first_file) + if (!is.na(array_of_var_files[j]) && !array_of_not_found_var_files[j]) { + changed_dims <- which(current_indices != previous_indices) + # Prepare vars_to_read for this dataset (i loop) and this file (j loop) + vars_to_read <- generate_vars_to_read(return_vars, changed_dims, first_found_file, + common_return_vars, common_first_found_file, i) + + file_object <- file_opener(array_of_var_files[j]) + if (!is.null(file_object)) { + for (var_to_read in vars_to_read) { + if (var_to_read %in% unlist(var_params)) { + associated_dim_name <- names(var_params)[which(unlist(var_params) == var_to_read)] + } + var_name_to_reader <- var_to_read + names(var_name_to_reader) <- 'var' + var_dims <- file_dim_reader(NULL, file_object, var_name_to_reader, NULL, + synonims) + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(var_dims) <- replace_with_synonmins(var_dims, synonims) + if (!is.null(var_dims)) { + + ## (1) common_return_vars + if (var_to_read %in% names(common_return_vars)) { + var_to_check <- common_return_vars[[var_to_read]] + list_picked_var_of_read <- generate_picked_var_of_read( + var_to_read, var_to_check, array_of_files_to_load, var_dims, + array_of_var_files = array_of_var_files[j], file_var_reader, + file_object, synonims, associated_dim_name, dim_reorder_params, + aiat, current_indices, var_params, + either_picked_vars = picked_common_vars[[var_to_read]], + either_picked_vars_ordered = picked_common_vars_ordered[[var_to_read]], + either_picked_vars_unorder_indices = picked_common_vars_unorder_indices[[var_to_read]] + ) + picked_common_vars[[var_to_read]] <- list_picked_var_of_read$either_picked_vars + picked_common_vars_ordered[[var_to_read]] <- + list_picked_var_of_read$either_picked_vars_ordered + picked_common_vars_unorder_indices[[var_to_read]] <- + list_picked_var_of_read$either_picked_vars_unorder_indices + + ## (2) return_vars + } else { + var_to_check <- return_vars[[var_to_read]] + list_picked_var_of_read <- generate_picked_var_of_read( + var_to_read, var_to_check, array_of_files_to_load, var_dims, + array_of_var_files = array_of_var_files[j], file_var_reader, + file_object, synonims, associated_dim_name, dim_reorder_params, + aiat, current_indices, var_params, + either_picked_vars = picked_vars[[i]][[var_to_read]], + either_picked_vars_ordered = picked_vars_ordered[[i]][[var_to_read]], + either_picked_vars_unorder_indices = picked_vars_unorder_indices[[i]][[var_to_read]] + ) + picked_vars[[i]][[var_to_read]] <- list_picked_var_of_read$either_picked_vars + picked_vars_ordered[[i]][[var_to_read]] <- + list_picked_var_of_read$either_picked_vars_ordered + picked_vars_unorder_indices[[i]][[var_to_read]] <- + list_picked_var_of_read$either_picked_vars_unorder_indices + } + if (var_to_read %in% names(first_found_file)) { + first_found_file[var_to_read] <- TRUE + } + if (var_to_read %in% names(common_first_found_file)) { + common_first_found_file[var_to_read] <- TRUE + } + } else { + stop("Could not find variable '", var_to_read, + "' in the file ", array_of_var_files[j]) + } + } + file_closer(file_object) + } + } + previous_indices <- current_indices + } + } + } + # Once we have the variable values, we can work out the indices + # for the implicitly defined selectors. + + beta <- transform_extra_cells + dims_to_crop <- vector('list') + transformed_vars <- vector('list', length = length(dat)) + names(transformed_vars) <- dat_names + transformed_vars_ordered <- transformed_vars + transformed_vars_unorder_indices <- transformed_vars + transformed_common_vars <- NULL + transformed_common_vars_ordered <- NULL + transformed_common_vars_unorder_indices <- NULL + transform_crop_domain <- NULL + + # store warning messages from transform + warnings1 <- NULL + warnings2 <- NULL + + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + indices <- indices_of_first_files_with_data[[i]] + if (!is.null(indices)) { + #////////////////////////////////////////////////// + # Find data_dims + ## If largest_dims_length is a number & !merge_across_dims, + ## directly assign this dim as the number; + ## If largest_dims_length is a number & this dim is across files, find the dim length of each file + find_largest_dims_length_by_files <- FALSE + if (is.numeric(largest_dims_length)) { + if (names(largest_dims_length) %in% inner_dims_across_files) { + find_largest_dims_length_by_files <- TRUE + } + } else if (largest_dims_length) { + find_largest_dims_length_by_files <- TRUE + } + + if (!find_largest_dims_length_by_files) { # old code + file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) + # The following 5 lines should go several lines below, but were moved + # here for better performance. + # If any of the dimensions comes without defining variable, then we read + # the data dimensions. + data_dims <- NULL +# if (length(unlist(var_params[expected_inner_dims[[i]]])) < length(expected_inner_dims[[i]])) { + file_to_open <- file_path + data_dims <- file_dim_reader(file_to_open, NULL, selectors_of_first_files_with_data[[i]], + lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1), + synonims) + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(data_dims) <- replace_with_synonmins(data_dims, synonims) +# } + + if (is.numeric(largest_dims_length)) { # largest_dims_length is a named vector + # Check if the names fit the inner dimension names + if (!all(names(largest_dims_length) %in% names(data_dims))) { + #NOTE: stop or warning? + stop("Parameter 'largest_dims_length' has inconsistent names with inner dimensions.") + } else { + match_ind <- match(names(largest_dims_length), names(data_dims)) + data_dims[match_ind] <- largest_dims_length + } + } + + } else { + ## largest_dims_length = TRUE, or is a number & merge_across_dims is across this dim + tmp <- find_largest_dims_length( + selectors_total_list[[i]], array_of_files_to_load, + selector_indices_save[[i]], dat[[i]], expected_inner_dims[[i]], + synonims, file_dim_reader) + data_dims <- tmp$largest_data_dims + # 'data_dims_each_file' is used when merge_across_dims = TRUE & + # the files have different length of inner dim. + data_dims_each_file <- tmp$data_dims_all_files + + # file_dim_reader returns dimension names as found in the file. + # Need to translate accoridng to synonims: + names(data_dims) <- replace_with_synonmins(data_dims, synonims) + + } # end if (largest_dims_length == TRUE) + #////////////////////////////////////////////////// + + # Some dimension is defined in Start() call but doesn't exist in data + if (!all(expected_inner_dims[[i]] %in% names(data_dims))) { + tmp <- expected_inner_dims[[i]][which(!expected_inner_dims[[i]] %in% names(data_dims))] + stop("Could not find the dimension '", tmp, "' in the file. Either ", + "change the dimension name in your request, adjust the ", + "parameter 'dim_names_in_files' or fix the dimension name in ", + "the file.\n", file_path) + } + # Not all the inner dims are defined in Start() call + if (!all(names(data_dims) %in% expected_inner_dims[[i]])) { + tmp <- names(data_dims)[which(!names(data_dims) %in% expected_inner_dims[[i]])] + if (data_dims[tmp] != 1) { + stop("The dimension '", tmp, "' is found in the file ", file_path, + " but not defined in the Start call.") + } + } + + + #/////////////////////////////////////////////////////////////////// + # Transform the variables if needed and keep them apart. + if (!is.null(transform) && (length(transform_vars) > 0)) { + if (!all(transform_vars %in% c(names(picked_vars[[i]]), names(picked_common_vars)))) { + stop("Could not find all the required variables in 'transform_vars' ", + "for the dataset '", dat[[i]][['name']], "'.") + } + + vars_to_transform <- NULL + # picked_vars[[i]] + vars_to_transform <- generate_vars_to_transform(vars_to_transform, picked_vars[[i]], transform_vars, picked_vars_ordered[[i]]) + # picked_common_vars + vars_to_transform <- generate_vars_to_transform(vars_to_transform, picked_common_vars, transform_vars, picked_common_vars_ordered) + + # Save the crop domain from selectors of transformed vars + # PROB: It doesn't consider aiat. If aiat, the indices are for + # after transformed data; we don't know the corresponding + # values yet. + transform_crop_domain <- vector('list') + for (transform_var in transform_vars) { + transform_crop_domain[[transform_var]] <- dat[[i]][['selectors']][[transform_var]][[1]] + # Turn indices into values + if (attr(transform_crop_domain[[transform_var]], 'indices')) { + if (transform_var %in% names(common_return_vars)) { + if (transform_var %in% names(dim_reorder_params)) { + transform_crop_domain[[transform_var]] <- + generate_transform_crop_domain_values( + transform_crop_domain[[transform_var]], + picked_vars = picked_common_vars_ordered[[transform_var]], + transform_var) + } else { + transform_crop_domain[[transform_var]] <- + generate_transform_crop_domain_values( + transform_crop_domain[[transform_var]], + picked_vars = picked_common_vars[[transform_var]], + transform_var) + } + } else { # return_vars + if (transform_var %in% names(dim_reorder_params)) { + transform_crop_domain[[transform_var]] <- + generate_transform_crop_domain_values( + transform_crop_domain[[transform_var]], + picked_vars = picked_vars_ordered[[i]][[transform_var]], + transform_var) + } else { + transform_crop_domain[[transform_var]] <- + generate_transform_crop_domain_values( + transform_crop_domain[[transform_var]], + picked_vars = picked_vars[[i]][[transform_var]], + transform_var) + } + } + } else if (is.atomic(transform_crop_domain[[transform_var]])) { + # if it is values but vector + transform_crop_domain[[transform_var]] <- + c(transform_crop_domain[[transform_var]][1], + tail(transform_crop_domain[[transform_var]], 1)) + } + + # For CDORemapper (not sure if it's also suitable for other transform functions): + # If lon_reorder is not used + lon selector is from big to small, + # lonmax and lonmin need to be exchanged. The ideal way is to + # exchange in CDORemapper(), but lon_reorder is used or not is not + # known by CDORemapper(). + # NOTE: lat's order doesn't matter, big to small and small to big + # both work. Since we shouldn't assume transform_var in Start(), + # e.g., transform_var can be anything transformable in the assigned transform function, + # we exchange whichever parameter here anyway. + if (!transform_var %in% names(dim_reorder_params) & + diff(unlist(transform_crop_domain[[transform_var]])) < 0) { + transform_crop_domain[[transform_var]] <- rev(transform_crop_domain[[transform_var]]) + } + + } + + # Transform the variables + tmp <- .withWarnings( + do.call(transform, c(list(data_array = NULL, + variables = vars_to_transform, + file_selectors = selectors_of_first_files_with_data[[i]], + crop_domain = transform_crop_domain), + transform_params)) + ) + transformed_data <- tmp$value + warnings1 <- c(warnings1, tmp$warnings) + + # Discard the common transformed variables if already transformed before + if (!is.null(transformed_common_vars)) { + common_ones <- which(names(picked_common_vars) %in% names(transformed_data$variables)) + if (length(common_ones) > 0) { + transformed_data$variables <- transformed_data$variables[-common_ones] + } + } + transformed_vars[[i]] <- list() + transformed_vars_ordered[[i]] <- list() + transformed_vars_unorder_indices[[i]] <- list() + # Order the transformed variables if needed + # 'var_to_read' should be 'transformed_var', but is kept to reuse the same code as above. + for (var_to_read in names(transformed_data$variables)) { + if (var_to_read %in% unlist(var_params)) { + associated_dim_name <- names(var_params)[which(unlist(var_params) == var_to_read)] + if ((associated_dim_name %in% names(dim_reorder_params))) { + ## Is this check really needed? + if (length(dim(transformed_data$variables[[associated_dim_name]])) > 1) { + stop("Requested a '", associated_dim_name, "_reorder' for a dimension ", + "whose coordinate variable that has more than 1 dimension (after ", + "transform). This is not supported.") + } + ordered_var_values <- dim_reorder_params[[associated_dim_name]](transformed_data$variables[[associated_dim_name]]) + attr(ordered_var_values, 'variables') <- attr(transformed_data$variables[[associated_dim_name]], 'variables') + if (!all(c('x', 'ix') %in% names(ordered_var_values))) { + stop("All the dimension reorder functions must return a list with the components 'x' and 'ix'.") + } + # Save the indices to reorder back the ordered variable values. + # This will be used to define the first round indices (if aiat) or second round + # indices (if !aiat). + unorder <- sort(ordered_var_values$ix, index.return = TRUE)$ix + if (var_to_read %in% names(picked_common_vars)) { + transformed_common_vars_ordered[[var_to_read]] <- ordered_var_values$x + transformed_common_vars_unorder_indices[[var_to_read]] <- unorder + } else { + transformed_vars_ordered[[i]][[var_to_read]] <- ordered_var_values$x + transformed_vars_unorder_indices[[i]][[var_to_read]] <- unorder + } + } + } + } + transformed_picked_vars_names <- which(names(picked_vars[[i]]) %in% names(transformed_data$variables)) + if (length(transformed_picked_vars_names) > 0) { + transformed_picked_vars_names <- names(picked_vars[[i]])[transformed_picked_vars_names] + transformed_vars[[i]][transformed_picked_vars_names] <- transformed_data$variables[transformed_picked_vars_names] + } + if (is.null(transformed_common_vars)) { + transformed_picked_common_vars_names <- which(names(picked_common_vars) %in% names(transformed_data$variables)) + if (length(transformed_picked_common_vars_names) > 0) { + transformed_picked_common_vars_names <- names(picked_common_vars)[transformed_picked_common_vars_names] + transformed_common_vars <- transformed_data$variables[transformed_picked_common_vars_names] + } + } + } + # Once the variables are transformed, we compute the indices to be + # taken for each inner dimension. + # In all cases, indices will have to be computed to know which data + # values to take from the original data for each dimension (if a + # variable is specified for that dimension, it will be used to + # convert the provided selectors into indices). These indices are + # referred to as 'first round of indices'. + # The taken data will then be transformed if needed, together with + # the dimension variable if specified, and, in that case, indices + # will have to be computed again to know which values to take from the + # transformed data. These are the 'second round of indices'. In the + # case there is no transformation, the second round of indices will + # be all the available indices, i.e. from 1 to the number of taken + # values with the first round of indices. + for (inner_dim in expected_inner_dims[[i]]) { + if (debug) { + print("-> DEFINING INDICES FOR INNER DIMENSION:") + print(inner_dim) + } + crossed_file_dim <- NULL + if (inner_dim %in% unlist(inner_dims_across_files)) { + crossed_file_dim <- names(inner_dims_across_files)[which(sapply(inner_dims_across_files, function(x) inner_dim %in% x))[1]] + chunk_amount <- length(dat[[i]][['selectors']][[crossed_file_dim]][[1]]) + names(chunk_amount) <- crossed_file_dim + } else if (!is.null(names(dim(dat[[i]][['selectors']][[inner_dim]][[1]]))) & + inner_dim %in% names(dim(dat[[i]][['selectors']][[inner_dim]][[1]])) & + any(found_file_dims[[i]] %in% names(dim(dat[[i]][['selectors']][[inner_dim]][[1]])))) { + # inner dim is dependent on file dim in the form of selector array (e.g., time = [sdate = 2, time = 4]) + crossed_file_dim <- found_file_dims[[i]][which(found_file_dims[[i]] %in% + names(dim(dat[[i]][['selectors']][[inner_dim]][[1]])))] + if (length(crossed_file_dim) == 1) { + chunk_amount <- length(dat[[i]][['selectors']][[crossed_file_dim]][[1]]) + names(chunk_amount) <- crossed_file_dim + } else { + # e.g., region = [memb = 2, sdate = 3, region = 1] + chunk_amount <- prod( + sapply(lapply( + dat[[i]][['selectors']][crossed_file_dim], "[[", 1), length)) + names(chunk_amount) <- paste(crossed_file_dim, collapse = '.') + } + } else { + chunk_amount <- 1 + } + # In the special case that the selectors for a dimension are 'all', 'first', ... + # and chunking (dividing in more than 1 chunk) is requested, the selectors are + # replaced for equivalent indices. + if ((any(dat[[i]][['selectors']][[inner_dim]][[1]] %in% c('all', 'first', 'last'))) && + (chunks[[inner_dim]]['n_chunks'] != 1)) { + dat[[i]][['selectors']][[inner_dim]][[1]] <- + replace_character_with_indices(selectors = dat[[i]][['selectors']][[inner_dim]][[1]], data_dims = data_dims[[inner_dim]], chunk_amount) + } + + # The selectors for the inner dimension are taken. + selector_array <- dat[[i]][['selectors']][[inner_dim]][[1]] + if (debug) { + if (inner_dim %in% dims_to_check) { + print(paste0("-> DEBUG MESSAGES FOR THE DATASET", i, " AND INNER DIMENSION '", inner_dim, "':")) + print("-> STRUCTURE OF SELECTOR ARRAY:") + print(str(selector_array)) + print("-> PICKED VARS:") + print(picked_vars) + print("-> TRANSFORMED VARS:") + print(transformed_vars) + } + } + if (is.null(dim(selector_array))) { + dim(selector_array) <- length(selector_array) + } + if (is.null(names(dim(selector_array)))) { + if (length(dim(selector_array)) == 1) { + names(dim(selector_array)) <- inner_dim + } else { + stop("Provided selector arrays must be provided with dimension ", + "names. Found an array of selectors without dimension names ", + "for the dimension '", inner_dim, "'.") + } + } + selectors_are_indices <- FALSE + if (!is.null(attr(selector_array, 'indices'))) { + if (!is.logical(attr(selector_array, 'indices'))) { + stop("The atribute 'indices' for the selectors for the dimension '", + inner_dim, "' must be TRUE or FALSE.") + } + selectors_are_indices <- attr(selector_array, 'indices') + } + taken_chunks <- rep(FALSE, chunk_amount) + selector_file_dims <- 1 + + #NOTE: Change 'selector_file_dims' (from 1) if selector is an array with a file_dim dimname. + # I.e., If time = [sdate = 2, time = 4], selector_file_dims <- c(sdate = 2) + if (any(found_file_dims[[i]] %in% names(dim(selector_array)))) { + selector_file_dims <- dim(selector_array)[which(names(dim(selector_array)) %in% found_file_dims[[i]])] + } + + selector_inner_dims <- dim(selector_array)[which(!(names(dim(selector_array)) %in% found_file_dims[[i]]))] + var_with_selectors <- NULL + var_with_selectors_name <- var_params[[inner_dim]] + var_ordered <- NULL + var_unorder_indices <- NULL + with_transform <- FALSE + #//////////////////////////////////////////////////////////////////// + # If the selectors come with an associated variable + if (!is.null(var_with_selectors_name)) { + if ((var_with_selectors_name %in% transform_vars) && (!is.null(transform))) { + with_transform <- TRUE + if (!is.null(crossed_file_dim)) { + stop("Requested a transformation over the dimension '", + inner_dim, "', wich goes across files. This feature ", + "is not supported. Either do the request without the ", + "transformation or request it over dimensions that do ", + "not go across files.") + } + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> NAME OF THE VARIABLE WITH THE SELECTOR VALUES FOR THE CURRENT INNER DIMENSION:") + print(var_with_selectors_name) + print("-> NAMES OF THE VARIABLES TO BE TRANSFORMED:") + print(transform_vars) + print("-> STRUCTURE OF THE TRANSFORMATION FUNCTION:") + print(str(transform)) + } + } + # For fri + if (var_with_selectors_name %in% names(picked_vars[[i]])) { + var_with_selectors <- picked_vars[[i]][[var_with_selectors_name]] + var_ordered <- picked_vars_ordered[[i]][[var_with_selectors_name]] + var_unorder_indices <- picked_vars_unorder_indices[[i]][[var_with_selectors_name]] + } else if (var_with_selectors_name %in% names(picked_common_vars)) { + var_with_selectors <- picked_common_vars[[var_with_selectors_name]] + var_ordered <- picked_common_vars_ordered[[var_with_selectors_name]] + var_unorder_indices <- picked_common_vars_unorder_indices[[var_with_selectors_name]] + } + n <- prod(dim(var_with_selectors)) + # if no _reorder, var_unorder_indices is NULL + if (is.null(var_unorder_indices)) { + var_unorder_indices <- 1:n + } + # For sri + if (with_transform) { + ## var in 'dat' + if (var_with_selectors_name %in% names(transformed_vars[[i]])) { + m <- prod(dim(transformed_vars[[i]][[var_with_selectors_name]])) + if (aiat) { + var_with_selectors <- transformed_vars[[i]][[var_with_selectors_name]] + var_ordered <- transformed_vars_ordered[[i]][[var_with_selectors_name]] + var_unorder_indices <- transformed_vars_unorder_indices[[i]][[var_with_selectors_name]] + } + # For making sri ordered later + transformed_var_unordered_indices <- transformed_vars_unorder_indices[[i]][[var_with_selectors_name]] + if (is.null(transformed_var_unordered_indices)) { + transformed_var_unordered_indices <- 1:m + } + transformed_var_with_selectors <- transformed_vars[[i]][transformed_picked_vars_names][[var_with_selectors_name]][transformed_var_unordered_indices] + # Sorting the transformed variable and working out the indices again after transform. + if (!is.null(dim_reorder_params[[var_with_selectors_name]])) { + transformed_var_with_selectors_reorder <- dim_reorder_params[[var_with_selectors_name]](transformed_var_with_selectors) + transformed_var_with_selectors <- transformed_var_with_selectors_reorder$x + transformed_var_with_selectors_unorder <- transformed_var_with_selectors_reorder$ix + } else { + transformed_var_with_selectors_unorder <- 1:length(transformed_var_with_selectors) + } + + ## var in common + } else if (var_with_selectors_name %in% names(transformed_common_vars)) { + m <- prod(dim(transformed_common_vars[[var_with_selectors_name]])) + if (aiat) { + var_with_selectors <- transformed_common_vars[[var_with_selectors_name]] + var_ordered <- transformed_common_vars_ordered[[var_with_selectors_name]] + var_unorder_indices <- transformed_common_vars_unorder_indices[[var_with_selectors_name]] + } + # For making sri ordered later + transformed_var_unordered_indices <- transformed_common_vars_unorder_indices[[var_with_selectors_name]] + if (is.null(transformed_var_unordered_indices)) { + transformed_var_unordered_indices <- 1:m + } + transformed_var_with_selectors <- transformed_common_vars[[var_with_selectors_name]][transformed_var_unordered_indices] + # Sorting the transformed variable and working out the indices again after transform. + if (!is.null(dim_reorder_params[[var_with_selectors_name]])) { + transformed_var_with_selectors_reorder <- dim_reorder_params[[var_with_selectors_name]](transformed_var_with_selectors) + transformed_var_with_selectors <- transformed_var_with_selectors_reorder$x + transformed_var_with_selectors_unorder <- transformed_var_with_selectors_reorder$ix + } else { + transformed_var_with_selectors_unorder <- 1:length(transformed_var_with_selectors) + } + } + if (is.null(var_unorder_indices)) { + var_unorder_indices <- 1:m + } + } + + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> SIZE OF ORIGINAL VARIABLE:") + print(n) + print("-> SIZE OF TRANSFORMED VARIABLE:") + if (with_transform) print(m) + print("-> STRUCTURE OF ORDERED VAR:") + print(str(var_ordered)) + print("-> UNORDER INDICES:") + print(var_unorder_indices) + } + } + var_dims <- var_full_dims <- dim(var_with_selectors) + var_file_dims <- 1 + + # If this inner dim's selector (var_with_selectors) is an array + # that has file dim as dimension (e.g., across or depend relationship) + if (any(names(var_dims) %in% found_file_dims[[i]])) { + if (with_transform) { + stop("Requested transformation for inner dimension '", + inner_dim, "' but provided selectors for such dimension ", + "over one or more file dimensions. This is not ", + "supported. Either request no transformation for the ", + "dimension '", inner_dim, "' or specify the ", + "selectors for this dimension without the file dimensions.") + } + var_file_dims <- var_dims[which(names(var_dims) %in% found_file_dims[[i]])] + var_dims <- var_dims[-which(names(var_dims) %in% found_file_dims[[i]])] + } + ## # Keep the selectors if they correspond to a variable that will be transformed. + ## if (with_transform) { + ## if (var_with_selectors_name %in% names(picked_vars[[i]])) { + ## transformed_var_with_selectors <- transformed_vars[[i]][[var_with_selectors_name]] + ## } else if (var_with_selectors_name %in% names(picked_common_vars)) { + ## transformed_var_with_selectors <- transformed_common_vars[[var_with_selectors_name]] + ## } + ## transformed_var_dims <- dim(transformed_var_with_selectors) + ## transformed_var_file_dims <- 1 + ## if (any(names(transformed_var_dims) %in% found_file_dims[[i]])) { + ## transformed_var_file_dims <- transformed_var_dims[which(names(transformed_var_dims) %in% found_file_dims[[i]])] + ## transformed_var_dims <- tranasformed_var_dims[-which(names(transformed_var_dims) %in% found_file_dims[[i]])] + ## } + ##if (inner_dim %in% dims_to_check) { + ##print("111m") + ##print(str(transformed_var_dims)) + ##} + ## + ## m <- prod(transformed_var_dims) + ## } + # Work out var file dims and inner dims. + if (inner_dim %in% unlist(inner_dims_across_files)) { + #TODO: if (chunk_amount != number of chunks in selector_file_dims), crash + if (length(var_dims) > 1) { + stop("Specified a '", inner_dim, "_var' for the dimension '", + inner_dim, "', which goes across files (across '", crossed_file_dim, + "'). The specified variable, '", var_with_selectors_name, "', has more ", + "than one dimension and can not be used as selector variable. ", + "Select another variable or fix it in the files.") + } + } + ## TODO HERE:: + #- indices_of_first_files_with_data may change, because array is now extended + var_full_dims <- dim(var_with_selectors) + } else if (((is.numeric(selector_array) || is.list(selector_array)) && selectors_are_indices) || + (is.character(selector_array) && (length(selector_array) == 1) && + (selector_array %in% c('all', 'first', 'last')) && + !is.null(file_dim_reader))) { + #### TODO HERE:: + ###- indices_of_first_files_with_data may change, because array is now extended + # Lines moved above for better performance. + ##data_dims <- file_dim_reader(file_path, NULL, selectors_of_first_files_with_data[[i]], + ## lapply(dat[[i]][['selectors']][expected_inner_dims[[i]]], '[[', 1)) + } else { + stop(paste0("Can not translate the provided selectors for '", inner_dim, + "' to numeric indices. Provide numeric indices and a ", + "'file_dim_reader' function, or a '", inner_dim, + "_var' in order to calculate the indices.")) + } + # At this point, if no selector variable was provided, the variable + # data_dims has been populated. If a selector variable was provided, + # the variables var_dims, var_file_dims and var_full_dims have been + # populated instead. + #//////////////////////////////////////////////////////////////////// + + # If the inner dim lengths differ among files, + # need to know each length to create the indices for each file later. + # Record 'inner_dim_lengths' here for later usage. + inner_dim_lengths <- NULL + if (largest_dims_length & !is.null(crossed_file_dim)) { + # inner_dim_lengths here includes all the files, but we only want + # the files of fyear for certain "sdate". We will categorize it later. + inner_dim_lengths <- tryCatch({ + sapply(data_dims_each_file, '[[', inner_dim) + }, error = function(x) { + sapply(data_dims_each_file, '[[', + synonims[[inner_dim]][which(synonims[[inner_dim]] != inner_dim)]) + }) + + # Use other file dims as the factors to categorize. + other_file_dims <- dim(array_of_files_to_load)[which(!found_file_dims[[i]] %in% crossed_file_dim)] + other_file_dims <- lapply(lapply(other_file_dims, seq, 1), rev) + other_file_dims_factor <- expand.grid(other_file_dims) + selector_indices_save_subset <- + lapply(selector_indices_save[[i]], '[', which(!found_file_dims[[i]] %in% crossed_file_dim)) + + # Put the fyear with the same other file dims (sdate, etc.) together, and find the largest length (in theory all of them should be the same) + inner_dim_lengths_cat <- vector('list', dim(other_file_dims_factor)[1]) + for (i_factor in 1:length(inner_dim_lengths_cat)) { + + inner_dim_lengths_cat[[i_factor]] <- + inner_dim_lengths[which(sapply(lapply( + selector_indices_save_subset, '==', + other_file_dims_factor[i_factor, ]), all))] + } + # Find the largest length of each time step + inner_dim_lengths <- do.call(pmax, inner_dim_lengths_cat) + } + + fri <- first_round_indices <- NULL + sri <- second_round_indices <- NULL + # This variable will keep the indices needed to crop the transformed + # variable (the one that has been transformed without being subset + # with the first round indices). + tvi <- tranaformed_variable_indices <- NULL + ordered_fri <- NULL + ordered_sri <- NULL + if ((length(selector_array) == 1) && is.character(selector_array) && + (selector_array %in% c('all', 'first', 'last')) && + (chunks[[inner_dim]]['n_chunks'] == 1)) { + if (is.null(var_with_selectors_name)) { + fri <- vector('list', length = chunk_amount) + dim(fri) <- c(chunk_amount) + sri <- vector('list', length = chunk_amount) + dim(sri) <- c(chunk_amount) + if (selector_array == 'all') { + if (is.null(inner_dim_lengths) | length(unique(inner_dim_lengths)) <= 1) { #old code + fri[] <- replicate(chunk_amount, list(1:(data_dims[inner_dim]))) + } else { # files have different inner dim length + for (i_chunk in 1:length(fri)) { + fri[[i_chunk]] <- 1:inner_dim_lengths[i_chunk] + } + } + taken_chunks <- rep(TRUE, chunk_amount) + #sri <- NULL + } else if (selector_array == 'first') { + fri[[1]] <- 1 + taken_chunks[1] <- TRUE + #sri <- NULL + } else if (selector_array == 'last') { + fri[[chunk_amount]] <- data_dims[inner_dim] + taken_chunks[length(taken_chunks)] <- TRUE + #sri <- NULL + } + } else { + if (!is.null(crossed_file_dim) & any(!(crossed_file_dim %in% names(var_file_dims)))) { + stop("The variable '", var_with_selectors_name, "' must also be ", + "requested for the file dimension '", crossed_file_dim, "' in ", + "this configuration.") + } + fri <- vector('list', length = prod(var_file_dims)) + dim(fri) <- var_file_dims + ordered_fri <- fri + sri <- vector('list', length = prod(var_file_dims)) + dim(sri) <- var_file_dims + ordered_sri <- sri + if (selector_array == 'all') { + # TODO: Populate ordered_fri + ordered_fri[] <- replicate(prod(var_file_dims), list(1:n)) + fri[] <- replicate(prod(var_file_dims), list(var_unorder_indices[1:n])) + taken_chunks <- rep(TRUE, chunk_amount) + if (!with_transform) { + #fri[] <- replicate(prod(var_file_dims), list(1:n)) + #taken_chunks <- rep(TRUE, chunk_amount) + #sri <- NULL + } else { + ordered_sri[] <- replicate(prod(var_file_dims), list(1:m)) + if (inner_dim %in% names(dim_reorder_params)) { + sri[] <- replicate(prod(var_file_dims), list(transformed_var_unordered_indices[1:m])) + } else { + sri[] <- replicate(prod(var_file_dims), list(1:m)) + } + ## var_file_dims instead?? + #if (!aiat) { + #fri[] <- replicate(prod(var_file_dims), list(1:n)) + #taken_chunks <- rep(TRUE, chunk_amount) + #sri[] <- replicate(prod(transformed_var_file_dims), list(1:m)) + #} else { + #fri[] <- replicate(prod(var_file_dims), list(1:n)) + #taken_chunks <- rep(TRUE, chunk_amount) + #sri[] <- replicate(prod(transformed_var_file_dims), list(1:m)) + #} + tvi <- 1:m + } + } else if (selector_array == 'first') { + taken_chunks[1] <- TRUE + if (!with_transform) { + ordered_fri[[1]] <- 1 + fri[[1]] <- var_unorder_indices[1] + #taken_chunks[1] <- TRUE + #sri <- NULL + } else { + if (!aiat) { + ordered_fri[[1]] <- 1 + fri[[1]] <- var_unorder_indices[1] + # TODO: TO BE IMPROVED + #taken_chunks[1] <- TRUE + ordered_sri[[1]] <- 1:ceiling(m / n) + sri[[1]] <- 1:ceiling(m / n) + tvi <- 1:ceiling(m / n) + } else { + ordered_fri[[1]] <- 1:ceiling(m / n) + fri[[1]] <- var_unorder_indices[1:ceiling(m / n)] + #taken_chunks[1] <- TRUE + ordered_sri[[1]] <- 1 + sri[[1]] <- 1 + tvi <- 1 + } + } + } else if (selector_array == 'last') { + taken_chunks[length(taken_chunks)] <- TRUE + if (!with_transform) { + ordered_fri[[prod(var_file_dims)]] <- n + fri[[prod(var_file_dims)]] <- var_unorder_indices[n] + #taken_chunks[length(taken_chunks)] <- TRUE + #sri <- NULL + } else { + if (!aiat) { + ordered_fri[[prod(var_file_dims)]] <- prod(var_dims) + fri[[prod(var_file_dims)]] <- var_unorder_indices[prod(var_dims)] + #taken_chunks[length(taken_chunks)] <- TRUE + ordered_sri[[prod(var_file_dims)]] <- 1:ceiling(m / n) + sri[[prod(var_file_dims)]] <- 1:ceiling(m / n) + # TODO: TO BE IMPROVED. THE TVI MAY BE WRONG IF THERE'S BEEN A REORDERING. + tvi <- 1:ceiling(m / n) + } else { + ordered_fri[[prod(var_file_dims)]] <- (n - ceiling(m / n) + 1):n + fri[[prod(var_file_dims)]] <- var_unorder_indices[(n - ceiling(m / n) + 1):n] + #taken_chunks[length(taken_chunks)] <- TRUE + ordered_sri[[prod(var_file_dims)]] <- 1 + sri[[prod(var_file_dims)]] <- 1 + tvi <- 1 + } + } + } + } + # If the selectors are not 'all', 'first', 'last', ... + } else { + if (!is.null(var_with_selectors_name)) { + unmatching_file_dims <- which(!(names(var_file_dims) %in% names(selector_file_dims))) + if ((length(unmatching_file_dims) > 0)) { + raise_error <- FALSE + if (is.null(crossed_file_dim)) { + raise_error <- TRUE + } else { + if (!(length(unmatching_file_dims) == 1 & + names(var_file_dims)[unmatching_file_dims] %in% crossed_file_dim & + inner_dim %in% names(selector_inner_dims))) { + raise_error <- TRUE + } + } + if (raise_error) { + stop("Provided selectors for the dimension '", inner_dim, "' must have as many ", + "file dimensions as the variable the dimension is defined along, '", + var_with_selectors_name, "', with the exceptions of the file pattern dimension ('", + found_pattern_dim, "') and any depended file dimension (if specified as ", + "depended dimension in parameter 'inner_dims_across_files' and the ", + "depending file dimension is present in the provided selector array).") + } + } + if (any(names(selector_file_dims) %in% names(dim(var_with_selectors)))) { + if (any(dim(var_with_selectors)[names(selector_file_dims)] != selector_file_dims)) { + stop("Size of selector file dimensions must match size of the corresponding ", + "variable dimensions.") + } + } + } + ## TODO: If var dimensions are not in the same order as selector dimensions, reorder + if (is.null(names(selector_file_dims))) { + if (is.null(crossed_file_dim)) { + fri_dims <- 1 + } else { + fri_dims <- chunk_amount + names(fri_dims) <- crossed_file_dim + } + } else { + fri_dim_names <- names(selector_file_dims) + if (!is.null(crossed_file_dim)) { + fri_dim_names <- c(fri_dim_names, crossed_file_dim) + } + fri_dim_names <- found_file_dims[[i]][which(found_file_dims[[i]] %in% fri_dim_names)] + fri_dims <- rep(NA, length(fri_dim_names)) + names(fri_dims) <- fri_dim_names + fri_dims[names(selector_file_dims)] <- selector_file_dims + #NOTE: Not sure how it works here, but "chunk_amount" is the same as + # "selector_file_dims" above in the cases we've seen so far, + # and it causes problem when crossed_file_dim is more than one. +# if (!is.null(crossed_file_dim)) { +# fri_dims[crossed_file_dim] <- chunk_amount +# } + } + fri <- vector('list', length = prod(fri_dims)) + dim(fri) <- fri_dims + sri <- vector('list', length = prod(fri_dims)) + dim(sri) <- fri_dims + selector_file_dim_array <- array(1:prod(selector_file_dims), dim = selector_file_dims) + selector_store_position <- fri_dims + for (j in 1:prod(dim(selector_file_dim_array))) { + selector_indices_to_take <- which(selector_file_dim_array == j, arr.ind = TRUE)[1, ] + names(selector_indices_to_take) <- names(selector_file_dims) + selector_store_position[names(selector_indices_to_take)] <- selector_indices_to_take + # "selector_indices_to_take" is an array if "selector_file_dims" is not 1 (if + # selector is an array with a file_dim dimname, i.e., time = [sdate = 2, time = 4]. + if (!is.null(names(selector_indices_to_take))) { + sub_array_of_selectors <- Subset(selector_array, names(selector_indices_to_take), + as.list(selector_indices_to_take), drop = 'selected') + } else { + sub_array_of_selectors <- selector_array + } + + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> ITERATING OVER FILE DIMENSIONS OF THE SELECTORS.") + print("-> STRUCTURE OF A SUB ARRAY:") + print(str(sub_array_of_selectors)) + print("-> STRUCTURE OF THE VARIABLE WITH SELECTORS:") + print(str(var_with_selectors)) + print(dim(var_with_selectors)) + } + } + if (selectors_are_indices) { + sub_array_of_values <- NULL + #} else if (!is.null(var_ordered)) { + # sub_array_of_values <- var_ordered + } else { + if (length(names(var_file_dims)) > 0) { + var_indices_to_take <- selector_indices_to_take[which(names(selector_indices_to_take) %in% names(var_file_dims))] + if (!is.null(names(var_indices_to_take))) { + sub_array_of_values <- Subset(var_with_selectors, names(var_indices_to_take), + as.list(var_indices_to_take), drop = 'selected') + } else { + # time across some file dim (e.g., "file_date") but doesn't have + # this file dim as dimension (e.g., time: [sdate, time]) + sub_array_of_values <- var_with_selectors + } + } else { + sub_array_of_values <- var_with_selectors + } + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> STRUCTURE OF THE SUB ARRAY FROM THE VARIABLE CORRESPONDING TO THE SUB ARRAY OF SELECTORS") + print(str(sub_array_of_values)) + print(dim(sub_array_of_values)) + print("-> NAME OF THE FILE DIMENSION THE CURRENT INNER DIMENSION EXTENDS ALONG:") + print(crossed_file_dim) + } + } + + # The inner dim selector is an array in which have file dim (e.g., time = [sdate = 2, time = 4], + # or the inner dim doesn't go across any file dim (e.g., no time_across = 'sdate') + if ((!is.null(crossed_file_dim) & (any(crossed_file_dim %in% names(selector_file_dims)))) || is.null(crossed_file_dim)) { + if (length(sub_array_of_selectors) > 0) { + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> THE INNER DIMENSION DOES NOT GO ACROSS ANY FILE DIMENSION OR IT DOES BUT IS IN THE PROVIDED SELECTOR ARRAY.") + } + } + if (selectors_are_indices) { + if (!is.null(var_with_selectors_name)) { + max_allowed <- ifelse(aiat, m, n) + } else { + max_allowed <- data_dims[inner_dim] + } + if (any(na.omit(unlist(sub_array_of_selectors)) > max_allowed) || + any(na.omit(unlist(sub_array_of_selectors)) < 1)) { + stop("Provided indices out of range for dimension '", inner_dim, "' ", + "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", + max_allowed, ").") + } + } + + # The selector_checker will return either a vector of indices or a list + # with the first and last desired indices. + #NOTE: goes_across_prime_meridian may be TRUE only if the selector is list of values + goes_across_prime_meridian <- FALSE + is_circular_dim <- FALSE + # If selectors are indices and _reorder = CircularSort() is used, change + # is_circular_dim to TRUE. + if (!is.null(var_ordered) & selectors_are_indices & + !is.null(dim_reorder_params[[inner_dim]])) { + if ('circular' %in% names(attributes(dim_reorder_params[[inner_dim]]))) { + is_circular_dim <- attr(dim_reorder_params[[inner_dim]], "circular") + if (is_circular_dim & is.list(sub_array_of_selectors)) { + tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix + goes_across_prime_meridian <- tmp[1] > tmp[2] + } + } + } + + # If selectors are values and _reorder is defined. + if (!is.null(var_ordered) && !selectors_are_indices) { + if (!is.null(dim_reorder_params[[inner_dim]])) { + if ('circular' %in% names(attributes(dim_reorder_params[[inner_dim]]))) { + is_circular_dim <- attr(dim_reorder_params[[inner_dim]], "circular") + } + if (is.list(sub_array_of_selectors)) { + ## NOTE: The check of 'goes_across_prime_meridian' is moved forward to here. + if (is_circular_dim) { + # NOTE: Use CircularSort() to put the values in the assigned range, and get the order. + # For example, [-10, 20] in CircularSort(0, 360) is [350, 20]. The $ix list is [2, 1]. + # 'goes_across_prime_meridian' means the selector range across the border. For example, + # CircularSort(-180, 180) with selector [170, 190] -> goes_across_prime_meridian = TRUE. + # dim_reorder_params is a list of Reorder function, i.e., + # Sort() or CircularSort(). + tmp <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$ix + goes_across_prime_meridian <- tmp[1] > tmp[2] + } + + #NOTE: HERE change to the same code as below (under 'else'). Not sure why originally + # it uses additional lines, which make reorder not work. + # If "_reorder" is used, here 'sub_array_of_selectors' is adjusted to + # follow the reorder rule. E.g., if lat = values(list(-90, 90)) and + # lat_reorder = Sort(decreasing = T), 'sub_array_of_selectors' changes + # from list(-90, 90) to list(90, -90). + sub_array_of_selectors <- as.list(dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors))$x) + #sub_array_reordered <- dim_reorder_params[[inner_dim]](unlist(sub_array_of_selectors)) + #sub_array_unorder <- sort(sub_array_reordered$ix, index.return = TRUE)$ix + #sub_array_of_selectors <- as.list(sub_array_reordered$x[sub_array_unorder]) + + # Add warning if the boundary is out of range + if (min(unlist(sub_array_of_selectors)) < range(var_ordered)[1]) { + show_out_of_range_warning(inner_dim, range = range(var_ordered), + bound = 'lower') + } + if (max(unlist(sub_array_of_selectors)) > range(var_ordered)[2]) { + show_out_of_range_warning(inner_dim, range = range(var_ordered), bound = 'upper') + } + + + } else { + sub_array_of_selectors <- dim_reorder_params[[inner_dim]](sub_array_of_selectors)$x + } + } + + # NOTE: The ideal solution for selecting indices in goes_across_prime_meridian case + # is modified SelectorCheckor.R. But now SelectorCheckor doesn't know the info of + #goes_across_prime_meridian, so I do the adjustion after calling SelectorCheckor(). + sub_array_of_indices <- selector_checker(sub_array_of_selectors, var_ordered, + tolerance = if (aiat) { + NULL + } else { + tolerance_params[[inner_dim]] + }) + + if (goes_across_prime_meridian & sub_array_of_indices[[1]] < sub_array_of_indices[[2]]) { + if (!(sub_array_of_selectors[[1]] %in% var_ordered)){ + sub_array_of_indices[[1]] <- sub_array_of_indices[[1]] - 1 + } + + if (!(sub_array_of_selectors[[2]] %in% var_ordered)){ + sub_array_of_indices[[2]] <- sub_array_of_indices[[2]] + 1 + } + } + + #NOTE: the possible case? + if (goes_across_prime_meridian & sub_array_of_indices[[1]] > sub_array_of_indices[[2]]) { + stop("The case is goes_across_prime_meridian but no adjustion for the indices!") + } + + if (any(is.na(sub_array_of_indices))) { + + stop(paste0("The selectors of ", inner_dim, + " are out of range [", min(var_ordered), + ", ", max(var_ordered), "].")) + } + + } else { + + # Add warning if the boundary is out of range + if (is.list(sub_array_of_selectors) & !selectors_are_indices) { + if (min(unlist(sub_array_of_selectors)) < min(sub_array_of_values)) { + show_out_of_range_warning(inner_dim, range = range(sub_array_of_values), + bound = 'lower') + } + if (max(unlist(sub_array_of_selectors)) > max(sub_array_of_values)) { + show_out_of_range_warning(inner_dim, range = range(sub_array_of_values), + bound = 'upper') + } + } + + # sub_array_of_values here is NULL if selectors are indices, and + # 'sub_array_of_indices' will be sub_array_of_selectors, i.e., the indices + # assigned (but rounded). + sub_array_of_indices <- selector_checker(sub_array_of_selectors, sub_array_of_values, + tolerance = if (aiat) { + NULL + } else { + tolerance_params[[inner_dim]] + }) + + if (any(is.na(sub_array_of_indices))) { + + stop(paste0("The selectors of ", inner_dim, + " are out of range [", min(sub_array_of_values), + ", ", max(sub_array_of_values), "].")) + } + + } + + #//////////////////////////////////////////////////////////// + # If chunking along this inner dim, this part creates the indices for each chunk. + # For example, if 'sub_array_of_indices' is c(5:10) and chunked into 2, + # 'sub_array_of_indices' becomes c(5:7) for chunk = 1, c(8:10) for chunk = 2. + # If 'sub_array_of_indices' is list(55, 62) and chunked into 2, it becomes + # list(55, 58) for chunk = 1 and list(59, 62) for chunk = 2. + #TODO: The list can be turned into vector here? So afterward no need to judge if + # it is list or vector. + #NOTE: chunking cannot be done if goes_across_prime_meridian = TRUE. + #TODO: Change the algorithm to make chunking works for goes_across_prime_meridian = TRUE. + # If goes_across_prime_meridian = TRUE, "sub_array_of_indices" are not + # continuous numbers. For example, list(37, 1243) means sub_array_of_fri + # that will be generated based on sub_array_of_indices later is c(1:37, 1243:1296). + # the longitude are separated into 2 parts, therefore, cannot be chunked here. + if (chunks[[inner_dim]]["n_chunks"] > 1) { + if (goes_across_prime_meridian) { + stop(paste0("Chunking over ", inner_dim, " that goes across the circular border assigned by '", inner_dim, "_reorder' is not supported by startR now. Adjust the ", inner_dim, " selector to be within the border or change the borders." )) + } + if (!is.list(sub_array_of_indices)) { + sub_array_of_indices <- + sub_array_of_indices[get_chunk_indices(length(sub_array_of_indices), + chunks[[inner_dim]]["chunk"], + chunks[[inner_dim]]["n_chunks"], + inner_dim)] + } else { + tmp <- + get_chunk_indices(length(sub_array_of_indices[[1]]:sub_array_of_indices[[2]]), + chunks[[inner_dim]]["chunk"], chunks[[inner_dim]]["n_chunks"], + inner_dim) + vect <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + sub_array_of_indices[[1]] <- vect[tmp[1]] + sub_array_of_indices[[2]] <- vect[tmp[length(tmp)]] + } + } + # The sub_array_of_indices now contains numeric indices of the values to be taken by each chunk. + #//////////////////////////////////////////////////////////// + + + #---------------------------------------------------------- + # 'sub_sub_array_of_values' is for sri chunking. If this inner dim is chunked, + # the sri has to follow the chunking of fri. Therefore, we save the original + # value of this chunk here for later use. We'll find the corresponding + # transformed value within 'sub_sub_array_of_values' and chunk sri. + if (with_transform & chunks[[inner_dim]]["n_chunks"] > 1) { + if (!is.null(var_ordered)) { #var_ordered + input_array_of_values <- var_ordered + } else { + if (is.null(sub_array_of_values)) { # selectors are indices + #NOTE: Not sure if 'vars_to_transform' is the correct one to use. + input_array_of_values <- vars_to_transform[[var_with_selectors_name]] + } else { + input_array_of_values <- sub_array_of_values + } + } + tmp <- generate_sub_sub_array_of_values( + input_array_of_values, sub_array_of_indices, + number_of_chunk = chunks[[inner_dim]]["chunk"]) + sub_sub_array_of_values <- tmp$sub_sub_array_of_values + previous_sub_sub_array_of_values <- tmp$previous_sub_sub_array_of_values + } + #---------------------------------------------------------- + + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> TRANSFORMATION REQUESTED?") + print(with_transform) + print("-> BETA:") + print(beta) + } + } + if (with_transform) { + # If there is a transformation and selector values are provided, these + # selectors will be processed in the same way either if aiat = TRUE or + # aiat = FALSE. + ## TODO: If sub_array_of_selectors was integer and aiat then... do what's commented 50 lines below. + ## otherwise, do what's coded. + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> SELECTORS REQUESTED BEFORE TRANSFORM.") + } + } + # Generate sub_array_of_fri + sub_array_of_fri <- generate_sub_array_of_fri( + with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta, + is_circular_dim) + # May be useful for crop = T. 'subset_vars_to_transform' may not need + # to include extra cells, but currently it shows mistake if not include. + sub_array_of_fri_no_beta <- generate_sub_array_of_fri( + with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta, + is_circular_dim, add_beta = FALSE) + + subset_vars_to_transform <- vars_to_transform + if (!is.null(var_ordered)) { + + #NOTE: If var_ordered is common_vars, it doesn't have attributes and it is a vector. + # Turn it into array and add dimension name. + if (!is.array(var_ordered)) { + var_ordered <- as.array(var_ordered) + names(dim(var_ordered)) <- inner_dim + } + + subset_vars_to_transform[[var_with_selectors_name]] <- Subset(var_ordered, inner_dim, sub_array_of_fri) + } else { + if (!selectors_are_indices) { # selectors are values + #NOTE: It should be redundant because without reordering the var should remain array + ## But just stay same with above... + if (!is.array(sub_array_of_values)) { + sub_array_of_values <- as.array(sub_array_of_values) + names(dim(sub_array_of_values)) <- inner_dim + } + + subset_vars_to_transform[[var_with_selectors_name]] <- Subset(sub_array_of_values, inner_dim, sub_array_of_fri) + + } else { # selectors are indices + subset_vars_to_transform[[var_with_selectors_name]] <- + Subset(subset_vars_to_transform[[var_with_selectors_name]], + inner_dim, sub_array_of_fri) + } + } + tmp <- .withWarnings( + do.call(transform, c(list(data_array = NULL, + variables = subset_vars_to_transform, + file_selectors = selectors_of_first_files_with_data[[i]], + crop_domain = transform_crop_domain), + transform_params))$variables[[var_with_selectors_name]] + ) + transformed_subset_var <- tmp$value + warnings2 <- c(warnings2, tmp$warnings) + + # Sorting the transformed variable and working out the indices again after transform. + if (!is.null(dim_reorder_params[[inner_dim]])) { + transformed_subset_var_reorder <- dim_reorder_params[[inner_dim]](transformed_subset_var) + transformed_subset_var <- transformed_subset_var_reorder$x + #NOTE: The fix here solves the mis-ordered lon when across_meridian. + transformed_subset_var_unorder <- transformed_subset_var_reorder$ix + # transformed_subset_var_unorder <- sort(transformed_subset_var_reorder$ix, index.return = TRUE)$ix + } else { + transformed_subset_var_unorder <- 1:length(transformed_subset_var) + } + if (!selectors_are_indices) { # selectors are values + sub_array_of_sri <- selector_checker( + sub_array_of_selectors, transformed_subset_var, + tolerance = if (aiat) { + tolerance_params[[inner_dim]] + } else { + NULL + }) + if (!is.list(sub_array_of_sri)) { + sub_array_of_sri <- unique(sub_array_of_sri) + } + } else { # selectors are indices + # Need to transfer to values first, then use the values to get the new + # indices in transformed_subset_var. + if (is.list(sub_array_of_selectors)) { + ori_values <- vars_to_transform[[var_with_selectors_name]][sub_array_of_selectors[[1]]:sub_array_of_selectors[[2]]] + } else { + ori_values <- vars_to_transform[[var_with_selectors_name]][sub_array_of_selectors] + } + sub_array_of_sri <- selector_checker( + ori_values, transformed_subset_var, + tolerance = if (aiat) { + tolerance_params[[inner_dim]] + } else { + NULL + }) + # Here may need to further modify considering aiat. If aiat = FALSE, + # (i.e., indices are taken before transform), unique() is needed. + sub_array_of_sri <- unique(sub_array_of_sri) + } + + # Check if selectors fall out of the range of the transform grid + # It may happen when original lon is [-180, 180] while want to regrid to + # [0, 360], and lon selector = [-20, -10]. + if (any(is.na(sub_array_of_sri))) { + stop(paste0("The selectors of ", + inner_dim, " are out of range of transform grid '", + transform_params$grid, "'. Use parameter '", + inner_dim, "_reorder' or change ", inner_dim, + " selectors.")) + } + + if (goes_across_prime_meridian) { + + if (sub_array_of_sri[[1]] == sub_array_of_sri[[2]]) { + # global longitude + sub_array_of_sri <- c(1:length(transformed_subset_var)) + } else { + # the common case, i.e., non-global +# # NOTE: Because sub_array_of_sri order is exchanged due to +# # previous development, here [[1]] and [[2]] should exchange +# sub_array_of_sri <- c(1:sub_array_of_sri[[1]], +# sub_array_of_sri[[2]]:length(transformed_subset_var)) + #NOTE: the old code above is not suitable for all the possible cases. + # If sub_array_of_selectors is not exactly the value in transformed_subset_var, sub_array_of_sri[[1]] will be larger than sub_array_of_sri[[2]]. + # Though here is not global case, we already have transformed_subset_var cropped as the desired region, so it is okay to use the whole length. Not sure if it will cause other problems... + sub_array_of_sri <- 1:length(transformed_subset_var) + } + + } else if (is.list(sub_array_of_sri)) { + sub_array_of_sri <- sub_array_of_sri[[1]]:sub_array_of_sri[[2]] + } + +#======================================================== + +# Instead of using values to find sri, directly use the destination grid to count. +#NOTE: sub_array_of_sri seems to start at 1 always (because crop = c(lonmin, lonmax, latmin, latmax) already?) + if (chunks[[inner_dim]]["n_chunks"] > 1) { + sub_array_of_sri <- sub_array_of_sri[get_chunk_indices( + length(sub_array_of_sri), + chunks[[inner_dim]]["chunk"], + chunks[[inner_dim]]["n_chunks"], + inner_dim)] + } +#======================================================== + + ordered_sri <- sub_array_of_sri + sub_array_of_sri <- transformed_subset_var_unorder[sub_array_of_sri] + +###########################old################################## +# if (chunks[[inner_dim]]["n_chunks"] > 1) { +# tmp <- which(transformed_subset_var >= min(sub_sub_array_of_values) & +# transformed_subset_var <= max(sub_sub_array_of_values)) +# sub_array_of_sri <- sub_array_of_sri[tmp] +# } +################################################################ + + # In this case, the tvi are not defined and the 'transformed_subset_var' + # will be taken instead of the var transformed before in the code. + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> FIRST INDEX:") +# print(first_index) + print("NOTE: Check function generate_sub_array_of_fri() in zzz.R") + print("-> LAST INDEX:") +# print(last_index) + print("NOTE: Check function generate_sub_array_of_fri() in zzz.R") + print("-> STRUCTURE OF FIRST ROUND INDICES:") + print(str(sub_array_of_fri)) + print("-> STRUCTURE OF SECOND ROUND INDICES:") + print(str(sub_array_of_sri)) + print("-> STRUCTURE OF TRANSFORMED VARIABLE INDICES:") + print(str(tvi)) + } + } + ### # If the selectors are expressed after transformation + ### } else { + ###if (debug) { + ###if (inner_dim %in% dims_to_check) { + ###print("-> SELECTORS REQUESTED AFTER TRANSFORM.") + ###} + ###} + ### if (goes_across_prime_meridian) { + ### sub_array_of_indices <- c(sub_array_of_indices[[1]]:m, + ### 1:sub_array_of_indices[[2]]) + ### } + ### first_index <- min(unlist(sub_array_of_indices)) + ### last_index <- max(unlist(sub_array_of_indices)) + ### first_index_before_transform <- max(transform_indices(first_index, m, n) - beta, 1) + ### last_index_before_transform <- min(transform_indices(last_index, m, n) + beta, n) + ### sub_array_of_fri <- first_index_before_transform:last_index_before_transform + ### n_of_extra_cells <- round(beta / n * m) + ### if (is.list(sub_array_of_indices) && (length(sub_array_of_indices) > 1)) { + ### sub_array_of_sri <- 1:(last_index - first_index + 1) + ### if (is.null(tvi)) { + ### tvi <- sub_array_of_sri + first_index - 1 + ### } + ### } else { + ### sub_array_of_sri <- sub_array_of_indices - first_index + 1 + ### if (is.null(tvi)) { + ### tvi <- sub_array_of_indices + ### } + ### } + ### sub_array_of_sri <- sub_array_of_sri + n_of_extra_cells + sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position), + list(value = sub_array_of_sri))) + + } else { # !with_transform + sub_array_of_fri <- generate_sub_array_of_fri( + with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta, + is_circular_dim) + } + + # Reorder sub_array_of_fri if reordering function is used. + # It was index in the assigned order (e.g., in [-180, 180] if CircularSort(-180, 180)), and here is changed to the index in the original order. + if (!is.null(var_unorder_indices)) { + if (is.null(ordered_fri)) { + ordered_fri <- sub_array_of_fri + } + sub_array_of_fri <- var_unorder_indices[sub_array_of_fri] + } + fri <- do.call('[[<-', c(list(x = fri), as.list(selector_store_position), + list(value = sub_array_of_fri))) + + #NOTE: This part existed always but never was used. taken_chunks + # is related to merge_across_dims, but I don't know how it is + # used (maybe for higher efficiency?) +# if (!is.null(crossed_file_dim)) { +# taken_chunks[selector_store_position[[crossed_file_dim]]] <- TRUE +# } else { + taken_chunks <- TRUE +# } + } + } else { + # The inner dim goes across a file dim (e.g., time_across = 'sdate') + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.") + } + } + # If "_across = + merge_across_dims = FALSE + chunk over ", return error because this instance is not logically correct. + if (chunks[[inner_dim]]["n_chunks"] > 1 & inner_dim %in% inner_dims_across_files & + merge_across_dims == FALSE) { + stop("Chunk over dimension '", inner_dim, "' is not allowed because '", + inner_dim, "' is across '", + names(inner_dims_across_files)[which(inner_dim %in% inner_dims_across_files)], + "' and 'merge_across_dims' is set to FALSE'.") + } + + if (inner_dim %in% names(dim(sub_array_of_selectors))) { + if (is.null(var_with_selectors_name)) { + if (!largest_dims_length | (largest_dims_length & length(unique(inner_dim_lengths)) <= 1)) { #old code + maximal_indice <- data_dims[inner_dim] * chunk_amount + } else { # files have different length of inner dim + maximal_indice <- sum(inner_dim_lengths) + } + + if (any(na.omit(unlist(sub_array_of_selectors)) < 1) || + any(na.omit(unlist(sub_array_of_selectors)) > maximal_indice)) { + stop("Provided indices out of range for dimension '", inner_dim, "' ", + "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", + maximal_indice, ").") + } + } else { + if (inner_dim %in% names(dim(sub_array_of_values))) { + # NOTE: Put across-inner-dim at the 1st position. + # POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_selectors below. + inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_values)) == inner_dim) + if (inner_dim_pos_in_sub_array != 1) { + new_sub_array_order <- (1:length(dim(sub_array_of_values)))[-inner_dim_pos_in_sub_array] + new_sub_array_order <- c(inner_dim_pos_in_sub_array, new_sub_array_order) + sub_array_of_values <- .aperm2(sub_array_of_values, new_sub_array_order) + } + } + } + + # NOTE: Put across-inner-dim at the 1st position. + # POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_values above. + inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_selectors)) == inner_dim) + if (inner_dim_pos_in_sub_array != 1) { + new_sub_array_order <- (1:length(dim(sub_array_of_selectors)))[-inner_dim_pos_in_sub_array] + new_sub_array_order <- c(inner_dim_pos_in_sub_array, new_sub_array_order) + sub_array_of_selectors <- .aperm2(sub_array_of_selectors, new_sub_array_order) + } + sub_array_of_indices <- selector_checker(sub_array_of_selectors, sub_array_of_values, + tolerance = tolerance_params[[inner_dim]]) + # It is needed to expand the indices here, otherwise for + # values(list(date1, date2)) only 2 values are picked. + if (is.list(sub_array_of_indices)) { + sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + } + sub_array_of_indices <- sub_array_of_indices[get_chunk_indices(length(sub_array_of_indices), + chunks[[inner_dim]]['chunk'], + chunks[[inner_dim]]['n_chunks'], + inner_dim)] + sub_array_is_list <- FALSE + if (is.list(sub_array_of_indices)) { + sub_array_is_list <- TRUE + sub_array_of_indices <- unlist(sub_array_of_indices) + } + + # "indices_chunk" refers to in which file the + # sub_array_of_indices is; "transformed_indices" + # refers to the indices of sub_array_of_indices in each file. + if (!largest_dims_length | + (largest_dims_length & length(unique(inner_dim_lengths)) <= 1)) { + # old code; all the files have the same length of inner_dim + if (is.null(var_with_selectors_name)) { + indices_chunk <- floor((sub_array_of_indices - 1) / data_dims[inner_dim]) + 1 + transformed_indices <- ((sub_array_of_indices - 1) %% data_dims[inner_dim]) + 1 + } else { + indices_chunk <- floor((sub_array_of_indices - 1) / var_full_dims[inner_dim]) + 1 + transformed_indices <- ((sub_array_of_indices - 1) %% var_full_dims[inner_dim]) + 1 + } + } else { # files have different inner dim length + indices_chunk <- c() + for (item in 1:length(inner_dim_lengths)) { + tmp <- which(sub_array_of_indices <= cumsum(inner_dim_lengths)[item]) + indices_chunk <- c(indices_chunk, rep(item, length(tmp) - length(indices_chunk))) + } + sub_array_of_indices_by_file <- split(sub_array_of_indices, indices_chunk) + for (item in names((sub_array_of_indices_by_file))) { + # If item is 1, cumsum(inner_dim_lengths)[item - 1] returns numeric(0) + if (as.numeric(item) > 1) { + sub_array_of_indices_by_file[[item]] <- sub_array_of_indices_by_file[[item]] - cumsum(inner_dim_lengths)[as.numeric(item) - 1] + } + } + transformed_indices <- unlist(sub_array_of_indices_by_file, use.names = FALSE) + } + + if (sub_array_is_list) { + sub_array_of_indices <- as.list(sub_array_of_indices) + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> GOING TO ITERATE ALONG CHUNKS.") + } + } + + for (chunk in 1:chunk_amount) { + if (!is.null(names(selector_store_position))) { + selector_store_position[crossed_file_dim] <- chunk + } else { + selector_store_position <- chunk + } + sub_array_of_indices <- transformed_indices[which(indices_chunk == chunk)] + + #NOTE: This 'with_transform' part is probably not tested because + # here is for the inner dim that goes across a file dim, which + # is normally not lat and lon dimension. If in the future, we + # can interpolate time, this part needs to be examined. + if (with_transform) { + # If the provided selectors are expressed in the world + # before transformation + if (!aiat) { + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + sub_array_of_fri <- max(c(first_index - beta, 1)):min(c(last_index + beta, n)) + sub_array_of_sri <- transform_indices(unlist(sub_array_of_indices) - first_index + 1, n, m) + if (is.list(sub_array_of_indices)) { + if (length(sub_array_of_sri) > 1) { + sub_array_of_sri <- sub_array_of_sri[[1]]:sub_array_of_sri[[2]] + } + } + ##TODO: TRANSFORM SUBSET VARIABLE AS ABOVE, TO COMPUTE SRI + # If the selectors are expressed after transformation + } else { + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + first_index_before_transform <- max(transform_indices(first_index, m, n) - beta, 1) + last_index_before_transform <- min(transform_indices(last_index, m, n) + beta, n) + sub_array_of_fri <- first_index_before_transform:last_index_before_transform + if (is.list(sub_array_of_indices) && (length(sub_array_of_indices) > 1)) { + sub_array_of_sri <- 1:(last_index - first_index + 1) + + round(beta / n * m) + } else { + sub_array_of_sri <- sub_array_of_indices - first_index + 1 + + round(beta / n * m) + } + ##TODO: FILL IN TVI + } + sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position), + list(value = sub_array_of_sri))) + if (length(sub_array_of_sri) > 0) { + taken_chunks[chunk] <- TRUE + } + } else { + sub_array_of_fri <- sub_array_of_indices + if (length(sub_array_of_fri) > 0) { + taken_chunks[chunk] <- TRUE + } + } + + if (!is.null(var_unorder_indices)) { + ordered_fri <- sub_array_of_fri + sub_array_of_fri <- var_unorder_indices[sub_array_of_fri] + } + fri <- do.call('[[<-', c(list(x = fri), as.list(selector_store_position), + list(value = sub_array_of_fri))) + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> FINISHED ITERATING ALONG CHUNKS") + } + } + } else { + stop("Provided array of indices for dimension '", inner_dim, "', ", + "which goes across the file dimension '", crossed_file_dim, "', but ", + "the provided array does not have the dimension '", inner_dim, + "', which is mandatory.") + } + } + } + } + if (debug) { + if (inner_dim %in% dims_to_check) { + print("-> PROCEEDING TO CROP VARIABLES") + } + } + #if ((length(selector_array) == 1) && (selector_array %in% c('all', 'first', 'last'))) { + #if (!is.null(var_with_selectors_name) || (is.null(var_with_selectors_name) && is.character(selector_array) && + # (length(selector_array) == 1) && (selector_array %in% c('all', 'first', 'last')))) { + empty_chunks <- which(!taken_chunks) + if (length(empty_chunks) >= length(taken_chunks)) { + stop("Selectors do not match any of the possible values for the dimension '", inner_dim, "'.") + } + if (length(empty_chunks) > 0) { + # # Get the first group of chunks to remove, and remove them. + # # E.g., from c(1, 2, 4, 5, 6, 8, 9) remove only 1 and 2 + # dist <- abs(rev(empty_chunks) - c(rev(empty_chunks)[1] - 1, head(rev(empty_chunks), length(rev(empty_chunks)) - 1))) + # if (all(dist == 1)) { + # start_chunks_to_remove <- NULL + # } else { + # first_chunk_to_remove <- tail(which(dist > 1), 1) + # start_chunks_to_remove <- rev(rev(empty_chunks)[first_chunk_to_remove:length(empty_chunks)]) + # } + # # Get the last group of chunks to remove, and remove them. + # # E.g., from c(1, 2, 4, 5, 6, 8, 9) remove only 8 and 9 + # dist <- abs(empty_chunks - c(empty_chunks[1] - 1, head(empty_chunks, length(empty_chunks) - 1))) + # if (all(dist == 1)) { + # first_chunk_to_remove <- 1 + # } else { + # first_chunk_to_remove <- tail(which(dist > 1), 1) + # } + # end_chunks_to_remove <- empty_chunks[first_chunk_to_remove:length(empty_chunks)] + # chunks_to_keep <- which(!((1:length(taken_chunks)) %in% c(start_chunks_to_remove, end_chunks_to_remove))) + chunks_to_keep <- which(taken_chunks) + dims_to_crop[[crossed_file_dim]] <- c(dims_to_crop[[crossed_file_dim]], list(chunks_to_keep)) + # found_indices <- Subset(found_indices, crossed_file_dim, chunks_to_keep) + # # Crop dataset variables file dims. + # for (picked_var in names(picked_vars[[i]])) { + # if (crossed_file_dim %in% names(dim(picked_vars[[i]][[picked_var]]))) { + # picked_vars[[i]][[picked_var]] <- Subset(picked_vars[[i]][[picked_var]], crossed_file_dim, chunks_to_keep) + # } + # } + } + #} + dat[[i]][['selectors']][[inner_dim]] <- list(fri = fri, sri = sri) + # Crop dataset variables inner dims. + # Crop common variables inner dims. + types_of_var_to_crop <- 'picked' + if (with_transform) { + types_of_var_to_crop <- c(types_of_var_to_crop, 'transformed') + } + if (!is.null(dim_reorder_params[[inner_dim]])) { + types_of_var_to_crop <- c(types_of_var_to_crop, 'reordered') + } + for (type_of_var_to_crop in types_of_var_to_crop) { + if (type_of_var_to_crop == 'transformed') { + if (is.null(tvi)) { + if (!is.null(dim_reorder_params[[inner_dim]])) { + crop_indices <- unique(unlist(ordered_sri)) + } else { + crop_indices <- unique(unlist(sri)) + } + } else { + crop_indices <- unique(unlist(tvi)) + } + vars_to_crop <- transformed_vars[[i]] + common_vars_to_crop <- transformed_common_vars + } else if (type_of_var_to_crop == 'reordered') { + crop_indices <- unique(unlist(ordered_fri)) + vars_to_crop <- picked_vars_ordered[[i]] + common_vars_to_crop <- picked_common_vars_ordered + } else { + #TODO: If fri has different indices in each list, the crop_indices should be + # separated for each list. Otherwise, picked_common_vars later will be wrong. + crop_indices <- unique(unlist(fri)) + vars_to_crop <- picked_vars[[i]] + common_vars_to_crop <- picked_common_vars + } + for (var_to_crop in names(vars_to_crop)) { + if (inner_dim %in% names(dim(vars_to_crop[[var_to_crop]]))) { + if (!is.null(crop_indices)) { + if (type_of_var_to_crop == 'transformed') { + if (!aiat) { + if (!(length(selector_array) == 1 & + all(selector_array %in% c('all', 'first', 'last')))) { + vars_to_crop[[var_to_crop]] <- Subset(transformed_subset_var, inner_dim, crop_indices) + } else { + vars_to_crop[[var_to_crop]] <- + Subset(transformed_var_with_selectors, inner_dim, crop_indices) + } + } else { + vars_to_crop[[var_to_crop]] <- Subset(vars_to_crop[[var_to_crop]], inner_dim, crop_indices) + } + } else { + vars_to_crop[[var_to_crop]] <- Subset(vars_to_crop[[var_to_crop]], inner_dim, crop_indices) + } + } + } + } + if (i == length(dat)) { + for (common_var_to_crop in names(common_vars_to_crop)) { + if (inner_dim %in% names(dim(common_vars_to_crop[[common_var_to_crop]]))) { + + if (type_of_var_to_crop == 'transformed' & !aiat) { + if (!(length(selector_array) == 1 & + all(selector_array %in% c('all', 'first', 'last')))) { + common_vars_to_crop[[common_var_to_crop]] <- + Subset(transformed_subset_var, inner_dim, crop_indices) + } else { + common_vars_to_crop[[common_var_to_crop]] <- + Subset(transformed_var_with_selectors, inner_dim, crop_indices) + } + } else { + if (!is.null(crossed_file_dim)) { #merge_across_dims, crossed_file_dim is the depended file dim + #NOTE: When is not this case??? Maybe this condition is not needed + if (any(crossed_file_dim %in% names(dim(common_vars_to_crop[[common_var_to_crop]])))) { + tmp <- common_vars_to_crop[[common_var_to_crop]] + tmp_attributes <- attributes(common_vars_to_crop[[common_var_to_crop]]) + dim_extra_ind <- which(!names(dim(tmp)) %in% c(crossed_file_dim, inner_dim)) + if (!identical(dim_extra_ind, integer(0))) { + tmp_list <- asplit(tmp, dim_extra_ind) + dim_file_ind <- which(names(dim(tmp_list[[1]])) %in% crossed_file_dim) + tmp_list <- lapply(tmp_list, asplit, dim_file_ind) + } else { # only crossed_file_dim and inner_dim + dim_file_ind <- which(names(dim(tmp)) %in% crossed_file_dim) + tmp_list <- asplit(tmp, dim_file_ind) + # Add another layer to be consistent with the first case above + tmp_list <- list(tmp_list) + } + max_fri_length <- max(sapply(fri, length)) + for (i_extra_dim in 1:length(tmp_list)) { + for (i_fri in 1:length(fri)) { + tmp_list[[i_extra_dim]][[i_fri]] <- + tmp_list[[i_extra_dim]][[i_fri]][fri[[i_fri]]] + + if (length(tmp_list[[i_extra_dim]][[i_fri]]) != max_fri_length) { + tmp_list[[i_extra_dim]][[i_fri]] <- + c(tmp_list[[i_extra_dim]][[i_fri]], rep(NA, max_fri_length - length(tmp_list[[i_extra_dim]][[i_fri]]))) + } + } + } + # Change list back to array + tmp_new_dim <- c(max_fri_length, dim(tmp)[crossed_file_dim], dim(tmp)[dim_extra_ind]) + names(tmp_new_dim) <- c(inner_dim, crossed_file_dim, names(dim(tmp))[dim_extra_ind]) + common_vars_to_crop[[common_var_to_crop]] <- + array(unlist(tmp_list), dim = tmp_new_dim) + # Reorder back + common_vars_to_crop[[common_var_to_crop]] <- + aperm(common_vars_to_crop[[common_var_to_crop]], match(names(dim(tmp)), names(tmp_new_dim))) + # Put attributes back + tmp <- which(!names(tmp_attributes) %in% names(attributes(common_vars_to_crop[[common_var_to_crop]]))) + attributes(common_vars_to_crop[[common_var_to_crop]]) <- + c(attributes(common_vars_to_crop[[common_var_to_crop]]), + tmp_attributes[tmp]) + + if ('time' %in% synonims[[common_var_to_crop]]) { + # Convert number back to time + common_vars_to_crop[[common_var_to_crop]] <- + as.POSIXct(common_vars_to_crop[[common_var_to_crop]], + origin = "1970-01-01", tz = 'UTC') + } + } + } else { # old code + + common_vars_to_crop[[common_var_to_crop]] <- Subset(common_vars_to_crop[[common_var_to_crop]], inner_dim, crop_indices) + } + + } + + } + } + } + if (type_of_var_to_crop == 'transformed') { + if (!is.null(vars_to_crop)) { + transformed_vars[[i]] <- vars_to_crop + } + if (i == length(dat)) { + transformed_common_vars <- common_vars_to_crop + } + } else if (type_of_var_to_crop == 'reordered') { + if (!is.null(vars_to_crop)) { + picked_vars_ordered[[i]] <- vars_to_crop + } + if (i == length(dat)) { + picked_common_vars_ordered <- common_vars_to_crop + } + } else { + if (!is.null(vars_to_crop)) { + picked_vars[[i]] <- vars_to_crop + } + if (i == length(dat)) { + #NOTE: To avoid redundant run + if (inner_dim %in% names(common_vars_to_crop)) { + picked_common_vars <- common_vars_to_crop + } + } + } + } + #} + } + # After the selectors have been picked (using the original variables), + # the variables are transformed. At that point, the original selectors + # for the transformed variables are also kept in the variable original_selectors. + #print("L") + } + } + } + # if (!is.null(transformed_common_vars)) { + # picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars + # } + # Remove the trailing chunks, if any. + for (file_dim in names(dims_to_crop)) { + # indices_to_keep <- min(sapply(dims_to_crop[[file_dim]], min)):max(sapply(dims_to_crop[[file_dim]], max)) + ## TODO: Merge indices in dims_to_crop with some advanced mechanism? + indices_to_keep <- unique(unlist(dims_to_crop[[file_dim]])) + array_of_files_to_load <- Subset(array_of_files_to_load, file_dim, indices_to_keep) + array_of_not_found_files <- Subset(array_of_not_found_files, file_dim, indices_to_keep) + for (i in 1:length(dat)) { + # Crop selectors + for (selector_dim in names(dat[[i]][['selectors']])) { + if (selector_dim == file_dim) { + for (j in 1:length(dat[[i]][['selectors']][[selector_dim]][['fri']])) { + dat[[i]][['selectors']][[selector_dim]][['fri']][[j]] <- dat[[i]][['selectors']][[selector_dim]][['fri']][[j]][indices_to_keep] + } + for (j in 1:length(dat[[i]][['selectors']][[selector_dim]][['sri']])) { + dat[[i]][['selectors']][[selector_dim]][['sri']][[j]] <- dat[[i]][['selectors']][[selector_dim]][['sri']][[j]][indices_to_keep] + } + } + if (file_dim %in% names(dim(dat[[i]][['selectors']][[selector_dim]][['fri']]))) { + dat[[i]][['selectors']][[selector_dim]][['fri']] <- Subset(dat[[i]][['selectors']][[selector_dim]][['fri']], file_dim, indices_to_keep) + dat[[i]][['selectors']][[selector_dim]][['sri']] <- Subset(dat[[i]][['selectors']][[selector_dim]][['sri']], file_dim, indices_to_keep) + } + } + # Crop dataset variables file dims. + for (picked_var in names(picked_vars[[i]])) { + if (file_dim %in% names(dim(picked_vars[[i]][[picked_var]]))) { + picked_vars[[i]][[picked_var]] <- Subset(picked_vars[[i]][[picked_var]], file_dim, indices_to_keep) + } + } + for (transformed_var in names(transformed_vars[[i]])) { + if (file_dim %in% names(dim(transformed_vars[[i]][[transformed_var]]))) { + transformed_vars[[i]][[transformed_var]] <- Subset(transformed_vars[[i]][[transformed_var]], file_dim, indices_to_keep) + } + } + } + # Crop common variables file dims. + for (picked_common_var in names(picked_common_vars)) { + if (file_dim %in% names(dim(picked_common_vars[[picked_common_var]]))) { + picked_common_vars[[picked_common_var]] <- Subset(picked_common_vars[[picked_common_var]], file_dim, indices_to_keep) + } + } + for (transformed_common_var in names(transformed_common_vars)) { + if (file_dim %in% names(dim(transformed_common_vars[[transformed_common_var]]))) { + transformed_common_vars[[transformed_common_var]] <- Subset(transformed_common_vars[[transformed_common_var]], file_dim, indices_to_keep) + } + } + } + # Calculate the size of the final array. + total_inner_dims <- NULL + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + inner_dims <- expected_inner_dims[[i]] + inner_dims <- sapply(inner_dims, + function(x) { + if (!all(sapply(dat[[i]][['selectors']][[x]][['sri']], is.null))) { + max(sapply(dat[[i]][['selectors']][[x]][['sri']], length)) + } else { + if (length(var_params[[x]]) > 0) { + if (var_params[[x]] %in% names(transformed_vars[[i]])) { + length(transformed_vars[[i]][[var_params[[x]]]]) + } else if (var_params[[x]] %in% names(transformed_common_vars)) { + length(transformed_common_vars[[var_params[[x]]]]) + } else { + max(sapply(dat[[i]][['selectors']][[x]][['fri']], length)) + } + } else { + max(sapply(dat[[i]][['selectors']][[x]][['fri']], length)) + } + } + }) + names(inner_dims) <- expected_inner_dims[[i]] + if (is.null(total_inner_dims)) { + total_inner_dims <- inner_dims + } else { + new_dims <- .MergeArrayDims(total_inner_dims, inner_dims) + total_inner_dims <- new_dims[[3]] + } + } + } + new_dims <- .MergeArrayDims(dim(array_of_files_to_load), total_inner_dims) + final_dims <- new_dims[[3]][dim_names] + # final_dims_fake is the vector of final dimensions after having merged the + # 'across' file dimensions with the respective 'across' inner dimensions, and + # after having broken into multiple dimensions those dimensions for which + # multidimensional selectors have been provided. + # final_dims will be used for collocation of data, whereas final_dims_fake + # will be used for shaping the final array to be returned to the user. + final_dims_fake <- final_dims + if (merge_across_dims) { + final_dims_fake <- dims_merge(inner_dims_across_files, final_dims_fake) + } + #========================================================================= + # Find the dimension to split if split_multiselected_dims = TRUE. + # If there is no dimension able to be split, change split_multiselected_dims to FALSE. + all_split_dims <- NULL + inner_dim_has_split_dim <- NULL + if (split_multiselected_dims) { + tmp <- dims_split(dim_params, final_dims_fake) + final_dims_fake <- tmp[[1]] + # all_split_dims is a list containing all the split dims + all_split_dims <- tmp[[2]] + + if (is.null(all_split_dims)) { + split_multiselected_dims <- FALSE + .warning(paste0("Not found any dimensions able to be split. The parameter ", + "'split_multiselected_dims' is changed to FALSE.")) + } else { + tmp_fun <- function (x, y) { + any(names(dim(x)) %in% y) + } + if (!is.null(picked_common_vars)) { + inner_dim_has_split_dim <- names(which(unlist(lapply( + picked_common_vars, tmp_fun, names(all_split_dims))))) + if (!identical(inner_dim_has_split_dim, character(0))) { + # If merge_across_dims also, it will be replaced later + saved_reshaped_attr <- attr(picked_common_vars[[inner_dim_has_split_dim]], 'variables') + } + } + } + } + #====================================================================== + # If only merge_across_dims and merge_across_dims_narm and no split_multiselected_dims, + # the length of inner across dim (e.g., time) needs to be adjusted. Sum up the actual length + # without potential NAs. + if (merge_across_dims) { + # Prepare the arguments for later use + across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? + # Get the length of each inner_dim ('time') along each file_dim ('file_date') + length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) + dims_of_merge_dim <- dim(picked_common_vars[[across_inner_dim]]) + # Save attributes for later use. If split_multiselected_dims, this variable has been created above but is replaced here + saved_reshaped_attr <- attr(picked_common_vars[[across_inner_dim]], 'variables') + + if (merge_across_dims_narm & !split_multiselected_dims) { + final_dims_fake <- merge_narm_dims(final_dims_fake, across_inner_dim, length_inner_across_dim) + } + } + + if (!silent) { + .message("Detected dimension sizes:") + longest_dim_len <- max(sapply(names(final_dims_fake), nchar)) + longest_size_len <- max(sapply(paste0(final_dims_fake, ''), nchar)) + sapply(names(final_dims_fake), + function(x) { + message(paste0("* ", paste(rep(' ', longest_dim_len - nchar(x)), collapse = ''), + x, ": ", paste(rep(' ', longest_size_len - nchar(paste0(final_dims_fake[x], ''))), collapse = ''), + final_dims_fake[x])) + }) + bytes <- prod(c(final_dims_fake, 8)) + dim_sizes <- paste(final_dims_fake, collapse = ' x ') + if (retrieve) { + .message(paste("Total size of requested data:")) + } else { + .message(paste("Total size of involved data:")) + } + .message(paste(dim_sizes, " x 8 bytes =", + format(structure(bytes, class = "object_size"), units = "auto")), + indent = 2) + } + + # NOTE: If split_multiselected_dims + merge_across_dims, the dim order may need to be changed. + # The inner_dim needs to be the first dim among split dims. + # TODO: Cannot control the rest dims are in the same order or not... + # Suppose users put the same order of across inner and file dims. + if (split_multiselected_dims & merge_across_dims) { + # TODO: More than one split? + inner_dim_pos_in_split_dims <- which(names(all_split_dims[[1]]) == inner_dims_across_files) + + # if inner_dim is not the first, change! + if (inner_dim_pos_in_split_dims != 1) { + # Save the current final_dims_fake for reordering it back later + final_dims_fake_output <- final_dims_fake + tmp <- reorder_split_dims(all_split_dims[[1]], inner_dim_pos_in_split_dims, final_dims_fake) + final_dims_fake <- tmp[[1]] + all_split_dims[[1]] <- tmp[[2]] + } + } + if (merge_across_dims | split_multiselected_dims) { + if (!merge_across_dims & split_multiselected_dims & identical(inner_dim_has_split_dim, character(0))) { + final_dims_fake_metadata <- NULL + } else { + if (!merge_across_dims & split_multiselected_dims & !is.null(picked_common_vars)) { + if (any(names(all_split_dims[[1]]) %in% names(dim(picked_common_vars[[inner_dim_has_split_dim]]))) & + names(all_split_dims)[1] != inner_dim_has_split_dim) { + if (inner_dim_has_split_dim %in% names(final_dims)) { + stop("Detect inner dimension in the split array, but merge_across_dims is not used. The output dimensions will be repeated. Check if the dimensions and parameters are correctly defined.") + } else { + # Only split no merge, time dim is not explicitly defined because the + # length is 1, the sdate dim to be split having 'time' as one dimension. + # --> Take 'time' dim off from picked_common_vars. + dim(picked_common_vars[[inner_dim_has_split_dim]]) <- dim(picked_common_vars[[inner_dim_has_split_dim]])[-which(names(dim(picked_common_vars[[inner_dim_has_split_dim]])) == inner_dim_has_split_dim)] + } + } + } + final_dims_fake_metadata <- find_final_dims_fake_metadata( + merge_across_dims, split_multiselected_dims, picked_common_vars = picked_common_vars[[inner_dim_has_split_dim]], across_inner_dim, + final_dims_fake, dims_of_merge_dim, all_split_dims) + } + } + + # store warning messages from transform + warnings3 <- NULL + + # The following several lines will only run if retrieve = TRUE + if (retrieve) { + + ########## CREATING THE SHARED MATRIX AND DISPATCHING WORK PIECES ########### + # TODO: try performance of storing all in cols instead of rows + # Create the shared memory array, and a pointer to it, to be sent + # to the work pieces. + if (is.null(ObjectBigmemory)) { + data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1) + } else { + data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1, + backingfile = ObjectBigmemory, + init = NA) + } + shared_matrix_pointer <- bigmemory::describe(data_array) + if (is.null(ObjectBigmemory)) { + name_bigmemory_obj <- attr(shared_matrix_pointer, 'description')$sharedName + } else { + name_bigmemory_obj <- attr(shared_matrix_pointer, 'description')$filename + } + + #warning(paste("SharedName:", attr(shared_matrix_pointer, 'description')$sharedName)) + #warning(paste("Filename:", attr(shared_matrix_pointer, 'description')$filename)) + #if (!is.null(ObjectBigmemory)) { + # attr(shared_matrix_pointer, 'description')$sharedName <- ObjectBigmemory + #} + if (is.null(num_procs)) { + num_procs <- future::availableCores() + } + # Creating a shared tmp folder to store metadata from each chunk + array_of_metadata_flags <- array(FALSE, dim = dim(array_of_files_to_load)) + if (!is.null(metadata_dims)) { + metadata_indices_to_load <- as.list(rep(1, length(dim(array_of_files_to_load)))) + names(metadata_indices_to_load) <- names(dim(array_of_files_to_load)) + metadata_indices_to_load[metadata_dims] <- as.list(rep(TRUE, length(metadata_dims))) + array_of_metadata_flags <- do.call('[<-', c(list(array_of_metadata_flags), metadata_indices_to_load, + list(value = rep(TRUE, prod(dim(array_of_files_to_load)[metadata_dims]))))) + } + metadata_file_counter <- 0 + metadata_folder <- tempfile('metadata') + dir.create(metadata_folder) + # Build the work pieces, each with: + # - file path + # - total size (dims) of store array + # - start position in store array + # - file selectors (to provide extra info. useful e.g. to select variable) + # - indices to take from file + work_pieces <- list() + for (i in 1:length(dat)) { + if (dataset_has_files[i]) { + # metadata_file_counter may be changed by the following function + work_pieces <- build_work_pieces( + work_pieces = work_pieces, i = i, selectors = dat[[i]][['selectors']], + file_dims = found_file_dims[[i]], + inner_dims = expected_inner_dims[[i]], final_dims = final_dims, + found_pattern_dim = found_pattern_dim, + inner_dims_across_files = inner_dims_across_files, + array_of_files_to_load = array_of_files_to_load, + array_of_not_found_files = array_of_not_found_files, + array_of_metadata_flags = array_of_metadata_flags, + metadata_file_counter = metadata_file_counter, + depending_file_dims = depending_file_dims, transform = transform, + transform_vars = transform_vars, picked_vars = picked_vars[[i]], + picked_vars_ordered = picked_vars_ordered[[i]], + picked_common_vars = picked_common_vars, + picked_common_vars_ordered = picked_common_vars_ordered, + metadata_folder = metadata_folder, debug = debug) + } + } + #print("N") + if (debug) { + print("-> WORK PIECES BUILT") + } + + # Calculate the progress %s that will be displayed and assign them to + # the appropriate work pieces. + work_pieces <- retrieve_progress_message(work_pieces, num_procs, silent) + + + # NOTE: In .LoadDataFile(), metadata is saved in metadata_folder/1 or /2 etc. Before here, + # the path name is created in work_pieces but the path hasn't been built yet. + if (num_procs == 1) { + tmp <- .withWarnings( + lapply(work_pieces, .LoadDataFile, + shared_matrix_pointer = shared_matrix_pointer, + file_data_reader = file_data_reader, + synonims = synonims, + transform = transform, + transform_params = transform_params, + transform_crop_domain = transform_crop_domain, + silent = silent, debug = debug) + ) + found_files <- tmp$value + warnings3 <- c(warnings3, tmp$warnings) + + } else { + cluster <- parallel::makeCluster(num_procs, outfile = "") + # Send the heavy work to the workers + ##NOTE: .withWarnings() can't catch warnings like it does above (num_procs == 1). The warnings + ## show below when "bigmemory::as.matrix(data_array)" is called. Don't know how to fix it for now. + work_errors <- try({ + found_files <- parallel::clusterApplyLB(cluster, work_pieces, .LoadDataFile, + shared_matrix_pointer = shared_matrix_pointer, + file_data_reader = file_data_reader, + synonims = synonims, + transform = transform, + transform_params = transform_params, + transform_crop_domain = transform_crop_domain, + silent = silent, debug = debug) + }) + parallel::stopCluster(cluster) + } + + if (!silent) { + # if (progress_message != '') + if (length(work_pieces) / num_procs >= 2 && !silent) { + .message("\n", tag = '') + } + } + #print("P") + + # If merge_across_dims = TRUE, there might be additional NAs due to unequal + # inner_dim ('time') length across file_dim ('file_date'). + # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. + # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. + if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { + if (!merge_across_dims_narm) { + data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) + tmp <- match(names(final_dims), names(dims_of_merge_dim)) + if (any(diff(tmp[!is.na(tmp)]) < 0)) { #need to reorder + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp[!is.na(tmp)]) + } + metadata_tmp <- picked_common_vars[[across_inner_dim]] + } else { + tmp <- remove_additional_na_from_merge( + data_array = bigmemory::as.matrix(data_array), + merge_dim_metadata = picked_common_vars[[across_inner_dim]], + inner_dims_across_files, final_dims, + length_inner_across_dim) + data_array_tmp <- tmp$data_array + metadata_tmp <- tmp$merge_dim_metadata + } + + if (length(data_array_tmp) != prod(final_dims_fake)) { + stop(paste0("After reshaping, the data do not fit into the expected output dimension. ", + "Check if the reshaping parameters are used correctly.")) + } + if (length(metadata_tmp) != prod(final_dims_fake_metadata)) { + stop(paste0("After reshaping, the metadata do not fit into the expected output dimension. ", + "Check if the reshaping parameters are used correctly or contact support.")) + } + + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. + if (split_multiselected_dims) { + tmp <- rebuild_array_merge_split( + data_array = data_array_tmp, metadata = metadata_tmp, indices_chunk, + all_split_dims, final_dims_fake, across_inner_dim, length_inner_across_dim) + data_array_tmp <- tmp$data_array + metadata_tmp <- tmp$metadata + } + + data_array <- array(data_array_tmp, dim = final_dims_fake) + metadata_tmp <- array(metadata_tmp, dim = final_dims_fake_metadata) + + # If split_multiselected_dims + merge_across_dims, the dimension order may change above. + # To get the user-required dim order, we need to reorder the array again. + if (split_multiselected_dims) { + if (inner_dim_pos_in_split_dims != 1) { + correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) + data_array <- .aperm2(data_array, correct_order) + correct_order_metadata <- match(names(final_dims_fake_output), names(all_split_dims[[across_inner_dim]])) + metadata_tmp <- .aperm2(metadata_tmp, correct_order_metadata[!is.na(correct_order_metadata)]) + } + } + # Convert numeric back to dates + if ('time' %in% synonims[[across_inner_dim]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + + picked_common_vars[[across_inner_dim]] <- metadata_tmp + attr(picked_common_vars[[across_inner_dim]], 'variables') <- saved_reshaped_attr + + } else { # ! (merge_across_dims + split_multiselected_dims) (old version) + data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) + if (merge_across_dims) { + # merge_across_dims = TRUE but (merge_across_dims_narm = F & split_multiselected_dims = F) + + inner_dim_pos <- which(names(dims_of_merge_dim) == inner_dims_across_files) + file_dim_pos <- which(names(dims_of_merge_dim) == names(inner_dims_across_files)) + if (file_dim_pos < inner_dim_pos) { #need to reorder + tmp <- seq(1, length(dims_of_merge_dim)) + tmp[inner_dim_pos] <- file_dim_pos + tmp[file_dim_pos] <- inner_dim_pos + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp) + } + metadata_tmp <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_metadata) + # Convert numeric back to dates + if ('time' %in% synonims[[across_inner_dim]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[across_inner_dim]] <- metadata_tmp + attr(picked_common_vars[[across_inner_dim]], 'variables') <- saved_reshaped_attr + } + if (split_multiselected_dims & !is.null(picked_common_vars)) { + if (!identical(inner_dim_has_split_dim, character(0))) { + metadata_tmp <- array(picked_common_vars[[inner_dim_has_split_dim]], dim = final_dims_fake_metadata) + # Convert numeric back to dates + if (inherits(picked_common_vars[[inner_dim_has_split_dim]], 'POSIXct')) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[inner_dim_has_split_dim]] <- metadata_tmp + attr(picked_common_vars[[inner_dim_has_split_dim]], 'variables') <- saved_reshaped_attr + } + } + } + + gc() + + # Load metadata and remove the metadata folder + if (!is.null(metadata_dims)) { + loaded_metadata_files <- list.files(metadata_folder) + + if (!identical(loaded_metadata_files, character(0))) { # old code + loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS) + } else { + loaded_metadata <- NULL + } + + unlink(metadata_folder, recursive = TRUE) + + # Create a list of metadata of the variable (e.g., tas) + return_metadata <- create_metadata_list(array_of_metadata_flags, metadata_dims, pattern_dims, + loaded_metadata_files, loaded_metadata, dat_names, + dataset_has_files) + # TODO: Try to infer data type from loaded_metadata + # as.integer(data_array) + } + + failed_pieces <- work_pieces[which(unlist(found_files))] + for (failed_piece in failed_pieces) { + array_of_not_found_files <- do.call('[<-', + c(list(array_of_not_found_files), + as.list(failed_piece[['file_indices_in_array_of_files']]), + list(value = TRUE))) + } + if (any(array_of_not_found_files)) { + for (i in 1:prod(dim(array_of_files_to_load))) { + if (is.na(array_of_not_found_files[i])) { + array_of_files_to_load[i] <- NA + } else { + if (array_of_not_found_files[i]) { + array_of_not_found_files[i] <- array_of_files_to_load[i] + array_of_files_to_load[i] <- NA + } else { + array_of_not_found_files[i] <- NA + } + } + } + } else { + array_of_not_found_files <- NULL + } + + } # End if (retrieve) + else { # if retrieve = FALSE, metadata still needs to reshape + + if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { + if (!merge_across_dims_narm) { + tmp <- match(names(final_dims), names(dims_of_merge_dim)) + if (any(diff(tmp[!is.na(tmp)]) < 0)) { #need to reorder + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp[!is.na(tmp)]) + } + metadata_tmp <- picked_common_vars[[across_inner_dim]] + } else { + tmp <- remove_additional_na_from_merge( + data_array = NULL, + merge_dim_metadata = picked_common_vars[[across_inner_dim]], + inner_dims_across_files, final_dims, + length_inner_across_dim) + metadata_tmp <- tmp$merge_dim_metadata + } + + if (length(metadata_tmp) != prod(final_dims_fake_metadata)) { + stop(paste0("After reshaping, the metadata do not fit into the expected output dimension. ", + "Check if the reshaping parameters are used correctly or contact support.")) + } + + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. + if (split_multiselected_dims) { + tmp <- rebuild_array_merge_split( + data_array = NULL, metadata = metadata_tmp, indices_chunk, + all_split_dims, final_dims_fake, across_inner_dim, length_inner_across_dim) + metadata_tmp <- tmp$metadata + } + metadata_tmp <- array(metadata_tmp, dim = final_dims_fake_metadata) + + # If split_multiselected_dims + merge_across_dims, the dimension order may change above. + # To get the user-required dim order, we need to reorder the array again. + if (split_multiselected_dims) { + if (inner_dim_pos_in_split_dims != 1) { + correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) +# data_array <- .aperm2(data_array, correct_order) + correct_order_metadata <- match(names(final_dims_fake_output), names(all_split_dims[[across_inner_dim]])) + metadata_tmp <- .aperm2(metadata_tmp, correct_order_metadata[!is.na(correct_order_metadata)]) + } + } + # Convert numeric back to dates + if ('time' %in% synonims[[across_inner_dim]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[across_inner_dim]] <- metadata_tmp + attr(picked_common_vars[[across_inner_dim]], 'variables') <- saved_reshaped_attr + } else { # ! (merge_across_dims + split_multiselected_dims) (old version) + if (merge_across_dims) { + # merge_across_dims = TRUE but (merge_across_dims_narm = F & split_multiselected_dims = F) + + inner_dim_pos <- which(names(dims_of_merge_dim) == inner_dims_across_files) + file_dim_pos <- which(names(dims_of_merge_dim) == names(inner_dims_across_files)) + if (file_dim_pos < inner_dim_pos) { #need to reorder + tmp <- seq(1, length(dims_of_merge_dim)) + tmp[inner_dim_pos] <- file_dim_pos + tmp[file_dim_pos] <- inner_dim_pos + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp) + } + metadata_tmp <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_metadata) + # Convert numeric back to dates + if ('time' %in% synonims[[across_inner_dim]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[across_inner_dim]] <- metadata_tmp + attr(picked_common_vars[[across_inner_dim]], 'variables') <- saved_reshaped_attr + } + if (split_multiselected_dims & !is.null(picked_common_vars)) { + if (!identical(inner_dim_has_split_dim, character(0))) { + metadata_tmp <- array(picked_common_vars[[inner_dim_has_split_dim]], dim = final_dims_fake_metadata) + # Convert numeric back to dates + if (inherits(picked_common_vars[[inner_dim_has_split_dim]], 'POSIXct')) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[inner_dim_has_split_dim]] <- metadata_tmp + attr(picked_common_vars[[inner_dim_has_split_dim]], 'variables') <- saved_reshaped_attr + } + } + } + # Retrieve variable metadata + # Compare array_of_metadata_flags with array_of_files_to_load to know which files to take for metadata + if (!is.null(metadata_dims)) { + array_of_metadata_flags <- array(FALSE, dim = dim(array_of_files_to_load)) + metadata_indices_to_load <- as.list(rep(1, length(dim(array_of_files_to_load)))) + names(metadata_indices_to_load) <- names(dim(array_of_files_to_load)) + metadata_indices_to_load[metadata_dims] <- as.list(rep(TRUE, length(metadata_dims))) + array_of_metadata_flags <- do.call('[<-', c(list(array_of_metadata_flags), metadata_indices_to_load, + list(value = rep(TRUE, prod(dim(array_of_files_to_load)[metadata_dims]))))) + + if (tail(names(dim(array_of_files_to_load)), 1) != found_pattern_dim) { + tmp1 <- s2dv::Reorder(array_of_files_to_load, c(2:length(dim(array_of_files_to_load)), 1)) + tmp2 <- s2dv::Reorder(array_of_metadata_flags, c(2:length(dim(array_of_metadata_flags)), 1)) + files_for_metadata <- tmp1[tmp2] + } else { + files_for_metadata <- array_of_files_to_load[array_of_metadata_flags] + } + + # Get variable name + #NOTE: This part probably will fail when one netCDF file has more than one variable. + if (found_pattern_dim %in% metadata_dims) { # metadata_dims has "dat" + if (any(metadata_dims %in% c('var', 'variable'))) { # metadata_dim is c('dat', 'var') + how_many_vars <- length(dat[[1]][['selectors']]$var[[1]]) + } else if (length(metadata_dims) > 1) { # metadata_dims is c('dat', xxx) + how_many_vars <- length(dat[[1]][['selectors']][[metadata_dims[which(found_pattern_dim != metadata_dims)]]][[1]]) + } else { # metadata_dims is 'dat' + how_many_vars <- 1 + } + tmp_var <- matrix(NA, how_many_vars, length(dat)) + for (i_dat in 1:dim(array_of_metadata_flags)[found_pattern_dim]) { + if (any(metadata_dims %in% c('var', 'variable'))) { # metadata_dims has "var" + tmp_var[, i_dat] <- dat[[i_dat]][['selectors']]$var[[1]] + } else if (length(metadata_dims) > 1) { # metadata_dims is c('dat', xxx) + tmp_var[, i_dat] <- rep(dat[[i_dat]][['selectors']]$var[[1]][1], + length(dat[[1]][['selectors']][[metadata_dims[which(found_pattern_dim != metadata_dims)]]][[1]])) + } else { # metadata_dims is 'dat' + tmp_var[, i_dat] <- dat[[i_dat]][['selectors']]$var[[1]][1] + } + } + + # if metadat_dims = c('dat', 'var') and [dat = 2, var = 2], tmp_var has length 4, like c('tas', 'tos', 'tas', 'tos'). + # if metadata_dims = 'dat' and [dat = 2], tmp_var has length 2 like c('tas', 'tos'). + tmp_var <- c(tmp_var) + + } else { # metadata_dims doesn't have "dat" + if (any(metadata_dims %in% c('var', 'variable'))) { # metadata_dims has "var" + tmp_var <- dat[[1]][['selectors']]$var[[1]] + } else { + tmp_var <- rep(dat[[1]][['selectors']]$var[[1]][1], length(dat[[1]][['selectors']][[metadata_dims]][[1]])) + } + # if metadata_dims = 'var' and [var = 2], tmp_var has length 2 like c('tas', 'tos') + # if metadata_dims = 'table' and [table = 2], tmp_var has length 1 like 'tas' + } + + loaded_metadata <- vector('list', length = length(files_for_metadata)) + for (i_file in 1:length(files_for_metadata)) { + #NOTE: Not use ncatt_get() because it only gets the attr shown with ncdump -h + tmp <- file_opener(files_for_metadata[i_file]) + if (!is.null(tmp)) { # if file exists + loaded_metadata[[i_file]][[1]] <- tmp$var[[tmp_var[i_file]]] + names(loaded_metadata[[i_file]]) <- tmp_var[i_file] + file_closer(tmp) + } + } + # Find loaded_metadata_files identical as "retrieve = T" case. If dataset_has_files is F, deduct that dataset from counting + ind_loaded_metadata_has_values <- which(!sapply(loaded_metadata, is.null)) # c(1, 2, 4) + if (!all(dataset_has_files)) { # If dataset_has_files has F, deduct that dataset from counting + if (found_pattern_dim %in% metadata_dims) { # metadata_dims has "dat" + dataset_has_files_expand <- rep(dataset_has_files, each = how_many_vars) + i_ind <- 1 + while (i_ind <= length(ind_loaded_metadata_has_values)) { # 3, 4, 8 + if (ind_loaded_metadata_has_values[i_ind] > i_ind) { + ind_loaded_metadata_has_values[i_ind] <- ind_loaded_metadata_has_values[i_ind] - length(which(!dataset_has_files_expand[1:dataset_has_files_expand[i_ind]])) + } + i_ind <- i_ind + 1 + } + } + } + loaded_metadata_files <- as.character(ind_loaded_metadata_has_values) + loaded_metadata <- loaded_metadata[which(!sapply(loaded_metadata, is.null))] + return_metadata <- create_metadata_list(array_of_metadata_flags, metadata_dims, pattern_dims, + loaded_metadata_files, loaded_metadata, dat_names, + dataset_has_files) + } + } + # Print the warnings from transform + if (!is.null(c(warnings1, warnings2, warnings3))) { + transform_warnings_list <- lapply(c(warnings1, warnings2, warnings3), function(x) { + return(x$message) + }) + transform_warnings_list <- unique(transform_warnings_list) + for (i in 1:length(transform_warnings_list)) { + .warning(transform_warnings_list[[i]]) + } + } + + # Change final_dims_fake back because retrieve = FALSE will use it for attributes later + if (exists("final_dims_fake_output")) { + final_dims_fake <- final_dims_fake_output + } + # Replace the vars and common vars by the transformed vars and common vars + for (i in 1:length(dat)) { + if (length(names(transformed_vars[[i]])) > 0) { + picked_vars[[i]][names(transformed_vars[[i]])] <- transformed_vars[[i]] + } else if (length(names(picked_vars_ordered[[i]])) > 0) { + picked_vars[[i]][names(picked_vars_ordered[[i]])] <- picked_vars_ordered[[i]] + } + } + if (length(names(transformed_common_vars)) > 0) { + picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars + } else if (length(names(picked_common_vars_ordered)) > 0) { + picked_common_vars[names(picked_common_vars_ordered)] <- picked_common_vars_ordered + } + if (debug) { + print("-> THE TRANSFORMED VARS:") + print(str(transformed_vars)) + print("-> THE PICKED VARS:") + print(str(picked_vars)) + } + + file_selectors <- NULL + for (i in 1:length(dat)) { + file_selectors[[dat[[i]][['name']]]] <- dat[[i]][['selectors']][which(names(dat[[i]][['selectors']]) %in% found_file_dims[[i]])] + } + + # Prepare attr Variables + if (all(sapply(return_metadata, is.null))) { + # We don't have metadata of the variable (e.g., tas). The returned metadata list only + # contains those are specified in argument "return_vars". + Variables_list <- c(list(common = picked_common_vars), picked_vars) + .warning(paste0("Metadata cannot be retrieved. The reason may be the ", + "non-existence of the first file. Use parameter 'metadata_dims'", + " to assign to file dimensions along which to return metadata, ", + "or check the existence of the first file.")) + } else { + # Add the metadata of the variable (e.g., tas) into the list of picked_vars or + # picked_common_vars. + Variables_list <- combine_metadata_picked_vars( + return_metadata, picked_vars, picked_common_vars, + metadata_dims, pattern_dims, length(dat)) + } + + if (retrieve) { + if (!silent) { + .message("Successfully retrieved data.") + } + + attributes(data_array) <- c(attributes(data_array), + list(Variables = Variables_list, + Files = array_of_files_to_load, + NotFoundFiles = array_of_not_found_files, + FileSelectors = file_selectors, + PatternDim = found_pattern_dim, + ObjectBigmemory = name_bigmemory_obj) #attr(shared_matrix_pointer, 'description')$sharedName) + ) + attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class')) + data_array + + } else { # retrieve = FALSE + if (!silent) { + .message("Successfully discovered data dimensions.") + } + start_call <- match.call() + for (i in 2:length(start_call)) { + if (class(start_call[[i]]) %in% c('name', 'call')) { + tmp <- eval.parent(start_call[[i]]) + if (is.null(tmp)) { + start_call[i] <- list(NULL) + } else { + start_call[[i]] <- eval.parent(start_call[[i]]) + } + } + } + start_call[['retrieve']] <- TRUE + attributes(start_call) <- c(attributes(start_call), + list(Dimensions = final_dims_fake, + Variables = Variables_list, + ExpectedFiles = array_of_files_to_load, + FileSelectors = file_selectors, + PatternDim = found_pattern_dim, + MergedDims = if (merge_across_dims) { + inner_dims_across_files + } else { + NULL + }, + SplitDims = if (split_multiselected_dims) { + all_split_dims + } else { + NULL + }) + ) + attr(start_call, 'class') <- c('startR_cube', attr(start_call, 'class')) + start_call + } +} + +# This function is the responsible for loading the data of each work +# piece. +.LoadDataFile <- function(work_piece, shared_matrix_pointer, + file_data_reader, synonims, + transform, transform_params, transform_crop_domain = NULL, + silent = FALSE, debug = FALSE) { + #warning(attr(shared_matrix_pointer, 'description')$sharedName) + # suppressPackageStartupMessages({library(bigmemory)}) + ### TODO: Specify dependencies as parameter + # suppressPackageStartupMessages({library(ncdf4)}) + + #print("1") + store_indices <- as.list(work_piece[['store_position']]) + first_round_indices <- work_piece[['first_round_indices']] + second_round_indices <- work_piece[['second_round_indices']] + #print("2") + file_to_open <- work_piece[['file_path']] + # Get data and metadata + sub_array <- file_data_reader(file_to_open, NULL, + work_piece[['file_selectors']], + first_round_indices, synonims) + if (debug) { + if (all(unlist(store_indices[1:6]) == 1)) { + print("-> LOADING A WORK PIECE") + print("-> STRUCTURE OF READ UNTRANSFORMED DATA:") + print(str(sub_array)) + print("-> STRUCTURE OF VARIABLES TO TRANSFORM:") + print(str(work_piece[['vars_to_transform']])) + print("-> COMMON ARRAY DIMENSIONS:") + print(str(work_piece[['store_dims']])) + } + } + if (!is.null(sub_array)) { + # Apply data transformation once we have the data arrays. + if (!is.null(transform)) { + if (debug) { + if (all(unlist(store_indices[1:6]) == 1)) { + print("-> PROCEEDING TO TRANSFORM ARRAY") + print("-> DIMENSIONS OF ARRAY RIGHT BEFORE TRANSFORMING:") + print(dim(sub_array)) + } + } + sub_array <- do.call(transform, c(list(data_array = sub_array, + variables = work_piece[['vars_to_transform']], + file_selectors = work_piece[['file_selectors']], + crop_domain = transform_crop_domain), + transform_params)) + if (debug) { + if (all(unlist(store_indices[1:6]) == 1)) { + print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER TRANSFORMING:") + print(str(sub_array)) + print("-> DIMENSIONS OF ARRAY RIGHT AFTER TRANSFORMING:") + print(dim(sub_array$data_array)) + } + } + sub_array <- sub_array$data_array + # Subset with second round of indices + dims_to_crop <- which(!sapply(second_round_indices, is.null)) + if (length(dims_to_crop) > 0) { + dimnames_to_crop <- names(second_round_indices)[dims_to_crop] + sub_array <- ClimProjDiags::Subset(sub_array, dimnames_to_crop, + second_round_indices[dimnames_to_crop]) + } + if (debug) { + if (all(unlist(store_indices[1:6]) == 1)) { + print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER SUBSETTING WITH 2nd ROUND INDICES:") + print(str(sub_array)) + } + } + } + + metadata <- attr(sub_array, 'variables') + + names_bk <- names(store_indices) + store_indices <- lapply(names(store_indices), + function (x) { + if (!(x %in% names(first_round_indices))) { + store_indices[[x]] + } else if (is.null(second_round_indices[[x]])) { + 1:dim(sub_array)[x] + } else { + if (is.numeric(second_round_indices[[x]])) { + ## TODO: Review carefully this line. Inner indices are all + ## aligned to the left-most positions. If dataset A has longitudes + ## 1, 2, 3, 4 but dataset B has only longitudes 3 and 4, then + ## they will be stored as follows: + ## 1, 2, 3, 4 + ## 3, 4, NA, NA + ##x - min(x) + 1 + 1:length(second_round_indices[[x]]) + } else { + 1:length(second_round_indices[[x]]) + } + } + }) + names(store_indices) <- names_bk + if (debug) { + if (all(unlist(store_indices) == 1)) { + print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:") + print(str(first_round_indices)) + print("-> STRUCTURE OF SECOND ROUND INDICES FOR THIS WORK PIECE:") + print(str(second_round_indices)) + print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:") + print(str(store_indices)) + } + } + + store_indices <- lapply(store_indices, as.integer) + store_dims <- work_piece[['store_dims']] + + # split the storage work of the loaded subset in parts + largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))] + max_parts <- length(store_indices[[largest_dim_name]]) + + # Indexing a data file of N MB with expand.grid takes 30*N MB + # The peak ram of Start is, minimum, 2 * total data to load from all files + # due to inefficiencies in other regions of the code + # The more parts we split the indexing done below in, the lower + # the memory footprint of the indexing and the fast. + # But more than 10 indexing iterations (parts) for each MB processed + # makes the iteration slower (tested empirically on BSC workstations). + subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024 + best_n_parts <- ceiling(subset_size_in_mb * 10) + # We want to set n_parts to a greater value than the one that would + # result in a memory footprint (of the subset indexing code below) equal + # to 2 * total data to load from all files. + # s = subset size in MB + # p = number of parts to break it in + # T = total size of data to load + # then, s / p * 30 = 2 * T + # then, p = s * 15 / T + min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims)) + # Make sure we pick n_parts much greater than the minimum calculated + n_parts <- min_n_parts * 10 + if (n_parts > best_n_parts) { + n_parts <- best_n_parts + } + # Boundary checks + if (n_parts < 1) { + n_parts <- 1 + } + if (n_parts > max_parts) { + n_parts <- max_parts + } + + if (n_parts > 1) { + make_parts <- function(length, n) { + clusters <- cut(1:length, n, labels = FALSE) + lapply(1:n, function(y) which(clusters == y)) + } + part_indices <- make_parts(max_parts, n_parts) + parts <- lapply(part_indices, + function(x) { + store_indices[[largest_dim_name]][x] + }) + } else { + part_indices <- list(1:max_parts) + parts <- store_indices[largest_dim_name] + } + + # do the storage work + weights <- sapply(1:length(store_dims), + function(i) prod(c(1, store_dims)[1:i])) + part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array)))) + names(part_indices_in_sub_array) <- names(dim(sub_array)) + data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer) + for (i in 1:n_parts) { + store_indices[[largest_dim_name]] <- parts[[i]] + # Converting array indices to vector indices + matrix_indices <- do.call("expand.grid", store_indices) + # Given a matrix where each row is a set of array indices of an element + # the vector indices are computed + matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights) + part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]] + data_array[matrix_indices] <- as.vector(do.call('[', + c(list(x = sub_array), + part_indices_in_sub_array))) + } + rm(data_array) + gc() + + if (!is.null(work_piece[['save_metadata_in']])) { + saveRDS(metadata, file = work_piece[['save_metadata_in']]) + } + } + if (!is.null(work_piece[['progress_amount']]) && !silent) { + message(work_piece[['progress_amount']], appendLF = FALSE) + } + is.null(sub_array) +} diff --git a/modules/Loading/tmp/startR/R/Step.R b/modules/Loading/tmp/startR/R/Step.R new file mode 100644 index 00000000..3c997f00 --- /dev/null +++ b/modules/Loading/tmp/startR/R/Step.R @@ -0,0 +1,141 @@ +#'Define the operation applied on declared data. +#' +#'The step of the startR workflow after declaring data by Start() call. It +#'identifies the operation (i.e., function) and the target and output +#'dimensions of data array for the function. Ideally, it expects the dimension +#'name to be in the same order as the one requested in the Start() call. +#'If a different order is specified, startR will reorder the subset dimension +#'to the expected order for this function. +#' +#'@param fun A function in R format defining the operation to be applied to the +#' data declared by a Start() call. It should only work on the essential +#' dimensions rather than all the data dimensions. Since the function will be +#' called numerous times through all the non-essential dimensions, it is +#' recommended to keep them as light as possible. +#'@param target_dims A vector for single input array or a list of vectors for +#' multiple input arrays indicating the names of the dimensions 'fun' to be +#' applied along. +#'@param output_dims A vector for single returned array or a list of vectors +#' for multiple returned arrays indicating the dimension names of the function +#' output. +#'@param use_libraries A vector of character string indicating the R library +#' names to be used in 'fun'. Only used when the jobs are run on HPCs; if the +#' jobs are run locally, load the necessary libraries by \code{library()} +#' directly. The default value is NULL. +#'@param use_attributes One or more lists of vectors of character string +#' indicating the data attributes to be used in 'fun'. The list name should be +#' consistent with the list name of 'data' in AddStep(). The default value is +#' NULL. +#'@return A closure that contains all the objects assigned. It serves as the +#' input of Addstep(). +#'@examples +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011', '200012') +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = 'all', +#' longitude = 'all', +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#' fun <- function(x) { +#' lat = attributes(x)$Variables$dat1$latitude +#' weight = sqrt(cos(lat * pi / 180)) +#' corrected = Apply(list(x), target_dims = "latitude", +#' fun = function(x) {x * weight}) +#' } +#' step <- Step(fun = fun, +#' target_dims = 'latitude', +#' output_dims = 'latitude', +#' use_libraries = c('multiApply'), +#' use_attributes = list(data = "Variables")) +#' wf <- AddStep(data, step) +#' +#'@export +Step <- function(fun, target_dims, output_dims, + use_libraries = NULL, use_attributes = NULL) { + # Check fun + if (!is.function(fun)) { + stop("Parameter 'fun' must be a function.") + } + + # Check target_dims + if (is.character(target_dims)) { + target_dims <- list(target_dims) + names(target_dims) <- 'input1' + } + if (is.list(target_dims)) { + sapply(target_dims, + function(x) { + if (!(is.character(x) && (length(x) > 0))) { + stop("Parameter 'target_dims' must be one or a list of vectors ", + "of target dimension names for each data array input in ", + "the function 'fun'.") + } + }) + if (is.null(names(target_dims))) { + names(target_dims) <- paste0('input', 1:length(target_dims)) + } + } + + # Check output_dims + if (is.character(output_dims) || is.null(output_dims)) { + output_dims <- list(output_dims) + names(output_dims) <- 'output1' + } + if (is.list(output_dims)) { + sapply(output_dims, + function(x) { + if (!(is.character(x) || is.null(x))) { + stop("Parameter 'output_dims' must be one or a list of vectors ", + "of target dimension names for each data array input in ", + "the function 'fun'.") + } + }) + if (is.null(names(output_dims))) { + names(output_dims) <- paste0('output', 1:length(output_dims)) + } + } + + # Check use_libraries + if (!is.null(use_libraries)) { + if (!is.character(use_libraries)) { + stop("Parameter 'use_libraries' must be a vector of character ", + "strings.") + } + } + + # Check use_attributes + if (!is.null(use_attributes)) { + raise_error <- FALSE + if (!is.list(use_attributes)) { + raise_error <- TRUE + } + if (!all(sapply(use_attributes, + function(x) { + is.character(x) || + (is.list(x) && all(sapply(x, is.character))) + }))) { + raise_error <- TRUE + } + if (raise_error) { + stop("Parameter 'use_attributes' must be a list of vectors of ", + "character strings or of lists of vectors of character ", + "strings.") + } + } + + attr(fun, 'TargetDims') <- target_dims + attr(fun, 'OutputDims') <- output_dims + attr(fun, 'UseLibraries') <- use_libraries + attr(fun, 'UseAttributes') <- use_attributes + + # TODO: Add provenance info + class(fun) <- 'startR_step_fun' + + fun +} diff --git a/modules/Loading/tmp/startR/R/Utils.R b/modules/Loading/tmp/startR/R/Utils.R new file mode 100644 index 00000000..e440ddee --- /dev/null +++ b/modules/Loading/tmp/startR/R/Utils.R @@ -0,0 +1,1017 @@ +#'@import abind +#'@importFrom methods is +#'@importFrom ClimProjDiags Subset +.chunk <- function(chunk, n_chunks, selectors) { + if (any(chunk > n_chunks)) { + stop("Requested chunk index out of bounds.") + } + if (length(chunk) == 1 && length(n_chunks) == 1) { + if (!is.null(attr(selectors, 'chunk'))) { + attr(selectors, 'chunk') <- c((attr(selectors, 'chunk')['chunk'] - 1) * n_chunks + + chunk, + attr(selectors, 'chunk')['n_chunks'] * n_chunks) + } else { + attr(selectors, 'chunk') <- c(chunk = unname(chunk), n_chunks = unname(n_chunks)) + } + } else { + # Chunking arrays of multidimensional selectors. + # This should be done in Start.R but implies modifications. + if (length(chunk) != length(n_chunks)) { + stop("Wrong chunk specification.") + } + #NOTE: 1. It should be for above? not nultidimensional selector + # 2. it was !is.null before, but it should be is.null (?) + # if (!is.null(attr(selectors, 'values'))) { + # stop("Multidimensional chunking only available when selector ", + # "values provided.") + # } + if (is.null(dim(selectors))) { + stop("Multidimensional chunking only available when multidimensional ", + "selector values provided.") + } + if (length(dim(selectors)) != length(chunk)) { + stop("As many chunk indices and chunk lengths as dimensions in the ", + "multidimensional selector array must be specified.") + } + old_indices <- attr(selectors, 'indices') + old_values <- attr(selectors, 'values') + selectors <- ClimProjDiags::Subset(selectors, names(chunk), + lapply(names(chunk), + function(x) { + n_indices <- dim(selectors)[x] + chunk_sizes <- rep(floor(n_indices / n_chunks[x]), n_chunks[x]) + chunks_to_extend <- n_indices - chunk_sizes[1] * n_chunks[x] + if (chunks_to_extend > 0) { + chunk_sizes[1:chunks_to_extend] <- chunk_sizes[1:chunks_to_extend] + 1 + } + chunk_size <- chunk_sizes[chunk[x]] + offset <- 0 + if (chunk[x] > 1) { + offset <- sum(chunk_sizes[1:(chunk[x] - 1)]) + } + 1:chunk_sizes[chunk[x]] + offset + })) + attr(selectors, 'indices') <- old_indices + attr(selectors, 'values') <- old_values + } + selectors +} + +.ReplaceVariablesInString <- function(string, replace_values, allow_undefined_key_vars = FALSE) { + # This function replaces all the occurrences of a variable in a string by + # their corresponding string stored in the replace_values. + if (length(strsplit(string, "\\$")[[1]]) > 1) { + parts <- strsplit(string, "\\$")[[1]] + output <- "" + i <- 0 + for (part in parts) { + if (i %% 2 == 0) { + output <- paste(output, part, sep = "") + } else { + if (part %in% names(replace_values)) { + output <- paste(output, .ReplaceVariablesInString(replace_values[[part]], replace_values, allow_undefined_key_vars), sep = "") + } else if (allow_undefined_key_vars) { + output <- paste0(output, "$", part, "$") + } else { + stop(paste('Error: The variable $', part, '$ was not defined in the configuration file.', sep = '')) + } + } + i <- i + 1 + } + output + } else { + string + } +} + +.ReplaceGlobExpressions <- function(path_with_globs, actual_path, + replace_values, tags_to_keep, + dataset_name, permissive) { + # The goal of this function is to replace the shell globbing expressions in + # a path pattern (that may contain shell globbing expressions and Load() + # tags) by the corresponding part of the real existing path. + # What is done actually is to replace all the values of the tags in the + # actual path by the corresponding $TAG$ + # + # It takes mainly two inputs. The path with expressions and tags, e.g.: + # /data/experiments/*/$EXP_NAME$/$VAR_NAME$/$VAR_NAME$_*$START_DATE$*.nc + # and a complete known path to one of the matching files, e.g.: + # /data/experiments/ecearth/i00k/tos/tos_fc0-1_19901101_199011-199110.nc + # and it returns the path pattern but without shell globbing expressions: + # /data/experiments/ecearth/$EXP_NAME$/$VAR_NAME$/$VAR_NAME$_fc0-1_$START_DATE$_199011-199110.nc + # + # To do that, it needs also as inputs the list of replace values (the + # association of each tag to their value). + # + # All the tags not present in the parameter tags_to_keep will be repalced. + # + # Not all cases can be resolved with the implemented algorithm. In an + # unsolvable case a warning is given and one possible guess is returned. + # + # In some cases it is interesting to replace only the expressions in the + # path to the file, but not the ones in the file name itself. To keep the + # expressions in the file name, the parameter permissive can be set to + # TRUE. To replace all the expressions it can be set to FALSE. + + # Tests + #a <- "/esarchive/exp/ecearth/a13c/3hourly/$var$_*/$var$_*-LR_historical_r1i1p1f1_gr_$chunk$.nc" + #b <- "/esarchive/exp/ecearth/a13c/3hourly/psl_f6h/psl_E3hrPt_EC-Earth3-LR_historical_r1i1p1f1_gr_195001010000-195001312100.nc" + #c <- list(dat = 'dat1', var = 'psl', chunk = '195001010000-195001312100') + #d <- c('dat', 'var', 'chunk') + #e <- 'dat1' + #f <- FALSE #TRUE/0/1/2/3 + #r <- .ReplaceGlobExpressions(a, b, c, d, e, f) + + clean <- function(x) { + if (nchar(x) > 0) { + x <- gsub('\\\\', '', x) + x <- gsub('\\^', '', x) + x <- gsub('\\$', '', x) + x <- unname(sapply(strsplit(x, '[',fixed = TRUE)[[1]], function(y) gsub('.*]', '.', y))) + do.call(paste0, as.list(x)) + } else { + x + } + } + + strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "") + + if (permissive == 0) { + permissive <- FALSE + } else { + if (permissive == TRUE) { + permissive_levels <- 1 + } else { + permissive_levels <- round(permissive[1]) + permissive <- TRUE + } + } + + if (permissive) { + actual_path_chunks <- strsplit(actual_path, '/')[[1]] + if (permissive_levels >= length(actual_path_chunks)) { + stop("Error: Provided levels out of scope in parameter 'permissive'.") + } + permissive_levels <- 1:permissive_levels + permissive_levels <- length(actual_path_chunks) - (rev(permissive_levels) - 1) + actual_path <- paste(actual_path_chunks[-permissive_levels], collapse = '/') + file_name <- paste(actual_path_chunks[permissive_levels], collapse = '/') + if (length(actual_path_chunks) > 1) { + file_name <- paste0('/', file_name) + } + path_with_globs_chunks <- strsplit(path_with_globs, '/')[[1]] + path_with_globs <- paste(path_with_globs_chunks[-permissive_levels], + collapse = '/') + path_with_globs_no_tags <- .ReplaceVariablesInString(path_with_globs, replace_values) + file_name_with_globs <- paste(path_with_globs_chunks[permissive_levels], collapse = '/') + if (length(path_with_globs_chunks) > 1) { + file_name_with_globs <- paste0('/', file_name_with_globs) + } + right_known <- head(strsplit(file_name_with_globs, '*', fixed = TRUE)[[1]], 1) + right_known_no_tags <- .ReplaceVariablesInString(right_known, replace_values) + path_with_globs_no_tags_rx <- utils::glob2rx(paste0(path_with_globs_no_tags, right_known_no_tags)) + match <- regexpr(gsub('$', '', path_with_globs_no_tags_rx, fixed = TRUE), paste0(actual_path, file_name)) + if (match != 1) { + stop("Incorrect parameters to replace glob expressions. The path with expressions does not match the actual path.") + } + #if (attr(match, 'match.length') - nchar(right_known_no_tags) < nchar(actual_path)) { + # path_with_globs_no_tags <- paste0(path_with_globs_no_tags, right_known_no_tags, '*') + # file_name_with_globs <- sub(right_known, '/*', file_name_with_globs) + #} + } + path_with_globs_rx <- utils::glob2rx(path_with_globs) + values_to_replace <- c() + tags_to_replace_starts <- c() + tags_to_replace_ends <- c() + give_warning <- FALSE + for (tag in tags_to_keep) { + matches <- gregexpr(paste0('$', tag, '$'), path_with_globs_rx, fixed = TRUE)[[1]] + lengths <- attr(matches, 'match.length') + if (!(length(matches) == 1 && matches[1] == -1)) { + for (i in 1:length(matches)) { + left <- NULL + if (matches[i] > 1) { + left <- .ReplaceVariablesInString(substr(path_with_globs_rx, 1, matches[i] - 1), replace_values) + left_known <- strReverse(head(strsplit(strReverse(left), strReverse('.*'), fixed = TRUE)[[1]], 1)) + } + right <- NULL + if ((matches[i] + lengths[i] - 1) < nchar(path_with_globs_rx)) { + right <- .ReplaceVariablesInString(substr(path_with_globs_rx, matches[i] + lengths[i], nchar(path_with_globs_rx)), replace_values) + right_known <- head(strsplit(right, '.*', fixed = TRUE)[[1]], 1) + } + final_match <- NULL + match_limits <- NULL + if (!is.null(left)) { + left_match <- regexpr(paste0(left, replace_values[[tag]], right_known), actual_path) + match_len <- attr(left_match, 'match.length') + + right_known_nchar <- nchar(clean(right_known)) + if (identical(right_known_nchar, integer(0))) right_known_nchar <- 0 + left_match_limits <- c(left_match + match_len - 1 - right_known_nchar - nchar(replace_values[[tag]]) + 1, + left_match + match_len - 1 - right_known_nchar) + + if (!(left_match < 1)) { + match_limits <- left_match_limits + } + } + right_match <- NULL + if (!is.null(right)) { + right_match <- regexpr(paste0(left_known, replace_values[[tag]], right), actual_path) + match_len <- attr(right_match, 'match.length') + + left_known_nchar <- nchar(clean(left_known)) + if (identical(left_known_nchar, integer(0))) left_known_nchar <- 0 + right_match_limits <- c(right_match + left_known_nchar, + right_match + left_known_nchar + nchar(replace_values[[tag]]) - 1) + if (is.null(match_limits) && !(right_match < 1)) { + match_limits <- right_match_limits + } + } + if (!is.null(right_match) && !is.null(left_match)) { + if (!identical(right_match_limits, left_match_limits)) { + give_warning <- TRUE + } + } + if (is.null(match_limits)) { + stop("Too complex path pattern specified for ", dataset_name, + ". Specify a simpler path pattern for this dataset.") + } + values_to_replace <- c(values_to_replace, tag) + tags_to_replace_starts <- c(tags_to_replace_starts, match_limits[1]) + tags_to_replace_ends <- c(tags_to_replace_ends, match_limits[2]) + } + } + } + + actual_path_with_tags <- actual_path + if (length(tags_to_replace_starts) > 0) { + reorder <- sort(tags_to_replace_starts, index.return = TRUE) + tags_to_replace_starts <- reorder$x + values_to_replace <- values_to_replace[reorder$ix] + tags_to_replace_ends <- tags_to_replace_ends[reorder$ix] + while (length(values_to_replace) > 0) { + actual_path_with_tags <- paste0(substr(actual_path_with_tags, 1, head(tags_to_replace_starts, 1) - 1), + '$', head(values_to_replace, 1), '$', + substr(actual_path_with_tags, head(tags_to_replace_ends, 1) + 1, nchar(actual_path_with_tags))) + extra_chars <- nchar(head(values_to_replace, 1)) + 2 - (head(tags_to_replace_ends, 1) - head(tags_to_replace_starts, 1) + 1) + values_to_replace <- values_to_replace[-1] + tags_to_replace_starts <- tags_to_replace_starts[-1] + tags_to_replace_ends <- tags_to_replace_ends[-1] + tags_to_replace_starts <- tags_to_replace_starts + extra_chars + tags_to_replace_ends <- tags_to_replace_ends + extra_chars + } + } + + if (give_warning) { + .warning(paste0("Too complex path pattern specified for ", dataset_name, + ". Double check carefully the '$Files' fetched for this dataset or specify a simpler path pattern.")) + } + + if (permissive) { + paste0(actual_path_with_tags, file_name_with_globs) + } else { + actual_path_with_tags + } +} + +.FindTagValue <- function(path_with_globs_and_tag, actual_path, tag) { + + addition_warning = FALSE + + if (!all(sapply(c(path_with_globs_and_tag, actual_path, tag), is.character))) { + stop("All 'path_with_globs_and_tag', 'actual_path' and 'tag' must be character strings.") + } + + if (grepl('$', tag, fixed = TRUE)) { + stop("The provided 'tag' must not contain '$' symbols.") + } + full_tag <- paste0('$', tag, '$') + + if (!grepl(full_tag, path_with_globs_and_tag, fixed = TRUE)) { + stop("The provided 'path_with_globs_and_tag' must contain the tag in 'tag' surrounded by '$' symbols.") + } + + parts <- strsplit(path_with_globs_and_tag, full_tag, fixed = TRUE)[[1]] + if (length(parts) == 1) { + parts <- c(parts, '') + } + parts[1] <- paste0('^', parts[1]) + parts[length(parts)] <- paste0(parts[length(parts)], '$') + + # Group the parts in 2 groups, in a way that both groups have a number + # of characters as similar as possible. + part_lengths <- sapply(parts, nchar) + group_len_diffs <- sapply(1:(length(parts) - 1), + function(x) { + sum(part_lengths[(x + 1):length(parts)]) - sum(part_lengths[1:x]) + } + ) + clp <- chosen_left_part <- which.min(group_len_diffs)[1] + + left_expr <- paste(parts[1:clp], collapse = full_tag) + + #because ? means sth, use . (any char) to substitute ? + left_expr <- gsub('?', '.', left_expr, fixed = TRUE) + test_left_expr <- left_expr + + # because * means zero or more char, use . to substitute *. + # And the * behind . means zero or more char. '?' for lazy evaluation. + left_expr <- gsub('*', '.*?', left_expr, fixed = TRUE) + left_expr <- gsub(full_tag, '.*?', left_expr, fixed = TRUE) + + # To test if the pattern matches only one... dont use lazy evaluation + test_left_expr <- gsub('*', '.*', test_left_expr, fixed = TRUE) + test_left_expr <- gsub(full_tag, '.*', test_left_expr, fixed = TRUE) + + # Find the match chars from left + left_match <- regexec(left_expr, actual_path)[[1]] + test_left_match <- regexec(test_left_expr, actual_path)[[1]] + + if (left_match < 0) { + stop("Unexpected error in .FindTagValue.") + } + + if (attr(test_left_match, "match.length") != attr(left_match, "match.length")) { + addition_warning = TRUE + warning("Detect more than one possibility derived from the global expression of path.") + } + + #Cut down the left match part + actual_path_sub <- substr(actual_path, + attr(left_match, 'match.length') + 1, + nchar(actual_path)) + + #----------Search match chars from right + right_expr <- paste(parts[(clp + 1):(length(parts))], collapse = full_tag) + right_expr <- gsub('?', '.', right_expr, fixed = TRUE) + + test_right_expr <- right_expr + # For lazy evaulation to work, pattern and string have to be reversed. + right_expr <- gsub('*', '.*?', right_expr, fixed = TRUE) + right_expr <- gsub(full_tag, '.*?', right_expr, fixed = TRUE) + right_expr <- gsub('$', '^', right_expr, fixed = TRUE) + + # To test if the pattern matches only one... dont use lazy evaluation + test_right_expr <- gsub('*', '.*', test_right_expr, fixed = TRUE) + test_right_expr <- gsub(full_tag, '.*', test_right_expr, fixed = TRUE) + test_right_expr <- gsub('$', '^', test_right_expr, fixed = TRUE) + + rev_str <- function(s) { + paste(rev(strsplit(s, NULL)[[1]]), collapse = '') + } + + right_expr <- rev_str(right_expr) + test_right_expr <- rev_str(test_right_expr) + + right_expr <- gsub('?*.', '.*?', right_expr, fixed = TRUE) + right_match <- regexec(right_expr, rev_str(actual_path))[[1]] + + test_right_expr <- gsub('*.', '.*', test_right_expr, fixed = TRUE) + test_right_match <- regexec(test_right_expr, rev_str(actual_path_sub))[[1]] + + if (right_match < 0) { + stop("Unexpected error in .FindTagValue.") + } + + if (attr(test_right_match, "match.length") != attr(right_match, "match.length")) { + addition_warning = TRUE + warning(paste0("Detect more than one possibility derived from the global ", + "expression of path.")) + } + + #-------------get tag value + right_match[] <- nchar(actual_path) - + (right_match[] + attr(right_match, 'match.length') - 1) + 1 + + if (addition_warning) { + warning(paste0("The extracted parameter ", full_tag, " is ", + substr(actual_path, left_match + attr(left_match, 'match.length'), + right_match - 1), + ". Check if all the desired files were read in. ", + "If not, specify parameter '", tag, + "' by values instead of indices, or set parameter ", + "'path_glob_permissive' as TRUE")) + } + + if ((left_match + attr(left_match, 'match.length')) > + (right_match - 1)) { + NULL + } else { + substr(actual_path, left_match + attr(left_match, 'match.length'), + right_match - 1) + } +} + +.message <- function(...) { + # Function to use the 'message' R function with our custom settings + # Default: new line at end of message, indent to 0, exdent to 3, + # collapse to \n* + args <- list(...) + + ## In case we need to specify message arguments + if (!is.null(args[["appendLF"]])) { + appendLF <- args[["appendLF"]] + } else { + ## Default value in message function + appendLF <- TRUE + } + if (!is.null(args[["domain"]])) { + domain <- args[["domain"]] + } else { + ## Default value in message function + domain <- NULL + } + args[["appendLF"]] <- NULL + args[["domain"]] <- NULL + + ## To modify strwrap indent and exdent arguments + if (!is.null(args[["indent"]])) { + indent <- args[["indent"]] + } else { + indent <- 0 + } + if (!is.null(args[["exdent"]])) { + exdent <- args[["exdent"]] + } else { + exdent <- 3 + } + args[["indent"]] <- NULL + args[["exdent"]] <- NULL + + ## To modify paste collapse argument + if (!is.null(args[["collapse"]])) { + collapse <- args[["collapse"]] + } else { + collapse <- "\n*" + } + args[["collapse"]] <- NULL + + ## Message tag + if (!is.null(args[["tag"]])) { + tag <- args[["tag"]] + } else { + tag <- "* " + } + args[["tag"]] <- NULL + + message(paste0(tag, paste(strwrap( + args, indent = indent, exdent = exdent + ), collapse = collapse)), appendLF = appendLF, domain = domain) +} + +.warning <- function(...) { + # Function to use the 'warning' R function with our custom settings + # Default: no call information, indent to 0, exdent to 3, + # collapse to \n + args <- list(...) + + ## In case we need to specify warning arguments + if (!is.null(args[["call."]])) { + call <- args[["call."]] + } else { + ## Default: don't show info about the call where the warning came up + call <- FALSE + } + if (!is.null(args[["immediate."]])) { + immediate <- args[["immediate."]] + } else { + ## Default value in warning function + immediate <- FALSE + } + if (!is.null(args[["noBreaks."]])) { + noBreaks <- args[["noBreaks."]] + } else { + ## Default value warning function + noBreaks <- FALSE + } + if (!is.null(args[["domain"]])) { + domain <- args[["domain"]] + } else { + ## Default value warning function + domain <- NULL + } + args[["call."]] <- NULL + args[["immediate."]] <- NULL + args[["noBreaks."]] <- NULL + args[["domain"]] <- NULL + + ## To modify strwrap indent and exdent arguments + if (!is.null(args[["indent"]])) { + indent <- args[["indent"]] + } else { + indent <- 0 + } + if (!is.null(args[["exdent"]])) { + exdent <- args[["exdent"]] + } else { + exdent <- 3 + } + args[["indent"]] <- NULL + args[["exdent"]] <- NULL + + ## To modify paste collapse argument + if (!is.null(args[["collapse"]])) { + collapse <- args[["collapse"]] + } else { + collapse <- "\n!" + } + args[["collapse"]] <- NULL + + ## Warning tag + if (!is.null(args[["tag"]])) { + tag <- args[["tag"]] + } else { + tag <- "! Warning: " + } + args[["tag"]] <- NULL + + warning(paste0(tag, paste(strwrap( + args, indent = indent, exdent = exdent + ), collapse = collapse)), call. = call, immediate. = immediate, + noBreaks. = noBreaks, domain = domain) +} + +# Function to permute arrays of non-atomic elements (e.g. POSIXct) +# Function to permute arrays of non-atomic elements (e.g. POSIXct) +.aperm2 <- function(x, new_order) { + old_dims <- dim(x) + attr_bk <- attributes(x) + if ('dim' %in% names(attr_bk)) { + attr_bk[['dim']] <- NULL + } + if (is.numeric(x)) { + x <- aperm(x, new_order) + } else { + y <- array(1:length(x), dim = dim(x)) + y <- aperm(y, new_order) + x <- x[as.vector(y)] + } + dim(x) <- old_dims[new_order] + attributes(x) <- c(attributes(x), attr_bk) + x +} + +# Function to bind arrays of non-atomic elements (e.g. POSIXct) +# 'x' and 'y' must have dimension names +# parameter 'along' must be a dimension name +.abind2 <- function(x, y, along) { + x_along <- which(names(dim(x)) == along) + if (x_along != length(dim(x))) { + tmp_order_x <- c((1:length(dim(x)))[-x_along], x_along) + x <- .aperm2(x, tmp_order_x) + } + y_along <- which(names(dim(y)) == along) + if (y_along != length(dim(y))) { + tmp_order_y <- c((1:length(dim(y)))[-y_along], y_along) + y <- .aperm2(y, tmp_order_y) + } + r <- c(x, y) + new_dims <- dim(x) + new_dims[length(new_dims)] <- dim(x)[length(dim(x))] + dim(y)[length(dim(y))] + dim(r) <- new_dims + if (x_along != length(dim(x))) { + final_order <- NULL + if (x_along > 1) { + final_order <- c(final_order, (1:length(dim(r)))[1:(x_along - 1)]) + } + final_order <- c(final_order, length(dim(r))) + final_order <- c(final_order, (1:length(dim(r)))[x_along:(length(dim(r)) - 1)]) + r <- .aperm2(r, final_order) + } + r +} + +# This function is a helper for the function .MergeArrays. +# It expects as inputs two named numeric vectors, and it extends them +# with dimensions of length 1 until an ordered common dimension +# format is reached. +# The first output is dims1 extended with 1s. +# The second output is dims2 extended with 1s. +# The third output is a merged dimension vector. If dimensions with +# the same name are found in the two inputs, and they have a different +# length, the maximum is taken. +.MergeArrayDims <- function(dims1, dims2) { + new_dims1 <- c() + new_dims2 <- c() + while (length(dims1) > 0) { + if (names(dims1)[1] %in% names(dims2)) { + pos <- which(names(dims2) == names(dims1)[1]) + dims_to_add <- rep(1, pos - 1) + if (length(dims_to_add) > 0) { + names(dims_to_add) <- names(dims2[1:(pos - 1)]) + } + new_dims1 <- c(new_dims1, dims_to_add, dims1[1]) + new_dims2 <- c(new_dims2, dims2[1:pos]) + dims1 <- dims1[-1] + dims2 <- dims2[-c(1:pos)] + } else { + new_dims1 <- c(new_dims1, dims1[1]) + new_dims2 <- c(new_dims2, 1) + names(new_dims2)[length(new_dims2)] <- names(dims1)[1] + dims1 <- dims1[-1] + } + } + if (length(dims2) > 0) { + dims_to_add <- rep(1, length(dims2)) + names(dims_to_add) <- names(dims2) + new_dims1 <- c(new_dims1, dims_to_add) + new_dims2 <- c(new_dims2, dims2) + } + list(new_dims1, new_dims2, pmax(new_dims1, new_dims2)) +} + +# This function takes two named arrays and merges them, filling with +# NA where needed. +# dim(array1) +# 'b' 'c' 'e' 'f' +# 1 3 7 9 +# dim(array2) +# 'a' 'b' 'd' 'f' 'g' +# 2 3 5 9 11 +# dim(.MergeArrays(array1, array2, 'b')) +# 'a' 'b' 'c' 'e' 'd' 'f' 'g' +# 2 4 3 7 5 9 11 +.MergeArrays <- function(array1, array2, along) { + if (!(is.null(array1) || is.null(array2))) { + if (!(identical(names(dim(array1)), names(dim(array2))) && + identical(dim(array1)[-which(names(dim(array1)) == along)], + dim(array2)[-which(names(dim(array2)) == along)]))) { + new_dims <- .MergeArrayDims(dim(array1), dim(array2)) + dim(array1) <- new_dims[[1]] + dim(array2) <- new_dims[[2]] + for (j in 1:length(dim(array1))) { + if (names(dim(array1))[j] != along) { + if (dim(array1)[j] != dim(array2)[j]) { + if (which.max(c(dim(array1)[j], dim(array2)[j])) == 1) { + na_array_dims <- dim(array2) + na_array_dims[j] <- dim(array1)[j] - dim(array2)[j] + na_array <- array(dim = na_array_dims) + array2 <- abind(array2, na_array, along = j) + names(dim(array2)) <- names(na_array_dims) + } else { + na_array_dims <- dim(array1) + na_array_dims[j] <- dim(array2)[j] - dim(array1)[j] + na_array <- array(dim = na_array_dims) + array1 <- abind(array1, na_array, along = j) + names(dim(array1)) <- names(na_array_dims) + } + } + } + } + } + if (!(along %in% names(dim(array2)))) { + stop("The dimension specified in 'along' is not present in the ", + "provided arrays.") + } + array1 <- abind(array1, array2, along = which(names(dim(array1)) == along)) + names(dim(array1)) <- names(dim(array2)) + } else if (is.null(array1)) { + array1 <- array2 + } + array1 +} + +# Takes as input a list of arrays. The list must have named dimensions. +.MergeArrayOfArrays <- function(array_of_arrays) { + MergeArrays <- .MergeArrays + array_dims <- (dim(array_of_arrays)) + dim_names <- names(array_dims) + + # Merge the chunks. + for (dim_index in 1:length(dim_names)) { + dim_sub_array_of_chunks <- dim_sub_array_of_chunk_indices <- NULL + if (dim_index < length(dim_names)) { + dim_sub_array_of_chunks <- array_dims[(dim_index + 1):length(dim_names)] + names(dim_sub_array_of_chunks) <- dim_names[(dim_index + 1):length(dim_names)] + dim_sub_array_of_chunk_indices <- dim_sub_array_of_chunks + sub_array_of_chunk_indices <- array(1:prod(dim_sub_array_of_chunk_indices), + dim_sub_array_of_chunk_indices) + } else { + sub_array_of_chunk_indices <- NULL + } + sub_array_of_chunks <- vector('list', prod(dim_sub_array_of_chunks)) + dim(sub_array_of_chunks) <- dim_sub_array_of_chunks + for (i in 1:prod(dim_sub_array_of_chunks)) { + if (!is.null(sub_array_of_chunk_indices)) { + chunk_sub_indices <- which(sub_array_of_chunk_indices == i, arr.ind = TRUE)[1, ] + } else { + chunk_sub_indices <- NULL + } + for (j in 1:(array_dims[dim_index])) { + new_chunk <- do.call('[[', c(list(x = array_of_arrays), + as.list(c(j, chunk_sub_indices)))) + if (is.null(new_chunk)) { + stop("Chunks missing.") + } + if (is.null(sub_array_of_chunks[[i]])) { + sub_array_of_chunks[[i]] <- new_chunk + } else { + sub_array_of_chunks[[i]] <- MergeArrays(sub_array_of_chunks[[i]], + new_chunk, + dim_names[dim_index]) + } + } + } + array_of_arrays <- sub_array_of_chunks + rm(sub_array_of_chunks) + gc() + } + + array_of_arrays[[1]] +} + +.MergeChunks <- function(shared_dir, suite_id, remove) { + MergeArrays <- .MergeArrays + + args <- NULL + shared_dir <- paste0(shared_dir, '/STARTR_CHUNKING_', suite_id) + + all_chunk_files_original <- list.files(paste0(shared_dir, '/'), '.*\\.Rds$') + all_chunk_files <- gsub('\\.Rds$', '', all_chunk_files_original) + chunk_filename_parts_all_components <- strsplit(all_chunk_files, '__') + all_components <- sapply(chunk_filename_parts_all_components, '[[', 1) + components <- unique(all_components) + result <- vector('list', length(components)) + names(result) <- components + for (component in components) { + chunk_files_original <- all_chunk_files_original[which(all_components == component)] + chunk_filename_parts <- chunk_filename_parts_all_components[which(all_components == component)] + chunk_filename_parts <- lapply(chunk_filename_parts, '[', -1) + if (length(unique(sapply(chunk_filename_parts, length))) != 1) { + stop("Detected chunks with more dimensions than others.") + } + dim_names <- sapply(chunk_filename_parts[[1]], + # TODO: strsplit by the last '_' match, not the first. + function(x) strsplit(x, '_')[[1]][1]) + # TODO check all files have exactly the same dimnames + found_chunk_indices <- sapply(chunk_filename_parts, + function(x) as.numeric(sapply(strsplit(x, '_'), '[[', 2))) + found_chunk_indices <- array(found_chunk_indices, + dim = c(length(dim_names), + length(found_chunk_indices) / length(dim_names)) + ) + found_chunks_str <- apply(found_chunk_indices, 2, paste, collapse = '_') + + if (length(args) > 0) { + if ((length(args) %% 2) != 0) { + stop("Wrong number of parameters.") + } + expected_dim_names <- args[(1:(length(args) / 2) - 1) * 2 + 1] + if (any(!is.character(expected_dim_names))) { + stop("Expected dimension names in parameters at odd positions.") + } + dim_indices <- args[(1:(length(args) / 2) - 1) * 2 + 2] + if (!any(dim_indices == 'all')) { + stop("Expected one dimension index to be 'all'.") + } + dim_to_merge <- which(dim_indices == 'all') + if (length(dim_indices) > 1) { + if (!all(is.numeric(dim_indices[-dim_to_merge]))) { + stop("Expected all dimension index but one to be numeric.") + } + } + # Check expected dim names match dim names + ## TODO + # Merge indices that vary along dim_to_merge whereas other fixed by dim_indices + # REMOVE FILES + ## TODO + stop("Feature not implemented.") + } else { + chunks_indices <- 1:length(dim_names) + chunks_indices <- lapply(chunks_indices, function(x) sort(unique(found_chunk_indices[x, ]))) + names(chunks_indices) <- dim_names + + # Load all found chunks into the array 'array_of_chuks'. + array_dims <- sapply(chunks_indices, length) + names(array_dims) <- dim_names + array_of_chunks <- vector('list', prod(array_dims)) + dim(array_of_chunks) <- array_dims + array_of_chunks_indices <- array(1:prod(array_dims), array_dims) + for (i in 1:prod(array_dims)) { + chunk_indices <- which(array_of_chunks_indices == i, arr.ind = TRUE)[1, ] + j <- 1 + chunk_indices_on_file <- sapply(chunk_indices, + function(x) { + r <- chunks_indices[[j]][x] + j <<- j + 1 + r + }) + found_chunk <- which(found_chunks_str == paste(chunk_indices_on_file, + collapse = '_'))[1] + if (length(found_chunk) > 0) { + num_tries <- 5 + found <- FALSE + try_num <- 1 + while ((try_num <= num_tries) && !found) { + array_of_chunks[[i]] <- try({ + readRDS(paste0(shared_dir, '/', + chunk_files_original[found_chunk])) + }) + if (is(array_of_chunks[[i]], 'try-error')) { + message("Waiting for an incomplete file transfer...") + Sys.sleep(5) + } else { + found <- TRUE + } + try_num <- try_num + 1 + } + if (!found) { + stop("Could not open one of the chunks. Might be a large chunk ", + "in transfer. Merge aborted, files have been preserved.") + } + } + } + + result[[component]] <- .MergeArrayOfArrays(array_of_chunks) + rm(array_of_chunks) + gc() + } + } + + if (remove) { + sapply(all_chunk_files_original, + function(x) { + file.remove(paste0(shared_dir, '/', x)) + }) + } + + result +} + +.KnownLonNames <- function() { + known_lon_names <- c('lon', 'longitude', 'x', 'i', 'nav_lon') +} + +.KnownLatNames <- function() { + known_lat_names <- c('lat', 'latitude', 'y', 'j', 'nav_lat') +} + +.ReplaceElementInVector <- function(x, target, new_val) { + # x is a vector with name + # target is a string + # new_val is a vector with name + # E.g., Change [a = 2, b = 3] to [c = 1, d = 2, b = 3], then: + # x = c(a = 2, b = 3), target = 'a', new_val = c(c = 1, d = 2) + new_names <- unlist(lapply(as.list(names(x)), function(x) if (x == target) names(new_val) else x)) + new_list <- vector('list', length = length(new_names)) + for (i in 1:length(new_list)) { + new_list[[i]] <- c(new_val, x)[which(c(names(new_val), names(x)) == new_names[i])] + } + return(unlist(new_list)) +} + +.withWarnings <- function(expr) { + myWarnings <- NULL + wHandler <- function(w) { + myWarnings <<- c(myWarnings, list(w)) + invokeRestart("muffleWarning") + } + val <- withCallingHandlers(expr, warning = wHandler) + list(value = val, warnings = myWarnings) +} + +# This function writes startR_autosubmit.sh to local startR_autosubmit folder, under expID/ +write_autosubmit_bash <- function(chunks, cluster, autosubmit_suite_dir) { + # "chunks" should be the argument "chunks" in Compute() plus the redundant margin dims, + # e.g., list(dat = 1, var = 1, sdate = 1, time = 1, lat = 2, lon = 3) + + # Loop through chunks to create load script for each + for (n_chunk in 0:(prod(unlist(chunks)) - 1)) { + + # Create chunk args + chunk_names <- names(chunks) + chunk_args <- matrix(NA, 2, length(chunks)) + chunk_args[1, ] <- paste0('%JOBS.CHUNK_', n_chunk, '.', chunk_names, '%') + chunk_args[2, ] <- paste0('%JOBS.CHUNK_', n_chunk, '.', chunk_names, '_N%') + chunk_args <- paste0('(', paste(c(chunk_args), collapse = ' '), ')') + + bash_script_template <- file(system.file('chunking/Autosubmit/startR_autosubmit.sh', + package = 'startR')) + bash_script_lines <- readLines(bash_script_template) + close(bash_script_template) + + # Rewrite chunk_args= + bash_script_lines <- gsub('^chunk_args=*', paste0('chunk_args=', chunk_args), + bash_script_lines) + # Include init commands + bash_script_lines <- gsub('^include_init_commands', + paste0(paste0(cluster[['init_commands']], collapse = '\n'), '\n'), + + bash_script_lines) + # Rewrite include_module_load + bash_script_lines <- gsub('^include_module_load', + paste0('module load ', cluster[['r_module']]), + bash_script_lines) + # Rewrite cd run_dir + # If run_dir is not specified, the script will run under ${proj_dir} + if (!is.null(cluster[['run_dir']])) { + bash_script_lines <- gsub('^cd_run_dir', + paste0('cd ', cluster[['run_dir']]), + bash_script_lines) + } else { + bash_script_lines <- gsub('^cd_run_dir', 'cd ${proj_dir}', + bash_script_lines) + } + + # Save modified .sh file under local$PROJECT_PATH in expdef.yml + #NOTE: dest_dir is ecflow_suite_dir_suite in ByChunks_autosubmit() + #NOTE: the file will be copied to proj/ by "autosubmit create" + dest_dir <- file.path(autosubmit_suite_dir, paste0("/STARTR_CHUNKING_", cluster$expid)) + + if (!file.exists(dest_dir)) { + dir.create(dest_dir, recursive = TRUE) + } + writeLines(bash_script_lines, paste0(dest_dir, '/startR_autosubmit_', n_chunk, '.sh')) + } +} + +# This function generates the .yml files under autosubmit conf/ +write_autosubmit_confs <- function(chunks, cluster, autosubmit_suite_dir) { + # "chunks" is from Compute() input, e.g., chunks <- list(lat = 2, lon = 3) + # "cluster" is the argument "cluster" in Compute(), to set machine configuration + # "autosubmit_suite_dir" should be the local folder that has R script, like ecflow_suite_dir in Compute() + + # Get config template files from package + template_dir <- system.file('chunking/Autosubmit/', package = 'startR') + config_files <- list.files(template_dir, pattern = "*\\.yml$") + + for (i_file in config_files) { + + conf <- yaml::read_yaml(file.path(template_dir, i_file)) + conf_type <- strsplit(i_file, split = "[.]")[[1]][1] + +############################################################ + if (conf_type == "autosubmit") { + + #Q: Should it be the total amount of chunk? + conf$config$MAXWAITINGJOBS <- as.integer(prod(unlist(chunks))) # total amount of chunk + #NOTE: Nord3 max. amount of queued jobs is 366 + if (conf$config$MAXWAITINGJOBS > 366) conf$config$MAXWAITINGJOBS <- 366 + conf$config$TOTALJOBS <- as.integer(cluster$max_jobs) + +############################################################ + } else if (conf_type == "expdef") { + conf$default$EXPID <- cluster$expid + conf$default$HPCARCH <- cluster$queue_host + # PROJECT_PATH should be where submit.sh and load....R stored --> local startR_autosubmit folder, under expID/ + conf$local$PROJECT_PATH <- file.path(autosubmit_suite_dir, paste0("STARTR_CHUNKING_", cluster$expid)) + +############################################################ + } else if (conf_type == "jobs") { + + chunks_vec <- lapply(lapply(chunks, seq, 1), rev) # list(lat = 1:2, lon = 1:3) + chunk_df <- expand.grid(chunks_vec) + nchunks <- nrow(chunk_df) + chunk_name <- paste0("CHUNK_", 0:(nchunks - 1)) + + # Fill in common configurations + jobs <- conf$JOBS + # wallclock from '01:00:00' to '01:00' + jobs[[1]]$WALLCLOCK <- substr(cluster$job_wallclock, 1, 5) + jobs[[1]]$PLATFORM <- cluster$queue_host + jobs[[1]]$THREADS <- as.integer(cluster$cores_per_job) + jobs[[1]][paste0(names(chunks), "_N")] <- as.integer(unlist(chunks)) + jobs[[1]][names(chunks)] <- "" + + # Create chunks and fill in info for each chunk + if (nchunks > 1) { + jobs <- c(jobs, rep(jobs, nchunks - 1)) + names(jobs) <- chunk_name + } + for (i_chunk in 1:nchunks) { + jobs[[i_chunk]][names(chunks)] <- chunk_df[i_chunk, ] + jobs[[i_chunk]]$FILE <- paste0('startR_autosubmit_', i_chunk - 1, '.sh') + } + + conf$JOBS <- jobs + +############################################################ + } else if (conf_type == "platforms") { + if (tolower(cluster$queue_host) != "local") { + conf$Platforms[[cluster$queue_host]]$USER <- cluster$hpc_user + conf$Platforms[[cluster$queue_host]]$PROCESSORS_PER_NODE <- as.integer(cluster$cores_per_job) + if (!is.null(cluster$extra_queue_params)) { + tmp <- unlist(cluster$extra_queue_params) + for (ii in 1:length(tmp)) { + tmp[ii] <- paste0('\"', tmp[ii], '\"') + } + conf$Platforms[[cluster$queue_host]]$CUSTOM_DIRECTIVES <- paste0('[ ', paste(tmp, collapse = ','), ' ]') + } + } + +############################################################ + } else { + stop("File ", i_file, " is not considered in this function.") + } + +############################################################ + # Output directory + dest_dir <- paste0("/esarchive/autosubmit/", cluster$expid, "/conf/") + dest_file <- paste0(conf_type, "_", cluster$expid, ".yml") + + # Write config file inside autosubmit dir + yaml::write_yaml(conf, paste0(dest_dir, dest_file)) + Sys.chmod(paste0(dest_dir, dest_file), mode = "755", use_umask = F) + + } # for loop each file +} diff --git a/modules/Loading/tmp/startR/R/indices.R b/modules/Loading/tmp/startR/R/indices.R new file mode 100644 index 00000000..a7ad7a08 --- /dev/null +++ b/modules/Loading/tmp/startR/R/indices.R @@ -0,0 +1,34 @@ +#'Specify dimension selectors with indices +#' +#'This is a helper function used in a Start() call to define the desired range +#'of dimensions. It selects the indices of the coordinate variable from +#'original data. See details in the documentation of the parameter \code{\dots} +#''indices to take' of the function Start(). +#' +#'@param x A numeric vector or a list with two nemerics to take all the +#' elements between the two specified indices (both extremes inclusive). +#'@return Same as input, but with additional attribute 'indices', 'values', and +#' 'chunk'. +#'@examples +#' # Used in Start(): +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011', '200012') +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = indices(1:2), +#' longitude = indices(list(2, 14)), +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#'@seealso \code{\link{values}} +#'@export +indices <- function(x) { + attr(x, 'indices') <- TRUE + attr(x, 'values') <- FALSE + attr(x, 'chunk') <- c(chunk = 1, n_chunks = 1) + x +} diff --git a/modules/Loading/tmp/startR/R/values.R b/modules/Loading/tmp/startR/R/values.R new file mode 100644 index 00000000..592aa1d1 --- /dev/null +++ b/modules/Loading/tmp/startR/R/values.R @@ -0,0 +1,35 @@ +#'Specify dimension selectors with actual values +#' +#'This is a helper function used in a Start() call to define the desired range +#'of dimensions. It specifies the actual value to be matched with the +#'coordinate variable. See details in the documentation of the parameter +#'\code{\dots} 'indices to take' of the function Start(). +#'@param x A numeric vector or a list with two nemerics to take all the element +#' between the two specified values (both extremes inclusive). +#'@return Same as input, but with additional attribute 'indices', 'values', and +#' 'chunk'. +#'@examples +#' # Used in Start(): +#' data_path <- system.file('extdata', package = 'startR') +#' path_obs <- file.path(data_path, 'obs/monthly_mean/$var$/$var$_$sdate$.nc') +#' sdates <- c('200011', '200012') +#' data <- Start(dat = list(list(path = path_obs)), +#' var = 'tos', +#' sdate = sdates, +#' time = 'all', +#' latitude = values(seq(-80, 80, 20)), +#' latitude_reorder = Sort(), +#' longitude = values(list(10, 300)), +#' longitude_reorder = CircularSort(0, 360), +#' return_vars = list(latitude = 'dat', +#' longitude = 'dat', +#' time = 'sdate'), +#' retrieve = FALSE) +#'@seealso \code{\link{indices}} +#'@export +values <- function(x) { + attr(x, 'indices') <- FALSE + attr(x, 'values') <- TRUE + attr(x, 'chunk') <- c(chunk = 1, n_chunks = 1) + x +} diff --git a/modules/Loading/tmp/startR/R/zzz.R b/modules/Loading/tmp/startR/R/zzz.R new file mode 100644 index 00000000..f1987463 --- /dev/null +++ b/modules/Loading/tmp/startR/R/zzz.R @@ -0,0 +1,1579 @@ +# Take *_var parameters apart +take_var_params <- function(dim_params) { + # Take *_var parameters apart + var_params_ind <- grep('_var$', names(dim_params)) + var_params <- dim_params[var_params_ind] + # Check all *_var are NULL or vectors of character strings, and + # that they all have a matching dimension param. + i <- 1 + for (var_param in var_params) { + if (!is.character(var_param)) { + stop("All '*_var' parameters must be character strings.") + } else if (!any(grepl(paste0('^', strsplit(names(var_params)[i], + '_var$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_var' parameters must be associated to a dimension parameter. Found parameter '", + names(var_params)[i], "' but no parameter '", + strsplit(names(var_params)[i], '_var$')[[1]][1], "'.")) + } + i <- i + 1 + } + # Make the keys of 'var_params' to be the name of + # the corresponding dimension. + if (length(var_params) < 1) { + var_params <- NULL + } else { + names(var_params) <- gsub('_var$', '', names(var_params)) + } + return(var_params) +} + +# Take *_reorder parameters apart +take_var_reorder <- function(dim_params) { + # Take *_reorder parameters apart + dim_reorder_params_ind <- grep('_reorder$', names(dim_params)) + dim_reorder_params <- dim_params[dim_reorder_params_ind] + # Make the keys of 'dim_reorder_params' to be the name of + # the corresponding dimension. + if (length(dim_reorder_params) < 1) { + dim_reorder_params <- NULL + } else { + names(dim_reorder_params) <- gsub('_reorder$', '', names(dim_reorder_params)) + } + return(dim_reorder_params) +} + +# Take *_depends parameters apart +take_var_depends <- function(dim_params) { + depends_params_ind <- grep('_depends$', names(dim_params)) + depends_params <- dim_params[depends_params_ind] + # Check all *_depends are NULL or vectors of character strings, and + # that they all have a matching dimension param. + i <- 1 + for (depends_param in depends_params) { + if (!is.character(depends_param) || (length(depends_param) > 1)) { + stop("All '*_depends' parameters must be single character strings.") + } else if (!any(grepl(paste0('^', strsplit(names(depends_params)[i], + '_depends$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_depends' parameters must be associated to a dimension parameter. Found parameter '", + names(depends_params)[i], "' but no parameter '", + strsplit(names(depends_params)[i], '_depends$')[[1]][1], "'.")) + } + i <- i + 1 + } + # Make the keys of 'depends_params' to be the name of + # the corresponding dimension. + if (length(depends_params) < 1) { + depends_params <- NULL + } else { + names(depends_params) <- gsub('_depends$', '', names(depends_params)) + } + return(depends_params) +} + +# Take *_across parameters apart +take_var_across <- function(dim_params) { + across_params_ind <- grep('_across$', names(dim_params)) + across_params <- dim_params[across_params_ind] + # Check all *_across are NULL or vectors of character strings, and + # that they all have a matching dimension param. + i <- 1 + for (across_param in across_params) { + if (!is.character(across_param) || (length(across_param) > 1)) { + stop("All '*_across' parameters must be single character strings.") + } else if (!any(grepl(paste0('^', strsplit(names(across_params)[i], + '_across$')[[1]][1], '$'), + names(dim_params)))) { + stop(paste0("All '*_across' parameters must be associated to a dimension parameter. Found parameter '", + names(across_params)[i], "' but no parameter '", + strsplit(names(across_params)[i], '_across$')[[1]][1], "'.")) + } + i <- i + 1 + } + # Make the keys of 'across_params' to be the name of + # the corresponding dimension. + if (length(across_params) < 1) { + across_params <- NULL + } else { + names(across_params) <- gsub('_across$', '', names(across_params)) + } + return(across_params) +} + +# Leave alone the dimension parameters in the variable dim_params +rebuild_dim_params <- function(dim_params, merge_across_dims, + inner_dims_across_files) { + var_params_ind <- grep('_var$', names(dim_params)) + dim_reorder_params_ind <- grep('_reorder$', names(dim_params)) + tolerance_params_ind <- grep('_tolerance$', names(dim_params)) + depends_params_ind <- grep('_depends$', names(dim_params)) + across_params_ind <- grep('_across$', names(dim_params)) + # Leave alone the dimension parameters in the variable dim_params + if (length(c(var_params_ind, dim_reorder_params_ind, tolerance_params_ind, + depends_params_ind, across_params_ind)) > 0) { + dim_params <- dim_params[-c(var_params_ind, dim_reorder_params_ind, + tolerance_params_ind, depends_params_ind, + across_params_ind)] + # Reallocating pairs of across file and inner dimensions if they have + # to be merged. They are put one next to the other to ease merge later. + if (merge_across_dims) { + if (any(!names(inner_dims_across_files) %in% names(dim_params)) | + any(!unlist(inner_dims_across_files) %in% names(dim_params))) + stop("All *_across parameters must have value as a file dimension name.") + for (inner_dim_across in names(inner_dims_across_files)) { + inner_dim_pos <- which(names(dim_params) == inner_dim_across) + file_dim_pos <- which(names(dim_params) == inner_dims_across_files[[inner_dim_across]]) + new_pos <- inner_dim_pos + if (file_dim_pos < inner_dim_pos) { + new_pos <- new_pos - 1 + } + dim_params_to_move <- dim_params[c(inner_dim_pos, file_dim_pos)] + dim_params <- dim_params[-c(inner_dim_pos, file_dim_pos)] + new_dim_params <- list() + if (new_pos > 1) { + new_dim_params <- c(new_dim_params, dim_params[1:(new_pos - 1)]) + } + new_dim_params <- c(new_dim_params, dim_params_to_move) + if (length(dim_params) >= new_pos) { + new_dim_params <- c(new_dim_params, dim_params[new_pos:length(dim_params)]) + } + dim_params <- new_dim_params + } + } + } + dim_names <- names(dim_params) + if (is.null(dim_names)) { + stop("At least one pattern dim must be specified.") + } + return(dim_params) +} + +# Look for chunked dims +look_for_chunks <- function(dim_params, dim_names) { + chunks <- vector('list', length(dim_names)) + names(chunks) <- dim_names + for (dim_name in dim_names) { + if (!is.null(attr(dim_params[[dim_name]], 'chunk'))) { + chunks[[dim_name]] <- attr(dim_params[[dim_name]], 'chunk') + attributes(dim_params[[dim_name]]) <- attributes(dim_params[[dim_name]])[-which(names(attributes(dim_params[[dim_name]])) == 'chunk')] + } else { + chunks[[dim_name]] <- c(chunk = 1, n_chunks = 1) + } + } + return(chunks) +} + +# This is a helper function to compute the chunk indices to take once the total +# number of indices for a dimension has been discovered. + get_chunk_indices <- function(n_indices, chunk, n_chunks, dim_name) { + if (n_chunks > n_indices) { + stop("Requested to divide dimension '", dim_name, "' of length ", + n_indices, " in ", n_chunks, " chunks, which is not possible.") + } + chunk_sizes <- rep(floor(n_indices / n_chunks), n_chunks) + chunks_to_extend <- n_indices - chunk_sizes[1] * n_chunks + if (chunks_to_extend > 0) { + chunk_sizes[1:chunks_to_extend] <- chunk_sizes[1:chunks_to_extend] + 1 + } + chunk_size <- chunk_sizes[chunk] + offset <- 0 + if (chunk > 1) { + offset <- sum(chunk_sizes[1:(chunk - 1)]) + } + indices <- 1:chunk_sizes[chunk] + offset + array(indices, dim = setNames(length(indices), dim_name)) + } + +# Check pattern_dims +# Function found_pattern_dims may change pattern_dims in the parent.env +found_pattern_dims <- function(pattern_dims, dim_names, var_params, + dim_params, dim_reorder_params) { + if (is.null(pattern_dims)) { + .warning(paste0("Parameter 'pattern_dims' not specified. Taking the first dimension, '", + dim_names[1], "' as 'pattern_dims'.")) + assign('pattern_dims', dim_names[1], envir = parent.frame()) + pattern_dims <- dim_names[1] + } else if (is.character(pattern_dims) && (length(pattern_dims) > 0)) { + assign('pattern_dims', unique(pattern_dims), envir = parent.frame()) + pattern_dims <- unique(pattern_dims) + } else { + stop("Parameter 'pattern_dims' must be a vector of character strings.") + } + if (any(names(var_params) %in% pattern_dims)) { + stop("'*_var' parameters specified for pattern dimensions. Remove or fix them.") + } + # Find the pattern dimension with the pattern specifications + found_pattern_dim <- NULL + for (pattern_dim in pattern_dims) { + # Check all specifications in pattern_dim are valid +# dat <- datasets <- dim_params[[pattern_dim]] + dat <- dim_params[[pattern_dim]] + if (is.null(dat) || !(is.character(dat) && all(nchar(dat) > 0)) && !is.list(dat)) { + stop(paste0("Parameter '", pattern_dim, + "' must be a list of lists with pattern specifications or a vector of character strings.")) + } + if (!is.null(dim_reorder_params[[pattern_dim]])) { + .warning(paste0("A reorder for the selectors of '", pattern_dim, + "' has been specified, but it is a pattern dimension and the reorder will be ignored.")) + } + if (is.list(dat) || any(sapply(dat, is.list))) { + if (is.null(found_pattern_dim)) { + found_pattern_dim <- pattern_dim + } else { + stop("Found more than one pattern dim with pattern specifications (list of lists). One and only one pattern dim must contain pattern specifications.") + } + } + } + if (is.null(found_pattern_dim)) { + .warning(paste0("Could not find any pattern dim with explicit data set descriptions (in the form of list of lists). Taking the first pattern dim, '", pattern_dims[1], "', as dimension with pattern specifications.")) + found_pattern_dim <- pattern_dims[1] + } + return(found_pattern_dim) +} + + +# The variable 'dat' is mounted with the information (name, path) of each dataset. +# NOTE: This function creates the object 'dat_names' in the parent env. +mount_dat <- function(dat, pattern_dims, found_pattern_dim, dat_names) { + +# dat_info_names <- c('name', 'path')#, 'nc_var_name', 'suffix', 'var_min', 'var_max', 'dimnames') + dat_to_fetch <- c() +# dat_names <- c() + if (!is.list(dat)) { + dat <- as.list(dat) + } else { + if (!any(sapply(dat, is.list))) { + dat <- list(dat) + } + } + for (i in 1:length(dat)) { + if (is.character(dat[[i]]) && length(dat[[i]]) == 1 && nchar(dat[[i]]) > 0) { + if (grepl('^(\\./|\\.\\./|/.*/|~/)', dat[[i]])) { + dat[[i]] <- list(path = dat[[i]]) + } else { + dat[[i]] <- list(name = dat[[i]]) + } + } else if (!is.list(dat[[i]])) { + stop(paste0("Parameter '", pattern_dims, + "' is incorrect. It must be a list of lists or character strings.")) + } + #if (!(all(names(dat[[i]]) %in% dat_info_names))) { + # stop("Error: parameter 'dat' is incorrect. There are unrecognized components in the information of some of the datasets. Check 'dat' in ?Load for details.") + #} + if (!('name' %in% names(dat[[i]]))) { + dat[[i]][['name']] <- paste0('dat', i) + if (!('path' %in% names(dat[[i]]))) { + stop(paste0("Parameter '", found_pattern_dim, + "' is incorrect. A 'path' should be provided for each dataset if no 'name' is provided.")) + } + } else if (!('path' %in% names(dat[[i]]))) { + dat_to_fetch <- c(dat_to_fetch, i) + } + #if ('path' %in% names(dat[[i]])) { + # if (!('nc_var_name' %in% names(dat[[i]]))) { + # dat[[i]][['nc_var_name']] <- '$var_name$' + # } + # if (!('suffix' %in% names(dat[[i]]))) { + # dat[[i]][['suffix']] <- '' + # } + # if (!('var_min' %in% names(dat[[i]]))) { + # dat[[i]][['var_min']] <- '' + # } + # if (!('var_max' %in% names(dat[[i]]))) { + # dat[[i]][['var_max']] <- '' + # } + #} + dat_names <- c(dat_names, dat[[i]][['name']]) + } + if ((length(dat_to_fetch) > 0) && (length(dat_to_fetch) < length(dat))) { + .warning("'path' has been provided for some datasets. Any information in the configuration file related to these will be ignored.") + } + if (length(dat_to_fetch) > 0) { + stop("Specified only the name for some data sets, but not the path ", + "pattern. This option has not been yet implemented.") + } + + assign('dat_names', dat_names, envir = parent.frame()) + return(dat) +} + +# Add attributes indicating whether this dimension selector is value or indice +add_value_indices_flag <- function(x) { + if (is.null(attr(x, 'values')) || is.null(attr(x, 'indices'))) { + flag <- (any(x %in% c('all', 'first', 'last')) || is.numeric(unlist(x))) + attr(x, 'values') <- !flag + attr(x, 'indices') <- flag + } + return(x) +} + + +# Find the value for the undefined selector (i.e., indices()). Use the value from the first +# found file. +# Note that "dat[[i]][['path']]" in parent env. is changed in this function. +find_ufd_value <- function(undefined_file_dims, dat, i, replace_values, + first_file, file_dims, path_glob_permissive, + depending_file_dims, dat_selectors, selector_checker, chunks) { + first_values <- vector('list', length = length(undefined_file_dims)) + names(first_values) <- undefined_file_dims + found_values <- 0 + stop <- FALSE + try_dim <- 1 + last_success <- 1 + while ((found_values < length(undefined_file_dims)) && !stop) { + u_file_dim <- undefined_file_dims[try_dim] + if (is.null(first_values[[u_file_dim]])) { + path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], + replace_values[-which(file_dims == u_file_dim)], + allow_undefined_key_vars = TRUE) + found_value <- .FindTagValue(path_with_globs_and_tag, + first_file, u_file_dim) + if (!is.null(found_value)) { + found_values <- found_values + 1 + last_success <- try_dim + first_values[[u_file_dim]] <- found_value + replace_values[[u_file_dim]] <- found_value + } + } + try_dim <- (try_dim %% length(undefined_file_dims)) + 1 + if (try_dim == last_success) { + stop <- TRUE + } + } + if (found_values < length(undefined_file_dims)) { + stop(paste0("Path pattern of dataset '", dat[[i]][['name']], + "' is too complex. Could not automatically ", + "detect values for all non-explicitly defined ", + "indices. Check its pattern: ", dat[[i]][['path']])) + } + ## TODO: Replace ReplaceGlobExpressions by looped call to FindTagValue? As done above + ## Maybe it can solve more cases actually. I got warnings in ReplGlobExp with a typical + ## cmor case, requesting all members and chunks for fixed var and sdate. Not fixing + ## sdate raised 'too complex' error. + # Replace shell globs in path pattern and keep the file_dims as tags + dat[[i]][['path']] <- .ReplaceGlobExpressions(dat[[i]][['path']], first_file, replace_values, + file_dims, dat[[i]][['name']], path_glob_permissive) + + # Now time to look for the available values for the non + # explicitly defined selectors for the file dimensions. + #print("H") + # Check first the ones that do not depend on others. + ufd <- c(undefined_file_dims[which(!(undefined_file_dims %in% names(depending_file_dims)))], + undefined_file_dims[which(undefined_file_dims %in% names(depending_file_dims))]) + + for (u_file_dim in ufd) { + replace_values[undefined_file_dims] <- first_values + replace_values[[u_file_dim]] <- '*' + depended_dim <- NULL + depended_dim_values <- NA + + #NOTE: Here 'selectors' is always 1. Is it supposed to be like this? + selectors <- dat_selectors[[u_file_dim]][[1]] + if (u_file_dim %in% names(depending_file_dims)) { + depended_dim <- depending_file_dims[[u_file_dim]] + depended_dim_values <- dat_selectors[[depended_dim]][[1]] + dat_selectors[[u_file_dim]] <- vector('list', length = length(depended_dim_values)) + names(dat_selectors[[u_file_dim]]) <- depended_dim_values + } else { + dat_selectors[[u_file_dim]] <- list() + } + if (u_file_dim %in% unlist(depending_file_dims)) { + depending_dims <- names(depending_file_dims)[which(sapply(depending_file_dims, function(x) u_file_dim %in% x))] + replace_values[depending_dims] <- rep('*', length(depending_dims)) + } + # If u_file_dim depends on the same depended dimension as another depending + # dimension, then the value of the depending dim should be replaced with '*' + # to avoid only the first value being used, which can result in the wrong + # path specification. + other_depending_file_dims <- depending_file_dims[-which(names(depending_file_dims) == u_file_dim)] + if (length(depending_file_dims) > 1 && + any(unlist(other_depending_file_dims) == depended_dim)) { + depending_dims <- names(other_depending_file_dims)[which(other_depending_file_dims == depended_dim)] + replace_values[depending_dims] <- rep('*', length(depending_dims)) + } + for (j in 1:length(depended_dim_values)) { + parsed_values <- c() + if (!is.null(depended_dim)) { + replace_values[[depended_dim]] <- depended_dim_values[j] + } + path_with_globs <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + found_files <- Sys.glob(path_with_globs) + ## TODO: Enhance this error message, or change by warning. + ## Raises if a wrong sdate is specified, for example. + if (length(found_files) == 0) { + .warning(paste0("Could not find files for any '", u_file_dim, + "' for '", depended_dim, "' = '", + depended_dim_values[j], "'.")) + dat_selectors[[u_file_dim]][[j]] <- NA + } else { + for (found_file in found_files) { + path_with_globs_and_tag <- .ReplaceVariablesInString(dat[[i]][['path']], + replace_values[-which(file_dims == u_file_dim)], + allow_undefined_key_vars = TRUE) + parsed_values <- c(parsed_values, + .FindTagValue(path_with_globs_and_tag, found_file, + u_file_dim)) + } + #TODO: selector_checker() doesn't allow selectors to be characters. For selectors + # like "member = 'r7i1p1f1", it cannot be defined with values. + dat_selectors[[u_file_dim]][[j]] <- selector_checker(selectors = selectors, + var = unique(parsed_values), + return_indices = FALSE) + # Take chunk if needed + dat_selectors[[u_file_dim]][[j]] <- dat_selectors[[u_file_dim]][[j]][get_chunk_indices(length(dat_selectors[[u_file_dim]][[j]]), + chunks[[u_file_dim]]['chunk'], + chunks[[u_file_dim]]['n_chunks'], + u_file_dim)] + } + } + } + #NOTE: change 'dat' in parent env. because "dat[[i]][['path']]" is changed. + assign('dat', dat, envir = parent.frame()) + return(dat_selectors) +} + + +# Adjust the argument 'return_vars' if users don't assign them properly. +# Force return_vars = (time = NULL) to (time = 'sdate') if one of the situations: +# (1) selector = [sdate = 2, time = 4], or +# (2) time_across = 'sdate'. +correct_return_vars <- function(inner_dim, inner_dims_across_files, found_pattern_dim, + file_dim_as_selector_array_dim) { + # inner_dim is not in return_vars or is NULL + if (is.character(file_dim_as_selector_array_dim)) { #(1) + if (any(file_dim_as_selector_array_dim %in% found_pattern_dim)) { + stop(paste0("Found '", inner_dim, "' selector has dimension of the pattern dim '", + found_pattern_dim, + "', which is not allowed. To assign the dependency on the pattern dim, ", + "use 'return_vars = list(", inner_dim, " = 'dat')' instead.")) + } else { + corrected_value <- file_dim_as_selector_array_dim + } + } else if (inner_dim %in% inner_dims_across_files) { #(2) + file_dim_name <- names(which(inner_dim == inner_dims_across_files)) + if (file_dim_name %in% found_pattern_dim) { + stop(paste0("Found '", inner_dim, "' has across dependency on the pattern dim '", + found_pattern_dim, "', which is not allowed.")) + } else { + corrected_value <- file_dim_name + } + } + .warning(paste0("Found '", inner_dim, "' dependency on file dimension '", corrected_value, + "', but '", inner_dim, "' is not in return_vars list or does not include '", corrected_value, + "'. To provide the correct metadata, '", corrected_value, "' is included under '", inner_dim, + "' in 'return_vars.")) + return(corrected_value) +} + +# The time classes that are needed to adjust time zone back to UTC. +time_special_types <- function() { + list('POSIXct' = as.POSIXct, 'POSIXlt' = as.POSIXlt, 'Date' = as.Date) +} + +# Replace the dim names read from netCDF file with the user-specified synonims. +replace_with_synonmins <- function(read_dims, synonims) { + corrected_dim_name <- sapply(names(read_dims), + function(x) { + which_entry <- which(sapply(synonims, function(y) x %in% y)) + if (length(which_entry) > 0) { + names(synonims)[which_entry] + } else { + x + } + }) + return(corrected_dim_name) +} + + +# Prepare vars_to_read for this dataset (i loop) and this file (j loop) +generate_vars_to_read <- function(return_vars, changed_dims, first_found_file, common_return_vars, + common_first_found_file, i) { + vars_to_read <- NULL + if (length(return_vars) > 0) { + #NOTE: because return_vars has changed 'dat' to character(0) above (line 1775), + # 'dat' won't be included in vars_to_read here. + vars_to_read <- names(return_vars)[sapply(return_vars, function(x) any(names(changed_dims) %in% x))] + } + if (!is.null(first_found_file)) { + if (any(!first_found_file)) { + vars_to_read <- c(vars_to_read, names(first_found_file[which(!first_found_file)])) + } + } + if ((i == 1) && (length(common_return_vars) > 0)) { + vars_to_read <- c(vars_to_read, names(common_return_vars)[sapply(common_return_vars, function(x) any(names(changed_dims) %in% x))]) + } + if (!is.null(common_first_found_file)) { + if (any(!common_first_found_file)) { + vars_to_read <- c(vars_to_read, names(common_first_found_file[which(!common_first_found_file)])) + } + } + return(vars_to_read) +} + +# Find the largest dims length within one dataset. +find_largest_dims_length <- function(selectors_total_list, array_of_files_to_load, + selector_indices_save, dat, expected_inner_dims, + synonims, file_dim_reader) { + # Open and get all the dims from all the files + data_dims_all_files <- vector('list', length = length(selectors_total_list)) + + for (selectors_kk in 1:length(data_dims_all_files)) { + file_to_open <- do.call("[", c(list(array_of_files_to_load), + as.list(selector_indices_save[[selectors_kk]]))) + data_dims_all_files[[selectors_kk]] <- try( + file_dim_reader(file_to_open, NULL, selectors_total_list[[selectors_kk]], + lapply(dat[['selectors']][expected_inner_dims], '[[', 1), + synonims), silent = TRUE) + + } + + # Remove the missing files (i.e., fail try above) + if (!identical(which(substr(data_dims_all_files, 1, 5) == 'Error'), integer(0))) { + tmp <- which(substr(data_dims_all_files, 1, 5) == 'Error') + data_dims_all_files <- data_dims_all_files[-tmp] + } + + # Find the longest dimensions from all the files + largest_data_dims <- rep(0, length(data_dims_all_files[[1]])) + + # The inner dim order may differ among files. Need to align them before + # find out the largest dim length. + dim_names_first_file <- names(data_dims_all_files[[1]]) + same_dim_order <-lapply(lapply(data_dims_all_files, names), + identical, dim_names_first_file) + for (to_fix in which(!unlist(same_dim_order))) { + data_dims_all_files[[to_fix]] <- data_dims_all_files[[to_fix]][match(dim_names_first_file, + names(data_dims_all_files[[to_fix]]))] + } + + for (kk in 1:length(data_dims_all_files[[1]])) { + largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) + } + names(largest_data_dims) <- names(data_dims_all_files[[1]]) + return(list(largest_data_dims = largest_data_dims, + data_dims_all_files = data_dims_all_files)) +} + +# Gererate vars_to_transform from picked_vars[[i]] and picked_common_vars +generate_vars_to_transform <- function(vars_to_transform, picked_vars, transform_vars, + picked_vars_ordered) { + # In Start(), picked_vars can be picked_vars[[i]] or picked_common_vars + picked_vars_to_transform <- which(names(picked_vars) %in% transform_vars) + if (length(picked_vars_to_transform) > 0) { + picked_vars_to_transform <- names(picked_vars)[picked_vars_to_transform] + new_vars_to_transform <- picked_vars[picked_vars_to_transform] + which_are_ordered <- which(!sapply(picked_vars_ordered[picked_vars_to_transform], is.null)) + + if (length(which_are_ordered) > 0) { + tmp <- which(!is.na(match(names(picked_vars_ordered), names(which_are_ordered)))) + new_vars_to_transform[which_are_ordered] <- picked_vars_ordered[tmp] + } + vars_to_transform <- c(vars_to_transform, new_vars_to_transform) + } + return(vars_to_transform) +} + +# Turn indices to values for transform_crop_domain +generate_transform_crop_domain_values <- function(transform_crop_domain, + picked_vars, + transform_var) { + if (any(transform_crop_domain == 'all')) { + if (transform_var %in% .KnownLatNames()) { + transform_crop_domain <- c(-90, 90) + } else if (transform_var %in% .KnownLonNames()) { + if (any(picked_vars > 180)) { + transform_crop_domain <- c(0, 360) + } else { + transform_crop_domain <- c(-180, 180) + } + } else { + transform_crop_domain <- c(picked_vars[1], tail(picked_vars, 1)) + } + } else { # indices() + if (is.list(transform_crop_domain)) { + transform_crop_domain <- picked_vars[unlist(transform_crop_domain)] + } else { # vector + transform_crop_domain <- + c(picked_vars[transform_crop_domain[1]], + picked_vars[tail(transform_crop_domain, 1)]) + } + } + return(transform_crop_domain) +} + +# Out-of-range warning +show_out_of_range_warning <- function(inner_dim, range, bound) { + # bound: 'lower' or 'upper' + .warning(paste0("The ", bound, " boundary of selector of ", inner_dim, + " is out of range [", min(range), ", ", max(range), "]. ", + "Check if the desired range is all included.")) +} + +# 'sub_sub_array_of_values' is for sri chunking. If this inner dim is chunked, +# the sri has to follow the chunking of fri. Therefore, we save the original +# value of this chunk here for later use. We'll find the corresponding +# transformed value within 'sub_sub_array_of_values' and chunk sri. This +# function also returns 'previous_sub_subarray_of_values', which is used for +# checking if there is sri being skipped. +generate_sub_sub_array_of_values <- function(input_array_of_values, sub_array_of_indices, + number_of_chunk) { + previous_sub_sub_array_of_values <- NULL + + if (is.list(sub_array_of_indices)) { + sub_sub_array_of_values <- list(input_array_of_values[sub_array_of_indices[[1]]], + input_array_of_values[sub_array_of_indices[[2]]]) + if (number_of_chunk > 1) { + if (diff(unlist(sub_array_of_indices)) > 0) { + previous_sub_sub_array_of_values <- + input_array_of_values[sub_array_of_indices[[1]] - 1] + } else { + previous_sub_sub_array_of_values <- + input_array_of_values[sub_array_of_indices[[1]] + 1] + } + } + } else { # is vector + sub_sub_array_of_values <- input_array_of_values[sub_array_of_indices] + if (number_of_chunk > 1) { + if (diff(sub_array_of_indices[1:2]) > 0) { + previous_sub_sub_array_of_values <- + input_array_of_values[sub_array_of_indices[1] - 1] + } else { + previous_sub_sub_array_of_values <- + input_array_of_values[sub_array_of_indices[1] + 1] + } + } + } + + return(list(sub_sub_array_of_values = sub_sub_array_of_values, + previous_sub_sub_array_of_values = previous_sub_sub_array_of_values)) +} + + +# Generate sub_array_of_fri +generate_sub_array_of_fri <- function(with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta, + is_circular_dim, add_beta = TRUE) { + print_warning <- FALSE + if (goes_across_prime_meridian) { + #NOTE: The potential problem here is, if it is global longitude, + # and the indices overlap (e.g., lon = [0, 359.723] and + # CircularSort(-180, 180), then sub_array_of_indices = list(649, 649)). + # Therefore, sub_array_of_fri will be c(1:649, 649:1296). We'll get two 649. + # The fix below may not be the best solution, but it works for the example above. + + if (sub_array_of_indices[[1]] == sub_array_of_indices[[2]]) { + # global longitude + sub_array_of_fri <- 1:n # n = prod(dim(var_with_selectors)) + + if (with_transform & beta != 0 & add_beta) { + # Warning if transform_extra_cell != 0 + print_warning <- TRUE + } + + } else { + # normal case, i.e., not global + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + if (with_transform & add_beta) { + gap_width <- last_index - first_index - 1 + actual_beta <- min(gap_width, beta) + sub_array_of_fri <- c(1:(first_index + actual_beta), + (last_index - actual_beta):n) + if (actual_beta != beta) { + print_warning <- TRUE + } + } else { + sub_array_of_fri <- c(1:first_index, last_index:n) + } + } + + } else { + #NOTE: This if seems redundant. +# if (is.list(sub_array_of_indices)) { +# sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] +# } + #NOTE: sub_array_of_indices may be vector or list + if (with_transform & add_beta) { + first_index <- min(unlist(sub_array_of_indices)) + last_index <- max(unlist(sub_array_of_indices)) + start_padding <- min(beta, first_index - 1) + end_padding <- min(beta, n - last_index) + + if (!is_circular_dim) { #latitude or when _reorder is not used + sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) + if (start_padding != beta | end_padding != beta) { + print_warning <- TRUE + } + } else { #longitude + if (start_padding == beta & end_padding == beta) { + # normal regional situation + sub_array_of_fri <- (first_index - start_padding):(last_index + end_padding) + } else if (start_padding < beta & end_padding < beta) { + # global + sub_array_of_fri <- 1:n + } else if (start_padding < beta) { + # left side too close to border, need to go to right side + sub_array_of_fri <- c((first_index - start_padding):(last_index + end_padding), (n - (beta - start_padding - 1)):n) + sub_array_of_fri <- unique(sub_array_of_fri) + } else if (end_padding < beta) { + # right side too close to border, need to go to left side + sub_array_of_fri <- c(1: (beta - end_padding), (first_index - start_padding):(last_index + end_padding)) + sub_array_of_fri <- unique(sub_array_of_fri) + } + } + + } else { + if (is.list(sub_array_of_indices)) { + sub_array_of_fri <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]] + } else { + sub_array_of_fri <- sub_array_of_indices + } + } + } + + if (print_warning) { + .warning(paste0("Adding parameter transform_extra_cells = ", beta, + " to the transformed index excesses ", + "the border. The border index is used for transformation.")) + } + + return(sub_array_of_fri) +} + +# This function merges two dimensions (e.g., time and sdate if "time_across = 'sdate'") into one. +# The two dimensions have to be next to each other. In Start(), it is used to reshape +# final_dims_fake if merge_across_dims = TRUE +dims_merge <- function(inner_dims_across_files, final_dims_fake) { + # inner_dims_across_files would be like: $sdate: "time" + for (file_dim_across in names(inner_dims_across_files)) { + inner_dim_pos <- which(names(final_dims_fake) == inner_dims_across_files[[file_dim_across]]) + new_dims <- c() + # part 1: Put the dims before 'time' in new_dims + if (inner_dim_pos > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(inner_dim_pos - 1)]) + } + # part 2: Merge time and sdate together, and name this dim as 'time' + # The cross and being crossed dims are next to each other, e.g., [time, sdate] + new_dims <- c(new_dims, setNames(prod(final_dims_fake[c(inner_dim_pos, inner_dim_pos + 1)]), + inner_dims_across_files[[file_dim_across]])) + # part 3: Put the dimes after 'sdate' in new_dims + if (inner_dim_pos + 1 < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(inner_dim_pos + 2):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + return(final_dims_fake) +} + +# This function splits one dimension into two. In Start(), it is used to reshape final_dims_fake +# if split_multiselected_dims = TRUE. +dims_split <- function(dim_params, final_dims_fake) { + all_split_dims <- NULL + for (dim_param in 1:length(dim_params)) { + split_dims <- dim(dim_params[[dim_param]]) + if (!is.null(split_dims)) { + if (length(split_dims) > 1) { + all_split_dims <- c(all_split_dims, setNames(list(split_dims), + names(dim_params)[dim_param])) + if (is.null(names(split_dims))) { + names(split_dims) <- paste0(names(dim_params)[dim_param], + 1:length(split_dims)) + } + old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) + + # If merge_across_dims and split_multiselected_dims are both used, + # on one file dim, and this file dim is multi-dim, it doesn't work. + if (identical(old_dim_pos, integer(0))) { + stop(paste0("The dimension '", names(dim_params)[dim_param], + "' to be split cannot be found after 'merge_across_dims' ", + "is used. Check if the reshape parameters are used appropriately.")) + } + + # NOTE: Three steps to create new dims. + # 1st: Put in the dims before split_dim. + # 2nd: Replace the old_dim with split_dims. + # 3rd: Put in the dims after split_dim. + new_dims <- c() + if (old_dim_pos > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)]) + } + new_dims <- c(new_dims, split_dims) + if (old_dim_pos < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(old_dim_pos + 1):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + } + } + return(list(final_dims_fake, all_split_dims)) +} + + +# This function sums up the length of all the inner across dim (e.g., time: list(31, 29, 31, 30)) +# and use it to replace the value of that inner dim. That is, it returns the actual length of +# time rather than using the one including NAs. In Start(), it is used to reshape final_dims_fake +# if merge_across_dims = TRUE, merge_across_dims_narm = TRUE, and split_multiselected_dims = FALSE. +merge_narm_dims <- function(final_dims_fake, across_inner_dim, length_inner_across_dim) { + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name + return(final_dims_fake) +} + + + +# Adjust the dim order. If split_multiselected_dims + merge_across_dims, the dim order may +# need to be changed. The inner_dim needs to be the first dim among split dims. +reorder_split_dims <- function(all_split_dims, inner_dim_pos_in_split_dims, final_dims_fake) { + all_split_dims <- c(all_split_dims[inner_dim_pos_in_split_dims], + all_split_dims[1:length(all_split_dims)][-inner_dim_pos_in_split_dims]) + split_dims_pos <- which(!is.na(match(names(final_dims_fake), names(all_split_dims)))) + new_dims <- c() + if (split_dims_pos[1] != 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, all_split_dims) + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + + return(list(final_dims_fake, all_split_dims)) +} + +# Find the final_dims_fake for metadata if it needs to be reshaped +find_final_dims_fake_metadata <- function(merge_across_dims, split_multiselected_dims, + picked_common_vars, across_inner_dim, final_dims_fake, + dims_of_merge_dim, all_split_dims) { + if (merge_across_dims) { + if (!split_multiselected_dims) { + final_dims_fake_metadata <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] + } else { + final_dims_fake_metadata <- final_dims_fake[names(final_dims_fake) %in% names(all_split_dims[[across_inner_dim]])] + } + } else if (split_multiselected_dims) { + target_split_dim_ind <- which(names(dim(picked_common_vars)) == names(all_split_dims)) + margin_dim_ind <- c(1:length(dim(picked_common_vars)))[-target_split_dim_ind] + if (identical(margin_dim_ind, numeric(0)) | identical(margin_dim_ind, integer(0))) { + final_dims_fake_metadata <- all_split_dims[[1]] + } else { + final_dims_fake_metadata <- .ReplaceElementInVector(dim(picked_common_vars), target = names(all_split_dims), new_val = all_split_dims[[1]]) + } + } + + return(final_dims_fake_metadata) +} + +# Build the work pieces. +build_work_pieces <- function(work_pieces, i, selectors, file_dims, inner_dims, final_dims, + found_pattern_dim, inner_dims_across_files, array_of_files_to_load, + array_of_not_found_files, array_of_metadata_flags, + metadata_file_counter, depending_file_dims, transform, + transform_vars, picked_vars, picked_vars_ordered, picked_common_vars, + picked_common_vars_ordered, metadata_folder, debug = debug) { + sub_array_dims <- final_dims[file_dims] + sub_array_dims[found_pattern_dim] <- 1 + sub_array_of_files_to_load <- array(1:prod(sub_array_dims), + dim = sub_array_dims) + names(dim(sub_array_of_files_to_load)) <- names(sub_array_dims) + # Detect which of the dimensions of the dataset go across files. + file_dim_across_files <- lapply(inner_dims, + function(x) { + dim_across <- sapply(inner_dims_across_files, function(y) x %in% y) + if (any(dim_across)) { + names(inner_dims_across_files)[which(dim_across)[1]] + } else { + NULL + } + }) + names(file_dim_across_files) <- inner_dims + j <- 1 + while (j <= prod(sub_array_dims)) { + # Work out file path. + file_to_load_sub_indices <- which(sub_array_of_files_to_load == j, arr.ind = TRUE)[1, ] + names(file_to_load_sub_indices) <- names(sub_array_dims) + file_to_load_sub_indices[found_pattern_dim] <- i + big_dims <- rep(1, length(dim(array_of_files_to_load))) + names(big_dims) <- names(dim(array_of_files_to_load)) + file_to_load_indices <- .MergeArrayDims(file_to_load_sub_indices, big_dims)[[1]] + file_to_load <- do.call('[[', c(list(array_of_files_to_load), + as.list(file_to_load_indices))) + not_found_file <- do.call('[[', c(list(array_of_not_found_files), + as.list(file_to_load_indices))) + load_file_metadata <- do.call('[', c(list(array_of_metadata_flags), + as.list(file_to_load_indices))) + if (load_file_metadata) { + metadata_file_counter <- metadata_file_counter + 1 + assign('metadata_file_counter', metadata_file_counter, envir = parent.frame()) + } + if (!is.na(file_to_load) && !not_found_file) { + # Work out indices to take + first_round_indices <- lapply(inner_dims, + function (x) { + if (is.null(file_dim_across_files[[x]])) { + x_dim_name <- attr(attr(selectors[[x]][['fri']], "dim"), "names") + if (!is.null(x_dim_name)) { + which_chunk <- file_to_load_sub_indices[x_dim_name] + if (length(which_chunk) > 1) { + tmp_dim <- attr(selectors[[x]][['fri']], "dim") + vec_ind <- which_chunk[1] + for (i_dim in length(tmp_dim):2) { + vec_ind <- vec_ind + (which_chunk[i_dim] - 1) * prod(tmp_dim[1:(i_dim - 1)]) + } + selectors[[x]][['fri']][[vec_ind]] + } else { #old code + selectors[[x]][['fri']][[which_chunk]] + } + } else { + selectors[[x]][['fri']][[1]] + } + } else { + which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] + selectors[[x]][['fri']][[which_chunk]] + } + }) + names(first_round_indices) <- inner_dims + second_round_indices <- lapply(inner_dims, + function (x) { + if (is.null(file_dim_across_files[[x]])) { + x_dim_name <- attr(attr(selectors[[x]][['sri']], "dim"), "names") + if (!is.null(x_dim_name)) { + which_chunk <- file_to_load_sub_indices[x_dim_name] + if (length(which_chunk) > 1) { + tmp_dim <- attr(selectors[[x]][['sri']], "dim") + vec_ind <- which_chunk[1] + for (i_dim in length(tmp_dim):2) { + vec_ind <- vec_ind + (which_chunk[i_dim] - 1) * prod(tmp_dim[1:(i_dim - 1)]) + } + selectors[[x]][['sri']][[vec_ind]] + } else { #old code + selectors[[x]][['sri']][[which_chunk]] + } + } else { + selectors[[x]][['sri']][[1]] + } + } else { + which_chunk <- file_to_load_sub_indices[file_dim_across_files[[x]]] + selectors[[x]][['sri']][[which_chunk]] + } + }) + if (debug) { + print("-> BUILDING A WORK PIECE") + #print(str(selectors)) + } + names(second_round_indices) <- inner_dims + if (!any(sapply(first_round_indices, length) == 0)) { + work_piece <- list() + work_piece[['first_round_indices']] <- first_round_indices + work_piece[['second_round_indices']] <- second_round_indices + work_piece[['file_indices_in_array_of_files']] <- file_to_load_indices + work_piece[['file_path']] <- file_to_load + work_piece[['store_dims']] <- final_dims + # Work out store position + store_position <- final_dims + store_position[names(file_to_load_indices)] <- file_to_load_indices + store_position[inner_dims] <- rep(1, length(inner_dims)) + work_piece[['store_position']] <- store_position + # Work out file selectors + file_selectors <- sapply(file_dims, + function (x) { + vector_to_pick <- 1 + if (x %in% names(depending_file_dims)) { + vector_to_pick <- file_to_load_indices[depending_file_dims[[x]]] + } + if (x != found_pattern_dim) { + selectors[[x]][[vector_to_pick]][file_to_load_indices[x]] + } else { + # dat_dim only has one value in each work_piece + selectors[[x]][[vector_to_pick]] + } + }) + names(file_selectors) <- file_dims + work_piece[['file_selectors']] <- file_selectors + # Send variables for transformation + if (!is.null(transform) && (length(transform_vars) > 0)) { + vars_to_transform <- NULL + picked_vars_to_transform <- which(names(picked_vars) %in% transform_vars) + if (length(picked_vars_to_transform) > 0) { + picked_vars_to_transform <- names(picked_vars)[picked_vars_to_transform] + vars_to_transform <- c(vars_to_transform, picked_vars[picked_vars_to_transform]) + if (any(picked_vars_to_transform %in% names(picked_vars_ordered))) { + picked_vars_ordered_to_transform <- picked_vars_to_transform[which(picked_vars_to_transform %in% names(picked_vars_ordered))] + vars_to_transform[picked_vars_ordered_to_transform] <- picked_vars_ordered[picked_vars_ordered_to_transform] + } + } + picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars) + if (length(picked_common_vars_to_transform) > 0) { + picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform] + vars_to_transform <- c(vars_to_transform, picked_common_vars[picked_common_vars_to_transform]) + if (any(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))) { + picked_common_vars_ordered_to_transform <- picked_common_vars_to_transform[which(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))] + vars_to_transform[picked_common_vars_ordered_to_transform] <- picked_common_vars_ordered[picked_common_vars_ordered_to_transform] + } + } + work_piece[['vars_to_transform']] <- vars_to_transform + } + # Send flag to load metadata + if (load_file_metadata) { + work_piece[['save_metadata_in']] <- paste0(metadata_folder, '/', metadata_file_counter) + } + work_pieces <- c(work_pieces, list(work_piece)) + } + } + j <- j + 1 + } + return(work_pieces) +} + +# Calculate the progress %s that will be displayed and assign them to the appropriate work pieces. +retrieve_progress_message <- function(work_pieces, num_procs, silent) { + if (length(work_pieces) / num_procs >= 2 && !silent) { + if (length(work_pieces) / num_procs < 10) { + amount <- 100 / ceiling(length(work_pieces) / num_procs) + reps <- ceiling(length(work_pieces) / num_procs) + } else { + amount <- 10 + reps <- 10 + } + progress_steps <- rep(amount, reps) + if (length(work_pieces) < (reps + 1)) { + selected_pieces <- length(work_pieces) + progress_steps <- c(sum(head(progress_steps, reps)), + tail(progress_steps, reps)) + } else { + selected_pieces <- round(seq(1, length(work_pieces), + length.out = reps + 1))[-1] + } + progress_steps <- paste0(' + ', round(progress_steps, 2), '%') + progress_message <- 'Progress: 0%' + } else { + progress_message <- '' + selected_pieces <- NULL + } + piece_counter <- 1 + step_counter <- 1 + work_pieces <- lapply(work_pieces, + function (x) { + if (piece_counter %in% selected_pieces) { + wp <- c(x, list(progress_amount = progress_steps[step_counter])) + step_counter <<- step_counter + 1 + } else { + wp <- x + } + piece_counter <<- piece_counter + 1 + wp + }) + if (!silent) { + .message("If the size of the requested data is close to or above the free shared RAM memory, R may crash.") + .message("If the size of the requested data is close to or above the half of the free RAM memory, R may crash.") + .message(paste0("Will now proceed to read and process ", length(work_pieces), " data files:")) + if (length(work_pieces) < 30) { + lapply(work_pieces, function (x) .message(x[['file_path']], indent = 2)) + } else { + .message("The list of files is long. You can check it after Start() finishes in the output '$Files'.", indent = 2, exdent = 5) + } + } + + # Build the cluster of processes that will do the work and dispatch work pieces. + # The function .LoadDataFile is applied to each work piece. This function will + # open the data file, regrid if needed, subset, apply the mask, + # compute and apply the weights if needed, + # disable extreme values and store in the shared memory matrix. + #print("O") + if (!silent) { + .message("Loading... This may take several minutes...") + if (progress_message != '') { + .message(progress_message, appendLF = FALSE) + } + } + return(work_pieces) +} + +# If merge_across_dims = TRUE and merge_across_dims_narm = TRUE, remove the additional NAs +# due to unequal inner_dim ('time') length across file_dim ('sdate'). +remove_additional_na_from_merge <- function(data_array = NULL, merge_dim_metadata = NULL, + inner_dims_across_files, final_dims, length_inner_across_dim) { + # data_array is a vector from bigmemory::as.matrix + # merge_dim_metadata is an array + + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? + # Get the length of these two dimensions in final_dims + length_inner_across_store_dims <- final_dims[across_inner_dim] + length_file_across_store_dims <- final_dims[across_file_dim] + + # Create a logical array for merge_across_dims + logi_array <- array(rep(FALSE, + length_file_across_store_dims * length_inner_across_store_dims), + dim = c(length_inner_across_store_dims, length_file_across_store_dims)) + for (i in 1:length_file_across_store_dims) { #1:4 + logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + } + + if (!is.null(data_array)) { + # First, turn the data vector into array with final_dims + data_array_final_dims <- array(data_array, dim = final_dims) + } + + # Change the NA derived from additional spaces to -9999, then remove these -9999 + func_remove_blank <- function(data_array, logi_array) { + # dim(data_array) = [time, file_date] + # dim(logi_array) = [time, file_date] + # data_array can be data or metadata; if data, change the blank spaces from + # NA to -9999; if metadata (supposed to be 'time'), change the corresponding + # spaces to -12^10. + if (is(data_array, "POSIXct")) { + # change to numeric first + data_array <- array(as.vector(data_array), dim = dim(data_array)) + data_array[which(!logi_array)] <- -12^10 + } else { + data_array[which(!logi_array)] <- -9999 + } + return(data_array) + } + + if (!is.null(data_array)) { + data_array_final_dims <- multiApply::Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + } + if (!is.null(merge_dim_metadata)) { + tmp_attr <- attributes(merge_dim_metadata)$variables + merge_dim_metadata <- multiApply::Apply(merge_dim_metadata, + target_dims = c(across_inner_dim, across_file_dim), + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + } + + if (!is.null(data_array)) { + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + } else { + data_array_tmp <- NULL + } + if (!is.null(merge_dim_metadata)) { + # Reorder metadata dim as final dim + tmp <- match(names(final_dims), names(dim(merge_dim_metadata))) + merge_dim_metadata <- aperm(merge_dim_metadata, tmp[!is.na(tmp)]) + merge_dim_metadata <- merge_dim_metadata[merge_dim_metadata != -12^10] + attr(merge_dim_metadata, 'variables') <- tmp_attr + } + + #NOTE: both outputs are vectors. If 'merge_dim_metadata' is actually time, it is just numeric here. + return(list(data_array = data_array_tmp, merge_dim_metadata = merge_dim_metadata)) +} + + +# When merge_across_dims = TRUE and split_multiselected_dims = TRUE, rearrange the chunks +# (i.e., work_piece) is necessary if one file contains values for discrete dimensions +rebuild_array_merge_split <- function(data_array = NULL, metadata = NULL, indices_chunk, + all_split_dims, final_dims_fake, across_inner_dim, length_inner_across_dim) { + + rebuild_data <- ifelse(is.null(data_array), FALSE, TRUE) + rebuild_metadata <- ifelse(is.null(metadata), FALSE, TRUE) + + # generate the correct order list from indices_chunk + final_order_list <- list() + i <- 1 + j <- 1 + a <- indices_chunk[i] + while (i <= length(indices_chunk)) { + while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { + a <- c(a, indices_chunk[i+1]) + i <- i + 1 + } + final_order_list[[j]] <- a + a <- indices_chunk[i+1] + i <- i + 1 + j <- j + 1 + } + names(final_order_list) <- sapply(final_order_list, '[[', 1) + final_order_list <- lapply(final_order_list, length) + + if (!all(diff(as.numeric(names(final_order_list))) > 0)) { + # shape the vector into the array without split_dims + split_dims_pos <- match(names(all_split_dims[[1]]), names(final_dims_fake)) + new_dims <- c() + if (split_dims_pos[1] > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, prod(all_split_dims[[1]])) + names(new_dims)[split_dims_pos[1]] <- across_inner_dim + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + + if (rebuild_data) { + data_array <- array(data_array, dim = new_dims) + # seperate 'time' dim into each work_piece length + data_array_seperate <- vector('list', length = length(length_inner_across_dim)) + array_piece <- vector('list', length = length(final_order_list)) + } + if (rebuild_metadata) { + metadata <- array(metadata, dim = length(metadata)) #metadata_no_split + names(dim(metadata)) <- across_inner_dim + metadata_seperate <- vector('list', length = length(length_inner_across_dim)) + metadata_piece <- vector('list', length = length(final_order_list)) + } + + tmp <- cumsum(unlist(length_inner_across_dim)) + tmp <- c(0, tmp) + for (i in 1:length(length_inner_across_dim)) { + if (rebuild_data) { + data_array_seperate[[i]] <- ClimProjDiags::Subset(data_array, + across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + if (rebuild_metadata) { + metadata_seperate[[i]] <- ClimProjDiags::Subset(metadata, + across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + } + + # re-build the array: chunk + which_chunk <- as.numeric(names(final_order_list)) + sort_which_chunk <- sort(unique(which_chunk)) + which_chunk <- sapply(lapply(which_chunk, '==', sort_which_chunk), which) + how_many_indices <- unlist(final_order_list) + + if (rebuild_data) { + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + } else if (rebuild_metadata) { + ind_in_array_seperate <- as.list(rep(1, length(metadata_seperate))) + } + + for (i in 1:length(final_order_list)) { + if (rebuild_data) { + array_piece[[i]] <- ClimProjDiags::Subset( + data_array_seperate[[which_chunk[i]]], across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + } + if (rebuild_metadata) { + metadata_piece[[i]] <- ClimProjDiags::Subset( + metadata_seperate[[which_chunk[i]]], across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + } + ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] + } + + # re-build the array: paste + if (rebuild_data) { + data_array_tmp <- array_piece[[1]] + } else { + data_array_tmp <- NULL + } + if (rebuild_metadata) { + metadata_tmp <- metadata_piece[[1]] + } else { + metadata_tmp <- NULL + } + + if (rebuild_data) { + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + length_piece <- length(array_piece) + } + if (rebuild_metadata) { + along_pos_metadata <- which(names(dim(metadata_tmp)) == across_inner_dim) + if (!rebuild_data) + length_piece <- length(metadata_piece) + } + + if (length_piece > 1) { + for (i in 2:length_piece) { + if (rebuild_data) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } + if (rebuild_metadata) { + metadata_tmp <- abind::abind(metadata_tmp, metadata_piece[[i]], + along = along_pos_metadata) + } + } + } + } else { + data_array_tmp <- data_array + metadata_tmp <- metadata + } + + return(list(data_array = data_array_tmp, metadata = metadata_tmp)) +} + + +# Create a list of metadata of the variable (e.g., tas) +create_metadata_list <- function(array_of_metadata_flags, metadata_dims, pattern_dims, + loaded_metadata_files, loaded_metadata, dat_names, + dataset_has_files) { + #NOTE: Here, metadata can be saved in one of two ways: one for $common and the other for $dat + # for $common, it is a list of metadata length. For $dat, it is a list of dat length, + # and each sublist has the metadata for each dat. + dim_of_metadata <- dim(array_of_metadata_flags)[metadata_dims] + if (!any(names(dim_of_metadata) == pattern_dims) | + (any(names(dim_of_metadata) == pattern_dims) & + dim_of_metadata[pattern_dims] == 1)) { # put under $common; old code + return_metadata <- vector('list', + length = prod(dim_of_metadata)) + return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata + dim(return_metadata) <- dim_of_metadata + + } else { # put under $dat. metadata_dims has 'dat' and dat length > 1 + return_metadata <- vector('list', + length = dim_of_metadata[pattern_dims]) + names(return_metadata) <- dat_names + for (kk in 1:length(return_metadata)) { + return_metadata[[kk]] <- vector('list', length = prod(dim_of_metadata[-1])) # 1 is dat + } + loaded_metadata_count <- 1 + for (kk in 1:length(return_metadata)) { + for (jj in 1:length(return_metadata[[kk]])) { + if (dataset_has_files[kk]) { + if (loaded_metadata_count %in% loaded_metadata_files) { + return_metadata[[kk]][jj] <- loaded_metadata[[which(loaded_metadata_files == loaded_metadata_count)]] + names(return_metadata[[kk]])[jj] <- names(loaded_metadata[[which(loaded_metadata_files == loaded_metadata_count)]]) + + } else { + return_metadata[[kk]][jj] <- NULL + } + loaded_metadata_count <- loaded_metadata_count + 1 + } else { + return_metadata[[kk]][jj] <- NULL + } + + } + } + } + + return(return_metadata) +} + +# This function adds the metadata of the variable (e.g., tas) into the list of picked_vars or +# picked_common_vars. The metadata is only retrieved when 'retrieve = TRUE'. +combine_metadata_picked_vars <- function(return_metadata, picked_vars, picked_common_vars, + metadata_dims, pattern_dims, length_dat) { +#NOTE: The metadata of variables can be saved in one of the two different structures. +# (1) metadata_dims != 'dat', or (metadata_dims == 'dat' & length(dat) == 1): +# put under $common +# (2) (metadata_dims == 'dat' & length(dat) > 1): +# put under $dat1, $dat2, .... Put it in picked_vars list +#TODO: The current (2) uses the inefficient method. Should define the list structure first +# then fill the list, rather than expand it in the for loop. + + if (any(metadata_dims == pattern_dims) & length_dat > 1) { # (2) + for (kk in 1:length(return_metadata)) { + sublist_names <- lapply(return_metadata, names)[[kk]] + if (!is.null(sublist_names)) { + for (jj in 1:length(sublist_names)) { + if (!is.null(return_metadata[[kk]][[jj]])) { + picked_vars[[kk]] <- c(picked_vars[[kk]], list(return_metadata[[kk]][[jj]])) + names(picked_vars[[kk]])[length(picked_vars[[kk]])] <- names(return_metadata[[kk]][jj]) + } + } + } + } + Variables_list <- c(list(common = picked_common_vars), picked_vars) + + } else { #(1) + len <- unlist(lapply(return_metadata, length)) + len <- sum(len) + length(which(len == 0)) #0 means NULL + name_list <- lapply(return_metadata, names) + new_list <- vector('list', length = len) + count <- 1 + + for (kk in 1:length(return_metadata)) { + if (length(return_metadata[[kk]]) == 0) { #NULL + count <- count + 1 + } else { + for (jj in 1:length(return_metadata[[kk]])) { + new_list[[count]] <- return_metadata[[kk]][[jj]] + names(new_list)[count] <- name_list[[kk]][jj] + count <- count + 1 + } + } + } + Variables_list <- c(list(common = c(picked_common_vars, new_list)), picked_vars) + } + + return(Variables_list) +} + +# This function generates a list of 3, containing picked(_common)_vars, +# picked(_common)_vars_ordered, and picked(_common)_vars_unorder_indices for the 'var_to_read' +# of this dataset (i) and file (j). +generate_picked_var_of_read <- function(var_to_read, var_to_check, array_of_files_to_load, + var_dims, array_of_var_files, file_var_reader, + file_object, synonims, associated_dim_name, + dim_reorder_params, aiat, current_indices, var_params, + either_picked_vars, + either_picked_vars_ordered, + either_picked_vars_unorder_indices) { + var_file_dims <- NULL + + if (any(names(dim(array_of_files_to_load)) %in% var_to_check)) { + var_file_dims <- dim(array_of_files_to_load)[which(names(dim(array_of_files_to_load)) %in% + var_to_check)] + } + if (is.null(either_picked_vars)) { + + if (any(names(var_file_dims) %in% names(var_dims))) { + stop("Found a requested var in 'return_var' requested for a ", + "file dimension which also appears in the dimensions of ", + "the variable inside the file.\n", array_of_var_files) + } + first_sample <- file_var_reader(NULL, file_object, NULL, + var_to_read, synonims) + if (any(class(first_sample) %in% names(time_special_types()))) { + array_size <- prod(c(var_file_dims, var_dims)) + new_array <- rep(time_special_types()[[class(first_sample)[1]]](NA), array_size) + dim(new_array) <- c(var_file_dims, var_dims) + } else { + new_array <- array(dim = c(var_file_dims, var_dims)) + } + attr(new_array, 'variables') <- attr(first_sample, 'variables') + + either_picked_vars <- new_array + pick_ordered <- FALSE + if (var_to_read %in% unlist(var_params)) { + if (associated_dim_name %in% names(dim_reorder_params) && !aiat) { + either_picked_vars_ordered <- new_array + pick_ordered <- TRUE + } + } + if (!pick_ordered) { + either_picked_vars_ordered <- NULL + } + + } else { + array_var_dims <- dim(either_picked_vars) + full_array_var_dims <- array_var_dims + if (any(names(array_var_dims) %in% names(var_file_dims))) { + array_var_dims <- array_var_dims[-which(names(array_var_dims) %in% names(var_file_dims))] + } + if (any(names(array_var_dims) != names(var_dims))) { + stop("Error while reading the variable '", var_to_read, "' from ", + "the file. Dimensions do not match.\nExpected ", + paste(paste0("'", names(array_var_dims), "'"), collapse = ', '), + " but found ", + paste(paste0("'", names(var_dims), "'"), collapse = ', '), + ".\n", array_of_var_files) + } + if (any(var_dims > array_var_dims)) { + longer_dims <- which(var_dims > array_var_dims) + if (length(longer_dims) == 1) { + longer_dims_in_full_array <- longer_dims + if (any(names(full_array_var_dims) %in% names(var_file_dims))) { + candidates <- (1:length(full_array_var_dims))[-which(names(full_array_var_dims) %in% names(var_file_dims))] + longer_dims_in_full_array <- candidates[longer_dims] + } + padding_dims <- full_array_var_dims + padding_dims[longer_dims_in_full_array] <- + var_dims[longer_dims] - array_var_dims[longer_dims] + + var_class <- class(either_picked_vars) + if (any(var_class %in% names(time_special_types()))) { + padding_size <- prod(padding_dims) + padding <- rep(time_special_types()[[var_class[1]]](NA), padding_size) + dim(padding) <- padding_dims + } else { + padding <- array(dim = padding_dims) + } + tmp_attr <- attributes(either_picked_vars)$variables + either_picked_vars <- .abind2(either_picked_vars, padding, + names(full_array_var_dims)[longer_dims_in_full_array]) + attr(either_picked_vars, 'variables') <- tmp_attr + + } else { + stop("Error while reading the variable '", var_to_read, "' from ", + "the file. Found size (", paste(var_dims, collapse = ' x '), + ") is greater than expected maximum size (", array_var_dims, ").") + } + } + } + + var_store_indices <- c(as.list(current_indices[names(var_file_dims)]), + lapply(var_dims, function(x) 1:x)) + var_values <- file_var_reader(NULL, file_object, NULL, var_to_read, synonims) + if (var_to_read %in% unlist(var_params)) { + if ((associated_dim_name %in% names(dim_reorder_params)) && !aiat) { + ## Is this check really needed? + if (length(dim(var_values)) > 1) { + stop("Requested a '", associated_dim_name, "_reorder' for a dimension ", + "whose coordinate variable that has more than 1 dimension. This is ", + "not supported.") + } + ordered_var_values <- dim_reorder_params[[associated_dim_name]](var_values) + attr(ordered_var_values$x, 'variables') <- attr(var_values, 'variables') + if (!all(c('x', 'ix') %in% names(ordered_var_values))) { + stop("All the dimension reorder functions must return a list with the components 'x' and 'ix'.") + } + # Save the indices to reorder the ordered variable values back to original order. + # 'unorder' refers to the indices of 'ordered_var_values' if it is unordered. + # This will be used to define the first round indices. + unorder <- sort(ordered_var_values$ix, index.return = TRUE)$ix + either_picked_vars_ordered <- do.call('[<-', + c(list(x = either_picked_vars_ordered), + var_store_indices, + list(value = ordered_var_values$x))) + either_picked_vars_unorder_indices <- do.call('[<-', + c(list(x = either_picked_vars_unorder_indices), + var_store_indices, + list(value = unorder))) + + + } + } + + either_picked_vars <- do.call('[<-', + c(list(x = either_picked_vars), + var_store_indices, + list(value = var_values))) + # Turn time zone back to UTC if this var_to_read is 'time' + if (all(class(either_picked_vars) == names(time_special_types))) { + attr(either_picked_vars, "tzone") <- 'UTC' + } + + + return(list(either_picked_vars = either_picked_vars, + either_picked_vars_ordered = either_picked_vars_ordered, + either_picked_vars_unorder_indices = either_picked_vars_unorder_indices)) +} + + +# Trnasforms a vector of indices v expressed in a world of +# length N from 1 to N, into a world of length M, from +# 1 to M. Repeated adjacent indices are collapsed. +transform_indices <- function(v, n, m) { + #unique2 turns e.g. 1 1 2 2 2 3 3 1 1 1 into 1 2 3 1 + unique2 <- function(v) { + if (length(v) < 2) { + v + } else { + v[c(1, v[2:length(v)] - v[1:(length(v) - 1)]) != 0] + } + } + unique2(round(((v - 1) / (n - 1)) * (m - 1))) + 1 # this rounding may generate 0s. what then? +} + +replace_character_with_indices <- function(selectors, data_dims, chunk_amount) { + if (selectors == 'all') { + selectors <- indices(1:(data_dims * chunk_amount)) + } else if (selectors == 'first') { + selectors <- indices(1) + } else if (selectors == 'last') { + selectors <- indices(data_dims * chunk_amount) + } + return(selectors) +} -- GitLab