From e937ab8ff61e15d0b688674d8b12d18cc0a87290 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 12 Feb 2025 14:54:40 +0100 Subject: [PATCH 1/3] (WIP) Fix inconsistencies in AS running locally vs via ssh --- R/ByChunks_autosubmit.R | 29 ++++++++++++++----------- inst/chunking/Autosubmit/autosubmit.yml | 2 +- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/R/ByChunks_autosubmit.R b/R/ByChunks_autosubmit.R index ec33632..7719c9f 100644 --- a/R/ByChunks_autosubmit.R +++ b/R/ByChunks_autosubmit.R @@ -29,8 +29,9 @@ #' as autosubmit machine. The default value is NULL, and a temporary folder #' under the current working folder will be created. #'@param autosubmit_server A character vector indicating the login node of the -#' autosubmit machine. It can be "bscesautosubmit01" or "bscesautosubmit02". -#' The default value is NULL, and the node will be randomly chosen. +#' autosubmit machine. It can be "bscesautosubmit01" or "bscesautosubmit02". +#' If NULL, Autosubmit will be run locally on the current machine. +#' The default value is NULL. #'@param silent A logical value deciding whether to print the computation #' progress (FALSE) on the R session or not (TRUE). It only works when the #' execution runs locally or the parameter 'wait' is TRUE. The default value @@ -167,8 +168,6 @@ ByChunks_autosubmit <- function(step_fun, cube_headers, ..., chunks = 'auto', if (!autosubmit_server %in% c('bscesautosubmit01', 'bscesautosubmit02')) { stop("Parameter 'autosubmit_server' must be one existing Autosubmit machine login node, 'bscesautosubmit01' or 'bscesautosubmit02'.") } - } else { - autosubmit_server <- paste0('bscesautosubmit0', sample(1:2, 1)) } ## silent @@ -238,7 +237,7 @@ ByChunks_autosubmit <- function(step_fun, cube_headers, ..., chunks = 'auto', ### queue_host support_hpcs <- c('local', 'nord3') # names in platforms.yml if (is.null(cluster$queue_host) || !cluster$queue_host %in% support_hpcs) { - stop("Cluster component 'queue_host' must be one of the follows: ", + stop("Cluster component 'queue_host' must be one of the following: ", paste(support_hpcs, collapse = ','), '.') } @@ -329,10 +328,14 @@ ByChunks_autosubmit <- function(step_fun, cube_headers, ..., chunks = 'auto', ### expid as_module <- cluster[['autosubmit_module']] if (is.null(cluster[['expid']])) { - text <- system( - paste0("module load ", as_module, "; ", - "autosubmit expid -H local -d 'startR computation'"), - intern = T) + sys_commands <- paste0("module load ", as_module, "; ", + "autosubmit expid -H local -d 'startR computation'") + if (!is.null(autosubmit_server)) { + as_login <- paste0(Sys.getenv("USER"), '@', autosubmit_server, '.bsc.es') + sys_commands <- paste0('ssh ', as_login, ' "', sys_commands, '"') + } + + text <- system(sys_commands, intern = TRUE) cluster[['expid']] <- strsplit( text[grep("The new experiment", text)], "\"")[[1]][2] @@ -600,14 +603,16 @@ ByChunks_autosubmit <- function(step_fun, cube_headers, ..., chunks = 'auto', } time_begin_first_chunk <- Sys.time() sys_commands <- paste0("module load ", as_module, "; ", - "autosubmit create ", suite_id, " -np; ", + "autosubmit create ", suite_id, " -v -np; ", "autosubmit refresh ", suite_id, "; ") if (wait) { sys_commands <- paste0(sys_commands, "autosubmit run ", suite_id) } else { sys_commands <- paste0(sys_commands, "nohup autosubmit run ", suite_id, " >/dev/null 2>&1 &") # disown? } - if (gsub('[[:digit:]]', "", Sys.getenv('HOSTNAME')) == 'bscesautosubmit') { + ## TODO: Review logic here + if ((is.null(autosubmit_server)) || + (gsub('[[:digit:]]', "", Sys.getenv('HOSTNAME')) == 'bscesautosubmit')) { #NOTE: If we ssh to AS VM and run everything there, we don't need to ssh here system(sys_commands) @@ -618,8 +623,6 @@ ByChunks_autosubmit <- function(step_fun, cube_headers, ..., chunks = 'auto', sys_commands <- paste0('ssh ', as_login, ' "', sys_commands, '"') #'; exit"') system(sys_commands) -# } else { -# stop("Cannot identify host", Sys.getenv("HOSTNAME"), ". Where to run AS exp?") } # Check the size of tmp/ASLOGS/jobs_failed_status.log. If it is not 0, the jobs failed. diff --git a/inst/chunking/Autosubmit/autosubmit.yml b/inst/chunking/Autosubmit/autosubmit.yml index 8b129a0..b08fca4 100644 --- a/inst/chunking/Autosubmit/autosubmit.yml +++ b/inst/chunking/Autosubmit/autosubmit.yml @@ -1,5 +1,5 @@ config: - AUTOSUBMIT_VERSION: 4.0.0b0 + AUTOSUBMIT_VERSION: MAXWAITINGJOBS: # Should it be the total amount of chunk? TOTALJOBS: SAFETYSLEEPTIME: 10 -- GitLab From a1af493c1aacebf606efaa44c030973d88f82e93 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 12 Feb 2025 17:01:14 +0100 Subject: [PATCH 2/3] Add 'autosubmit_version' to cluster components and edit expdef_*.yml file with version --- R/ByChunks_autosubmit.R | 18 ++++++++++++++---- R/Utils.R | 1 + 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/R/ByChunks_autosubmit.R b/R/ByChunks_autosubmit.R index 7719c9f..b5bd3a8 100644 --- a/R/ByChunks_autosubmit.R +++ b/R/ByChunks_autosubmit.R @@ -287,6 +287,17 @@ ByChunks_autosubmit <- function(step_fun, cube_headers, ..., chunks = 'auto', if (!is.character(cluster[['autosubmit_module']])) { stop("The component 'autosubmit_module' of the parameter 'cluster' must be a character string.") } + ### autosubmit_version + if (!is.null(cluster[['autosubmit_version']])) { + if (!is.character(cluster[['autosubmit_version']])) { + stop("The component 'autosubmit_version' of the parameter 'cluster' must be a character string.") + } + } else { + cluster[['autosubmit_version']] <- stringr::str_extract(cluster['autosubmit_module'], + "(?<=/)(.+)(?=\\-foss)") + warning("The component 'autosubmit_version' has not been provided. It will ", + "be parsed from 'autosubmit_module'.") + } ### cores_per_job if (is.null(cluster[['cores_per_job']])) { cluster[['cores_per_job']] <- threads_compute @@ -610,19 +621,18 @@ ByChunks_autosubmit <- function(step_fun, cube_headers, ..., chunks = 'auto', } else { sys_commands <- paste0(sys_commands, "nohup autosubmit run ", suite_id, " >/dev/null 2>&1 &") # disown? } - ## TODO: Review logic here + # Execute system commands locally or remotely if ((is.null(autosubmit_server)) || (gsub('[[:digit:]]', "", Sys.getenv('HOSTNAME')) == 'bscesautosubmit')) { + # If autosubmit_server is NULL or we are already on bscesautosubmit0x #NOTE: If we ssh to AS VM and run everything there, we don't need to ssh here system(sys_commands) } else { -# } else if (gsub("[[:digit:]]", "", Sys.getenv("HOSTNAME")) == "bscearth") { - # ssh from WS to AS VM to run exp + # ssh from local machine to AS VM to run exp as_login <- paste0(Sys.getenv("USER"), '@', autosubmit_server, '.bsc.es') sys_commands <- paste0('ssh ', as_login, ' "', sys_commands, '"') #'; exit"') system(sys_commands) - } # Check the size of tmp/ASLOGS/jobs_failed_status.log. If it is not 0, the jobs failed. diff --git a/R/Utils.R b/R/Utils.R index e440dde..242ec70 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -943,6 +943,7 @@ write_autosubmit_confs <- function(chunks, cluster, autosubmit_suite_dir) { ############################################################ if (conf_type == "autosubmit") { + conf$config$AUTOSUBMIT_VERSION <- cluster['autosubmit_version'] #Q: Should it be the total amount of chunk? conf$config$MAXWAITINGJOBS <- as.integer(prod(unlist(chunks))) # total amount of chunk #NOTE: Nord3 max. amount of queued jobs is 366 -- GitLab From 53744e2493e10a0afc5fa33ccf61feaf88d29d72 Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 17 Feb 2025 11:30:24 +0100 Subject: [PATCH 3/3] Fix typos and update practical guide --- R/ByChunks_autosubmit.R | 2 +- inst/doc/practical_guide.md | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/R/ByChunks_autosubmit.R b/R/ByChunks_autosubmit.R index b5bd3a8..16d273d 100644 --- a/R/ByChunks_autosubmit.R +++ b/R/ByChunks_autosubmit.R @@ -20,7 +20,7 @@ #'@param threads_compute An integer indicating the number of execution threads #' to use for the computation. The default value is 1. #'@param cluster A list of components that define the configuration of the -#' machine to be run on. The comoponents vary from different machines. Check +#' machine to be run on. The components vary for different machines. Check #' \href{https://earth.bsc.es/gitlab/es/startR/-/blob/master/inst/doc/practical_guide.md}{practical guide} #' for more details and examples. #'@param autosubmit_suite_dir A character string indicating the path to a folder diff --git a/inst/doc/practical_guide.md b/inst/doc/practical_guide.md index 91c11ee..637474e 100644 --- a/inst/doc/practical_guide.md +++ b/inst/doc/practical_guide.md @@ -581,7 +581,7 @@ res <- Compute(wf, #### 4-3-2. Compute() on HPCs with ecFlow We can use workflow manager (ecFlow or Autosubmit) to dispatch computation jobs on a HPC. -To use Autosubmit, check the next session. +To use Autosubmit, check the next section. You will need to make sure that the passwordless connection with the login node of that HPC is configured, as shown at the beginning of this guide. If possible, in both directions. Also, you will need to know whether there is a shared file system between your workstation and that HPC, and will need information on the number of nodes, cores per node, threads per core, RAM memory per node, and type of workload used by that HPC (Slurm, PBS and LSF supported). @@ -712,12 +712,13 @@ To use Autosubmit as workflow manager, add the following parameters to your Comp `autosubmit_suite_dir` is the path where to store temporary files generated for Autosubmit to establish the workflow. It should be found in both workstation and the Autosubmit machine. -`autosubmit_server` is the login node of the Autosubmit machine, i.e., 'bscesautosubmit01'or 'bscesautosubmit02'. +`autosubmit_server` can be 'local' if the local machine R is running on has Autosubmit installed (e.g. the BSC-ES Hub), or it can be the login node of the Autosubmit machine, i.e., 'bscesautosubmit01'or 'bscesautosubmit02'. The default is 'local'. -The parameter `cluster` expects a list of components that provide the configuration of Autosubmit machine. For now, the supported platforms are 'local' (run on Autosubmit machine) and 'nord3' (Autosubmit submits jobs to Nord3). +The parameter `cluster` expects a list of components that provide the configuration of Autosubmit machine. For now, the supported platforms are 'local' (run on the current machine) and 'nord3' (Autosubmit submits jobs to Nord3). You can see one example of cluster configuration below. ```r + # Launch Autosubmit on the autosubmit machine res <- Compute(wf, chunks = list(sdate = 2), threads_compute = 4, threads_load = 2, cluster = list( @@ -727,6 +728,7 @@ You can see one example of cluster configuration below. r_module = "R/4.1.2-foss-2019b", CDO_module = "CDO/1.9.8-foss-2019b", autosubmit_module = 'autosubmit/4.0.0b-foss-2015a-Python-3.7.3', + autosubmit_version = '4.0.0b', cores_per_job = 4, job_wallclock = '01:00:00', max_jobs = 4 @@ -736,6 +738,28 @@ You can see one example of cluster configuration below. autosubmit_server = 'bscesautosubmit01', wait = TRUE ) + + # Launch Autosubmit locally from the bsceshub + res <- Compute(wf, chunks = list(sdate = 2), + threads_compute = 4, threads_load = 2, + cluster = list( + queue_host = 'nord3', + expid = , + hpc_user = "bsc32xxx", + r_module = "R/4.1.2-foss-2019b", + CDO_module = "CDO/1.9.8-foss-2019b", + autosubmit_module = 'autosubmit/4.0.98-foss-2021b-Python-3.9.6', + autosubmit_version = '4.0.98', + cores_per_job = 4, + job_wallclock = '01:00:00', + max_jobs = 4 + ), + workflow_manager = 'autosubmit', + autosubmit_suite_dir = "/home/Earth//startR_local_autosubmit/", + autosubmit_server = 'local', + wait = TRUE + ) + ``` The cluster components and options are explained next: @@ -751,6 +775,7 @@ To have the good practice, note down the expid if it is automatically created by - `r_module`: Name of the UNIX environment module to be used for R. If not specified, `module load R` will be used. - `CDO_module`: Name of the UNIX environment module to be used for CDO. If not specified, it is NULL and no CDO module will be loaded. Make sure to assign it if `tranform` is required in Start(). - `autosubmit_module`: The name of the Autosubmit module. If not specified, `module load autosubmit` will be used. +- `autosubmit_version`: The Autosubmit version (e.g. '4.0.0b'). If not specified, it will be parsed from the `autosubmit_module`. If it cannot be parsed from this parameter, it will be set to '4.0.0b' by default. - `cores_per_job`: Number of computing cores to be requested when submitting the job for each chunk to the HPC queue. It is corresponded to the parameter "THREADS" in _jobs.yml_ and "PROCESSORS_PER_NODE" in _platforms.yml_. - `job_wallclock`: amount of time to reserve the resources when submitting the job for each chunk. Must follow the specific format required by the specified `queue_type`. @@ -1105,6 +1130,7 @@ r <- Compute(wf, ### Nord3-v2 ```r +# Using ecFlow cluster = list(queue_host = 'nord4.bsc.es', queue_type = 'slurm', temp_dir = '/gpfs/scratch/bsc32/bsc32734/startR_hpc/', @@ -1113,7 +1139,8 @@ cluster = list(queue_host = 'nord4.bsc.es', max_jobs = 4, bidirectional = FALSE, polling_period = 10 - ) + ) + ``` ### Nord3 (deprecated) -- GitLab