From 077cd9ad691f64e9d37bea3d1f951a2633883dad Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 28 Jul 2022 16:19:58 +0200 Subject: [PATCH 001/213] over_wallclock fix --- autosubmit/job/job.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 226b85c37..948269142 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -768,6 +768,22 @@ class Job(object): except BaseException as e: pass return + def parse_time(self,wallclock): + format = "minute" + regex = re.compile(r'(((?P\d+):)((?P\d+)))(:(?P\d+))?') + parts = regex.match(wallclock) + if not parts: + return + parts = parts.groupdict() + if int(parts['hours']) > 0 : + format = "hour" + else: + format = "minute" + time_params = {} + for name, param in parts.items(): + if param: + time_params[name] = int(param) + return datetime.timedelta(**time_params),format # Duplicated for wrappers and jobs to fix in 4.0.0 def is_over_wallclock(self, start_time, wallclock): """ @@ -777,25 +793,13 @@ class Job(object): :return: """ elapsed = datetime.datetime.now() - start_time - wallclock = datetime.datetime.strptime(wallclock, '%H:%M') - total = 0.0 - if wallclock.hour > 0: - total = wallclock.hour - format = "hour" - else: - format = "minute" - if format == "hour": - if wallclock.minute > 0: - total += wallclock.minute / 60.0 - if wallclock.second > 0: - total += wallclock.second / 60.0 / 60.0 + wallclock,time_format = self.parse_time(wallclock) + if time_format == "hour": + total = wallclock.days * 24 + wallclock.seconds / 60 / 60 else: - if wallclock.minute > 0: - total += wallclock.minute - if wallclock.second > 0: - total += wallclock.second / 60.0 + total = wallclock.days * 24 + wallclock.seconds / 60 total = total * 1.30 # in this case we only want to avoid slurm issues so the time is increased by 50% - if format == "hour": + if time_format == "hour": hour = int(total) minute = int((total - int(total)) * 60.0) second = int(((total - int(total)) * 60 - -- GitLab From 45008de2246909d4c4c0d441ce2647d234f95089 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 4 Aug 2022 04:21:27 +0200 Subject: [PATCH 002/213] adds support for post+1 --- autosubmit/job/job_list.py | 13 ++++++++++++- docs/source/usage/configuration/new_job.rst | 4 +++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 3d55bb040..d880e3dc4 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -376,6 +376,8 @@ class JobList(object): # Get current job dependency relations. Used for select chunk option. This is the job in where select chunks option is defined if len(dependency.select_chunks_orig) > 0: # find chunk relation other_parents = dic_jobs.get_jobs(dependency.section, date, member, None) + jobs_by_section = [p for p in other_parents if p.section == dependency.section] + chunk_relation_indx = 0 while chunk_relation_indx < len(dependency.select_chunks_orig): if job.running in ["once"] or len(dependency.select_chunks_orig[chunk_relation_indx]) == 0 or job.chunk in dependency.select_chunks_orig[chunk_relation_indx]: @@ -425,7 +427,16 @@ class JobList(object): JobList._add_edge(graph, job, parent) other_parents.remove(parent) visited_parents.add(parent) - + # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. + if len(job.parents) <= 0: + for relation_indx in chunk_relations_to_add: + for parent in jobs_by_section: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + visited_parents.add(parent) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) diff --git a/docs/source/usage/configuration/new_job.rst b/docs/source/usage/configuration/new_job.rst index e8fb39692..b9099e348 100644 --- a/docs/source/usage/configuration/new_job.rst +++ b/docs/source/usage/configuration/new_job.rst @@ -31,10 +31,12 @@ This is the minimum job definition and usually is not enough. You usually will n .. code-block:: ini [jobs] - SELECT_CHUNKS = SIM*[1]*[3] # Enables the dependency of chunk 1 with chunk 3. While chunks 2,4 won't be linked. + SELECT_CHUNKS = SIM*[1:3] # Enables the dependency of chunk 1,2 and 3. While 4 won't be linked. SELECT_CHUNKS = SIM*[1,3] # Enables the dependency of chunk 1 and 3. While 2 and 4 won't be linked SELECT_CHUNKS = SIM*[1] # Enables the dependency of chunk 1. While 2, 3 and 4 won't be linked + SELECT_CHUNKS = SIM*[1]*[3] # Enables the dependency of SIM_1 with CHILD_3. While chunks 2,4 won't be linked. + SELECT_CHUNKS = SIM*[2:4]*[2:4] SIM*[2]*[1] # Links SIM_2:4 with CHILDREN_2:4 and links SIM_2 with CHILD_1 * SELECT_MEMBERS (optional): by default, all sections depend on all jobs the items specified on the DEPENDENCIES parameter. However, with this parameter, you could select the members of a specific job section. At the end of this doc, you will find diverse examples of this feature. Caution, you must pick the member index, not the member name. -- GitLab From 1cec0bab0ee05c47f8bc2b5b3bc5de6c6422b6ec Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 8 Aug 2022 16:36:54 +0200 Subject: [PATCH 003/213] fix project_Destination --- autosubmit/autosubmit.py | 3 ++- autosubmit/config/config_common.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 19dc23baf..5bdb10116 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4324,7 +4324,8 @@ class Autosubmit: """ project_destination = as_conf.get_project_destination() if project_destination is None or len(project_destination) == 0: - raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) + if project_type.lower() != "none": + raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": submitter = Autosubmit._get_submitter(as_conf) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 7b2a6a12b..e3e9188a4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1119,11 +1119,14 @@ class AutosubmitConfig(object): elif self.get_project_type().lower() == "git": value = self.get_git_project_origin().split( '/')[-1].split('.')[-2] - return value + if value != "": + return value + else: + return "project_files" except Exception as exp: Log.debug(str(exp)) Log.debug(traceback.format_exc()) - return '' + return "project_files" def set_git_project_commit(self, as_conf): """ -- GitLab From 3fbfdb808c309b83d63192cd78aa2212ce77f191 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 15:09:33 +0200 Subject: [PATCH 004/213] tkinter --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 2b4a65497..e6112df7d 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -11,7 +11,7 @@ The Autosubmit code is maintained in *PyPi*, the main source for python packages .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10, tkinter .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. -- GitLab From 71cd5a6bed9d30231a562d9664f89d585934b5f5 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 14:56:12 +0200 Subject: [PATCH 005/213] tkinter --- docs/source/installation.rst | 4 ++-- requeriments.txt | 1 + setup.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index e6112df7d..5dd60a136 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -7,11 +7,11 @@ How to install The Autosubmit code is maintained in *PyPi*, the main source for python packages. -- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk, python2-dev, graphviz >= 2.41, pip2 +- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk(tkinter in centOS), python2-dev, graphviz >= 2.41, pip2 .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10, tkinter +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10 .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. diff --git a/requeriments.txt b/requeriments.txt index f2dfdd0aa..d57974475 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -13,6 +13,7 @@ typing bscearth.utils cryptography==3.3.2 PyNaCl==1.4.0 +six>=1.10.0 requests xlib Pygments \ No newline at end of file diff --git a/setup.py b/setup.py index 35e8f4f4f..7935f7a42 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], extras_require={ -- GitLab From 46b2c19842702a6f82aef43089326fd1d385f86d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 15:42:36 +0200 Subject: [PATCH 006/213] author change --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7935f7a42..d4d0f0179 100644 --- a/setup.py +++ b/setup.py @@ -34,8 +34,8 @@ setup( version=version, description='Autosubmit: a versatile tool to manage Weather and Climate Experiments in diverse ' 'Supercomputing Environments', - author='Domingo Manubens-Gil', - author_email='domingo.manubens@bsc.es', + author='Daniel Beltran Mora', + author_email='daniel.beltran@bsc.es', url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], -- GitLab From f7a014ba3d3b554ddc52e6a27be42de501e0f6ff Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 15:08:10 +0200 Subject: [PATCH 007/213] Added requests, improvement exception recovery for wrappers , added more info, bugfixed status appearing in log.out , bug fixed lc level not being able to change --- autosubmit/autosubmit.py | 54 ++++++++++++++--------- autosubmit/platforms/paramiko_platform.py | 32 ++++++++------ autosubmit/platforms/platform.py | 2 +- environment.yml | 1 + log/log.py | 15 ++++++- setup.py | 2 +- 6 files changed, 69 insertions(+), 37 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5bdb10116..8704d27f3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -162,7 +162,7 @@ class Autosubmit: parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version) parser.add_argument('-lf', '--logfile', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), - default='WARNING', type=str, + default='DEBUG', type=str, help="sets file's log level.") parser.add_argument('-lc', '--logconsole', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), default='INFO', type=str, @@ -1659,7 +1659,11 @@ class Autosubmit: Log.debug('Checking Wrapper {0}'.format(str(job_id))) wrapper_job.checked_time = datetime.datetime.now() # This is where wrapper will be checked on the slurm platform, update takes place. - platform.check_job(wrapper_job) + try: + platform.check_job(wrapper_job,is_wrapper=True) + except BaseException as e: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking wrapper {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) #Log.info("FD 3Wrapper checked: {0}".format(log.fd_show.fd_table_status_str())) try: if wrapper_job.status != wrapper_job.new_status: @@ -1671,8 +1675,12 @@ class Autosubmit: "Wrapper is in Unknown Status couldn't get wrapper parameters", 7050) # New status will be saved and inner_jobs will be checked. - wrapper_job.check_status( - wrapper_job.new_status) + try: + wrapper_job.check_status(wrapper_job.new_status) + except: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking the inner_jobs of {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) + # Erase from packages if the wrapper failed to be queued ( Hold Admin bug ) if wrapper_job.status == Status.WAITING: for inner_job in wrapper_job.job_list: @@ -1782,9 +1790,18 @@ class Autosubmit: # No need to wait until the remote platform reconnection recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - consecutive_retrials = 1 - delay = min(15*consecutive_retrials,120) + consecutive_retrials = 0 + failed_names = {} + Log.info("Storing failed job count...") + try: + for job in job_list.get_job_list(): + if job.fail_count > 0: + failed_names[job.name] = job.fail_count + except BaseException as e: + Log.printlog("Error trying to store failed job count",Log.WARNING) + Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: + delay = min(15 * consecutive_retrials, 120) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1794,6 +1811,7 @@ class Autosubmit: Log.info("Recovering job_list...") job_list = Autosubmit.load_job_list( expid, as_conf, notransitive=notransitive) + Log.info("Recovering job_list... Done") if allowed_members: # Set allowed members after checks have been performed. This triggers the setter and main logic of the -rm feature. job_list.run_members = allowed_members @@ -1801,26 +1819,20 @@ class Autosubmit: "Only jobs with member value in {0} or no member will be allowed in this run. Also, those jobs already SUBMITTED, QUEUING, or RUNNING will be allowed to complete and will be tracked.".format( str(allowed_members))) platforms_to_test = set() + Log.info("Recovering platform information...") for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) - #Recover job_list while keeping job.fail_count - failed_names = {} - for job in job_list.get_job_list(): - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] - platforms_to_test.add(job.platform) - if job.fail_count > 0: - failed_names[job.name] = job.fail_count + + Log.info("Recovering platform information... Done") + Log.info("Recovering Failure count...") for job in job_list.get_job_list(): if job.name in failed_names.keys(): job.fail_count = failed_names[job.name] - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] + Log.info("Recovering Failure count... Done") + Log.info("Recovering parameters...") Autosubmit._load_parameters(as_conf, job_list, submitter.platforms) # Recovery wrapper [Packages] @@ -1876,9 +1888,11 @@ class Autosubmit: None, None, jobs[0].platform, as_conf, jobs[0].hold) job_list.job_package_map[jobs[0].id] = wrapper_job + Log.info("Recovering wrappers... Done") job_list.update_list(as_conf) Log.info("Saving recovered job list...") job_list.save() + Log.info("Saving recovered job list... Done") recovery = True Log.result("Recover of job_list is completed") except AutosubmitError as e: @@ -1886,10 +1900,10 @@ class Autosubmit: Log.result("Recover of job_list has fail {0}".format(e.message)) except IOError as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) except BaseException as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) # Restore platforms and try again, to avoid endless loop with failed configuration, a hard limit is set. reconnected = False mail_notify = True diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 43adfd5c6..e57512f55 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -452,17 +452,20 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status :param retries: retries :param job: job + :type job: autosubmit.job.job.Job + :param default_status: default status if job is not found :type job: class(job) :param default_status: status to assign if it can be retrieved from the platform :type default_status: autosubmit.job.job_common.Status :return: current job status :rtype: autosubmit.job.job_common.Status + """ job_id = job.id job_status = Status.UNKNOWN @@ -491,19 +494,20 @@ class ParamikoPlatform(Platform): job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: job_status = Status.RUNNING - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + if not is_wrapper: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED elif job_status in self.job_status['QUEUING'] and job.hold is False: job_status = Status.QUEUING elif job_status in self.job_status['QUEUING'] and job.hold is True: diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index c2ccf3575..acbb20aa7 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -384,7 +384,7 @@ class Platform(object): """ raise NotImplementedError - def check_job(self, jobid, default_status=Status.COMPLETED, retries=5): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status diff --git a/environment.yml b/environment.yml index 4585486d9..bc6e7308b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - portalocker - networkx - python=2.7 +- requests - pip: - bscearth.utils - Xlib diff --git a/log/log.py b/log/log.py index ae3ca5a74..216fc23eb 100644 --- a/log/log.py +++ b/log/log.py @@ -161,7 +161,7 @@ class Log: logging.getLogger(name) @staticmethod - def set_file(file_path, type='out', level=WARNING): + def set_file(file_path, type='out', level="WARNING"): """ Configure the file to store the log. If another file was specified earlier, new messages will only go to the new file. @@ -169,6 +169,19 @@ class Log: :param file_path: file to store the log :type file_path: str """ + levels = {} + levels["STATUS_FAILED"] = 500 + levels["STATUS"] = 1000 + levels["DEBUG"] = 2000 + levels["WARNING"] = 3000 + levels["INFO"] = 4000 + levels["RESULT"] = 5000 + levels["ERROR"] = 6000 + levels["CRITICAL"] = 7000 + levels["NO_LOG"] = levels["CRITICAL"] + 1000 + + level = levels.get(str(level).upper(),"DEBUG") + max_retrials = 3 retrials = 0 timeout = 5 diff --git a/setup.py b/setup.py index d4d0f0179..8e56eb8c5 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( keywords=['climate', 'weather', 'workflow', 'HPC'], install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', - 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], + 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] }, -- GitLab From 417305910c1b0331d341cde655b9ad518b6bed96 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 15:53:09 +0200 Subject: [PATCH 008/213] stat fix --- autosubmit/autosubmit.py | 1 + autosubmit/job/job.py | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8704d27f3..03853b178 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1763,6 +1763,7 @@ class Autosubmit: save2 = job_list.update_list( as_conf, submitter=submitter) job_list.save() + if len(job_list.get_ready()) > 0: save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 948269142..28c9b2be9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -630,10 +630,9 @@ class Job(object): found = False retrials = 0 while retrials < 3 and not found: - sleep(2) if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): found = True - retrials = retrials - 1 + retrials = retrials + 1 for i in range(max_logs-1,-1,-1): if platform.check_stat_file_by_retrials(stat_file + str(i)): last_log = i @@ -1181,18 +1180,18 @@ class Job(object): if self.type == Type.BASH: template = 'sleep 5' + "\n" elif self.type == Type.PYTHON: - template = 'time.sleep(30)' + "\n" + template = 'time.sleep(5)' + "\n" elif self.type == Type.R: - template = 'Sys.sleep(30)' + "\n" + template = 'Sys.sleep(5)' + "\n" template += template_file.read() template_file.close() else: if self.type == Type.BASH: - template = 'sleep 35' + template = 'sleep 5' elif self.type == Type.PYTHON: - template = 'time.sleep(35)' + template = 'time.sleep(5)' elif self.type == Type.R: - template = 'Sys.sleep(35)' + template = 'Sys.sleep(5)' else: template = '' except: -- GitLab From 29676178bca47777f1e5561eedd1f001bf73a7a2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 16:44:23 +0200 Subject: [PATCH 009/213] wrapper_type is now being saved correctly --- autosubmit/autosubmit.py | 1 - autosubmit/job/job_list.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 03853b178..8704d27f3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1763,7 +1763,6 @@ class Autosubmit: save2 = job_list.update_list( as_conf, submitter=submitter) job_list.save() - if len(job_list.get_ready()) > 0: save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index d880e3dc4..395e07467 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -215,6 +215,15 @@ class JobList(object): new, notransitive, update_structure=update_structure) for job in self._job_list: job.parameters = parameters + job_data = jobs_data.get(job.name,"none") + try: + if job_data != "none": + job.wrapper_type = job_data[12] + else: + job.wrapper_type = "none" + except BaseException as e: + job.wrapper_type = "none" + # Checking for member constraints if len(run_only_members) > 0: # Found -- GitLab From 5137b0022be260c23150e66bb153a52d86abf5a9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 08:45:34 +0200 Subject: [PATCH 010/213] erased debug info, changed exception for baseexception --- autosubmit/job/job.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 28c9b2be9..1056b93f6 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -620,10 +620,9 @@ class Job(object): submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) platform = submitter.platforms[platform_name.lower()] - try: - platform.test_connection() - except: - pass + + platform.test_connection() + max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count if self.wrapper_type is not None and self.wrapper_type == "vertical": @@ -643,7 +642,7 @@ class Job(object): else: remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except Exception as e: + except BaseException as e: Log.printlog( "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) out_exist = False -- GitLab From ce7a4b1be6d3a6bd1ca782d1e3fda5ee2545f4ab Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 12:53:18 +0200 Subject: [PATCH 011/213] Fixed delay issue #862 --- autosubmit/job/job_list.py | 60 +++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 395e07467..395c97e4c 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -415,37 +415,37 @@ class JobList(object): if dependency.splits is not None: parent = filter( lambda _parent: _parent.split in dependency.splits, parent) - #Select chunk + select member - if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - elif len(dependency.select_members_orig) > 0: - for relation_indx in member_relations_to_add: - if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - elif len(dependency.select_chunks_orig) > 0: + #Select chunk + select member + if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + elif len(dependency.select_members_orig) > 0: + for relation_indx in member_relations_to_add: + if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + elif len(dependency.select_chunks_orig) > 0: + for relation_indx in chunk_relations_to_add: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. + if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( - dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. - if len(job.parents) <= 0: - for relation_indx in chunk_relations_to_add: - for parent in jobs_by_section: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( - dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - visited_parents.add(parent) + for parent in jobs_by_section: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + visited_parents.add(parent) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) -- GitLab From 9e68d54234bbbf51a089776dde1db8e9e494e195 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 13:40:57 +0200 Subject: [PATCH 012/213] Added 5min retrial in case that something is wrong while recovering the As_conf info inside a thread. --- autosubmit/job/job.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1056b93f6..325564bec 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -614,15 +614,27 @@ class Job(object): max_logs = 0 sleep(5) stat_file = self.script_name[:-4] + "_STAT_" + retries = 2 + count = 0 + success = False + error_message = "" + while count < retries or success: + try: + as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) + as_conf.reload() + submitter = self._get_submitter(as_conf) + submitter.load_platforms(as_conf) + success = True + except BaseException as e: + error_message = str(e) + sleep(60*5) + pass + count=count+1 + if not success: + raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) + platform = submitter.platforms[platform_name.lower()] try: - as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.reload() - submitter = self._get_submitter(as_conf) - submitter.load_platforms(as_conf) - platform = submitter.platforms[platform_name.lower()] - platform.test_connection() - max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count if self.wrapper_type is not None and self.wrapper_type == "vertical": @@ -644,7 +656,7 @@ class Job(object): except BaseException as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(e.message, self.name), 6001) out_exist = False err_exist = False retries = 3 -- GitLab From e065788458a9e0ec3077463c0ef24844e2eeebcc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:30:45 +0200 Subject: [PATCH 013/213] e --- autosubmit/autosubmit.py | 2 +- autosubmit/job/job.py | 2 +- autosubmit/platforms/slurmplatform.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8704d27f3..b299c7dcc 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2227,7 +2227,7 @@ class Autosubmit: "{0} submission failed, some hold jobs failed to be held".format(platform.name), 6015) except WrongTemplateException as e: raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( - e.job_name), 7014, e.message) + e.job_name), 7014, str(e)) except AutosubmitError as e: raise except AutosubmitCritical as e: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 325564bec..1068dca65 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -618,7 +618,7 @@ class Job(object): count = 0 success = False error_message = "" - while count < retries or success: + while (count < retries) or success: try: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index cd96b21cc..5d31690c4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -362,8 +362,8 @@ class SlurmPlatform(ParamikoPlatform): return export + self._submit_hold_cmd + job_script else: if not hold: - self._submit_script_file.write( - export + self._submit_cmd + job_script + "\n") + write_this = export + self._submit_cmd + job_script +"\n" + self._submit_script_file.write(write_this) else: self._submit_script_file.write( export + self._submit_hold_cmd + job_script + "\n") -- GitLab From 1d79e5d748f7e7071ec0d4650c1b16d11bff7c96 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:34:19 +0200 Subject: [PATCH 014/213] e --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1068dca65..9365e516f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -618,7 +618,7 @@ class Job(object): count = 0 success = False error_message = "" - while (count < retries) or success: + while (count < retries) or not success: try: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() -- GitLab From 4ce7f18eaa5288980bffa600211e8c6cb884675e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:50:30 +0200 Subject: [PATCH 015/213] fixed message --- autosubmit/platforms/paramiko_submitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index c597274f7..acba2bcce 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -184,8 +184,9 @@ class ParamikoSubmitter(Submitter): None) remote_platform.custom_directives = parser.get_option(section, 'CUSTOM_DIRECTIVES', None) - Log.debug("Custom directives from platform.conf: {0}".format( - remote_platform.custom_directives)) + if remote_platform.custom_directives is not None and remote_platform.custom_directives != '' and remote_platform.custom_directives != 'None': + Log.debug("Custom directives from platform.conf: {0}".format( + remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, -- GitLab From e768dde8f1892ce6c5c75712a539f129c21fff7b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 6 Sep 2022 10:53:49 +0200 Subject: [PATCH 016/213] conda fix --- docs/source/installation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 5dd60a136..9a90c4e54 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -172,9 +172,9 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. .. code-block:: bash # Download conda - wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh./Miniconda3-py39_4.12.0-Linux-x86_64.sh + wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it - ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + chmod + x ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh # Download git apt install git -y -q # Download autosubmit -- GitLab From 578751f40bf1a910b3adcba0862dec86c455e6be Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 6 Sep 2022 11:01:23 +0200 Subject: [PATCH 017/213] conda fix --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 9a90c4e54..64b314886 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -174,7 +174,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Download conda wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it - chmod + x ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh # Download git apt install git -y -q # Download autosubmit -- GitLab From 3cdfa7f700b99e04217cf77ad570ce332980fb9d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 7 Sep 2022 15:23:52 +0200 Subject: [PATCH 018/213] Wrapper is now fully independent from total and waiting jobs as expected #857 --- autosubmit/autosubmit.py | 5 +-- autosubmit/config/config_common.py | 9 ++-- autosubmit/job/job_packager.py | 52 +++++++++++----------- autosubmit/platforms/paramiko_submitter.py | 4 +- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index b299c7dcc..6fd5932a3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1372,8 +1372,8 @@ class Autosubmit: while job_list.get_active(): Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True, only_wrappers, hold=False) - for job in job_list.get_uncompleted_and_not_waiting(): - job.status = Status.COMPLETED + #for job in job_list.get_uncompleted_and_not_waiting(): + # job.status = Status.COMPLETED job_list.update_list(as_conf, False) @staticmethod @@ -2071,7 +2071,6 @@ class Autosubmit: platform.open_submit_script() valid_packages_to_submit = [] # type: List[JobPackageBase] for package in packages_to_submit: - try: # If called from inspect command or -cw if only_wrappers or inspect: diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index e3e9188a4..3f5c39a3b 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1600,7 +1600,9 @@ class AutosubmitConfig(object): :return: maximum number of jobs (or total jobs) :rtype: int """ - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED', self.get_total_jobs())) + #total_jobs = self.get_total_jobs() + #unlimited because wrapper should count as one + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED', 999999999)) def get_max_wrapped_jobs_vertical(self, wrapper_section_name="wrapper"): """ @@ -1609,8 +1611,7 @@ class AutosubmitConfig(object): :return: maximum number of jobs (or total jobs) :rtype: int """ - max_wrapped = self.get_max_wrapped_jobs(wrapper_section_name) - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_V', max_wrapped)) + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_V', -1)) def get_max_wrapped_jobs_horizontal(self, wrapper_section_name="wrapper"): """ @@ -1620,7 +1621,7 @@ class AutosubmitConfig(object): :rtype: int """ max_wrapped = self.get_max_wrapped_jobs(wrapper_section_name) - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_H', max_wrapped)) + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_H', -1)) def get_min_wrapped_jobs_vertical(self, wrapper_section_name="wrapper"): """ diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 54a6268c3..cfc1235e8 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -57,7 +57,12 @@ class JobPackager(object): # Submitted + Queuing Jobs for specific Platform queuing_jobs = jobs_list.get_queuing(platform) # We now consider the running jobs count - running_jobs_count = len(jobs_list.get_running(platform)) + running_jobs = jobs_list.get_running(platform) + running_by_id = dict() + for running_job in running_jobs: + running_by_id[running_job.id] = running_job + running_jobs_len = len(running_by_id.keys()) + queued_by_id = dict() for queued_job in queuing_jobs: queued_by_id[queued_job.id] = queued_job @@ -76,10 +81,9 @@ class JobPackager(object): # .total_jobs Maximum number of jobs at the same time self._max_jobs_to_submit = platform.total_jobs - queuing_jobs_len # Substracting running jobs - self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_count + self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_len self._max_jobs_to_submit = self._max_jobs_to_submit if self._max_jobs_to_submit > 0 else 0 - self.max_jobs = min(self._max_wait_jobs_to_submit, - self._max_jobs_to_submit) + self.max_jobs = min(self._max_wait_jobs_to_submit,self._max_jobs_to_submit) self.wrapper_type["wrapper"] = self._as_config.get_wrapper_type() self.wrapper_policy["wrapper"] = self._as_config.get_wrapper_policy() @@ -94,24 +98,15 @@ class JobPackager(object): self.jobs_in_wrapper[wrapper_section] = self._as_config.get_wrapper_jobs(wrapper_section) self.extensible_wallclock[wrapper_section] = int(self._as_config.get_extensible_wallclock(wrapper_section)) self.wrapper_info = [self.wrapper_type,self.wrapper_policy,self.wrapper_method,self.jobs_in_wrapper,self.extensible_wallclock] # to pass to job_packages - - - # True or False - - Log.debug( - "Number of jobs available: {0}", self._max_wait_jobs_to_submit) + Log.debug("Number of jobs available: {0}", self._max_wait_jobs_to_submit) if self.hold: - Log.debug("Number of jobs prepared: {0}", len( - jobs_list.get_prepared(platform))) + Log.debug("Number of jobs prepared: {0}", len(jobs_list.get_prepared(platform))) if len(jobs_list.get_prepared(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_prepared(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_prepared(platform))) else: - Log.debug("Number of jobs ready: {0}", len( - jobs_list.get_ready(platform, hold=False))) + Log.debug("Number of jobs ready: {0}", len(jobs_list.get_ready(platform, hold=False))) if len(jobs_list.get_ready(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_ready(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_ready(platform))) self._maxTotalProcessors = 0 def compute_weight(self, job_list): @@ -210,8 +205,7 @@ class JobPackager(object): # Sort by Priority, highest first list_of_available = sorted( available_sorted, key=lambda k: k.priority, reverse=True) - num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len( - jobs_ready), self._max_jobs_to_submit) + num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len(jobs_ready), self._max_jobs_to_submit) # Take the first num_jobs_to_submit from the list of available jobs_to_submit_tmp = list_of_available[0:num_jobs_to_submit] #jobs_to_submit = [ @@ -248,6 +242,10 @@ class JobPackager(object): wrapper_limits["max_h"] = self._as_config.get_max_wrapped_jobs_horizontal(self.current_wrapper_section) if wrapper_limits["max"] < wrapper_limits["max_v"] * wrapper_limits["max_h"]: wrapper_limits["max"] = wrapper_limits["max_v"] * wrapper_limits["max_h"] + if wrapper_limits["max_v"] == -1: + wrapper_limits["max_v"] = wrapper_limits["max"] + if wrapper_limits["max_h"] == -1: + wrapper_limits["max_h"] = wrapper_limits["max"] if '&' not in section: if self._as_config.jobs_parser.has_option(section, 'DEPENDENCIES'): dependencies_keys = self._as_config.jobs_parser.get( @@ -552,7 +550,7 @@ class JobPackager(object): def _build_horizontal_packages(self, section_list, wrapper_limits, section): packages = [] horizontal_packager = JobPackagerHorizontal(section_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) + wrapper_limits["max"], self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) package_jobs = horizontal_packager.build_horizontal_package() @@ -585,11 +583,11 @@ class JobPackager(object): """ packages = [] for job in section_list: - if self.max_jobs > 0: + if wrapper_limits["max"] > 0: if job.packed is False: job.packed = True dict_jobs = self._jobs_list.get_ordered_jobs_by_date_member(self.current_wrapper_section) - job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, self.max_jobs, wrapper_limits, self._platform.max_wallclock) + job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, wrapper_limits["max"], wrapper_limits, self._platform.max_wallclock) jobs_list = job_vertical_packager.build_vertical_package(job) packages.append(JobPackageVertical(jobs_list, configuration=self._as_config,wrapper_section=self.current_wrapper_section,wrapper_info=wrapper_info)) @@ -605,7 +603,7 @@ class JobPackager(object): ## READY JOBS ## ## Create the horizontal ## horizontal_packager = JobPackagerHorizontal(jobs_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node) + wrapper_limits["max"], self._platform.processors_per_node) if self.wrapper_type[self.current_wrapper_section] == 'vertical-horizontal': return self._build_vertical_horizontal_package(horizontal_packager, jobs_resources) @@ -654,7 +652,7 @@ class JobPackager(object): horizontal_packager.wrapper_limits["max_by_section"][section] = horizontal_packager.wrapper_limits["max_by_section"][section] - 1 horizontal_packager.wrapper_limits["max"] = horizontal_packager.wrapper_limits["max"] - actual_wrapped_jobs for job in horizontal_package: - job_list = JobPackagerVerticalSimple([job], job.wallclock, self.max_jobs, + job_list = JobPackagerVerticalSimple([job], job.wallclock, horizontal_packager.wrapper_limits["max"], horizontal_packager.wrapper_limits, self._platform.max_wallclock).build_vertical_package(job) @@ -706,7 +704,7 @@ class JobPackagerVertical(object): :rtype: List() of Job Object \n """ # self.jobs_list starts as only 1 member, but wrapped jobs are added in the recursion - if len(self.jobs_list) >= self.max_jobs or len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: + if len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: return self.jobs_list child = self.get_wrappable_child(job) # If not None, it is wrappable @@ -897,7 +895,7 @@ class JobPackagerHorizontal(object): for section in jobs_by_section: current_package_by_section[section] = 0 for job in jobs_by_section[section]: - if self.max_jobs > 0 and len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: + if len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: if int(job.tasks) != 0 and int(job.tasks) != int(self.processors_node) and \ int(job.tasks) < job.total_processors: nodes = int( diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index acba2bcce..1f577426f 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -159,8 +159,10 @@ class ParamikoSubmitter(Submitter): asconf.get_max_processors()) remote_platform.max_waiting_jobs = int(parser.get_option(section, 'MAX_WAITING_JOBS', asconf.get_max_waiting_jobs())) - remote_platform.total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', + totaljobs = int(parser.get_option(section, 'TOTALJOBS', asconf.get_total_jobs())) + total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', asconf.get_total_jobs())) + remote_platform.total_jobs = min(min(totaljobs, total_jobs),asconf.get_total_jobs()) remote_platform.hyperthreading = parser.get_option(section, 'HYPERTHREADING', 'false').lower() remote_platform.project = parser.get_option( -- GitLab From f8a51172cb2f483cac5c013cfc090de213de1353 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Sep 2022 11:45:53 +0200 Subject: [PATCH 019/213] error message fix --- autosubmit/platforms/paramiko_platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e57512f55..e1b36f116 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -901,7 +901,7 @@ class ParamikoPlatform(Platform): except AutosubmitError as e: raise except IOError as e: - raise AutosubmitError(e.message,6016) + raise AutosubmitError("IO issues, something seems wrong with {0}".format(self.name),6016,e.message) except BaseException as e: raise AutosubmitError('Command {0} in {1} warning: {2}'.format( command, self.host, '\n'.join(stderr_readlines)), 6005, e.message) -- GitLab From 835215e84ce616477a46c4233eff5a5ec41e7114 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Sep 2022 15:45:24 +0200 Subject: [PATCH 020/213] docs update --- docs/source/devel_proj.rst | 19 ++++++++++++++++++- docs/source/faq.rst | 4 +++- .../usage/configuration/new_platform.rst | 4 ++-- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/docs/source/devel_proj.rst b/docs/source/devel_proj.rst index 056fb3265..17caddcf5 100644 --- a/docs/source/devel_proj.rst +++ b/docs/source/devel_proj.rst @@ -699,4 +699,21 @@ The custom directives can be used for multiple parameters at the same time using # [test [80] // small [40] // large [1040] MAX_PROCESSORS = 80 # test [40] / small [40] // large [40] - PROCESSORS_PER_NODE = 40 \ No newline at end of file + PROCESSORS_PER_NODE = 40 + +Controling the number of active concurrent tasks in an experiment +---------------------------------------------------------------------- + +In some cases, you may want to control the number of concurrent tasks/jobs that can be active in an experiment. + +To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS`` and ``MAX_WAITING_JOBS`` variable in the ``conf/autosubmit_cxxx.conf`` file. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + # Maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 + diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 7d1e31b34..b659c6bdc 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -155,7 +155,9 @@ Minor errors - Error codes [6000+] +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6013 | Configuration issues | Check log output for more info | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ -| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6015 | Submission failed | Automatically, if there aren't bigger issues | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6016 | Temporal connection issues | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ diff --git a/docs/source/usage/configuration/new_platform.rst b/docs/source/usage/configuration/new_platform.rst index 173dafae4..675d4edc6 100644 --- a/docs/source/usage/configuration/new_platform.rst +++ b/docs/source/usage/configuration/new_platform.rst @@ -53,9 +53,9 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be waiting in this platform. +* MAX_WAITING_JOBS: maximum number of jobs to be queuing or submitted in this platform. -* TOTAL_JOBS: maximum number of jobs to be running at the same time in this platform. +* TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. * CUSTOM_DIRECTIVES: Custom directives for the resource manager of this platform. -- GitLab From 50b2db0ce581933ea7c3f9e2f510ebee574fbc0b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 16 Sep 2022 15:48:55 +0200 Subject: [PATCH 021/213] Now critical issues messages is always shown --- autosubmit/autosubmit.py | 7 ++++--- autosubmit/config/config_common.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 6fd5932a3..355260a76 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4341,12 +4341,13 @@ class Autosubmit: raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": - submitter = Autosubmit._get_submitter(as_conf) - submitter.load_platforms(as_conf) + try: + submitter = Autosubmit._get_submitter(as_conf) + submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform", 7014, e.message) + raise AutosubmitCritical("Can't set main platform\nCheck the hpcarch platform configuration inside platform.conf", 7014) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 3f5c39a3b..cc8aa3e1c 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -540,6 +540,8 @@ class AutosubmitConfig(object): # In case that there are critical errors in the configuration, Autosubmit won't continue. if running_time is True: raise AutosubmitCritical(e.message, e.code, e.trace) + else: + Log.printlog(e.message+"\n") except Exception as e: raise AutosubmitCritical( "There was an error while showing the config log messages", 7014, str(e)) -- GitLab From 022a881c7c83e2d40665d5267a984206985a4db0 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 21 Sep 2022 16:09:27 +0200 Subject: [PATCH 022/213] Patch for db_fix --- autosubmit/autosubmit.py | 20 ++++++++++++-------- requeriments.txt | 1 + 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 355260a76..60b064de9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1556,7 +1556,8 @@ class Autosubmit: exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) except Exception as e: # This error is important - raise AutosubmitCritical("Error while processing historical database.", 7005, str(e)) + Log.printlog("Error while processing historical database.", 7005, str(e)) + try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -4224,13 +4225,16 @@ class Autosubmit: except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) - Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), + as_conf.get_full_config_as_json(), + job_list.get_job_list()) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: if group_by: status = list() diff --git a/requeriments.txt b/requeriments.txt index d57974475..c34451db2 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,3 +1,4 @@ +configparser argparse>=1.2,<2 python-dateutil>2 matplotlib -- GitLab From d680e3652664c7acc4a05a80aef869392667d8c8 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 22 Sep 2022 09:53:04 +0200 Subject: [PATCH 023/213] Patch for db_fix (1) --- autosubmit/autosubmit.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 60b064de9..153c0c8a3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1776,9 +1776,22 @@ class Autosubmit: job_list.update_list(as_conf, submitter=submitter) job_list.save() # Safe spot to store changes - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - if len(job_changes_tracker) > 0: - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except BaseException as e: + Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", + Log.INFO) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} if Autosubmit.exit: job_list.save() @@ -1949,8 +1962,16 @@ class Autosubmit: raise AutosubmitCritical("There is a bug in the code, please contact via git",7070,e.message) Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + Log.printlog() # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True -- GitLab From 6b7ff9ef2c0f5cd355530e4a3971ae382e5dedb9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 23 Sep 2022 15:05:21 +0200 Subject: [PATCH 024/213] Does an sql dump everytime a change is detected. Then db_fix load this sql dump --- autosubmit/autosubmit.py | 102 ++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 55 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 153c0c8a3..75baab6de 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -58,6 +58,7 @@ import locale from distutils.util import strtobool from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set +import sqlite3 try: import dialog @@ -71,6 +72,7 @@ import tarfile import time import copy import os +import glob import pwd import sys import shutil @@ -1553,11 +1555,14 @@ class Autosubmit: # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current run does not exist. exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) except Exception as e: - # This error is important - Log.printlog("Error while processing historical database.", 7005, str(e)) - + try: + Autosubmit.database_fix(expid) + # This error is important + except: + pass try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -1781,6 +1786,7 @@ class Autosubmit: historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) if len(job_changes_tracker) > 0: exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) @@ -1790,6 +1796,7 @@ class Autosubmit: historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) if len(job_changes_tracker) > 0: exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except: Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} @@ -1965,13 +1972,12 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except: try: Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) except: - Log.printlog() + pass # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True @@ -3901,6 +3907,17 @@ class Autosubmit: raise @staticmethod + def database_backup(expid): + try: + database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) + backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) + Log.info("Backing up jobs_data...") + subprocess.call(command, shell=True) + Log.result("Jobs_data database backup completed.") + except BaseException as e: + Log.info("Jobs_data database backup failed.") + @staticmethod def database_fix(expid): """ Database methods. Performs a sql dump of the database and restores it. @@ -3912,52 +3929,31 @@ class Autosubmit: """ os.umask(0) # Overrides user permissions current_time = int(time.time()) + corrupted_db_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_corrupted.db".format(expid)) + database_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) - database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_{1}.db".format(expid, str(current_time))) - dump_file_name = 'job_data_{0}_{1}.sql'.format(expid, current_time) + database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + dump_file_name = 'job_data_{0}.sql'.format(expid, current_time) dump_file_path = os.path.join(BasicConfig.JOBDATA_DIR, dump_file_name) - bash_command = 'sqlite3 {0} .dump > {1}'.format(database_path, dump_file_path) + bash_command = 'cat {1} | sqlite3 {0}'.format(database_path, dump_file_path) try: - if os.path.exists(database_path): + if os.path.exists(database_path): + result = os.popen("mv {0} {1}".format(database_path, corrupted_db_path)).read() + time.sleep(10) + Log.info("Original database moved.") + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + Log.info("Restoring from sql") result = os.popen(bash_command).read() - if result is not None and os.path.exists(dump_file_path): - Log.info("sqldump {0} created".format(dump_file_path)) - Log.info( - "Backing up original database {0}".format(database_path)) - result = os.popen("mv {0} {1}".format(database_path, database_backup_path)).read() - time.sleep(10) - if result is not None and not os.path.exists(database_path): - Log.info("Original database moved.") - Log.info("Restoring from sqldump") - HUtils.create_file_with_full_permissions(database_path) - result = os.popen("cat {0} | sqlite3 {1}".format( - dump_file_path, database_path)).read() - time.sleep(10) - if result is not None and os.path.exists(database_path): - Log.info( - "Database {0} restored.".format(database_path)) - Log.info("Deleting sqldump.") - result = os.popen( - "rm {0}".format(dump_file_path)).read() - sleep(5) - if result is not None and not os.path.exists(dump_file_path): - ExperimentHistory(expid).initialize_database() - Log.info("sqldump file deleted.") - Log.result( - "The database {0} has been fixed.".format(database_path)) - else: - raise Exception( - "The sqldump file could not be removed.") - else: - raise Exception( - "It was not possible to restore the sqldump file.") - else: - raise Exception( - "It was not possible to delete the original database.") - else: - raise Exception("The sqldump file couldn't be created.") - else: - raise Exception("The database file doesn't exist.") + except: + Log.warning("It was not possible to restore the jobs_data.db file... , a new blank db will be created") + result = os.popen("rm {0}".format(database_path)).read() + + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() except Exception as exp: Log.critical(str(exp)) @@ -4243,17 +4239,12 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) try: Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) except: Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: @@ -5018,6 +5009,7 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.process_status_changes(job_list.get_job_list(), chunk_unit=as_conf.get_chunk_size_unit(), chunk_size=as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) else: Log.printlog( "Changes NOT saved to the JobList!!!!: use -s option to save", 3000) -- GitLab From 7213cb18e2abe25090b6a75f440eda6e730b4302 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Sep 2022 09:21:49 +0200 Subject: [PATCH 025/213] database changes #870 --- autosubmit/autosubmit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 75baab6de..337247605 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3912,11 +3912,11 @@ class Autosubmit: database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) - Log.info("Backing up jobs_data...") + Log.debug("Backing up jobs_data...") subprocess.call(command, shell=True) - Log.result("Jobs_data database backup completed.") + Log.debug("Jobs_data database backup completed.") except BaseException as e: - Log.info("Jobs_data database backup failed.") + Log.debug("Jobs_data database backup failed.") @staticmethod def database_fix(expid): """ -- GitLab From f1f3ea23923b2eabb669cdf3ecb517915c9365d9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 30 Sep 2022 13:50:03 +0200 Subject: [PATCH 026/213] #877 conda typo --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 64b314886..4f68c3788 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -183,7 +183,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Create conda environment conda env update -f environment.yml -n autosubmit python=2 # Activate env - source activate autosubmit + conda activate autosubmit # Test autosubmit autosubmit -v # Configure autosubmitrc and install database as indicated in this doc -- GitLab From 2867216631fe6d9c1017af331afc13c0635f2dc3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 30 Sep 2022 13:50:53 +0200 Subject: [PATCH 027/213] #877 changed version to the lastest one (3.14.0b) --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 4f68c3788..7159ac7c0 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -178,7 +178,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Download git apt install git -y -q # Download autosubmit - git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 + git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b cd autosubmit # Create conda environment conda env update -f environment.yml -n autosubmit python=2 -- GitLab From e41fab2383d907df115cbeaf1310755e60a0878c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 13:03:40 +0200 Subject: [PATCH 028/213] #inline comments, fixes for slrum --- autosubmit/autosubmit.py | 20 ++++-- autosubmit/platforms/paramiko_platform.py | 74 ++++++++++++++++------- test/regression/tests_runner.py | 1 + 3 files changed, 69 insertions(+), 26 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 337247605..09ce96335 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1584,7 +1584,7 @@ class Autosubmit: if unparsed_two_step_start != "": job_list.parse_jobs_by_filter(unparsed_two_step_start) - main_loop_retrials = 3650 # Hard limit of tries 3650 tries at 15-120seconds sleep each try + main_loop_retrials = 11250*2 # Hard limit of tries ( 48h min 72h max), 2 retrials per stop # establish the connection to all platforms Autosubmit.restore_platforms(platforms_to_test) @@ -1822,7 +1822,7 @@ class Autosubmit: Log.printlog("Error trying to store failed job count",Log.WARNING) Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: - delay = min(15 * consecutive_retrials, 120) + delay = min(15 * consecutive_retrials, 30) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1959,7 +1959,7 @@ class Autosubmit: except BaseException: reconnected = False if main_loop_retrials <= 0: - raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of 4hours reached", 7051, e.message) + raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of {0} retrials reached".format(main_loop_retrials), 7051, e.message) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error raise AutosubmitCritical(e.message, e.code, e.trace) except portalocker.AlreadyLocked: @@ -3322,7 +3322,12 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("Unknown error while reporting the parameters list, likely it is due IO issues",7040,e.message) - + @staticmethod + def removeInlineComments(cfgparser): + for section in cfgparser.sections(): + for item in cfgparser.items(section): + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + return cfgparser @staticmethod def describe(experiment_id): """ @@ -3497,6 +3502,7 @@ class Autosubmit: parser.set("autosubmitapi", "url", autosubmitapi_url) #parser.add_section("hosts") #parser.set("hosts", "whitelist", " localhost # Add your machine names") + parser = Autosubmit.removeInlineComments(parser) parser.write(config_file) config_file.close() Log.result("Configuration file written successfully: \n\t{0}".format(rc_path)) @@ -3591,6 +3597,8 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) + parser = Autosubmit.removeInlineComments(parser) + if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') if parser.has_option('database', 'filename'): @@ -3723,11 +3731,15 @@ class Autosubmit: parser.add_section('mail') parser.set('mail', 'smtp_server', smtp_hostname) parser.set('mail', 'mail_from', mail_from) + parser = Autosubmit.removeInlineComments(parser) + parser.write(config_file) config_file.close() d.msgbox("Configuration file written successfully", width=50, height=5) os.system('clear') + + except (IOError, OSError) as e: raise AutosubmitCritical( "Can not write config file", 7012, e.message) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e1b36f116..fb9059915 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -550,35 +550,63 @@ class ParamikoPlatform(Platform): cmd = self.get_checkAlljobs_cmd(job_list_cmd) sleep_time = 5 sleep(sleep_time) - self.send_command(cmd) - while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + slurm_error = False + e_msg = "" + try: self.send_command(cmd) - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - retries -= 1 - sleep(sleep_time) - sleep_time = sleep_time + 5 + except AutosubmitError as e: + e_msg = e.trace+" "+e.message + slurm_error = True + if not slurm_error: + while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + try: + self.send_command(cmd) + except AutosubmitError as e: + e_msg = e.trace + " " + e.message + slurm_error = True + break + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + retries -= 1 + sleep(sleep_time) + sleep_time = sleep_time + 5 + job_list_status = self.get_ssh_output() if retries >= 0: Log.debug('Successful check job command') in_queue_jobs = [] list_queue_jobid = "" for job in job_list: - job_id = job.id - job_status = self.parse_Alljobs_output(job_list_status, job_id) - while len(job_status) <= 0 and retries >= 0: - retries -= 1 - self.send_command(cmd) - job_list_status = self.get_ssh_output() + if not slurm_error: + job_id = job.id job_status = self.parse_Alljobs_output(job_list_status, job_id) - if len(job_status) <= 0: - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - sleep(sleep_time) - sleep_time = sleep_time + 5 - # URi: define status list in HPC Queue Class + while len(job_status) <= 0 and retries >= 0: + retries -= 1 + self.send_command(cmd) + job_list_status = self.get_ssh_output() + job_status = self.parse_Alljobs_output(job_list_status, job_id) + if len(job_status) <= 0: + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + sleep(sleep_time) + sleep_time = sleep_time + 5 + # URi: define status list in HPC Queue Class + else: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -595,12 +623,12 @@ class ParamikoPlatform(Platform): elif retries == 0: job_status = Status.COMPLETED job.update_status(remote_logs) - else: job_status = Status.UNKNOWN Log.error( 'check_job() The job id ({0}) status is {1}.', job_id, job_status) job.new_status = job_status + reason = str() if self.type == 'slurm' and len(in_queue_jobs) > 0: cmd = self.get_queue_status_cmd(list_queue_jobid) @@ -639,6 +667,8 @@ class ParamikoPlatform(Platform): 'check_job() The job id ({0}) from platform {1} has an status of {2}.', job.id, self.name, job_status) raise AutosubmitError("Some Jobs are in Unknown status", 6008) # job.new_status=job_status + if slurm_error: + raise AutosubmitError(e_msg, 6000) def get_jobid_by_jobname(self,job_name,retries=2): """ diff --git a/test/regression/tests_runner.py b/test/regression/tests_runner.py index ffd490888..ab186e849 100644 --- a/test/regression/tests_runner.py +++ b/test/regression/tests_runner.py @@ -79,6 +79,7 @@ def run(current_experiment_id, only_list=None, exclude_list=None, max_threads=5) tests_parser.optionxform = str tests_parser.read(tests_parser_file) + # Resetting the database clean_database(db_path) create_database() -- GitLab From 540e8a02d20e486d3302df07ab938aaa778d3eb8 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 15:43:21 +0200 Subject: [PATCH 029/213] Remove inline comments working #870 --- autosubmit/autosubmit.py | 13 +++---------- autosubmit/config/config_common.py | 9 +++++++++ autosubmit/config/config_parser.py | 5 ++++- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 09ce96335..82e4b44e9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3322,12 +3322,7 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("Unknown error while reporting the parameters list, likely it is due IO issues",7040,e.message) - @staticmethod - def removeInlineComments(cfgparser): - for section in cfgparser.sections(): - for item in cfgparser.items(section): - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) - return cfgparser + @staticmethod def describe(experiment_id): """ @@ -3502,7 +3497,6 @@ class Autosubmit: parser.set("autosubmitapi", "url", autosubmitapi_url) #parser.add_section("hosts") #parser.set("hosts", "whitelist", " localhost # Add your machine names") - parser = Autosubmit.removeInlineComments(parser) parser.write(config_file) config_file.close() Log.result("Configuration file written successfully: \n\t{0}".format(rc_path)) @@ -3597,7 +3591,6 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) - parser = Autosubmit.removeInlineComments(parser) if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') @@ -3731,8 +3724,6 @@ class Autosubmit: parser.add_section('mail') parser.set('mail', 'smtp_server', smtp_hostname) parser.set('mail', 'mail_from', mail_from) - parser = Autosubmit.removeInlineComments(parser) - parser.write(config_file) config_file.close() d.msgbox("Configuration file written successfully", @@ -5398,10 +5389,12 @@ class Autosubmit: raise AutosubmitCritical('Can not test a RERUN experiment', 7014) content = open(as_conf.experiment_file).read() + if random_select: if hpc is None: platforms_parser = as_conf.get_parser( ConfigParserFactory(), as_conf.platforms_file) + test_platforms = list() for section in platforms_parser.sections(): if platforms_parser.get_option(section, 'TEST_SUITE', 'false').lower() == 'true': diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index cc8aa3e1c..74dcc3e1e 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1759,6 +1759,13 @@ class AutosubmitConfig(object): commit = self.get_git_project_commit() return origin_exists and (branch is not None or commit is not None) + @staticmethod + def removeInlineComments(cfgparser): + for section in cfgparser.sections(): + for item in cfgparser.items(section): + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + return cfgparser + @staticmethod def get_parser(parser_factory, file_path): """ @@ -1794,5 +1801,7 @@ class AutosubmitConfig(object): raise Exception( "{}\n This file and the correctness of its content are necessary.".format(str(exp))) # parser.read(file_path) + #remove inline comments + parser = AutosubmitConfig.removeInlineComments(parser) return parser diff --git a/autosubmit/config/config_parser.py b/autosubmit/config/config_parser.py index 87b28456a..99d92fd8c 100644 --- a/autosubmit/config/config_parser.py +++ b/autosubmit/config/config_parser.py @@ -14,8 +14,11 @@ class ConfigParserFactory: def __init__(self): pass + + def create_parser(self): - return ConfigParser() + parser = ConfigParser() + return parser class ConfigParser(ConfPar, object): -- GitLab From 31c925f9c2718cba9f48a62eae37dc589f490eb3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 16:04:02 +0200 Subject: [PATCH 030/213] setstatus doesn't crash anymore if the id does not exists --- autosubmit/autosubmit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 82e4b44e9..37aa84475 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4467,7 +4467,10 @@ class Autosubmit: if job.status in [Status.SUBMITTED, Status.QUEUING, Status.HELD] and final_status not in [Status.QUEUING, Status.HELD, Status.SUSPENDED]: job.hold = False if job.platform_name and job.platform_name.lower() != "local": - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + try: + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + except: + pass elif job.status in [Status.QUEUING, Status.RUNNING, Status.SUBMITTED] and final_status == Status.SUSPENDED: if job.platform_name and job.platform_name.lower() != "local": job.platform.send_command("scontrol hold " + "{0}".format(job.id), ignore_log=True) -- GitLab From 25c11e3c0521957fec64cec162ae96001b2bab8a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 11:08:39 +0200 Subject: [PATCH 031/213] Fixed e message error --- autosubmit/autosubmit.py | 14 +++++++------- autosubmit/config/config_common.py | 4 ++-- autosubmit/git/autosubmit_git.py | 4 ++-- autosubmit/job/job.py | 8 ++++---- autosubmit/job/job_dict.py | 2 +- autosubmit/job/job_list.py | 6 +++--- autosubmit/monitor/monitor.py | 2 +- autosubmit/platforms/paramiko_platform.py | 16 ++++++++-------- test/regression/tests_utils.py | 2 +- 9 files changed, 29 insertions(+), 29 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 37aa84475..2fca7cb7b 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -590,7 +590,7 @@ class Autosubmit: except Exception as e: if type(e) is SystemExit: # Version keyword force an exception in parse arg due and os_exit(0) but the program is succesfully finished - if e.message == 0: + if str(e) == 0: print(Autosubmit.autosubmit_version) os._exit(0) raise AutosubmitCritical( @@ -836,28 +836,28 @@ class Autosubmit: if ret: Log.result("Experiment {0} deleted".format(expid_delete)) except BaseException as e: - error_message += 'Can not delete experiment entry: {0}\n'.format(e.message) + error_message += 'Can not delete experiment entry: {0}\n'.format(str(e)) Log.info("Removing experiment directory...") try: shutil.rmtree(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)) except BaseException as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) try: Log.info("Removing Structure db...") structures_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, "structure_{0}.db".format(expid_delete)) if os.path.exists(structures_path): os.remove(structures_path) except BaseException as e: - error_message += 'Can not delete structure: {0}\n'.format(e.message) + error_message += 'Can not delete structure: {0}\n'.format(str(e)) try: Log.info("Removing job_data db...") job_data_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid_delete)) if os.path.exists(job_data_path): os.remove(job_data_path) except BaseException as e: - error_message += 'Can not delete job_data: {0}\n'.format(e.message) + error_message += 'Can not delete job_data: {0}\n'.format(str(e)) except OSError as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) else: if not eadmin: raise AutosubmitCritical( @@ -1811,7 +1811,7 @@ class Autosubmit: # No need to wait until the remote platform reconnection recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - consecutive_retrials = 0 + consecutive_retrials = 1 failed_names = {} Log.info("Storing failed job count...") try: diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 74dcc3e1e..ddbb04c78 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -513,11 +513,11 @@ class AutosubmitConfig(object): self.reload() except IOError as e: raise AutosubmitError( - "I/O Issues con config files", 6016, e.message) + "I/O Issues con config files", 6016, str(e)) except (AutosubmitCritical, AutosubmitError) as e: raise except BaseException as e: - raise AutosubmitCritical("Unknown issue while checking the configulation files (check_conf_files)",7040,e.message) + raise AutosubmitCritical("Unknown issue while checking the configulation files (check_conf_files)",7040,str(e)) # Annotates all errors found in the configuration files in dictionaries self.warn_config and self.wrong_config. self.check_expdef_conf() self.check_platforms_conf() diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 817b5e09b..c191c21df 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -60,7 +60,7 @@ class AutosubmitGit: shell=True) except subprocess.CalledProcessError as e: raise AutosubmitCritical( - "Failed to retrieve git info ...", 7064, e.message) + "Failed to retrieve git info ...", 7064, str(e)) if output: Log.info("Changes not committed detected... SKIPPING!") raise AutosubmitCritical("Commit needed!", 7013) @@ -231,7 +231,7 @@ class AutosubmitGit: output_1 = subprocess.check_output(command_1, shell=True) except BaseException as e: submodule_failure = True - Log.printlog("Trace: {0}".format(e.message), 6014) + Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( "Submodule {0} has a wrong configuration".format(submodule), 6014) else: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 9365e516f..6653c51f9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -600,13 +600,13 @@ class Job(object): self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( - e.message, self.name)) + str(e), self.name)) except AutosubmitError as e: Log.printlog("Trace {0} \nFailed to retrieve log file for job {1}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) return @threaded @@ -656,7 +656,7 @@ class Job(object): except BaseException as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index 0b16d29af..d0aef9f42 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -126,7 +126,7 @@ class DicJobs: except BaseException as e: raise AutosubmitCritical( "Wrong format for {1} parameter in section {0}".format(section,called_from), 7011, - e.message) + str(e)) pass return parsed_list def read_section(self, section, priority, default_job_type, jobs_data=dict()): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 395c97e4c..ae52a0c78 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -249,7 +249,7 @@ class JobList(object): else: self._ordered_jobs_by_date_member[wrapper_section] = {} except BaseException as e: - raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,e.message) + raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,str(e)) pass @@ -1419,11 +1419,11 @@ class JobList(object): self._persistence_file, self._job_list if self.run_members is None or job_list is None else job_list) pass except BaseException as e: - raise AutosubmitError(e.message,6040,"Failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Failure while saving the job_list") except AutosubmitError as e: raise except BaseException as e: - raise AutosubmitError(e.message,6040,"Unknown failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Unknown failure while saving the job_list") def backup_save(self): diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index 55c60156a..9556e7d3d 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -353,7 +353,7 @@ class Monitor: except: pass - Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only writted in txt".format(e.message),7014) + Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only written in txt".format(e.message),7014) def generate_output_txt(self, expid, joblist, path, classictxt=False, job_list_object=None): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index fb9059915..1c1177510 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -112,7 +112,7 @@ class ParamikoPlatform(Platform): except EOFError as e: self.connected = False raise AutosubmitError("[{0}] not alive. Host: {1}".format( - self.name, self.host), 6002, e.message) + self.name, self.host), 6002, str(e)) except (AutosubmitError,AutosubmitCritical,IOError): self.connected = False raise @@ -136,7 +136,7 @@ class ParamikoPlatform(Platform): self.host.split(',')[0]), 6002) else: raise AutosubmitCritical( - "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,e.message) + "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,str(e)) while self.connected is False and retry < retries: try: self.connect(True) @@ -155,7 +155,7 @@ class ParamikoPlatform(Platform): raise except Exception as e: raise AutosubmitCritical( - 'Cant connect to this platform due an unknown error', 7050, e.message) + 'Cant connect to this platform due an unknown error', 7050, str(e)) def threaded(fn): def wrapper(*args, **kwargs): @@ -219,12 +219,12 @@ class ParamikoPlatform(Platform): elif "name or service not known" in e.strerror.lower(): raise SSHException(" {0} doesn't accept remote connections. Check if there is an typo in the hostname".format(self.host)) else: - raise AutosubmitError("File can't be located due an slow connection", 6016, e.message) + raise AutosubmitError("File can't be located due an slow connection", 6016, str(e)) except BaseException as e: self.connected = False - if "Authentication failed." in e.message: + if "Authentication failed." in str(e): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( - self._host_config['hostname']), 7050, e.message) + self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: self.restore_connection(reconnect=True) else: @@ -284,7 +284,7 @@ class ParamikoPlatform(Platform): return True except IOError as e: raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, e.message) + self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, str(e)) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -358,7 +358,7 @@ class ParamikoPlatform(Platform): except BaseException as e: Log.error('Could not remove file {0} due a wrong configuration'.format( os.path.join(self.get_files_path(), filename))) - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: raise AutosubmitCritical( "Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ", 7051, e.message) diff --git a/test/regression/tests_utils.py b/test/regression/tests_utils.py index 297fb8f75..53ead0dd5 100644 --- a/test/regression/tests_utils.py +++ b/test/regression/tests_utils.py @@ -23,7 +23,7 @@ def check_cmd(command, path=BIN_PATH, verbose='AS_TEST_VERBOSE' in os.environ): except subprocess.CalledProcessError as e: if verbose: - print e.output + print str(e) return False -- GitLab From 5640508259cc54472d91afa33ea3e2e6eb60e1a9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 13:16:06 +0200 Subject: [PATCH 032/213] log error --- autosubmit/config/config_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index ddbb04c78..4b683f1e4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1763,7 +1763,10 @@ class AutosubmitConfig(object): def removeInlineComments(cfgparser): for section in cfgparser.sections(): for item in cfgparser.items(section): - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + try: + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + except: + pass return cfgparser @staticmethod -- GitLab From 3d42d2e3f4f5af861b9244b88334ed92ee46403f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 13:33:32 +0200 Subject: [PATCH 033/213] CUSTOM directive has # crashing with the removeinlinecomments --- autosubmit/config/config_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 4b683f1e4..50c4d69e8 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1764,7 +1764,10 @@ class AutosubmitConfig(object): for section in cfgparser.sections(): for item in cfgparser.items(section): try: - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + if str(item[0]).upper() == "CUSTOM_DIRECTIVES": + pass + else: + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) except: pass return cfgparser -- GitLab From b6609fb36022d6d6c1a8fceafed01083246b3e54 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 15:01:34 +0200 Subject: [PATCH 034/213] Changed delete message, added complete list of directories --- autosubmit/autosubmit.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 2fca7cb7b..0720672e7 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -818,6 +818,16 @@ class Autosubmit: :return: True if succesfully deleted, False otherwise :rtype: boolean """ + message = "The {0} experiment was removed from the local disk and from the database.".format(expid_delete) + message+= " Note that this action does not delete any data written by the experiment.\n" + message+= "Complete list of files/directories deleted:\n" + for root, dirs, files in os.walk(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)): + for dir in dirs: + message += os.path.join(root, dir) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, + "structure_{0}.db".format(expid_delete)) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, + "job_data_{0}.db".format(expid_delete)) + "\n" owner,eadmin,currentOwner = Autosubmit._check_ownership(expid_delete) if expid_delete == '' or expid_delete is None and not os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR,expid_delete)): Log.printlog("Experiment directory does not exist.",Log.WARNING) @@ -865,6 +875,7 @@ class Autosubmit: else: raise AutosubmitCritical( 'Current user is not the owner of the experiment. {0} can not be deleted!'.format(expid_delete), 7012) + Log.printlog(message, Log.RESULT) except Exception as e: # Avoid calling Log at this point since it is possible that tmp folder is already deleted. error_message += "Couldn't delete the experiment".format(e.message) -- GitLab From 70f6711de4d5cedba985e576472bdbf6ed559e8e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 15:19:47 +0200 Subject: [PATCH 035/213] disable inline delete --- autosubmit/config/config_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 50c4d69e8..63b31483d 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1808,6 +1808,6 @@ class AutosubmitConfig(object): "{}\n This file and the correctness of its content are necessary.".format(str(exp))) # parser.read(file_path) #remove inline comments - parser = AutosubmitConfig.removeInlineComments(parser) + #parser = AutosubmitConfig.removeInlineComments(parser) return parser -- GitLab From 2942e7e6462fa9c136c31a42bf62858c47db592a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 16:11:47 +0200 Subject: [PATCH 036/213] Fixed node missconfiguration slurm message not being detected correclty --- autosubmit/autosubmit.py | 10 ++++++---- autosubmit/job/job_packages.py | 2 +- autosubmit/platforms/paramiko_submitter.py | 4 +++- autosubmit/platforms/slurmplatform.py | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 0720672e7..48e5b2e28 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2191,11 +2191,11 @@ class Autosubmit: platform.cancel_job(id) jobs_id = None platform.connected = False - if type(e.trace) is not None: - has_trace_bad_parameters = e.trace.lower().find("bad parameters") != -1 + if e.trace is not None: + has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 else: has_trace_bad_parameters = False - if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1: + if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -2206,7 +2206,9 @@ class Autosubmit: else: error_message+="Check that {1} platform has set the correct scheduler. Sections that could be affected: {0}".format( error_msg[:-1], platform.name) - raise AutosubmitCritical(error_message,7014,e.message+"\n"+e.trace) + if e.trace is None: + e.trace = "" + raise AutosubmitCritical(error_message,7014,e.message+"\n"+str(e.trace)) except IOError as e: raise AutosubmitError( "IO issues ", 6016, e.message) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 52afa70cc..a3a6a3b58 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -155,7 +155,7 @@ class JobPackageBase(object): exit=True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - if configuration.get_project_type().lower() != "none": + if str(configuration.get_project_type()).lower() != "none": raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) if not job.check_script(configuration, parameters,show_logs=job.check_warnings): Log.warning("Script {0} check failed",job.name) diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 1f577426f..12e1e70bc 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -203,6 +203,8 @@ class ParamikoSubmitter(Submitter): if parser.has_option(section, 'SERIAL_PLATFORM'): platforms[section.lower()].serial_platform = platforms[parser.get_option(section, 'SERIAL_PLATFORM', - None).lower()] + None)] + if platforms[section.lower()].serial_platform is not None: + platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 5d31690c4..d757256a4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -466,7 +466,7 @@ class SlurmPlatform(ParamikoPlatform): else: retries = 9999 except BaseException as e: # Unrecoverable error - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: if not wrapper_failed: sleep(sleeptime) sleeptime = sleeptime + 5 -- GitLab From df2165954c91e66e713605cbc1d74644c207abaa Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 11:16:06 +0200 Subject: [PATCH 037/213] Added include_members and chunks #748 --- autosubmit/job/job_dict.py | 36 ++++++++++++++++++++++++++++++------ test/unit/test_dic_jobs.py | 2 +- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index d0aef9f42..b7e6b4a6d 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -152,11 +152,19 @@ class DicJobs: elif running == 'date': self._create_jobs_startdate(section, priority, frequency, default_job_type, jobs_data,splits) elif running == 'member': - self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits,self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits, \ + self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + elif running == 'chunk': synchronize = self.get_option(section, "SYNCHRONIZE", None) delay = int(self.get_option(section, "DELAY", -1)) - self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data,excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"),excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data, \ + excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"), \ + excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + included_chunks=self.parse_relation(section,False,self.get_option(section, "INCLUDED_CHUNKS", []),"INCLUDED_CHUNKS"), \ + included_members=self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + pass def _create_jobs_once(self, section, priority, default_job_type, jobs_data=dict(),splits=0): @@ -218,7 +226,7 @@ class DicJobs: - def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[]): + def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[],included_members=[]): """ Create jobs to be run once per member @@ -242,11 +250,18 @@ class DicJobs: count = 0 if splits > 0: for member in self._member_list: - if self._member_list.index(member) not in excluded_members: - tmp_dic[section][date][member] = [] + if len(included_members) == 0: + if self._member_list.index(member) not in excluded_members: + tmp_dic[section][date][member] = [] + else: + if self._member_list.index(member) in included_members: + tmp_dic[section][date][member] = [] for member in self._member_list: if self._member_list.index(member) in excluded_members: continue + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue count += 1 if count % frequency == 0 or count == len(self._member_list): if splits <= 0: @@ -259,7 +274,7 @@ class DicJobs: - def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[]): + def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]): """ Create jobs to be run once per chunk @@ -282,6 +297,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): @@ -311,6 +329,9 @@ class DicJobs: for date in self._date_list: self._dic[section][date] = dict() for member in self._member_list: + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue if self._member_list.index(member) in excluded_members: continue self._dic[section][date][member] = dict() @@ -318,6 +339,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 5565c9328..39f7690b2 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -123,7 +123,7 @@ class TestDicJobs(TestCase): self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() self.dictionary._create_jobs_member.assert_not_called() - self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[]) + self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]) def test_dic_creates_right_jobs_by_startdate(self): # arrange -- GitLab From 6127001e9093604718f0f1546b4de2a0eef92bb7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:17:17 +0200 Subject: [PATCH 038/213] Bugfix timeout #812 --- autosubmit/platforms/locplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 3fe62f5cc..e7734b133 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -83,7 +83,7 @@ class LocalPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: -- GitLab From 99aec684a83eeafd9db75d2f9f9c0378f23ef9e9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:29:32 +0200 Subject: [PATCH 039/213] Erased wrong info about TOTAL_JOBS --- .../usage/configuration/new_platform.rst | 2 +- docs/source/usage/run_modes/wrappers.rst | 34 ++----------------- 2 files changed, 4 insertions(+), 32 deletions(-) diff --git a/docs/source/usage/configuration/new_platform.rst b/docs/source/usage/configuration/new_platform.rst index 675d4edc6..971778061 100644 --- a/docs/source/usage/configuration/new_platform.rst +++ b/docs/source/usage/configuration/new_platform.rst @@ -53,7 +53,7 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be queuing or submitted in this platform. +* MAX_WAITING_JOBS: Maximum number of jobs to be queuing or submitted in this platform. * TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. diff --git a/docs/source/usage/run_modes/wrappers.rst b/docs/source/usage/run_modes/wrappers.rst index 8085e4884..388c215ef 100644 --- a/docs/source/usage/run_modes/wrappers.rst +++ b/docs/source/usage/run_modes/wrappers.rst @@ -14,34 +14,6 @@ At the moment there are 4 types of wrappers that can be used depending on the ex When using the wrapper, it is useful to be able to visualize which packages are being created. So, when executing *autosubmit monitor cxxx*, a dashed box indicates the jobs that are wrapped together in the same job package. -How to configure -======================== - -In ``autosubmit_cxxx.conf``, regardless of the wrapper type, you need to make sure that the values of the variables **MAXWAITINGJOBS** and **TOTALJOBS** are increased according to the number of jobs expected to be waiting/running at the same time in your experiment. - -For example: - -.. code-block:: ini - - [config] - EXPID = .... - AUTOSUBMIT_VERSION = 3.13.0 - ... - - MAXWAITINGJOBS = 100 - TOTALJOBS = 100 - ... - -and below the [config] block, add the wrapper directive, indicating the wrapper type: - -.. code-block:: ini - - [wrapper] - TYPE = - -You can also specify which job types should be wrapped. This can be done using the **JOBS_IN_WRAPPER** parameter. -It is only required for the vertical-mixed type (in which the specified job types will be wrapped together), so if nothing is specified, all jobs will be wrapped. -By default, jobs of the same type will be wrapped together, as long as the constraints are satisfied. Number of jobs in a package *************************** @@ -57,7 +29,7 @@ Number of jobs in a package - **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` - - If **MAX_WRAPPED** is not defined, then **TOTALJOBS** is used by default + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. - **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain - If not defined, it considers that **MIN_WRAPPED** is 2. - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. @@ -241,7 +213,7 @@ In `autosubmit_cxxx.conf`: # JOBS_IN_WRAPPER = Sections that should be wrapped together ex SIM # METHOD : Select between MACHINESFILES or Shared-Memory. # MIN_WRAPPED set the minim number of jobs that should be included in the wrapper. DEFAULT = 2 - # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = TOTALJOBS + # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = 99999999999 # Policy : Select the behaviour of the inner jobs Strict/Flexible/Mixed # EXTEND_WALLCLOCK: Allows to extend the wallclock by the max wallclock of the horizontal package (max inner job). Values are integer units (0,1,2) # RETRIALS : Enables a retrial mechanism for vertical wrappers, or default retrial mechanism for the other wrappers @@ -250,7 +222,7 @@ In `autosubmit_cxxx.conf`: TYPE = Vertical #REQUIRED JOBS_IN_WRAPPER = SIM # Job types (as defined in jobs_cxxx.conf) separated by space. REQUIRED only if vertical-mixed MIN_WRAPPED = 2 - MAX_WRAPPED = 9999 # OPTIONAL. Integer value, overrides TOTALJOBS + MAX_WRAPPED = 999999 # OPTIONAL. Integer value. CHECK_TIME_WRAPPER = # OPTIONAL. Time in seconds, overrides SAFETYSLEEPTIME POLICY = flexible # OPTIONAL, Wrapper policy, mixed, flexible, strict QUEUE = bsc_es # If not specified, queue will be the same of the first SECTION specified on JOBS_IN_WRAPPER -- GitLab From 03fef134b138167b7ad8bfaeabac899011ccbbf2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:38:51 +0200 Subject: [PATCH 040/213] Added wrapper info under devel_proj -> Controling the number of active concurrent tasks in an experiment #857 --- docs/source/devel_proj.rst | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/docs/source/devel_proj.rst b/docs/source/devel_proj.rst index 17caddcf5..0dda37b3c 100644 --- a/docs/source/devel_proj.rst +++ b/docs/source/devel_proj.rst @@ -712,8 +712,34 @@ To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS .. code-block:: ini - # Maximum number of submitted,waiting and running tasks - TOTAL_JOBS = 10 - # Maximum number of submitted and waiting tasks - MAX_WAITING_JOBS = 10 + # Controls the maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Controls the maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 +To control the number of jobs included in a wrapper, you can use the `MAX_WRAPPED_JOBS` and `MIN_WRAPPED_JOBS` variables in the ``conf/autosubmit_cxxx.conf`` file. + +Note that a wrapped job is counted as a single job regardless of the number of tasks it contains. Therefore, `TOTAL_JOBS` and `MAX_WAITING_JOBS` won't have an impact inside a wrapper. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + [wrapper] + TYPE = + MIN_WRAPPED = 2 # Minium amount of jobs that will be wrapped together in any given time. + MIN_WRAPPED_H = 2 # Same as above but only for the horizontal packages. + MIN_WRAPPED_V = 2 # Same as above but only for the vertical packages. + MAX_WRAPPED = 99999 # Maximum amount of jobs that will be wrapped together in any given time. + MAX_WRAPPED_H = 99999 # Same as above but only for the horizontal packages. + MAX_WRAPPED_V = 99999 # Same as above but only for the vertical packages. + +- **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section + - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. +- **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain + - If not defined, it considers that **MIN_WRAPPED** is 2. + - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. + - If **POLICY** is mixed and there are failed jobs inside a wrapper, these jobs will be submitted as individual jobs. + - If **POLICY** is strict and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will not be submitted until there are enough tasks to build a package. + - strict and mixed policies can cause **deadlocks**. -- GitLab From 0408c45ce3d202f46502b837f91e08cde02f082a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 16:07:50 +0200 Subject: [PATCH 041/213] Deleted argcomplete --- autosubmit/autosubmit.py | 4 +--- setup.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 48e5b2e28..ccb1bbac9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# PYTHON_ARGCOMPLETE_OK # Copyright 2015-2020 Earth Sciences Department, BSC-CNS @@ -65,7 +64,7 @@ try: except Exception: dialog = None from time import sleep -import argparse, argcomplete +import argparse import subprocess import json import tarfile @@ -583,7 +582,6 @@ class Autosubmit: # Changelog subparsers.add_parser('changelog', description='show changelog') - argcomplete.autocomplete(parser) args = parser.parse_args() diff --git a/setup.py b/setup.py index 8e56eb8c5..a5a7801ef 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + install_requires=['argparse>=1.2,<2','six>=1.10.0', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ -- GitLab From c6f88d53a0c5b7d30be2c8187fbae4c46f859404 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 6 Oct 2022 14:33:10 +0200 Subject: [PATCH 042/213] Fixed an issue with main_platform = local and no platforms configured --- autosubmit/config/config_common.py | 8 +++++--- autosubmit/history/data_classes/job_data.py | 3 ++- autosubmit/job/job_dict.py | 7 ++++--- autosubmit/platforms/psplatform.py | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 63b31483d..26ce6ec50 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -607,9 +607,7 @@ class AutosubmitConfig(object): """ Checks experiment's queues configuration file. """ - if len(self._platforms_parser.sections()) == 0: - self.wrong_config["Platform"] += [["Global", - "Platform file is not well-configured or found"]] + if len(self._platforms_parser.sections()) != len(set(self._platforms_parser.sections())): self.wrong_config["Platform"] += [["Global", @@ -619,7 +617,11 @@ class AutosubmitConfig(object): main_platform_found = True elif self.ignore_undefined_platforms: main_platform_found = True + if len(self._platforms_parser.sections()) == 0 and not main_platform_found: + self.wrong_config["Platform"] += [["Global", + "Platform file is not well-configured or found"]] for section in self._platforms_parser.sections(): + if section in self.hpcarch: main_platform_found = True if not self._platforms_parser.check_exists(section, 'TYPE'): diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index b5249b797..93a88797a 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -57,7 +57,8 @@ class JobData(object): platform) > 0 else "NA" self.job_id = job_id if job_id else 0 try: - self.extra_data_parsed = loads(extra_data) + if extra_data != "": + self.extra_data_parsed = loads(extra_data) except Exception as exp: self.extra_data_parsed = {} # Fail fast self.extra_data = extra_data diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index b7e6b4a6d..29ca59e28 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -402,9 +402,10 @@ class DicJobs: for d in self._date_list: self._get_date(jobs, dic, d, member, chunk) try: - if type(jobs[0]) is list: - jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] - jobs = jobs_flattened + if len(jobs) > 0: + if type(jobs[0]) is list: + jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] + jobs = jobs_flattened except BaseException as e: pass return jobs diff --git a/autosubmit/platforms/psplatform.py b/autosubmit/platforms/psplatform.py index aee3e4eb7..e2c3ede88 100644 --- a/autosubmit/platforms/psplatform.py +++ b/autosubmit/platforms/psplatform.py @@ -76,7 +76,7 @@ class PsPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: -- GitLab From e0564c48d6230b7fd80bc048dff78a806f30069b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 6 Oct 2022 14:41:04 +0200 Subject: [PATCH 043/213] fixed tests --- requeriments.txt | 1 + test/unit/test_dic_jobs.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requeriments.txt b/requeriments.txt index c34451db2..b5783046b 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,3 +1,4 @@ +pytest==2.9.2 configparser argparse>=1.2,<2 python-dateutil>2 diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 39f7690b2..f955f96dc 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -81,9 +81,10 @@ class TestDicJobs(TestCase): frequency = 123 splits = 0 excluded_list_m = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='member') - self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() @@ -95,7 +96,7 @@ class TestDicJobs(TestCase): # assert self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() - self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m) + self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m,included_list_m) self.dictionary._create_jobs_chunk.assert_not_called() def test_read_section_running_chunk_create_jobs_chunk(self): @@ -108,9 +109,11 @@ class TestDicJobs(TestCase): splits = 0 excluded_list_c = [] excluded_list_m = [] + included_list_c = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='chunk') - self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m,included_list_c,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() -- GitLab From 425f667a50ffa4b3577f4405bfbcbff0353e24cf Mon Sep 17 00:00:00 2001 From: jberlin Date: Fri, 7 Oct 2022 11:24:08 +0200 Subject: [PATCH 044/213] Made small changes to documentation concerning the Conda installation - #864 --- docs/source/installation.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 7159ac7c0..157f28ecc 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -160,7 +160,7 @@ Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. autosubmit install # Get expid - autosubmit expid -H TEST -d "Test exp." + autosubmit expid -H local -d "Test exp." # Create with -np # Since it was a new install the expid will be a000 @@ -175,7 +175,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Download git + # Download git (if it is not already installed) apt install git -y -q # Download autosubmit git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b @@ -186,4 +186,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. conda activate autosubmit # Test autosubmit autosubmit -v - # Configure autosubmitrc and install database as indicated in this doc + # Configure autosubmitrc and install the database as indicated in the installation instructions above this section + +.. hint:: + After installing conda, you may need to close the terminal and re-open it so the installation takes effect. \ No newline at end of file -- GitLab From f4658dd56bde49fbd72d3eacada1d83a9fbd769d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 7 Oct 2022 13:10:25 +0200 Subject: [PATCH 045/213] updateversion does not need -v, and now stores the change in the db #882 #881 --- autosubmit/autosubmit.py | 25 +++++++++++++------------ autosubmit/config/config_common.py | 2 ++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ccb1bbac9..be15c0bec 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -755,18 +755,19 @@ class Autosubmit: force_update_version = args.update_version else: force_update_version = False - if force_update_version: - if as_conf.get_version() != Autosubmit.autosubmit_version: - Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", - as_conf.get_version(), Autosubmit.autosubmit_version, expid) - as_conf.set_version(Autosubmit.autosubmit_version) - else: - if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: - raise AutosubmitCritical( - "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" - "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), - Autosubmit.autosubmit_version, expid,args.command), - 7067) + if args.command not in ["upgrade","updateversion"]: + if force_update_version: + if as_conf.get_version() != Autosubmit.autosubmit_version: + Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", + as_conf.get_version(), Autosubmit.autosubmit_version, expid) + as_conf.set_version(Autosubmit.autosubmit_version) + else: + if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: + raise AutosubmitCritical( + "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" + "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), + Autosubmit.autosubmit_version, expid,args.command), + 7067) else: if expid == 'None': exp_id = "" diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 26ce6ec50..c0cacf190 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -35,6 +35,7 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from autosubmit.config.basicConfig import BasicConfig from collections import defaultdict +from autosubmit.database.db_common import update_experiment_descrip_version class AutosubmitConfig(object): @@ -1369,6 +1370,7 @@ class AutosubmitConfig(object): content = content.replace(re.search('AUTOSUBMIT_VERSION =.*', content).group(0), "AUTOSUBMIT_VERSION = " + autosubmit_version) open(self._conf_parser_file, 'w').write(content) + update_experiment_descrip_version(self.expid, description=None, version=autosubmit_version) def get_version(self): """ -- GitLab From 1ba44999f5545f29caaaa3c4d98d18348a2c931b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 11 Oct 2022 15:33:59 +0200 Subject: [PATCH 046/213] NEW RUN RULES changes #847 --- autosubmit/autosubmit.py | 16 ++++++++---- autosubmit/history/experiment_history.py | 32 +++++++++++++----------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index be15c0bec..be6c31665 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -59,10 +59,11 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set import sqlite3 -try: - import dialog -except Exception: - dialog = None +#try: +# import dialog +#except Exception: +# dialog = None +dialog = None from time import sleep import argparse import subprocess @@ -4253,7 +4254,12 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + + #exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + exp_history.process_status_changes(job_list.get_job_list(), + chunk_unit=as_conf.get_chunk_size_unit(), + chunk_size=as_conf.get_chunk_size(), + current_config=as_conf.get_full_config_as_json(),create=True) Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index f1e0be68c..ecd06067b 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -190,15 +190,15 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="",create=False): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ try: current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() update_these_changes = self._get_built_list_of_changes(job_list) - should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size) + should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size,create) if len(update_these_changes) > 0 and should_create_new_run == False: self.manager.update_many_job_data_change_status(update_these_changes) - if should_create_new_run: + if should_create_new_run: return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: @@ -217,11 +217,14 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size): - if len(job_list) != current_experiment_run_dc.total: - return True - if changes_count > int(self._get_date_member_completed_count(job_list)): - return True + def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): + if create: + return True + elif not create and self.expid[0].lower() == "t": + if len(job_list) != current_experiment_run_dc.total: + return True + if changes_count > int(self._get_date_member_completed_count(job_list)): + return True return self._chunk_config_has_changed(current_experiment_run_dc, new_chunk_unit, new_chunk_size) def _chunk_config_has_changed(self, current_exp_run_dc, new_chunk_unit, new_chunk_size): @@ -274,15 +277,16 @@ class ExperimentHistory(): def detect_changes_in_job_list(self, job_list): """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" - job_name_to_job = {job.name: job for job in job_list} + job_name_to_job = {str(job.name): job for job in job_list} current_job_data_dcs = self.manager.get_all_last_job_data_dcs() differences = [] for job_dc in current_job_data_dcs: - if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: - if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): - # If the job is not changing from a finalized status to a starting status - job_dc.status = job_name_to_job[job_dc.job_name].status_str - differences.append(job_dc) + if job_dc.job_name in job_name_to_job: + if job_dc.status != job_name_to_job[job_dc.job_name].status_str: + if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): + # If the job is not changing from a finalized status to a starting status + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) return differences def _get_defined_rowtype(self, code): -- GitLab From d9c47c385a6511ff1c991bbf96f7eaafe48eb2d7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 13 Oct 2022 09:02:40 +0200 Subject: [PATCH 047/213] new run --- autosubmit/history/experiment_history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index ecd06067b..96651df99 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -220,7 +220,7 @@ class ExperimentHistory(): def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): if create: return True - elif not create and self.expid[0].lower() == "t": + elif not create and self.expid[0].lower() != "t": if len(job_list) != current_experiment_run_dc.total: return True if changes_count > int(self._get_date_member_completed_count(job_list)): -- GitLab From 522d15890808a9cf67a76bed6b52babc48c8f419 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 13 Oct 2022 13:05:59 +0200 Subject: [PATCH 048/213] Fixed pipeline tests --- test/unit/test_autosubmit_config.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/test/unit/test_autosubmit_config.py b/test/unit/test_autosubmit_config.py index c4c8480df..00e624406 100644 --- a/test/unit/test_autosubmit_config.py +++ b/test/unit/test_autosubmit_config.py @@ -181,13 +181,22 @@ class TestAutosubmitConfig(TestCase): open_mock.assert_any_call(config.experiment_file, 'w') def test_set_version(self): - # arrange + + #ARRAGE + FakeBasicConfig.DB_PATH = 'fake-path' + sys.modules['os'].path.exists = Mock(return_value=True) + connection_mock = Mock() + cursor_mock = Mock() + connection_mock.cursor = Mock(return_value=cursor_mock) + cursor_mock.fetchone = Mock(return_value=[0]) + + sys.modules['sqlite3'].connect = Mock(return_value=connection_mock) config = AutosubmitConfig(self.any_expid, FakeBasicConfig, ConfigParserFactory()) open_mock = mock_open(read_data='AUTOSUBMIT_VERSION = dummy') with patch.object(builtins, "open", open_mock): # act - config.set_version('dummy-vesion') + config.set_version('dummy-version') # assert open_mock.assert_any_call(getattr(config, '_conf_parser_file'), 'w') @@ -461,3 +470,4 @@ class FakeBasicConfig: LOCAL_PROJ_DIR = '/dummy/local/proj/dir' DEFAULT_PLATFORMS_CONF = '' DEFAULT_JOBS_CONF = '' + -- GitLab From 288ea4e64e0120eb569de151ceb90d43364b12bc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 17 Oct 2022 16:30:31 +0200 Subject: [PATCH 049/213] when creating the experiment, it now also see if there is a folder called as it. If there is a folder, the (old) experiment will be registered --- autosubmit/autosubmit.py | 1 - autosubmit/database/db_common.py | 9 ++++++++- autosubmit/experiment/experiment_common.py | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index be6c31665..1ea5d3a97 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -916,7 +916,6 @@ class Autosubmit: os.mkdir(os.path.join( BasicConfig.LOCAL_ROOT_DIR, exp_id, 'conf')) Log.info("Copying config files...") - # autosubmit config and experiment copied from AS. files = resource_listdir('autosubmit.config', 'files') for filename in files: diff --git a/autosubmit/database/db_common.py b/autosubmit/database/db_common.py index aaaf4875a..47cc770eb 100644 --- a/autosubmit/database/db_common.py +++ b/autosubmit/database/db_common.py @@ -24,7 +24,7 @@ import os import sqlite3 import multiprocessing import Queue - +import autosubmit from log.log import Log, AutosubmitCritical, AutosubmitError Log.get_logger("Autosubmit") from autosubmit.config.basicConfig import BasicConfig @@ -319,6 +319,7 @@ def _check_experiment_exists(name, error_on_inexistence=True): :return: If experiment exists returns true, if not returns false :rtype: bool """ + if not check_db(): return False try: @@ -339,6 +340,12 @@ def _check_experiment_exists(name, error_on_inexistence=True): if error_on_inexistence: raise AutosubmitCritical( 'The experiment name "{0}" does not exist yet!!!'.format(name), 7005) + if os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR, name)): + try: + _save_experiment(name, 'No description', "3.14.0") + except BaseException as e: + pass + return True return False return True diff --git a/autosubmit/experiment/experiment_common.py b/autosubmit/experiment/experiment_common.py index 160f15158..3c31346c2 100644 --- a/autosubmit/experiment/experiment_common.py +++ b/autosubmit/experiment/experiment_common.py @@ -58,7 +58,7 @@ def new_experiment(description, version, test=False, operational=False): else: new_name = 'a000' else: - new_name = next_experiment_id(last_exp_name) + new_name = last_exp_name if new_name == '': return '' while db_common.check_experiment_exists(new_name, False): -- GitLab From 1225dc4636e913733a24bb1a15ad5f04d56756f2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 18 Oct 2022 11:44:25 +0200 Subject: [PATCH 050/213] Added more reasons to a job for stop #837 --- autosubmit/autosubmit.py | 6 ++++-- autosubmit/job/job.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 1ea5d3a97..27bda288e 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1467,8 +1467,10 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower( - )] + try: + job.platform = submitter.platforms[job.platform_name.lower()] + except: + raise AutosubmitCritical("hpcarch={0} not found in the platforms configuration file".format(job.platform_name), 7014) # noinspection PyTypeChecker if job.status not in (Status.COMPLETED, Status.SUSPENDED): platforms_to_test.add(job.platform) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 6653c51f9..08b39d27c 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1257,7 +1257,8 @@ class Job(object): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: @@ -1639,7 +1640,8 @@ class WrapperJob(Job): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: -- GitLab From 18ca02106a2faa2259b9f5eaddab104089cd4b85 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 18 Oct 2022 16:25:49 +0200 Subject: [PATCH 051/213] dbfix pipeline --- test/unit/test_expid.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/unit/test_expid.py b/test/unit/test_expid.py index 85e5a012b..7eee22bfc 100644 --- a/test/unit/test_expid.py +++ b/test/unit/test_expid.py @@ -31,21 +31,21 @@ class TestExpid(TestCase): @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "a006" + current_experiment_id = "a007" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version) self.assertEquals("a007", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_test_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "t0ab" + current_experiment_id = "t0ac" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, True) self.assertEquals("t0ac", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_operational_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "o112" + current_experiment_id = "o113" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, False, True) self.assertEquals("o113", experiment_id) -- GitLab From 27ed656eff50553f9f14742df4e4c9d5f2c5d88e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 19 Oct 2022 13:32:13 +0200 Subject: [PATCH 052/213] Recursive submodules --- autosubmit/git/autosubmit_git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index c191c21df..8d194de74 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -223,7 +223,7 @@ class AutosubmitGit: else: command_1 += " git submodule init;".format(project_destination) for submodule in git_project_submodules: - command_1 += " git submodule update {0};".format(submodule) + command_1 += " git submodule update --init --recursive {0};".format(submodule) if git_remote_project_path == '': try: command_1 = "cd {0}; {1} ".format(git_path,command_1) -- GitLab From bf2246f5617b5959d63cbcfe2037fb8cea02e133 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:17:23 +0200 Subject: [PATCH 053/213] Fixed an issue raised when a platform has no project expid user or scratch dir defined --- autosubmit/autosubmit.py | 7 +++++-- autosubmit/config/config_common.py | 2 +- autosubmit/platforms/paramiko_submitter.py | 19 +++++++++++++------ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 27bda288e..ff24f0967 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4385,8 +4385,11 @@ class Autosubmit: submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform\nCheck the hpcarch platform configuration inside platform.conf", 7014) - + try: + hpcarch = submitter.platforms[as_conf.get_platform()] + except: + hpcarch = "local" + Log.warning("Remote clone may be disabled due to: "+e.message) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": svn_project_url = as_conf.get_svn_project_url() diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index c0cacf190..ff6f31ea4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -623,7 +623,7 @@ class AutosubmitConfig(object): "Platform file is not well-configured or found"]] for section in self._platforms_parser.sections(): - if section in self.hpcarch: + if section.lower() in self.hpcarch.lower(): main_platform_found = True if not self._platforms_parser.check_exists(section, 'TYPE'): self.wrong_config["Platform"] += [[section, diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 12e1e70bc..92594abdd 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -20,7 +20,8 @@ import os -from log.log import Log + +from log.log import Log,AutosubmitCritical,AutosubmitError from autosubmit.config.basicConfig import BasicConfig from autosubmit.config.config_common import AutosubmitConfig from submitter import Submitter @@ -72,7 +73,7 @@ class ParamikoSubmitter(Submitter): :return: platforms used by the experiment :rtype: dict """ - + raise_message="" platforms_used = list() hpcarch = asconf.get_platform() platforms_used.append(hpcarch) @@ -191,12 +192,16 @@ class ParamikoSubmitter(Submitter): remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) - remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, - remote_platform.user, remote_platform.expid) + try: + remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, + remote_platform.user, remote_platform.expid) + remote_platform.update_cmds() + platforms[section.lower()] = remote_platform + + except: + raise_message = "Error in platform.conf: SCRATCH_DIR, PROJECT, USER, EXPID must be defined for platform {0}".format(section) # Executes update_cmds() from corresponding Platform Object - remote_platform.update_cmds() # Save platform into result dictionary - platforms[section.lower()] = remote_platform for section in parser.sections(): # if this section is included in platforms @@ -208,3 +213,5 @@ class ParamikoSubmitter(Submitter): platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms + if raise_message != "": + raise AutosubmitError(raise_message) -- GitLab From c65ca656d52a27f60e409e06a6a6ca6f9ba63cb4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:30:39 +0200 Subject: [PATCH 054/213] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ad59f742d..cd56dd095 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.14.0b +#3.14.0b -- GitLab From c41ef89f05ab40a890174f3f7f523077b55cb804 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:30:49 +0200 Subject: [PATCH 055/213] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index cd56dd095..ad59f742d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -#3.14.0b +3.14.0b -- GitLab From be9bc4a4d4293b9cee2e0f4c6bafe9b30925a6f4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:39:37 +0200 Subject: [PATCH 056/213] test local git not working proprly --- autosubmit/autosubmit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ff24f0967..5133dc4ca 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1359,6 +1359,7 @@ class Autosubmit: for wrapper_section in as_conf.get_wrapper_multi(): wrapper_jobs[wrapper_section] = as_conf.get_wrapper_jobs(wrapper_section) wrapper_jobs["wrapper"] = as_conf.get_wrapper_jobs("wrapper") + # Log.warning("Aux Job_list was generated successfully") submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) -- GitLab From 330574b835d9571f42645dd4bc6af079a353cff6 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 3 Nov 2022 09:42:59 +0100 Subject: [PATCH 057/213] Unbound variable fixes in some messages and job_status #893 Fixed an issue with recovery not cancelling all jobs --- autosubmit/autosubmit.py | 17 ++++---- autosubmit/git/autosubmit_git.py | 4 +- autosubmit/job/job.py | 49 ++++++++++++----------- autosubmit/job/job_list.py | 14 +++---- autosubmit/monitor/diagram.py | 12 ++++-- autosubmit/platforms/paramiko_platform.py | 37 ++++++++--------- 6 files changed, 69 insertions(+), 64 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5133dc4ca..cbb4b142d 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2068,7 +2068,6 @@ class Autosubmit: platform.connected = False Log.printlog("[{1}] Connection failed to host {0}".format( platform.host, platform.name),Log.WARNING) if issues != "": - platform.connected = False raise AutosubmitCritical( "Issues while checking the connectivity of platforms.", 7010, issues+"\n"+ssh_config_issues) @@ -2221,7 +2220,7 @@ class Autosubmit: "Submission failed, this can be due a failure on the platform", 6015, e.message) if jobs_id is None or len(jobs_id) <= 0: raise AutosubmitError( - "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(e.message,e.trace), 6015) + "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(str(e),""), 6015) i = 0 if hold: sleep(10) @@ -2677,9 +2676,9 @@ class Autosubmit: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) for platform in platforms_to_test: platform.test_connection() - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) if not force: raise AutosubmitCritical( "Experiment can't be recovered due being {0} active jobs in your experiment, If you want to recover the experiment, please use the flag -f and all active jobs will be cancelled".format( @@ -3235,16 +3234,16 @@ class Autosubmit: # Preparation for section parameters no_load_sections = False no_load_platforms = False - try: - job_list = Autosubmit.load_job_list( - expid, as_conf, notransitive=False) - except Exception as e: - no_load_sections = True + + job_list = Autosubmit.load_job_list( + expid, as_conf, notransitive=False) + try: submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) except Exception as e: no_load_platforms = True + submitter = Autosubmit._get_submitter(as_conf) submitter.load_local_platform(as_conf) try: # Gathering parameters of autosubmit and expdef config files @@ -4049,7 +4048,7 @@ class Autosubmit: Log.warning("Experiment folder renamed to: {0}".format( exp_folder + "_to_delete ")) except Exception as e: - Autosubmit.unarchive(expid, uncompress=False) + Autosubmit.unarchive(expid, uncompressed=False) raise AutosubmitCritical( "Can not remove or rename experiments folder", 7012, str(e)) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 8d194de74..493358ed0 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -203,7 +203,7 @@ class AutosubmitGit: command_0 = "cd {0} ; {1}".format(project_path, command_0) output_0 = subprocess.check_output(command_0, shell=True) else: - command_0 = "cd {0} ; {1}".format(git_remote_path, command_0) + command_0 = "cd {0} ; {1}".format(project_path, command_0) hpcarch.send_command(command_0) ##command 1 if os.path.exists(os.path.join(git_path, ".githooks")): @@ -233,7 +233,7 @@ class AutosubmitGit: submodule_failure = True Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( - "Submodule {0} has a wrong configuration".format(submodule), 6014) + "Submodule has a wrong configuration.\n{0}".format(command_1), 6014) else: command_1 = "cd {0}; {1} ".format(git_remote_path, command_1) hpcarch.send_command(command_1) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 08b39d27c..739216c4a 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -624,6 +624,7 @@ class Job(object): as_conf.reload() submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) + platform = submitter.platforms[platform_name.lower()] success = True except BaseException as e: error_message = str(e) @@ -632,31 +633,31 @@ class Job(object): count=count+1 if not success: raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) - platform = submitter.platforms[platform_name.lower()] - try: - platform.test_connection() + else: max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count - if self.wrapper_type is not None and self.wrapper_type == "vertical": - found = False - retrials = 0 - while retrials < 3 and not found: - if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): - found = True - retrials = retrials + 1 - for i in range(max_logs-1,-1,-1): - if platform.check_stat_file_by_retrials(stat_file + str(i)): - last_log = i - else: - break - remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) + try: + platform.test_connection() + if self.wrapper_type is not None and self.wrapper_type == "vertical": + found = False + retrials = 0 + while retrials < 3 and not found: + if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): + found = True + retrials = retrials + 1 + for i in range(max_logs-1,-1,-1): + if platform.check_stat_file_by_retrials(stat_file + str(i)): + last_log = i + else: + break + remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) - else: - remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) + else: + remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except BaseException as e: - Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) + except BaseException as e: + Log.printlog( + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 @@ -1730,7 +1731,7 @@ class WrapperJob(Job): self.name, reason), 6009) # while running jobs? self._check_running_jobs() - self.update_failed_jobs(canceled_wrapper=True) + self.update_failed_jobs() self.cancel_failed_wrapper_job() return @@ -1760,8 +1761,8 @@ class WrapperJob(Job): job.hold = self.hold job.status = self.status if self.status == Status.WAITING: - for job in self.job_list: - job.packed = False + for job2 in self.job_list: + job2.packed = False def _check_inner_job_wallclock(self, job): start_time = self.running_jobs_start[job] diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index ae52a0c78..2a687dbd8 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -439,13 +439,13 @@ class JobList(object): # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - for parent in jobs_by_section: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + for parent2 in jobs_by_section: + if parent2.chunk in dependency.select_chunks_dest[relation_indx] or len( dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - visited_parents.add(parent) + if parent2 not in visited_parents: + job.add_parent(parent2) + JobList._add_edge(graph, job, parent2) + visited_parents.add(parent2) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) @@ -2042,7 +2042,7 @@ class JobList(object): # root exists if root is not None: - result += self._recursion_print(root, 0) + result += self._recursion_print(root, 0,[]) else: result += "\nCannot find root." diff --git a/autosubmit/monitor/diagram.py b/autosubmit/monitor/diagram.py index 8e8753167..b1f0f6744 100644 --- a/autosubmit/monitor/diagram.py +++ b/autosubmit/monitor/diagram.py @@ -66,14 +66,18 @@ def create_bar_diagram(experiment_id, jobs_list, general_stats, output_file, per exp_stats.calculate_statistics() exp_stats.calculate_summary() exp_stats.make_old_format() - failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + # Stats variables definition + normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) + failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) except Exception as exp: + if not isinstance(normal_plots_count,int): + normal_plots_count = 0 + if not isinstance(failed_jobs_plots_count,int): + failed_jobs_plots_count = 0 print(exp) print(traceback.format_exc()) - # Stats variables definition - normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) - failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) total_plots_count = normal_plots_count + failed_jobs_plots_count # num_plots = norma # ind = np.arange(int(MAX_JOBS_PER_PLOT)) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 1c1177510..4b5c2d4b4 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -118,7 +118,7 @@ class ParamikoPlatform(Platform): raise except BaseException as e: self.connected = False - raise AutosubmitCritical(message,7051) + raise AutosubmitCritical(str(e),7051) #raise AutosubmitError("[{0}] connection failed for host: {1}".format(self.name, self.host), 6002, e.message) def restore_connection(self): @@ -226,7 +226,7 @@ class ParamikoPlatform(Platform): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: - self.restore_connection(reconnect=True) + self.restore_connection() else: raise AutosubmitError( "Couldn't establish a connection to the specified host, wrong configuration?", 6003, e.message) @@ -283,8 +283,8 @@ class ParamikoPlatform(Platform): self._ftpChannel.chmod(remote_path, os.stat(local_path).st_mode) return True except IOError as e: - raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, str(e)) + + raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join(self.tmp_path,filename), code=6004, trace=str(e))) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -594,19 +594,20 @@ class ParamikoPlatform(Platform): sleep_time = sleep_time + 5 # URi: define status list in HPC Queue Class else: - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + job_status = job.status + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -989,7 +990,7 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def parse_queue_reason(self, output): + def parse_queue_reason(self, output, job_id): raise NotImplementedError def get_ssh_output(self): -- GitLab From d1152bffe2dbadb296f598a6e79a4176f8018905 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 28 Jul 2022 16:19:58 +0200 Subject: [PATCH 058/213] over_wallclock fix --- autosubmit/job/job.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 226b85c37..948269142 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -768,6 +768,22 @@ class Job(object): except BaseException as e: pass return + def parse_time(self,wallclock): + format = "minute" + regex = re.compile(r'(((?P\d+):)((?P\d+)))(:(?P\d+))?') + parts = regex.match(wallclock) + if not parts: + return + parts = parts.groupdict() + if int(parts['hours']) > 0 : + format = "hour" + else: + format = "minute" + time_params = {} + for name, param in parts.items(): + if param: + time_params[name] = int(param) + return datetime.timedelta(**time_params),format # Duplicated for wrappers and jobs to fix in 4.0.0 def is_over_wallclock(self, start_time, wallclock): """ @@ -777,25 +793,13 @@ class Job(object): :return: """ elapsed = datetime.datetime.now() - start_time - wallclock = datetime.datetime.strptime(wallclock, '%H:%M') - total = 0.0 - if wallclock.hour > 0: - total = wallclock.hour - format = "hour" - else: - format = "minute" - if format == "hour": - if wallclock.minute > 0: - total += wallclock.minute / 60.0 - if wallclock.second > 0: - total += wallclock.second / 60.0 / 60.0 + wallclock,time_format = self.parse_time(wallclock) + if time_format == "hour": + total = wallclock.days * 24 + wallclock.seconds / 60 / 60 else: - if wallclock.minute > 0: - total += wallclock.minute - if wallclock.second > 0: - total += wallclock.second / 60.0 + total = wallclock.days * 24 + wallclock.seconds / 60 total = total * 1.30 # in this case we only want to avoid slurm issues so the time is increased by 50% - if format == "hour": + if time_format == "hour": hour = int(total) minute = int((total - int(total)) * 60.0) second = int(((total - int(total)) * 60 - -- GitLab From 1129425403695cc7b418a9e305e2fc3641e2e707 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 8 Aug 2022 16:36:54 +0200 Subject: [PATCH 059/213] fix project_Destination --- autosubmit/autosubmit.py | 3 ++- autosubmit/config/config_common.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 19dc23baf..5bdb10116 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4324,7 +4324,8 @@ class Autosubmit: """ project_destination = as_conf.get_project_destination() if project_destination is None or len(project_destination) == 0: - raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) + if project_type.lower() != "none": + raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": submitter = Autosubmit._get_submitter(as_conf) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 7b2a6a12b..e3e9188a4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1119,11 +1119,14 @@ class AutosubmitConfig(object): elif self.get_project_type().lower() == "git": value = self.get_git_project_origin().split( '/')[-1].split('.')[-2] - return value + if value != "": + return value + else: + return "project_files" except Exception as exp: Log.debug(str(exp)) Log.debug(traceback.format_exc()) - return '' + return "project_files" def set_git_project_commit(self, as_conf): """ -- GitLab From 746cc1691a97d728ef30036409248140f655d218 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 15:09:33 +0200 Subject: [PATCH 060/213] tkinter --- docs/source/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 6573f8723..55938efa0 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -8,7 +8,7 @@ The Autosubmit code is maintained in *PyPi*, the main source for python packages .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10, tkinter .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. -- GitLab From a33c2afea042c8c888a559aaacd21e849c1100fb Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 14:56:12 +0200 Subject: [PATCH 061/213] tkinter --- docs/source/installation/index.rst | 4 ++-- requeriments.txt | 1 + setup.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 55938efa0..f1a9640a9 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -4,11 +4,11 @@ How to Install Autosubmit The Autosubmit code is maintained in *PyPi*, the main source for python packages. -- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk, python2-dev, graphviz >= 2.41, pip2 +- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk(tkinter in centOS), python2-dev, graphviz >= 2.41, pip2 .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10, tkinter +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10 .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. diff --git a/requeriments.txt b/requeriments.txt index f2dfdd0aa..d57974475 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -13,6 +13,7 @@ typing bscearth.utils cryptography==3.3.2 PyNaCl==1.4.0 +six>=1.10.0 requests xlib Pygments \ No newline at end of file diff --git a/setup.py b/setup.py index 35e8f4f4f..7935f7a42 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], extras_require={ -- GitLab From 7b2fd57f194a48f4811804d6ed5c2234e00c71e4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 15:42:36 +0200 Subject: [PATCH 062/213] author change --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7935f7a42..d4d0f0179 100644 --- a/setup.py +++ b/setup.py @@ -34,8 +34,8 @@ setup( version=version, description='Autosubmit: a versatile tool to manage Weather and Climate Experiments in diverse ' 'Supercomputing Environments', - author='Domingo Manubens-Gil', - author_email='domingo.manubens@bsc.es', + author='Daniel Beltran Mora', + author_email='daniel.beltran@bsc.es', url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], -- GitLab From 75a6c968042d9f11c86ba297d8c5e2e7a1a72c12 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 15:08:10 +0200 Subject: [PATCH 063/213] Added requests, improvement exception recovery for wrappers , added more info, bugfixed status appearing in log.out , bug fixed lc level not being able to change --- autosubmit/autosubmit.py | 54 ++++++++++++++--------- autosubmit/platforms/paramiko_platform.py | 32 ++++++++------ autosubmit/platforms/platform.py | 2 +- environment.yml | 1 + log/log.py | 15 ++++++- setup.py | 2 +- 6 files changed, 69 insertions(+), 37 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5bdb10116..8704d27f3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -162,7 +162,7 @@ class Autosubmit: parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version) parser.add_argument('-lf', '--logfile', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), - default='WARNING', type=str, + default='DEBUG', type=str, help="sets file's log level.") parser.add_argument('-lc', '--logconsole', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), default='INFO', type=str, @@ -1659,7 +1659,11 @@ class Autosubmit: Log.debug('Checking Wrapper {0}'.format(str(job_id))) wrapper_job.checked_time = datetime.datetime.now() # This is where wrapper will be checked on the slurm platform, update takes place. - platform.check_job(wrapper_job) + try: + platform.check_job(wrapper_job,is_wrapper=True) + except BaseException as e: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking wrapper {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) #Log.info("FD 3Wrapper checked: {0}".format(log.fd_show.fd_table_status_str())) try: if wrapper_job.status != wrapper_job.new_status: @@ -1671,8 +1675,12 @@ class Autosubmit: "Wrapper is in Unknown Status couldn't get wrapper parameters", 7050) # New status will be saved and inner_jobs will be checked. - wrapper_job.check_status( - wrapper_job.new_status) + try: + wrapper_job.check_status(wrapper_job.new_status) + except: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking the inner_jobs of {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) + # Erase from packages if the wrapper failed to be queued ( Hold Admin bug ) if wrapper_job.status == Status.WAITING: for inner_job in wrapper_job.job_list: @@ -1782,9 +1790,18 @@ class Autosubmit: # No need to wait until the remote platform reconnection recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - consecutive_retrials = 1 - delay = min(15*consecutive_retrials,120) + consecutive_retrials = 0 + failed_names = {} + Log.info("Storing failed job count...") + try: + for job in job_list.get_job_list(): + if job.fail_count > 0: + failed_names[job.name] = job.fail_count + except BaseException as e: + Log.printlog("Error trying to store failed job count",Log.WARNING) + Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: + delay = min(15 * consecutive_retrials, 120) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1794,6 +1811,7 @@ class Autosubmit: Log.info("Recovering job_list...") job_list = Autosubmit.load_job_list( expid, as_conf, notransitive=notransitive) + Log.info("Recovering job_list... Done") if allowed_members: # Set allowed members after checks have been performed. This triggers the setter and main logic of the -rm feature. job_list.run_members = allowed_members @@ -1801,26 +1819,20 @@ class Autosubmit: "Only jobs with member value in {0} or no member will be allowed in this run. Also, those jobs already SUBMITTED, QUEUING, or RUNNING will be allowed to complete and will be tracked.".format( str(allowed_members))) platforms_to_test = set() + Log.info("Recovering platform information...") for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) - #Recover job_list while keeping job.fail_count - failed_names = {} - for job in job_list.get_job_list(): - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] - platforms_to_test.add(job.platform) - if job.fail_count > 0: - failed_names[job.name] = job.fail_count + + Log.info("Recovering platform information... Done") + Log.info("Recovering Failure count...") for job in job_list.get_job_list(): if job.name in failed_names.keys(): job.fail_count = failed_names[job.name] - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] + Log.info("Recovering Failure count... Done") + Log.info("Recovering parameters...") Autosubmit._load_parameters(as_conf, job_list, submitter.platforms) # Recovery wrapper [Packages] @@ -1876,9 +1888,11 @@ class Autosubmit: None, None, jobs[0].platform, as_conf, jobs[0].hold) job_list.job_package_map[jobs[0].id] = wrapper_job + Log.info("Recovering wrappers... Done") job_list.update_list(as_conf) Log.info("Saving recovered job list...") job_list.save() + Log.info("Saving recovered job list... Done") recovery = True Log.result("Recover of job_list is completed") except AutosubmitError as e: @@ -1886,10 +1900,10 @@ class Autosubmit: Log.result("Recover of job_list has fail {0}".format(e.message)) except IOError as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) except BaseException as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) # Restore platforms and try again, to avoid endless loop with failed configuration, a hard limit is set. reconnected = False mail_notify = True diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 43adfd5c6..e57512f55 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -452,17 +452,20 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status :param retries: retries :param job: job + :type job: autosubmit.job.job.Job + :param default_status: default status if job is not found :type job: class(job) :param default_status: status to assign if it can be retrieved from the platform :type default_status: autosubmit.job.job_common.Status :return: current job status :rtype: autosubmit.job.job_common.Status + """ job_id = job.id job_status = Status.UNKNOWN @@ -491,19 +494,20 @@ class ParamikoPlatform(Platform): job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: job_status = Status.RUNNING - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + if not is_wrapper: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED elif job_status in self.job_status['QUEUING'] and job.hold is False: job_status = Status.QUEUING elif job_status in self.job_status['QUEUING'] and job.hold is True: diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index c2ccf3575..acbb20aa7 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -384,7 +384,7 @@ class Platform(object): """ raise NotImplementedError - def check_job(self, jobid, default_status=Status.COMPLETED, retries=5): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status diff --git a/environment.yml b/environment.yml index 4585486d9..bc6e7308b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - portalocker - networkx - python=2.7 +- requests - pip: - bscearth.utils - Xlib diff --git a/log/log.py b/log/log.py index ae3ca5a74..216fc23eb 100644 --- a/log/log.py +++ b/log/log.py @@ -161,7 +161,7 @@ class Log: logging.getLogger(name) @staticmethod - def set_file(file_path, type='out', level=WARNING): + def set_file(file_path, type='out', level="WARNING"): """ Configure the file to store the log. If another file was specified earlier, new messages will only go to the new file. @@ -169,6 +169,19 @@ class Log: :param file_path: file to store the log :type file_path: str """ + levels = {} + levels["STATUS_FAILED"] = 500 + levels["STATUS"] = 1000 + levels["DEBUG"] = 2000 + levels["WARNING"] = 3000 + levels["INFO"] = 4000 + levels["RESULT"] = 5000 + levels["ERROR"] = 6000 + levels["CRITICAL"] = 7000 + levels["NO_LOG"] = levels["CRITICAL"] + 1000 + + level = levels.get(str(level).upper(),"DEBUG") + max_retrials = 3 retrials = 0 timeout = 5 diff --git a/setup.py b/setup.py index d4d0f0179..8e56eb8c5 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( keywords=['climate', 'weather', 'workflow', 'HPC'], install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', - 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], + 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] }, -- GitLab From 5d1fa462207ce7e05ee8597c9b1ff25ae2164ecf Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 15:53:09 +0200 Subject: [PATCH 064/213] stat fix --- autosubmit/autosubmit.py | 1 + autosubmit/job/job.py | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8704d27f3..03853b178 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1763,6 +1763,7 @@ class Autosubmit: save2 = job_list.update_list( as_conf, submitter=submitter) job_list.save() + if len(job_list.get_ready()) > 0: save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 948269142..28c9b2be9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -630,10 +630,9 @@ class Job(object): found = False retrials = 0 while retrials < 3 and not found: - sleep(2) if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): found = True - retrials = retrials - 1 + retrials = retrials + 1 for i in range(max_logs-1,-1,-1): if platform.check_stat_file_by_retrials(stat_file + str(i)): last_log = i @@ -1181,18 +1180,18 @@ class Job(object): if self.type == Type.BASH: template = 'sleep 5' + "\n" elif self.type == Type.PYTHON: - template = 'time.sleep(30)' + "\n" + template = 'time.sleep(5)' + "\n" elif self.type == Type.R: - template = 'Sys.sleep(30)' + "\n" + template = 'Sys.sleep(5)' + "\n" template += template_file.read() template_file.close() else: if self.type == Type.BASH: - template = 'sleep 35' + template = 'sleep 5' elif self.type == Type.PYTHON: - template = 'time.sleep(35)' + template = 'time.sleep(5)' elif self.type == Type.R: - template = 'Sys.sleep(35)' + template = 'Sys.sleep(5)' else: template = '' except: -- GitLab From c21485af35cbeb4f24fc21dbc2adf1d18f11e127 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 16:44:23 +0200 Subject: [PATCH 065/213] wrapper_type is now being saved correctly --- autosubmit/autosubmit.py | 1 - autosubmit/job/job_list.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 03853b178..8704d27f3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1763,7 +1763,6 @@ class Autosubmit: save2 = job_list.update_list( as_conf, submitter=submitter) job_list.save() - if len(job_list.get_ready()) > 0: save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 3d55bb040..d8abc0eff 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -215,6 +215,15 @@ class JobList(object): new, notransitive, update_structure=update_structure) for job in self._job_list: job.parameters = parameters + job_data = jobs_data.get(job.name,"none") + try: + if job_data != "none": + job.wrapper_type = job_data[12] + else: + job.wrapper_type = "none" + except BaseException as e: + job.wrapper_type = "none" + # Checking for member constraints if len(run_only_members) > 0: # Found -- GitLab From 401a1b3e36064aa0f86f4e48f06acc8328916d20 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 08:45:34 +0200 Subject: [PATCH 066/213] erased debug info, changed exception for baseexception --- autosubmit/job/job.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 28c9b2be9..1056b93f6 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -620,10 +620,9 @@ class Job(object): submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) platform = submitter.platforms[platform_name.lower()] - try: - platform.test_connection() - except: - pass + + platform.test_connection() + max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count if self.wrapper_type is not None and self.wrapper_type == "vertical": @@ -643,7 +642,7 @@ class Job(object): else: remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except Exception as e: + except BaseException as e: Log.printlog( "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) out_exist = False -- GitLab From 0c0bcd3c2138f64b9def6c8a8801df211bed08e1 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 12:53:18 +0200 Subject: [PATCH 067/213] Fixed delay issue #862 --- autosubmit/job/job_list.py | 51 ++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index d8abc0eff..b26f24e74 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -413,28 +413,37 @@ class JobList(object): if dependency.splits is not None: parent = filter( lambda _parent: _parent.split in dependency.splits, parent) - #Select chunk + select member - if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - elif len(dependency.select_members_orig) > 0: - for relation_indx in member_relations_to_add: - if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - elif len(dependency.select_chunks_orig) > 0: + #Select chunk + select member + if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + elif len(dependency.select_members_orig) > 0: + for relation_indx in member_relations_to_add: + if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + elif len(dependency.select_chunks_orig) > 0: + for relation_indx in chunk_relations_to_add: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. + if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( - dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - + for parent in jobs_by_section: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + visited_parents.add(parent) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) -- GitLab From fbc02f6ae0962dd5536c0c3c10ab613e86df1b13 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 13:40:57 +0200 Subject: [PATCH 068/213] Added 5min retrial in case that something is wrong while recovering the As_conf info inside a thread. --- autosubmit/job/job.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1056b93f6..325564bec 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -614,15 +614,27 @@ class Job(object): max_logs = 0 sleep(5) stat_file = self.script_name[:-4] + "_STAT_" + retries = 2 + count = 0 + success = False + error_message = "" + while count < retries or success: + try: + as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) + as_conf.reload() + submitter = self._get_submitter(as_conf) + submitter.load_platforms(as_conf) + success = True + except BaseException as e: + error_message = str(e) + sleep(60*5) + pass + count=count+1 + if not success: + raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) + platform = submitter.platforms[platform_name.lower()] try: - as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.reload() - submitter = self._get_submitter(as_conf) - submitter.load_platforms(as_conf) - platform = submitter.platforms[platform_name.lower()] - platform.test_connection() - max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count if self.wrapper_type is not None and self.wrapper_type == "vertical": @@ -644,7 +656,7 @@ class Job(object): except BaseException as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(e.message, self.name), 6001) out_exist = False err_exist = False retries = 3 -- GitLab From 7dc55ee35d06f663a3c0174cbd6896a3e7613534 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:30:45 +0200 Subject: [PATCH 069/213] e --- autosubmit/autosubmit.py | 2 +- autosubmit/job/job.py | 2 +- autosubmit/platforms/slurmplatform.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8704d27f3..b299c7dcc 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2227,7 +2227,7 @@ class Autosubmit: "{0} submission failed, some hold jobs failed to be held".format(platform.name), 6015) except WrongTemplateException as e: raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( - e.job_name), 7014, e.message) + e.job_name), 7014, str(e)) except AutosubmitError as e: raise except AutosubmitCritical as e: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 325564bec..1068dca65 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -618,7 +618,7 @@ class Job(object): count = 0 success = False error_message = "" - while count < retries or success: + while (count < retries) or success: try: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index cd96b21cc..5d31690c4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -362,8 +362,8 @@ class SlurmPlatform(ParamikoPlatform): return export + self._submit_hold_cmd + job_script else: if not hold: - self._submit_script_file.write( - export + self._submit_cmd + job_script + "\n") + write_this = export + self._submit_cmd + job_script +"\n" + self._submit_script_file.write(write_this) else: self._submit_script_file.write( export + self._submit_hold_cmd + job_script + "\n") -- GitLab From 4690246861edd1b3809440933a5929008506b7a6 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:34:19 +0200 Subject: [PATCH 070/213] e --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1068dca65..9365e516f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -618,7 +618,7 @@ class Job(object): count = 0 success = False error_message = "" - while (count < retries) or success: + while (count < retries) or not success: try: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() -- GitLab From 359b1eb147af2ee3c5392e1c0d64ff5d1ef74e5e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:50:30 +0200 Subject: [PATCH 071/213] fixed message --- autosubmit/platforms/paramiko_submitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index c597274f7..acba2bcce 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -184,8 +184,9 @@ class ParamikoSubmitter(Submitter): None) remote_platform.custom_directives = parser.get_option(section, 'CUSTOM_DIRECTIVES', None) - Log.debug("Custom directives from platform.conf: {0}".format( - remote_platform.custom_directives)) + if remote_platform.custom_directives is not None and remote_platform.custom_directives != '' and remote_platform.custom_directives != 'None': + Log.debug("Custom directives from platform.conf: {0}".format( + remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, -- GitLab From 54dbb37052d2516572d1ebdfd6a9c0d3e78b9698 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 6 Sep 2022 10:53:49 +0200 Subject: [PATCH 072/213] conda fix --- docs/source/installation/index.rst | 154 +++++++++++++++-------------- 1 file changed, 79 insertions(+), 75 deletions(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index f1a9640a9..9a90c4e54 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -1,6 +1,9 @@ -######################### -How to Install Autosubmit -######################### +############ +Installation +############ + +How to install +============== The Autosubmit code is maintained in *PyPi*, the main source for python packages. @@ -35,76 +38,8 @@ or download, unpack and: .. hint:: To see the changelog, use ``autosubmit changelog`` -Examples -======== - -Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. ------------------------------------------------------------------------------- - -.. code-block:: bash - - - # Update repositories - apt update - - # Avoid interactive stuff - export DEBIAN_FRONTEND=noninteractive - - # Dependencies - apt install wget curl python2 python-tk python2-dev graphviz -y -q - - # Additional dependencies related with pycrypto - apt install build-essential libssl-dev libffi-dev -y -q - - # Download get pip script and launch it - wget https://bootstrap.pypa.io/pip/2.7/get-pip.py - python2 get-pip.py - - # Install autosubmit using pip - pip2 install autosubmit - - # Check that we can execute autosubmit commands - autosubmit -h - - # Configure - autosubmit configure - - # Install - autosubmit install - - # Get expid - autosubmit expid -H TEST -d "Test exp." - - # Create with -np - # Since it was a new install the expid will be a000 - autosubmit create a000 -np - -Sequence of instructions to install Autosubmit and its dependencies with conda. -------------------------------------------------------------------------------- - -.. code-block:: bash - - # Download conda - wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Launch it - ./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Download git - apt install git -y -q - # Download autosubmit - git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 - cd autosubmit - # Create conda environment - conda env update -f environment.yml -n autosubmit python=2 - # Activate env - source activate autosubmit - # Test autosubmit - autosubmit -v - # Configure autosubmitrc and install database as indicated in this doc - - -################ How to configure -################ +================ After installation, you have to configure database and path for Autosubmit. In order to use the default settings, just create a directory called `autosubmit` in your home directory before running the configure command. @@ -136,9 +71,11 @@ For installing the database for Autosubmit on the configured folder, when no dat autosubmit install -.. important:: Be careful ! autosubmit install will create a blank database. +.. danger:: Be careful ! autosubmit install will create a blank database. -Lastly, if autosubmit configure doesn't work for you or you need to configure additional info create or modify /etc/autosubmitrc file or ~/.autosubmitrc with the information as follows: +Lastly, if autosubmit configure doesn't work for you or you need to configure additional info create: + +Create or modify /etc/autosubmitrc file or ~/.autosubmitrc with the information as follows: .. code-block:: ini @@ -182,4 +119,71 @@ From 3.14+ onwards, autosubmit commands can be tailored to run on specific machi * If no commands are defined, all commands are authorized. * If no machines are defined, all machines are authorized. -Now you are ready to use Autosubmit ! \ No newline at end of file +Now you are ready to use Autosubmit ! + + +Examples +======== + +Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. +------------------------------------------------------------------------------ + +.. code-block:: bash + + + # Update repositories + apt update + + # Avoid interactive stuff + export DEBIAN_FRONTEND=noninteractive + + # Dependencies + apt install wget curl python2 python-tk python2-dev graphviz -y -q + + # Additional dependencies related with pycrypto + apt install build-essential libssl-dev libffi-dev -y -q + + # Download get pip script and launch it + wget https://bootstrap.pypa.io/pip/2.7/get-pip.py + python2 get-pip.py + + # Install autosubmit using pip + pip2 install autosubmit + + # Check that we can execute autosubmit commands + autosubmit -h + + # Configure + autosubmit configure + + # Install + autosubmit install + + # Get expid + autosubmit expid -H TEST -d "Test exp." + + # Create with -np + # Since it was a new install the expid will be a000 + autosubmit create a000 -np + +Sequence of instructions to install Autosubmit and its dependencies with conda. +------------------------------------------------------------------------------- + +.. code-block:: bash + + # Download conda + wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh + # Launch it + chmod + x ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + # Download git + apt install git -y -q + # Download autosubmit + git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 + cd autosubmit + # Create conda environment + conda env update -f environment.yml -n autosubmit python=2 + # Activate env + source activate autosubmit + # Test autosubmit + autosubmit -v + # Configure autosubmitrc and install database as indicated in this doc -- GitLab From 21b77c1153765fe83e22e6f2e6be769808b1dcd9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 6 Sep 2022 11:01:23 +0200 Subject: [PATCH 073/213] conda fix --- docs/source/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 9a90c4e54..64b314886 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -174,7 +174,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Download conda wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it - chmod + x ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh # Download git apt install git -y -q # Download autosubmit -- GitLab From e9da166cbfb546c31afeb70eaed9066a5d32ad2c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 7 Sep 2022 15:23:52 +0200 Subject: [PATCH 074/213] Wrapper is now fully independent from total and waiting jobs as expected #857 --- autosubmit/autosubmit.py | 5 +-- autosubmit/config/config_common.py | 9 ++-- autosubmit/job/job_packager.py | 52 +++++++++++----------- autosubmit/platforms/paramiko_submitter.py | 4 +- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index b299c7dcc..6fd5932a3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1372,8 +1372,8 @@ class Autosubmit: while job_list.get_active(): Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True, only_wrappers, hold=False) - for job in job_list.get_uncompleted_and_not_waiting(): - job.status = Status.COMPLETED + #for job in job_list.get_uncompleted_and_not_waiting(): + # job.status = Status.COMPLETED job_list.update_list(as_conf, False) @staticmethod @@ -2071,7 +2071,6 @@ class Autosubmit: platform.open_submit_script() valid_packages_to_submit = [] # type: List[JobPackageBase] for package in packages_to_submit: - try: # If called from inspect command or -cw if only_wrappers or inspect: diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index e3e9188a4..3f5c39a3b 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1600,7 +1600,9 @@ class AutosubmitConfig(object): :return: maximum number of jobs (or total jobs) :rtype: int """ - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED', self.get_total_jobs())) + #total_jobs = self.get_total_jobs() + #unlimited because wrapper should count as one + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED', 999999999)) def get_max_wrapped_jobs_vertical(self, wrapper_section_name="wrapper"): """ @@ -1609,8 +1611,7 @@ class AutosubmitConfig(object): :return: maximum number of jobs (or total jobs) :rtype: int """ - max_wrapped = self.get_max_wrapped_jobs(wrapper_section_name) - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_V', max_wrapped)) + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_V', -1)) def get_max_wrapped_jobs_horizontal(self, wrapper_section_name="wrapper"): """ @@ -1620,7 +1621,7 @@ class AutosubmitConfig(object): :rtype: int """ max_wrapped = self.get_max_wrapped_jobs(wrapper_section_name) - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_H', max_wrapped)) + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_H', -1)) def get_min_wrapped_jobs_vertical(self, wrapper_section_name="wrapper"): """ diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 54a6268c3..cfc1235e8 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -57,7 +57,12 @@ class JobPackager(object): # Submitted + Queuing Jobs for specific Platform queuing_jobs = jobs_list.get_queuing(platform) # We now consider the running jobs count - running_jobs_count = len(jobs_list.get_running(platform)) + running_jobs = jobs_list.get_running(platform) + running_by_id = dict() + for running_job in running_jobs: + running_by_id[running_job.id] = running_job + running_jobs_len = len(running_by_id.keys()) + queued_by_id = dict() for queued_job in queuing_jobs: queued_by_id[queued_job.id] = queued_job @@ -76,10 +81,9 @@ class JobPackager(object): # .total_jobs Maximum number of jobs at the same time self._max_jobs_to_submit = platform.total_jobs - queuing_jobs_len # Substracting running jobs - self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_count + self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_len self._max_jobs_to_submit = self._max_jobs_to_submit if self._max_jobs_to_submit > 0 else 0 - self.max_jobs = min(self._max_wait_jobs_to_submit, - self._max_jobs_to_submit) + self.max_jobs = min(self._max_wait_jobs_to_submit,self._max_jobs_to_submit) self.wrapper_type["wrapper"] = self._as_config.get_wrapper_type() self.wrapper_policy["wrapper"] = self._as_config.get_wrapper_policy() @@ -94,24 +98,15 @@ class JobPackager(object): self.jobs_in_wrapper[wrapper_section] = self._as_config.get_wrapper_jobs(wrapper_section) self.extensible_wallclock[wrapper_section] = int(self._as_config.get_extensible_wallclock(wrapper_section)) self.wrapper_info = [self.wrapper_type,self.wrapper_policy,self.wrapper_method,self.jobs_in_wrapper,self.extensible_wallclock] # to pass to job_packages - - - # True or False - - Log.debug( - "Number of jobs available: {0}", self._max_wait_jobs_to_submit) + Log.debug("Number of jobs available: {0}", self._max_wait_jobs_to_submit) if self.hold: - Log.debug("Number of jobs prepared: {0}", len( - jobs_list.get_prepared(platform))) + Log.debug("Number of jobs prepared: {0}", len(jobs_list.get_prepared(platform))) if len(jobs_list.get_prepared(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_prepared(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_prepared(platform))) else: - Log.debug("Number of jobs ready: {0}", len( - jobs_list.get_ready(platform, hold=False))) + Log.debug("Number of jobs ready: {0}", len(jobs_list.get_ready(platform, hold=False))) if len(jobs_list.get_ready(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_ready(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_ready(platform))) self._maxTotalProcessors = 0 def compute_weight(self, job_list): @@ -210,8 +205,7 @@ class JobPackager(object): # Sort by Priority, highest first list_of_available = sorted( available_sorted, key=lambda k: k.priority, reverse=True) - num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len( - jobs_ready), self._max_jobs_to_submit) + num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len(jobs_ready), self._max_jobs_to_submit) # Take the first num_jobs_to_submit from the list of available jobs_to_submit_tmp = list_of_available[0:num_jobs_to_submit] #jobs_to_submit = [ @@ -248,6 +242,10 @@ class JobPackager(object): wrapper_limits["max_h"] = self._as_config.get_max_wrapped_jobs_horizontal(self.current_wrapper_section) if wrapper_limits["max"] < wrapper_limits["max_v"] * wrapper_limits["max_h"]: wrapper_limits["max"] = wrapper_limits["max_v"] * wrapper_limits["max_h"] + if wrapper_limits["max_v"] == -1: + wrapper_limits["max_v"] = wrapper_limits["max"] + if wrapper_limits["max_h"] == -1: + wrapper_limits["max_h"] = wrapper_limits["max"] if '&' not in section: if self._as_config.jobs_parser.has_option(section, 'DEPENDENCIES'): dependencies_keys = self._as_config.jobs_parser.get( @@ -552,7 +550,7 @@ class JobPackager(object): def _build_horizontal_packages(self, section_list, wrapper_limits, section): packages = [] horizontal_packager = JobPackagerHorizontal(section_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) + wrapper_limits["max"], self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) package_jobs = horizontal_packager.build_horizontal_package() @@ -585,11 +583,11 @@ class JobPackager(object): """ packages = [] for job in section_list: - if self.max_jobs > 0: + if wrapper_limits["max"] > 0: if job.packed is False: job.packed = True dict_jobs = self._jobs_list.get_ordered_jobs_by_date_member(self.current_wrapper_section) - job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, self.max_jobs, wrapper_limits, self._platform.max_wallclock) + job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, wrapper_limits["max"], wrapper_limits, self._platform.max_wallclock) jobs_list = job_vertical_packager.build_vertical_package(job) packages.append(JobPackageVertical(jobs_list, configuration=self._as_config,wrapper_section=self.current_wrapper_section,wrapper_info=wrapper_info)) @@ -605,7 +603,7 @@ class JobPackager(object): ## READY JOBS ## ## Create the horizontal ## horizontal_packager = JobPackagerHorizontal(jobs_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node) + wrapper_limits["max"], self._platform.processors_per_node) if self.wrapper_type[self.current_wrapper_section] == 'vertical-horizontal': return self._build_vertical_horizontal_package(horizontal_packager, jobs_resources) @@ -654,7 +652,7 @@ class JobPackager(object): horizontal_packager.wrapper_limits["max_by_section"][section] = horizontal_packager.wrapper_limits["max_by_section"][section] - 1 horizontal_packager.wrapper_limits["max"] = horizontal_packager.wrapper_limits["max"] - actual_wrapped_jobs for job in horizontal_package: - job_list = JobPackagerVerticalSimple([job], job.wallclock, self.max_jobs, + job_list = JobPackagerVerticalSimple([job], job.wallclock, horizontal_packager.wrapper_limits["max"], horizontal_packager.wrapper_limits, self._platform.max_wallclock).build_vertical_package(job) @@ -706,7 +704,7 @@ class JobPackagerVertical(object): :rtype: List() of Job Object \n """ # self.jobs_list starts as only 1 member, but wrapped jobs are added in the recursion - if len(self.jobs_list) >= self.max_jobs or len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: + if len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: return self.jobs_list child = self.get_wrappable_child(job) # If not None, it is wrappable @@ -897,7 +895,7 @@ class JobPackagerHorizontal(object): for section in jobs_by_section: current_package_by_section[section] = 0 for job in jobs_by_section[section]: - if self.max_jobs > 0 and len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: + if len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: if int(job.tasks) != 0 and int(job.tasks) != int(self.processors_node) and \ int(job.tasks) < job.total_processors: nodes = int( diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index acba2bcce..1f577426f 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -159,8 +159,10 @@ class ParamikoSubmitter(Submitter): asconf.get_max_processors()) remote_platform.max_waiting_jobs = int(parser.get_option(section, 'MAX_WAITING_JOBS', asconf.get_max_waiting_jobs())) - remote_platform.total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', + totaljobs = int(parser.get_option(section, 'TOTALJOBS', asconf.get_total_jobs())) + total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', asconf.get_total_jobs())) + remote_platform.total_jobs = min(min(totaljobs, total_jobs),asconf.get_total_jobs()) remote_platform.hyperthreading = parser.get_option(section, 'HYPERTHREADING', 'false').lower() remote_platform.project = parser.get_option( -- GitLab From 68b2a800cd2fa822f11eb746d993ff59df459439 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Sep 2022 11:45:53 +0200 Subject: [PATCH 075/213] error message fix --- autosubmit/platforms/paramiko_platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e57512f55..e1b36f116 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -901,7 +901,7 @@ class ParamikoPlatform(Platform): except AutosubmitError as e: raise except IOError as e: - raise AutosubmitError(e.message,6016) + raise AutosubmitError("IO issues, something seems wrong with {0}".format(self.name),6016,e.message) except BaseException as e: raise AutosubmitError('Command {0} in {1} warning: {2}'.format( command, self.host, '\n'.join(stderr_readlines)), 6005, e.message) -- GitLab From 5c84e366d4ce509f37357194ce0a45fe7d199739 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Sep 2022 15:45:24 +0200 Subject: [PATCH 076/213] docs update --- .../html/_sources/usage/new_platform.rst.txt | 4 +- docs/source/troubleshooting/error-codes.rst | 4 +- .../userguide/configure/develop_a_project.rst | 48 +++++++++++++------ 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/docs/build/html/_sources/usage/new_platform.rst.txt b/docs/build/html/_sources/usage/new_platform.rst.txt index 173dafae4..675d4edc6 100644 --- a/docs/build/html/_sources/usage/new_platform.rst.txt +++ b/docs/build/html/_sources/usage/new_platform.rst.txt @@ -53,9 +53,9 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be waiting in this platform. +* MAX_WAITING_JOBS: maximum number of jobs to be queuing or submitted in this platform. -* TOTAL_JOBS: maximum number of jobs to be running at the same time in this platform. +* TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. * CUSTOM_DIRECTIVES: Custom directives for the resource manager of this platform. diff --git a/docs/source/troubleshooting/error-codes.rst b/docs/source/troubleshooting/error-codes.rst index c92ba38ad..ed9154997 100644 --- a/docs/source/troubleshooting/error-codes.rst +++ b/docs/source/troubleshooting/error-codes.rst @@ -155,7 +155,9 @@ Minor errors - Error codes [6000+] +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6013 | Configuration issues | Check log output for more info | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ -| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6015 | Submission failed | Automatically, if there aren't bigger issues | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6016 | Temporal connection issues | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ diff --git a/docs/source/userguide/configure/develop_a_project.rst b/docs/source/userguide/configure/develop_a_project.rst index 39960413a..17caddcf5 100644 --- a/docs/source/userguide/configure/develop_a_project.rst +++ b/docs/source/userguide/configure/develop_a_project.rst @@ -1,6 +1,7 @@ .. _develproject: +==================== Developing a project ==================== @@ -8,14 +9,14 @@ This section contains some examples on how to develop a new project. All files, with the exception of user-defined scripts, are located in the ``/conf`` directory. -Configuration files are written in ``ini`` format. Autosubmit supports user-defined scripts are written in ``bash``, ``python``, and ``R``. +Configuration files are written in ``ini`` format. In the other hand, the user-defined scripts are written in ``bash/python or R`` format. -To configure the experiment, edit ``autosubmit_cxxx.conf``, ``expdef_cxxx.conf``, ``jobs_cxxx.conf`` , ``platforms_cxxx.conf``, and ``proj_cxxx.conf``` in the ``conf`` folder of the experiment. +To configure the experiment, edit ``autosubmit_cxxx.conf``, ``expdef_cxxx.conf``, ``jobs_cxxx.conf`` , ``platforms_cxxx.conf`` and ``proj_cxxx.conf``` in the ``conf`` folder of the experiment. Expdef configuration --------------------- +==================== - ``vi /cxxx/conf/expdef_cxxx.conf`` + vi /cxxx/conf/expdef_cxxx.conf .. code-block:: ini @@ -92,9 +93,9 @@ Expdef configuration FILE_JOBS_CONF = templates/common/jobs.conf Autosubmit configuration ------------------------- +======================== - ``vi /cxxx/conf/autosubmit_cxxx.conf`` + vi /cxxx/conf/autosubmit_cxxx.conf .. code-block:: ini @@ -128,9 +129,9 @@ Autosubmit configuration # [wrappers] Jobs configuration ------------------- +================== - ``vi /cxxx/conf/jobs_cxxx.conf`` + vi /cxxx/conf/jobs_cxxx.conf .. code-block:: ini @@ -234,9 +235,9 @@ Jobs configuration RUNNING = member Platform configuration ----------------------- +====================== - ``vi /cxxx/conf/platforms_cxxx.conf`` + vi /cxxx/conf/platforms_cxxx.conf .. code-block:: ini @@ -291,9 +292,9 @@ Platform configuration TEST_SUITE = True Proj configuration ------------------- +================== -After filling the experiment configuration and executing ``autosubmit create cxxx -np``, a copy of the model is stored in ``proj``. +After filling the experiment configuration and promt ``autosubmit create cxxx -np`` create, user can go into ``proj`` which has a copy of the model. The experiment project contains the scripts specified in ``jobs_cxxx.conf`` and a copy of model source code and data specified in ``expdef_xxxx.conf``. @@ -511,7 +512,7 @@ Example: PISCES_timestep = 3600 Proj configuration:: Full example -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------- This section contains a full example of a valid proj file with a valid user script. @@ -560,7 +561,7 @@ Final script, which is generated by `autosubmit run` or ``autosubmit inspect`` (...) Detailed platform configuration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------- In this section, we describe the platform configuration using `-QOS` and also `PARTITION` @@ -673,7 +674,7 @@ In this section, we describe the platform configuration using `-QOS` and also `P The custom directives can be used for multiple parameters at the same time using the follow syntax. - `vi /conf/platform_cxxx.conf` + vi /conf/platform_cxxx.conf .. code-block:: ini @@ -699,3 +700,20 @@ The custom directives can be used for multiple parameters at the same time using MAX_PROCESSORS = 80 # test [40] / small [40] // large [40] PROCESSORS_PER_NODE = 40 + +Controling the number of active concurrent tasks in an experiment +---------------------------------------------------------------------- + +In some cases, you may want to control the number of concurrent tasks/jobs that can be active in an experiment. + +To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS`` and ``MAX_WAITING_JOBS`` variable in the ``conf/autosubmit_cxxx.conf`` file. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + # Maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 + -- GitLab From ba4ea0319f5338708d1ff51b2a1f084f8f2e6e2c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 16 Sep 2022 15:48:55 +0200 Subject: [PATCH 077/213] Now critical issues messages is always shown --- autosubmit/autosubmit.py | 7 ++++--- autosubmit/config/config_common.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 6fd5932a3..355260a76 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4341,12 +4341,13 @@ class Autosubmit: raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": - submitter = Autosubmit._get_submitter(as_conf) - submitter.load_platforms(as_conf) + try: + submitter = Autosubmit._get_submitter(as_conf) + submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform", 7014, e.message) + raise AutosubmitCritical("Can't set main platform\nCheck the hpcarch platform configuration inside platform.conf", 7014) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 3f5c39a3b..cc8aa3e1c 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -540,6 +540,8 @@ class AutosubmitConfig(object): # In case that there are critical errors in the configuration, Autosubmit won't continue. if running_time is True: raise AutosubmitCritical(e.message, e.code, e.trace) + else: + Log.printlog(e.message+"\n") except Exception as e: raise AutosubmitCritical( "There was an error while showing the config log messages", 7014, str(e)) -- GitLab From 6a9d1f01c731241d11cf1010f37fb7d4ecf07240 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 21 Sep 2022 16:09:27 +0200 Subject: [PATCH 078/213] Patch for db_fix --- autosubmit/autosubmit.py | 20 ++++++++++++-------- requeriments.txt | 1 + 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 355260a76..60b064de9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1556,7 +1556,8 @@ class Autosubmit: exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) except Exception as e: # This error is important - raise AutosubmitCritical("Error while processing historical database.", 7005, str(e)) + Log.printlog("Error while processing historical database.", 7005, str(e)) + try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -4224,13 +4225,16 @@ class Autosubmit: except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) - Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), + as_conf.get_full_config_as_json(), + job_list.get_job_list()) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: if group_by: status = list() diff --git a/requeriments.txt b/requeriments.txt index d57974475..c34451db2 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,3 +1,4 @@ +configparser argparse>=1.2,<2 python-dateutil>2 matplotlib -- GitLab From 46d9cafcdecff1fc614af30ef13aac6fa5021193 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 22 Sep 2022 09:53:04 +0200 Subject: [PATCH 079/213] Patch for db_fix (1) --- autosubmit/autosubmit.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 60b064de9..153c0c8a3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1776,9 +1776,22 @@ class Autosubmit: job_list.update_list(as_conf, submitter=submitter) job_list.save() # Safe spot to store changes - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - if len(job_changes_tracker) > 0: - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except BaseException as e: + Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", + Log.INFO) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} if Autosubmit.exit: job_list.save() @@ -1949,8 +1962,16 @@ class Autosubmit: raise AutosubmitCritical("There is a bug in the code, please contact via git",7070,e.message) Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + Log.printlog() # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True -- GitLab From fd14bb03d995652453b27427c33e82b3938c8803 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 23 Sep 2022 15:05:21 +0200 Subject: [PATCH 080/213] Does an sql dump everytime a change is detected. Then db_fix load this sql dump --- autosubmit/autosubmit.py | 102 ++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 55 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 153c0c8a3..75baab6de 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -58,6 +58,7 @@ import locale from distutils.util import strtobool from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set +import sqlite3 try: import dialog @@ -71,6 +72,7 @@ import tarfile import time import copy import os +import glob import pwd import sys import shutil @@ -1553,11 +1555,14 @@ class Autosubmit: # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current run does not exist. exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) except Exception as e: - # This error is important - Log.printlog("Error while processing historical database.", 7005, str(e)) - + try: + Autosubmit.database_fix(expid) + # This error is important + except: + pass try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -1781,6 +1786,7 @@ class Autosubmit: historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) if len(job_changes_tracker) > 0: exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) @@ -1790,6 +1796,7 @@ class Autosubmit: historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) if len(job_changes_tracker) > 0: exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except: Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} @@ -1965,13 +1972,12 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except: try: Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) except: - Log.printlog() + pass # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True @@ -3901,6 +3907,17 @@ class Autosubmit: raise @staticmethod + def database_backup(expid): + try: + database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) + backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) + Log.info("Backing up jobs_data...") + subprocess.call(command, shell=True) + Log.result("Jobs_data database backup completed.") + except BaseException as e: + Log.info("Jobs_data database backup failed.") + @staticmethod def database_fix(expid): """ Database methods. Performs a sql dump of the database and restores it. @@ -3912,52 +3929,31 @@ class Autosubmit: """ os.umask(0) # Overrides user permissions current_time = int(time.time()) + corrupted_db_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_corrupted.db".format(expid)) + database_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) - database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_{1}.db".format(expid, str(current_time))) - dump_file_name = 'job_data_{0}_{1}.sql'.format(expid, current_time) + database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + dump_file_name = 'job_data_{0}.sql'.format(expid, current_time) dump_file_path = os.path.join(BasicConfig.JOBDATA_DIR, dump_file_name) - bash_command = 'sqlite3 {0} .dump > {1}'.format(database_path, dump_file_path) + bash_command = 'cat {1} | sqlite3 {0}'.format(database_path, dump_file_path) try: - if os.path.exists(database_path): + if os.path.exists(database_path): + result = os.popen("mv {0} {1}".format(database_path, corrupted_db_path)).read() + time.sleep(10) + Log.info("Original database moved.") + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + Log.info("Restoring from sql") result = os.popen(bash_command).read() - if result is not None and os.path.exists(dump_file_path): - Log.info("sqldump {0} created".format(dump_file_path)) - Log.info( - "Backing up original database {0}".format(database_path)) - result = os.popen("mv {0} {1}".format(database_path, database_backup_path)).read() - time.sleep(10) - if result is not None and not os.path.exists(database_path): - Log.info("Original database moved.") - Log.info("Restoring from sqldump") - HUtils.create_file_with_full_permissions(database_path) - result = os.popen("cat {0} | sqlite3 {1}".format( - dump_file_path, database_path)).read() - time.sleep(10) - if result is not None and os.path.exists(database_path): - Log.info( - "Database {0} restored.".format(database_path)) - Log.info("Deleting sqldump.") - result = os.popen( - "rm {0}".format(dump_file_path)).read() - sleep(5) - if result is not None and not os.path.exists(dump_file_path): - ExperimentHistory(expid).initialize_database() - Log.info("sqldump file deleted.") - Log.result( - "The database {0} has been fixed.".format(database_path)) - else: - raise Exception( - "The sqldump file could not be removed.") - else: - raise Exception( - "It was not possible to restore the sqldump file.") - else: - raise Exception( - "It was not possible to delete the original database.") - else: - raise Exception("The sqldump file couldn't be created.") - else: - raise Exception("The database file doesn't exist.") + except: + Log.warning("It was not possible to restore the jobs_data.db file... , a new blank db will be created") + result = os.popen("rm {0}".format(database_path)).read() + + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() except Exception as exp: Log.critical(str(exp)) @@ -4243,17 +4239,12 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) try: Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) except: Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: @@ -5018,6 +5009,7 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.process_status_changes(job_list.get_job_list(), chunk_unit=as_conf.get_chunk_size_unit(), chunk_size=as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) else: Log.printlog( "Changes NOT saved to the JobList!!!!: use -s option to save", 3000) -- GitLab From a1a2492b864712a050ac2cf7556ccf4fa0d4a791 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Sep 2022 09:21:49 +0200 Subject: [PATCH 081/213] database changes #870 --- autosubmit/autosubmit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 75baab6de..337247605 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3912,11 +3912,11 @@ class Autosubmit: database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) - Log.info("Backing up jobs_data...") + Log.debug("Backing up jobs_data...") subprocess.call(command, shell=True) - Log.result("Jobs_data database backup completed.") + Log.debug("Jobs_data database backup completed.") except BaseException as e: - Log.info("Jobs_data database backup failed.") + Log.debug("Jobs_data database backup failed.") @staticmethod def database_fix(expid): """ -- GitLab From 61a5b9cba349381c6cd421f219a7f8e4e959b34a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 30 Sep 2022 13:50:03 +0200 Subject: [PATCH 082/213] #877 conda typo --- docs/source/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 64b314886..4f68c3788 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -183,7 +183,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Create conda environment conda env update -f environment.yml -n autosubmit python=2 # Activate env - source activate autosubmit + conda activate autosubmit # Test autosubmit autosubmit -v # Configure autosubmitrc and install database as indicated in this doc -- GitLab From 315b55c2eef0be1a3cebf0203e8746c9a4dd89a9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 30 Sep 2022 13:50:53 +0200 Subject: [PATCH 083/213] #877 changed version to the lastest one (3.14.0b) --- docs/source/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 4f68c3788..7159ac7c0 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -178,7 +178,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Download git apt install git -y -q # Download autosubmit - git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 + git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b cd autosubmit # Create conda environment conda env update -f environment.yml -n autosubmit python=2 -- GitLab From 1ebc81cab2c26df63628e956ab4a27a683adc229 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 13:03:40 +0200 Subject: [PATCH 084/213] #inline comments, fixes for slrum --- autosubmit/autosubmit.py | 20 ++++-- autosubmit/platforms/paramiko_platform.py | 74 ++++++++++++++++------- test/regression/tests_runner.py | 1 + 3 files changed, 69 insertions(+), 26 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 337247605..09ce96335 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1584,7 +1584,7 @@ class Autosubmit: if unparsed_two_step_start != "": job_list.parse_jobs_by_filter(unparsed_two_step_start) - main_loop_retrials = 3650 # Hard limit of tries 3650 tries at 15-120seconds sleep each try + main_loop_retrials = 11250*2 # Hard limit of tries ( 48h min 72h max), 2 retrials per stop # establish the connection to all platforms Autosubmit.restore_platforms(platforms_to_test) @@ -1822,7 +1822,7 @@ class Autosubmit: Log.printlog("Error trying to store failed job count",Log.WARNING) Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: - delay = min(15 * consecutive_retrials, 120) + delay = min(15 * consecutive_retrials, 30) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1959,7 +1959,7 @@ class Autosubmit: except BaseException: reconnected = False if main_loop_retrials <= 0: - raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of 4hours reached", 7051, e.message) + raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of {0} retrials reached".format(main_loop_retrials), 7051, e.message) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error raise AutosubmitCritical(e.message, e.code, e.trace) except portalocker.AlreadyLocked: @@ -3322,7 +3322,12 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("Unknown error while reporting the parameters list, likely it is due IO issues",7040,e.message) - + @staticmethod + def removeInlineComments(cfgparser): + for section in cfgparser.sections(): + for item in cfgparser.items(section): + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + return cfgparser @staticmethod def describe(experiment_id): """ @@ -3497,6 +3502,7 @@ class Autosubmit: parser.set("autosubmitapi", "url", autosubmitapi_url) #parser.add_section("hosts") #parser.set("hosts", "whitelist", " localhost # Add your machine names") + parser = Autosubmit.removeInlineComments(parser) parser.write(config_file) config_file.close() Log.result("Configuration file written successfully: \n\t{0}".format(rc_path)) @@ -3591,6 +3597,8 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) + parser = Autosubmit.removeInlineComments(parser) + if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') if parser.has_option('database', 'filename'): @@ -3723,11 +3731,15 @@ class Autosubmit: parser.add_section('mail') parser.set('mail', 'smtp_server', smtp_hostname) parser.set('mail', 'mail_from', mail_from) + parser = Autosubmit.removeInlineComments(parser) + parser.write(config_file) config_file.close() d.msgbox("Configuration file written successfully", width=50, height=5) os.system('clear') + + except (IOError, OSError) as e: raise AutosubmitCritical( "Can not write config file", 7012, e.message) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e1b36f116..fb9059915 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -550,35 +550,63 @@ class ParamikoPlatform(Platform): cmd = self.get_checkAlljobs_cmd(job_list_cmd) sleep_time = 5 sleep(sleep_time) - self.send_command(cmd) - while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + slurm_error = False + e_msg = "" + try: self.send_command(cmd) - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - retries -= 1 - sleep(sleep_time) - sleep_time = sleep_time + 5 + except AutosubmitError as e: + e_msg = e.trace+" "+e.message + slurm_error = True + if not slurm_error: + while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + try: + self.send_command(cmd) + except AutosubmitError as e: + e_msg = e.trace + " " + e.message + slurm_error = True + break + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + retries -= 1 + sleep(sleep_time) + sleep_time = sleep_time + 5 + job_list_status = self.get_ssh_output() if retries >= 0: Log.debug('Successful check job command') in_queue_jobs = [] list_queue_jobid = "" for job in job_list: - job_id = job.id - job_status = self.parse_Alljobs_output(job_list_status, job_id) - while len(job_status) <= 0 and retries >= 0: - retries -= 1 - self.send_command(cmd) - job_list_status = self.get_ssh_output() + if not slurm_error: + job_id = job.id job_status = self.parse_Alljobs_output(job_list_status, job_id) - if len(job_status) <= 0: - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - sleep(sleep_time) - sleep_time = sleep_time + 5 - # URi: define status list in HPC Queue Class + while len(job_status) <= 0 and retries >= 0: + retries -= 1 + self.send_command(cmd) + job_list_status = self.get_ssh_output() + job_status = self.parse_Alljobs_output(job_list_status, job_id) + if len(job_status) <= 0: + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + sleep(sleep_time) + sleep_time = sleep_time + 5 + # URi: define status list in HPC Queue Class + else: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -595,12 +623,12 @@ class ParamikoPlatform(Platform): elif retries == 0: job_status = Status.COMPLETED job.update_status(remote_logs) - else: job_status = Status.UNKNOWN Log.error( 'check_job() The job id ({0}) status is {1}.', job_id, job_status) job.new_status = job_status + reason = str() if self.type == 'slurm' and len(in_queue_jobs) > 0: cmd = self.get_queue_status_cmd(list_queue_jobid) @@ -639,6 +667,8 @@ class ParamikoPlatform(Platform): 'check_job() The job id ({0}) from platform {1} has an status of {2}.', job.id, self.name, job_status) raise AutosubmitError("Some Jobs are in Unknown status", 6008) # job.new_status=job_status + if slurm_error: + raise AutosubmitError(e_msg, 6000) def get_jobid_by_jobname(self,job_name,retries=2): """ diff --git a/test/regression/tests_runner.py b/test/regression/tests_runner.py index ffd490888..ab186e849 100644 --- a/test/regression/tests_runner.py +++ b/test/regression/tests_runner.py @@ -79,6 +79,7 @@ def run(current_experiment_id, only_list=None, exclude_list=None, max_threads=5) tests_parser.optionxform = str tests_parser.read(tests_parser_file) + # Resetting the database clean_database(db_path) create_database() -- GitLab From 8b912ea7b2b65be3aa7128ec8d03d4f730be35ce Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 15:43:21 +0200 Subject: [PATCH 085/213] Remove inline comments working #870 --- autosubmit/autosubmit.py | 13 +++---------- autosubmit/config/config_common.py | 9 +++++++++ autosubmit/config/config_parser.py | 5 ++++- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 09ce96335..82e4b44e9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3322,12 +3322,7 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("Unknown error while reporting the parameters list, likely it is due IO issues",7040,e.message) - @staticmethod - def removeInlineComments(cfgparser): - for section in cfgparser.sections(): - for item in cfgparser.items(section): - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) - return cfgparser + @staticmethod def describe(experiment_id): """ @@ -3502,7 +3497,6 @@ class Autosubmit: parser.set("autosubmitapi", "url", autosubmitapi_url) #parser.add_section("hosts") #parser.set("hosts", "whitelist", " localhost # Add your machine names") - parser = Autosubmit.removeInlineComments(parser) parser.write(config_file) config_file.close() Log.result("Configuration file written successfully: \n\t{0}".format(rc_path)) @@ -3597,7 +3591,6 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) - parser = Autosubmit.removeInlineComments(parser) if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') @@ -3731,8 +3724,6 @@ class Autosubmit: parser.add_section('mail') parser.set('mail', 'smtp_server', smtp_hostname) parser.set('mail', 'mail_from', mail_from) - parser = Autosubmit.removeInlineComments(parser) - parser.write(config_file) config_file.close() d.msgbox("Configuration file written successfully", @@ -5398,10 +5389,12 @@ class Autosubmit: raise AutosubmitCritical('Can not test a RERUN experiment', 7014) content = open(as_conf.experiment_file).read() + if random_select: if hpc is None: platforms_parser = as_conf.get_parser( ConfigParserFactory(), as_conf.platforms_file) + test_platforms = list() for section in platforms_parser.sections(): if platforms_parser.get_option(section, 'TEST_SUITE', 'false').lower() == 'true': diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index cc8aa3e1c..74dcc3e1e 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1759,6 +1759,13 @@ class AutosubmitConfig(object): commit = self.get_git_project_commit() return origin_exists and (branch is not None or commit is not None) + @staticmethod + def removeInlineComments(cfgparser): + for section in cfgparser.sections(): + for item in cfgparser.items(section): + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + return cfgparser + @staticmethod def get_parser(parser_factory, file_path): """ @@ -1794,5 +1801,7 @@ class AutosubmitConfig(object): raise Exception( "{}\n This file and the correctness of its content are necessary.".format(str(exp))) # parser.read(file_path) + #remove inline comments + parser = AutosubmitConfig.removeInlineComments(parser) return parser diff --git a/autosubmit/config/config_parser.py b/autosubmit/config/config_parser.py index 87b28456a..99d92fd8c 100644 --- a/autosubmit/config/config_parser.py +++ b/autosubmit/config/config_parser.py @@ -14,8 +14,11 @@ class ConfigParserFactory: def __init__(self): pass + + def create_parser(self): - return ConfigParser() + parser = ConfigParser() + return parser class ConfigParser(ConfPar, object): -- GitLab From 2c5a30087b20ec3e0df2e7d7449ba964ea9d0275 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 16:04:02 +0200 Subject: [PATCH 086/213] setstatus doesn't crash anymore if the id does not exists --- autosubmit/autosubmit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 82e4b44e9..37aa84475 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4467,7 +4467,10 @@ class Autosubmit: if job.status in [Status.SUBMITTED, Status.QUEUING, Status.HELD] and final_status not in [Status.QUEUING, Status.HELD, Status.SUSPENDED]: job.hold = False if job.platform_name and job.platform_name.lower() != "local": - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + try: + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + except: + pass elif job.status in [Status.QUEUING, Status.RUNNING, Status.SUBMITTED] and final_status == Status.SUSPENDED: if job.platform_name and job.platform_name.lower() != "local": job.platform.send_command("scontrol hold " + "{0}".format(job.id), ignore_log=True) -- GitLab From 8244e8104c475bfd45b538cb0fe5f52ce2f9f44f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 11:08:39 +0200 Subject: [PATCH 087/213] Fixed e message error --- autosubmit/autosubmit.py | 14 +++++++------- autosubmit/config/config_common.py | 4 ++-- autosubmit/git/autosubmit_git.py | 4 ++-- autosubmit/job/job.py | 8 ++++---- autosubmit/job/job_dict.py | 2 +- autosubmit/job/job_list.py | 6 +++--- autosubmit/monitor/monitor.py | 2 +- autosubmit/platforms/paramiko_platform.py | 16 ++++++++-------- test/regression/tests_utils.py | 2 +- 9 files changed, 29 insertions(+), 29 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 37aa84475..2fca7cb7b 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -590,7 +590,7 @@ class Autosubmit: except Exception as e: if type(e) is SystemExit: # Version keyword force an exception in parse arg due and os_exit(0) but the program is succesfully finished - if e.message == 0: + if str(e) == 0: print(Autosubmit.autosubmit_version) os._exit(0) raise AutosubmitCritical( @@ -836,28 +836,28 @@ class Autosubmit: if ret: Log.result("Experiment {0} deleted".format(expid_delete)) except BaseException as e: - error_message += 'Can not delete experiment entry: {0}\n'.format(e.message) + error_message += 'Can not delete experiment entry: {0}\n'.format(str(e)) Log.info("Removing experiment directory...") try: shutil.rmtree(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)) except BaseException as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) try: Log.info("Removing Structure db...") structures_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, "structure_{0}.db".format(expid_delete)) if os.path.exists(structures_path): os.remove(structures_path) except BaseException as e: - error_message += 'Can not delete structure: {0}\n'.format(e.message) + error_message += 'Can not delete structure: {0}\n'.format(str(e)) try: Log.info("Removing job_data db...") job_data_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid_delete)) if os.path.exists(job_data_path): os.remove(job_data_path) except BaseException as e: - error_message += 'Can not delete job_data: {0}\n'.format(e.message) + error_message += 'Can not delete job_data: {0}\n'.format(str(e)) except OSError as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) else: if not eadmin: raise AutosubmitCritical( @@ -1811,7 +1811,7 @@ class Autosubmit: # No need to wait until the remote platform reconnection recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - consecutive_retrials = 0 + consecutive_retrials = 1 failed_names = {} Log.info("Storing failed job count...") try: diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 74dcc3e1e..ddbb04c78 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -513,11 +513,11 @@ class AutosubmitConfig(object): self.reload() except IOError as e: raise AutosubmitError( - "I/O Issues con config files", 6016, e.message) + "I/O Issues con config files", 6016, str(e)) except (AutosubmitCritical, AutosubmitError) as e: raise except BaseException as e: - raise AutosubmitCritical("Unknown issue while checking the configulation files (check_conf_files)",7040,e.message) + raise AutosubmitCritical("Unknown issue while checking the configulation files (check_conf_files)",7040,str(e)) # Annotates all errors found in the configuration files in dictionaries self.warn_config and self.wrong_config. self.check_expdef_conf() self.check_platforms_conf() diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 817b5e09b..c191c21df 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -60,7 +60,7 @@ class AutosubmitGit: shell=True) except subprocess.CalledProcessError as e: raise AutosubmitCritical( - "Failed to retrieve git info ...", 7064, e.message) + "Failed to retrieve git info ...", 7064, str(e)) if output: Log.info("Changes not committed detected... SKIPPING!") raise AutosubmitCritical("Commit needed!", 7013) @@ -231,7 +231,7 @@ class AutosubmitGit: output_1 = subprocess.check_output(command_1, shell=True) except BaseException as e: submodule_failure = True - Log.printlog("Trace: {0}".format(e.message), 6014) + Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( "Submodule {0} has a wrong configuration".format(submodule), 6014) else: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 9365e516f..6653c51f9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -600,13 +600,13 @@ class Job(object): self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( - e.message, self.name)) + str(e), self.name)) except AutosubmitError as e: Log.printlog("Trace {0} \nFailed to retrieve log file for job {1}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) return @threaded @@ -656,7 +656,7 @@ class Job(object): except BaseException as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index 0b16d29af..d0aef9f42 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -126,7 +126,7 @@ class DicJobs: except BaseException as e: raise AutosubmitCritical( "Wrong format for {1} parameter in section {0}".format(section,called_from), 7011, - e.message) + str(e)) pass return parsed_list def read_section(self, section, priority, default_job_type, jobs_data=dict()): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index b26f24e74..4c2712267 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -249,7 +249,7 @@ class JobList(object): else: self._ordered_jobs_by_date_member[wrapper_section] = {} except BaseException as e: - raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,e.message) + raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,str(e)) pass @@ -1417,11 +1417,11 @@ class JobList(object): self._persistence_file, self._job_list if self.run_members is None or job_list is None else job_list) pass except BaseException as e: - raise AutosubmitError(e.message,6040,"Failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Failure while saving the job_list") except AutosubmitError as e: raise except BaseException as e: - raise AutosubmitError(e.message,6040,"Unknown failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Unknown failure while saving the job_list") def backup_save(self): diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index 55c60156a..9556e7d3d 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -353,7 +353,7 @@ class Monitor: except: pass - Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only writted in txt".format(e.message),7014) + Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only written in txt".format(e.message),7014) def generate_output_txt(self, expid, joblist, path, classictxt=False, job_list_object=None): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index fb9059915..1c1177510 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -112,7 +112,7 @@ class ParamikoPlatform(Platform): except EOFError as e: self.connected = False raise AutosubmitError("[{0}] not alive. Host: {1}".format( - self.name, self.host), 6002, e.message) + self.name, self.host), 6002, str(e)) except (AutosubmitError,AutosubmitCritical,IOError): self.connected = False raise @@ -136,7 +136,7 @@ class ParamikoPlatform(Platform): self.host.split(',')[0]), 6002) else: raise AutosubmitCritical( - "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,e.message) + "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,str(e)) while self.connected is False and retry < retries: try: self.connect(True) @@ -155,7 +155,7 @@ class ParamikoPlatform(Platform): raise except Exception as e: raise AutosubmitCritical( - 'Cant connect to this platform due an unknown error', 7050, e.message) + 'Cant connect to this platform due an unknown error', 7050, str(e)) def threaded(fn): def wrapper(*args, **kwargs): @@ -219,12 +219,12 @@ class ParamikoPlatform(Platform): elif "name or service not known" in e.strerror.lower(): raise SSHException(" {0} doesn't accept remote connections. Check if there is an typo in the hostname".format(self.host)) else: - raise AutosubmitError("File can't be located due an slow connection", 6016, e.message) + raise AutosubmitError("File can't be located due an slow connection", 6016, str(e)) except BaseException as e: self.connected = False - if "Authentication failed." in e.message: + if "Authentication failed." in str(e): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( - self._host_config['hostname']), 7050, e.message) + self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: self.restore_connection(reconnect=True) else: @@ -284,7 +284,7 @@ class ParamikoPlatform(Platform): return True except IOError as e: raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, e.message) + self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, str(e)) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -358,7 +358,7 @@ class ParamikoPlatform(Platform): except BaseException as e: Log.error('Could not remove file {0} due a wrong configuration'.format( os.path.join(self.get_files_path(), filename))) - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: raise AutosubmitCritical( "Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ", 7051, e.message) diff --git a/test/regression/tests_utils.py b/test/regression/tests_utils.py index 297fb8f75..53ead0dd5 100644 --- a/test/regression/tests_utils.py +++ b/test/regression/tests_utils.py @@ -23,7 +23,7 @@ def check_cmd(command, path=BIN_PATH, verbose='AS_TEST_VERBOSE' in os.environ): except subprocess.CalledProcessError as e: if verbose: - print e.output + print str(e) return False -- GitLab From 5e068242989d2ae25aa4565901a91fcde1e0bf50 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 13:16:06 +0200 Subject: [PATCH 088/213] log error --- autosubmit/config/config_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index ddbb04c78..4b683f1e4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1763,7 +1763,10 @@ class AutosubmitConfig(object): def removeInlineComments(cfgparser): for section in cfgparser.sections(): for item in cfgparser.items(section): - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + try: + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + except: + pass return cfgparser @staticmethod -- GitLab From ad2a39015db8780ae924e0939effa2ad2473790e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 13:33:32 +0200 Subject: [PATCH 089/213] CUSTOM directive has # crashing with the removeinlinecomments --- autosubmit/config/config_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 4b683f1e4..50c4d69e8 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1764,7 +1764,10 @@ class AutosubmitConfig(object): for section in cfgparser.sections(): for item in cfgparser.items(section): try: - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + if str(item[0]).upper() == "CUSTOM_DIRECTIVES": + pass + else: + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) except: pass return cfgparser -- GitLab From 481dcd9788dc2b1828e3ff8c370cac98ff4737ad Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 15:01:34 +0200 Subject: [PATCH 090/213] Changed delete message, added complete list of directories --- autosubmit/autosubmit.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 2fca7cb7b..0720672e7 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -818,6 +818,16 @@ class Autosubmit: :return: True if succesfully deleted, False otherwise :rtype: boolean """ + message = "The {0} experiment was removed from the local disk and from the database.".format(expid_delete) + message+= " Note that this action does not delete any data written by the experiment.\n" + message+= "Complete list of files/directories deleted:\n" + for root, dirs, files in os.walk(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)): + for dir in dirs: + message += os.path.join(root, dir) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, + "structure_{0}.db".format(expid_delete)) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, + "job_data_{0}.db".format(expid_delete)) + "\n" owner,eadmin,currentOwner = Autosubmit._check_ownership(expid_delete) if expid_delete == '' or expid_delete is None and not os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR,expid_delete)): Log.printlog("Experiment directory does not exist.",Log.WARNING) @@ -865,6 +875,7 @@ class Autosubmit: else: raise AutosubmitCritical( 'Current user is not the owner of the experiment. {0} can not be deleted!'.format(expid_delete), 7012) + Log.printlog(message, Log.RESULT) except Exception as e: # Avoid calling Log at this point since it is possible that tmp folder is already deleted. error_message += "Couldn't delete the experiment".format(e.message) -- GitLab From dc6909c7a8c9d9122a9f1ff3207e322f76cb8aca Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 15:19:47 +0200 Subject: [PATCH 091/213] disable inline delete --- autosubmit/config/config_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 50c4d69e8..63b31483d 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1808,6 +1808,6 @@ class AutosubmitConfig(object): "{}\n This file and the correctness of its content are necessary.".format(str(exp))) # parser.read(file_path) #remove inline comments - parser = AutosubmitConfig.removeInlineComments(parser) + #parser = AutosubmitConfig.removeInlineComments(parser) return parser -- GitLab From a929856cfd3e3cddf888178a6fddc8e0ca2dfb88 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 16:11:47 +0200 Subject: [PATCH 092/213] Fixed node missconfiguration slurm message not being detected correclty --- autosubmit/autosubmit.py | 10 ++++++---- autosubmit/job/job_packages.py | 2 +- autosubmit/platforms/paramiko_submitter.py | 4 +++- autosubmit/platforms/slurmplatform.py | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 0720672e7..48e5b2e28 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2191,11 +2191,11 @@ class Autosubmit: platform.cancel_job(id) jobs_id = None platform.connected = False - if type(e.trace) is not None: - has_trace_bad_parameters = e.trace.lower().find("bad parameters") != -1 + if e.trace is not None: + has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 else: has_trace_bad_parameters = False - if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1: + if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -2206,7 +2206,9 @@ class Autosubmit: else: error_message+="Check that {1} platform has set the correct scheduler. Sections that could be affected: {0}".format( error_msg[:-1], platform.name) - raise AutosubmitCritical(error_message,7014,e.message+"\n"+e.trace) + if e.trace is None: + e.trace = "" + raise AutosubmitCritical(error_message,7014,e.message+"\n"+str(e.trace)) except IOError as e: raise AutosubmitError( "IO issues ", 6016, e.message) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 52afa70cc..a3a6a3b58 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -155,7 +155,7 @@ class JobPackageBase(object): exit=True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - if configuration.get_project_type().lower() != "none": + if str(configuration.get_project_type()).lower() != "none": raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) if not job.check_script(configuration, parameters,show_logs=job.check_warnings): Log.warning("Script {0} check failed",job.name) diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 1f577426f..12e1e70bc 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -203,6 +203,8 @@ class ParamikoSubmitter(Submitter): if parser.has_option(section, 'SERIAL_PLATFORM'): platforms[section.lower()].serial_platform = platforms[parser.get_option(section, 'SERIAL_PLATFORM', - None).lower()] + None)] + if platforms[section.lower()].serial_platform is not None: + platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 5d31690c4..d757256a4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -466,7 +466,7 @@ class SlurmPlatform(ParamikoPlatform): else: retries = 9999 except BaseException as e: # Unrecoverable error - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: if not wrapper_failed: sleep(sleeptime) sleeptime = sleeptime + 5 -- GitLab From 93f0a58cf4888cd893b56ca996d46d53b534cacf Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 11:16:06 +0200 Subject: [PATCH 093/213] Added include_members and chunks #748 --- autosubmit/job/job_dict.py | 36 ++++++++++++++++++++++++++++++------ test/unit/test_dic_jobs.py | 2 +- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index d0aef9f42..b7e6b4a6d 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -152,11 +152,19 @@ class DicJobs: elif running == 'date': self._create_jobs_startdate(section, priority, frequency, default_job_type, jobs_data,splits) elif running == 'member': - self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits,self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits, \ + self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + elif running == 'chunk': synchronize = self.get_option(section, "SYNCHRONIZE", None) delay = int(self.get_option(section, "DELAY", -1)) - self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data,excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"),excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data, \ + excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"), \ + excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + included_chunks=self.parse_relation(section,False,self.get_option(section, "INCLUDED_CHUNKS", []),"INCLUDED_CHUNKS"), \ + included_members=self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + pass def _create_jobs_once(self, section, priority, default_job_type, jobs_data=dict(),splits=0): @@ -218,7 +226,7 @@ class DicJobs: - def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[]): + def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[],included_members=[]): """ Create jobs to be run once per member @@ -242,11 +250,18 @@ class DicJobs: count = 0 if splits > 0: for member in self._member_list: - if self._member_list.index(member) not in excluded_members: - tmp_dic[section][date][member] = [] + if len(included_members) == 0: + if self._member_list.index(member) not in excluded_members: + tmp_dic[section][date][member] = [] + else: + if self._member_list.index(member) in included_members: + tmp_dic[section][date][member] = [] for member in self._member_list: if self._member_list.index(member) in excluded_members: continue + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue count += 1 if count % frequency == 0 or count == len(self._member_list): if splits <= 0: @@ -259,7 +274,7 @@ class DicJobs: - def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[]): + def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]): """ Create jobs to be run once per chunk @@ -282,6 +297,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): @@ -311,6 +329,9 @@ class DicJobs: for date in self._date_list: self._dic[section][date] = dict() for member in self._member_list: + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue if self._member_list.index(member) in excluded_members: continue self._dic[section][date][member] = dict() @@ -318,6 +339,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 5565c9328..39f7690b2 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -123,7 +123,7 @@ class TestDicJobs(TestCase): self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() self.dictionary._create_jobs_member.assert_not_called() - self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[]) + self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]) def test_dic_creates_right_jobs_by_startdate(self): # arrange -- GitLab From 7d652f65af6dfd733425417129a1962fe36aa5e9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:17:17 +0200 Subject: [PATCH 094/213] Bugfix timeout #812 --- autosubmit/platforms/locplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 3fe62f5cc..e7734b133 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -83,7 +83,7 @@ class LocalPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: -- GitLab From 005443718398f3c7bca61a0eadea85c4339b3987 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:29:32 +0200 Subject: [PATCH 095/213] Erased wrong info about TOTAL_JOBS --- .../html/_sources/usage/new_platform.rst.txt | 2 +- docs/source/userguide/wrappers/index.rst | 73 ++++++------------- 2 files changed, 25 insertions(+), 50 deletions(-) diff --git a/docs/build/html/_sources/usage/new_platform.rst.txt b/docs/build/html/_sources/usage/new_platform.rst.txt index 675d4edc6..971778061 100644 --- a/docs/build/html/_sources/usage/new_platform.rst.txt +++ b/docs/build/html/_sources/usage/new_platform.rst.txt @@ -53,7 +53,7 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be queuing or submitted in this platform. +* MAX_WAITING_JOBS: Maximum number of jobs to be queuing or submitted in this platform. * TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. diff --git a/docs/source/userguide/wrappers/index.rst b/docs/source/userguide/wrappers/index.rst index 2ee2a34e1..388c215ef 100644 --- a/docs/source/userguide/wrappers/index.rst +++ b/docs/source/userguide/wrappers/index.rst @@ -1,5 +1,6 @@ +############ Wrappers -======== +############ In order to understand the goal of this feature, please take a look at: https://earth.bsc.es/wiki/lib/exe/fetch.php?media=library:seminars:techniques_to_improve_the_throughput.pptx @@ -13,37 +14,9 @@ At the moment there are 4 types of wrappers that can be used depending on the ex When using the wrapper, it is useful to be able to visualize which packages are being created. So, when executing *autosubmit monitor cxxx*, a dashed box indicates the jobs that are wrapped together in the same job package. -How to configure ----------------- - -In ``autosubmit_cxxx.conf``, regardless of the wrapper type, you need to make sure that the values of the variables **MAXWAITINGJOBS** and **TOTALJOBS** are increased according to the number of jobs expected to be waiting/running at the same time in your experiment. - -For example: - -.. code-block:: ini - - [config] - EXPID = .... - AUTOSUBMIT_VERSION = 3.13.0 - ... - - MAXWAITINGJOBS = 100 - TOTALJOBS = 100 - ... - -and below the [config] block, add the wrapper directive, indicating the wrapper type: - -.. code-block:: ini - - [wrapper] - TYPE = - -You can also specify which job types should be wrapped. This can be done using the **JOBS_IN_WRAPPER** parameter. -It is only required for the vertical-mixed type (in which the specified job types will be wrapped together), so if nothing is specified, all jobs will be wrapped. -By default, jobs of the same type will be wrapped together, as long as the constraints are satisfied. Number of jobs in a package -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*************************** .. code-block:: ini @@ -56,7 +29,7 @@ Number of jobs in a package - **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` - - If **MAX_WRAPPED** is not defined, then **TOTALJOBS** is used by default + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. - **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain - If not defined, it considers that **MIN_WRAPPED** is 2. - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. @@ -66,14 +39,14 @@ Number of jobs in a package Wrapper check time -~~~~~~~~~~~~~~~~~~ +********************** It is possible to override the **SAFETYSLEEPTIME** for the wrapper, by using **CHECK_TIME_WRAPPER** and defining a time interval (in seconds) in which the wrapper internal jobs should be checked. .. important:: Note that the **numbers** shown in this documentation are examples. The actual values must be set according to the specific workflow, as well as the platform configurations. Vertical wrapper ----------------- +======================= The vertical wrapper is more appropriate when there are many sequential jobs. To use it, set TYPE = vertical: @@ -93,7 +66,7 @@ In order to be able to use the vertical wrapper, in ``platforms_cxxx.conf`` set Remember to add to each job the corresponding WALLCLOCK time. Vertical with multiple sections -------------------------------- +=============================== This is a mode of the vertical wrapper that allows jobs of different types to be wrapped together. Note that the solution considers the order of the sections defined in the ``jobs_cxxx.conf`` file, so the order of the sections given in **JOBS_IN_WRAPPER** is irrelevant. @@ -105,20 +78,23 @@ Additionally, jobs are grouped within the corresponding date, member and chunk h TYPE = vertical JOBS_IN_WRAPPER = SIM&SIM2 # REQUIRED -.. figure:: fig/vertical-mixed.png +.. figure:: ../../workflows/vertical-mixed.png :name: vertical-mixed :width: 100% :align: center :alt: vertical-mixed wrapper Horizontal wrapper ------------------- +========================== The horizontal wrapper is more appropriate when there are multiple ensemble members that can be run in parallel. If the wrapped jobs have an mpirun call, they will need machine files to specify in which nodes each job will run. Different cases may need specific approaches when creating the machine files. For auto-ecearth use COMPONENTS instead of STANDARD. +Horizontal wrapper +********************** + .. code-block:: ini [wrapper] @@ -135,14 +111,14 @@ In order to be able to use the horizontal wrapper, in ``platforms_cxxx.conf`` se ... MAX_PROCESSORS = 2400 -.. figure:: fig/horizontal_remote.png +.. figure:: ../../workflows/horizontal_remote.png :name: horizontal_remote :width: 60% :align: center :alt: horizontally wrapped jobs Shared-memory Experiments -~~~~~~~~~~~~~~~~~~~~~~~~~ +************************* There is also the possibility of setting the option **METHOD** to SRUN in the wrapper directive (**ONLY** for vertical and vertical-horizontal wrappers). @@ -155,13 +131,13 @@ This allows to form a wrapper with shared-memory paradigm instead of rely in mac METHOD = srun # default ASTHREAD Hybrid wrapper --------------- +========================== The hybrid wrapper is a wrapper that works both vertically and horizontally at the same time, meaning that members and chunks can be wrapped in one single job. Mixed approach using a combination of horizontal and vertical wrappers and the list of jobs is a list of lists. Horizontal-vertical -------------------- +=========================== - There is a dependency between lists. Each list runs after the previous one finishes; the jobs within the list run in parallel at the same time - It is particularly suitable if there are jobs of different types in the list with different wall clocks, but dependencies between jobs of different lists; it waits for all the jobs in the list to finish before starting the next list @@ -174,7 +150,7 @@ Horizontal-vertical MACHINEFILES = STANDARD JOBS_IN_WRAPPER = SIM&DA -.. figure:: fig/dasim.png +.. figure:: ../../workflows/dasim.png :name: wrapper_horizontal_vertical :width: 100% :align: center @@ -182,7 +158,7 @@ Horizontal-vertical Vertical-horizontal -------------------- +=========================== - In this approach, each list is independent of each other and run in parallel; jobs within the list run one after the other - It is particularly suitable for running many sequential ensembles @@ -195,15 +171,14 @@ Vertical-horizontal MACHINEFILES = STANDARD JOBS_IN_WRAPPER = SIM -.. figure:: fig/vertical-horizontal.png +.. figure:: ../../workflows/vertical-horizontal.png :name: wrapper_vertical_horizontal :width: 100% :align: center :alt: hybrid wrapper Multiple wrappers at once -------------------------- - +========================= This is an special mode that allows you to use multiple **independent** wrappers on the same experiment. By using an special variable that allows to define subwrapper sections .. code-block:: ini @@ -220,14 +195,14 @@ This is an special mode that allows you to use multiple **independent** wrappers TYPE = vertical JOBS_IN_WRAPPER = DA&REDUCE -.. figure:: fig/multiple_wrappers.png +.. figure:: ../workflows/multiple_wrappers.png :name: :width: 100% :align: center :alt: multi wrapper Summary -------- +========================== In `autosubmit_cxxx.conf`: @@ -238,7 +213,7 @@ In `autosubmit_cxxx.conf`: # JOBS_IN_WRAPPER = Sections that should be wrapped together ex SIM # METHOD : Select between MACHINESFILES or Shared-Memory. # MIN_WRAPPED set the minim number of jobs that should be included in the wrapper. DEFAULT = 2 - # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = TOTALJOBS + # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = 99999999999 # Policy : Select the behaviour of the inner jobs Strict/Flexible/Mixed # EXTEND_WALLCLOCK: Allows to extend the wallclock by the max wallclock of the horizontal package (max inner job). Values are integer units (0,1,2) # RETRIALS : Enables a retrial mechanism for vertical wrappers, or default retrial mechanism for the other wrappers @@ -247,7 +222,7 @@ In `autosubmit_cxxx.conf`: TYPE = Vertical #REQUIRED JOBS_IN_WRAPPER = SIM # Job types (as defined in jobs_cxxx.conf) separated by space. REQUIRED only if vertical-mixed MIN_WRAPPED = 2 - MAX_WRAPPED = 9999 # OPTIONAL. Integer value, overrides TOTALJOBS + MAX_WRAPPED = 999999 # OPTIONAL. Integer value. CHECK_TIME_WRAPPER = # OPTIONAL. Time in seconds, overrides SAFETYSLEEPTIME POLICY = flexible # OPTIONAL, Wrapper policy, mixed, flexible, strict QUEUE = bsc_es # If not specified, queue will be the same of the first SECTION specified on JOBS_IN_WRAPPER -- GitLab From 41d91d50d34557006390d826584d68938087effd Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:38:51 +0200 Subject: [PATCH 096/213] Added wrapper info under devel_proj -> Controling the number of active concurrent tasks in an experiment #857 --- .../userguide/configure/develop_a_project.rst | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/docs/source/userguide/configure/develop_a_project.rst b/docs/source/userguide/configure/develop_a_project.rst index 17caddcf5..0dda37b3c 100644 --- a/docs/source/userguide/configure/develop_a_project.rst +++ b/docs/source/userguide/configure/develop_a_project.rst @@ -712,8 +712,34 @@ To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS .. code-block:: ini - # Maximum number of submitted,waiting and running tasks - TOTAL_JOBS = 10 - # Maximum number of submitted and waiting tasks - MAX_WAITING_JOBS = 10 + # Controls the maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Controls the maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 +To control the number of jobs included in a wrapper, you can use the `MAX_WRAPPED_JOBS` and `MIN_WRAPPED_JOBS` variables in the ``conf/autosubmit_cxxx.conf`` file. + +Note that a wrapped job is counted as a single job regardless of the number of tasks it contains. Therefore, `TOTAL_JOBS` and `MAX_WAITING_JOBS` won't have an impact inside a wrapper. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + [wrapper] + TYPE = + MIN_WRAPPED = 2 # Minium amount of jobs that will be wrapped together in any given time. + MIN_WRAPPED_H = 2 # Same as above but only for the horizontal packages. + MIN_WRAPPED_V = 2 # Same as above but only for the vertical packages. + MAX_WRAPPED = 99999 # Maximum amount of jobs that will be wrapped together in any given time. + MAX_WRAPPED_H = 99999 # Same as above but only for the horizontal packages. + MAX_WRAPPED_V = 99999 # Same as above but only for the vertical packages. + +- **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section + - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. +- **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain + - If not defined, it considers that **MIN_WRAPPED** is 2. + - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. + - If **POLICY** is mixed and there are failed jobs inside a wrapper, these jobs will be submitted as individual jobs. + - If **POLICY** is strict and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will not be submitted until there are enough tasks to build a package. + - strict and mixed policies can cause **deadlocks**. -- GitLab From b1bb2535949f0da796fe23215be0cf3cebd912e2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 16:07:50 +0200 Subject: [PATCH 097/213] Deleted argcomplete --- autosubmit/autosubmit.py | 4 +--- setup.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 48e5b2e28..ccb1bbac9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# PYTHON_ARGCOMPLETE_OK # Copyright 2015-2020 Earth Sciences Department, BSC-CNS @@ -65,7 +64,7 @@ try: except Exception: dialog = None from time import sleep -import argparse, argcomplete +import argparse import subprocess import json import tarfile @@ -583,7 +582,6 @@ class Autosubmit: # Changelog subparsers.add_parser('changelog', description='show changelog') - argcomplete.autocomplete(parser) args = parser.parse_args() diff --git a/setup.py b/setup.py index 8e56eb8c5..a5a7801ef 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + install_requires=['argparse>=1.2,<2','six>=1.10.0', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ -- GitLab From b945cdd83dc16d11a0bcd205ae8785a80e47867c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 6 Oct 2022 14:33:10 +0200 Subject: [PATCH 098/213] Fixed an issue with main_platform = local and no platforms configured --- autosubmit/config/config_common.py | 8 +++++--- autosubmit/history/data_classes/job_data.py | 3 ++- autosubmit/job/job_dict.py | 7 ++++--- autosubmit/platforms/psplatform.py | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 63b31483d..26ce6ec50 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -607,9 +607,7 @@ class AutosubmitConfig(object): """ Checks experiment's queues configuration file. """ - if len(self._platforms_parser.sections()) == 0: - self.wrong_config["Platform"] += [["Global", - "Platform file is not well-configured or found"]] + if len(self._platforms_parser.sections()) != len(set(self._platforms_parser.sections())): self.wrong_config["Platform"] += [["Global", @@ -619,7 +617,11 @@ class AutosubmitConfig(object): main_platform_found = True elif self.ignore_undefined_platforms: main_platform_found = True + if len(self._platforms_parser.sections()) == 0 and not main_platform_found: + self.wrong_config["Platform"] += [["Global", + "Platform file is not well-configured or found"]] for section in self._platforms_parser.sections(): + if section in self.hpcarch: main_platform_found = True if not self._platforms_parser.check_exists(section, 'TYPE'): diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index b5249b797..93a88797a 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -57,7 +57,8 @@ class JobData(object): platform) > 0 else "NA" self.job_id = job_id if job_id else 0 try: - self.extra_data_parsed = loads(extra_data) + if extra_data != "": + self.extra_data_parsed = loads(extra_data) except Exception as exp: self.extra_data_parsed = {} # Fail fast self.extra_data = extra_data diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index b7e6b4a6d..29ca59e28 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -402,9 +402,10 @@ class DicJobs: for d in self._date_list: self._get_date(jobs, dic, d, member, chunk) try: - if type(jobs[0]) is list: - jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] - jobs = jobs_flattened + if len(jobs) > 0: + if type(jobs[0]) is list: + jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] + jobs = jobs_flattened except BaseException as e: pass return jobs diff --git a/autosubmit/platforms/psplatform.py b/autosubmit/platforms/psplatform.py index aee3e4eb7..e2c3ede88 100644 --- a/autosubmit/platforms/psplatform.py +++ b/autosubmit/platforms/psplatform.py @@ -76,7 +76,7 @@ class PsPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: -- GitLab From f61d45fa86eed705989935f50aaefc2d5b32e828 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 6 Oct 2022 14:41:04 +0200 Subject: [PATCH 099/213] fixed tests --- requeriments.txt | 1 + test/unit/test_dic_jobs.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requeriments.txt b/requeriments.txt index c34451db2..b5783046b 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,3 +1,4 @@ +pytest==2.9.2 configparser argparse>=1.2,<2 python-dateutil>2 diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 39f7690b2..f955f96dc 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -81,9 +81,10 @@ class TestDicJobs(TestCase): frequency = 123 splits = 0 excluded_list_m = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='member') - self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() @@ -95,7 +96,7 @@ class TestDicJobs(TestCase): # assert self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() - self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m) + self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m,included_list_m) self.dictionary._create_jobs_chunk.assert_not_called() def test_read_section_running_chunk_create_jobs_chunk(self): @@ -108,9 +109,11 @@ class TestDicJobs(TestCase): splits = 0 excluded_list_c = [] excluded_list_m = [] + included_list_c = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='chunk') - self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m,included_list_c,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() -- GitLab From 9640d066e2f04a1803cbed5bae83ad8840911410 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 7 Oct 2022 13:10:25 +0200 Subject: [PATCH 100/213] updateversion does not need -v, and now stores the change in the db #882 #881 --- autosubmit/autosubmit.py | 25 +++++++++++++------------ autosubmit/config/config_common.py | 2 ++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ccb1bbac9..be15c0bec 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -755,18 +755,19 @@ class Autosubmit: force_update_version = args.update_version else: force_update_version = False - if force_update_version: - if as_conf.get_version() != Autosubmit.autosubmit_version: - Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", - as_conf.get_version(), Autosubmit.autosubmit_version, expid) - as_conf.set_version(Autosubmit.autosubmit_version) - else: - if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: - raise AutosubmitCritical( - "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" - "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), - Autosubmit.autosubmit_version, expid,args.command), - 7067) + if args.command not in ["upgrade","updateversion"]: + if force_update_version: + if as_conf.get_version() != Autosubmit.autosubmit_version: + Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", + as_conf.get_version(), Autosubmit.autosubmit_version, expid) + as_conf.set_version(Autosubmit.autosubmit_version) + else: + if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: + raise AutosubmitCritical( + "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" + "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), + Autosubmit.autosubmit_version, expid,args.command), + 7067) else: if expid == 'None': exp_id = "" diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 26ce6ec50..c0cacf190 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -35,6 +35,7 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from autosubmit.config.basicConfig import BasicConfig from collections import defaultdict +from autosubmit.database.db_common import update_experiment_descrip_version class AutosubmitConfig(object): @@ -1369,6 +1370,7 @@ class AutosubmitConfig(object): content = content.replace(re.search('AUTOSUBMIT_VERSION =.*', content).group(0), "AUTOSUBMIT_VERSION = " + autosubmit_version) open(self._conf_parser_file, 'w').write(content) + update_experiment_descrip_version(self.expid, description=None, version=autosubmit_version) def get_version(self): """ -- GitLab From 8f03d5452f9f2286739597833df98d8a21c6b2c5 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 11 Oct 2022 15:33:59 +0200 Subject: [PATCH 101/213] NEW RUN RULES changes #847 --- autosubmit/autosubmit.py | 16 ++++++++---- autosubmit/history/experiment_history.py | 32 +++++++++++++----------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index be15c0bec..be6c31665 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -59,10 +59,11 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set import sqlite3 -try: - import dialog -except Exception: - dialog = None +#try: +# import dialog +#except Exception: +# dialog = None +dialog = None from time import sleep import argparse import subprocess @@ -4253,7 +4254,12 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + + #exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + exp_history.process_status_changes(job_list.get_job_list(), + chunk_unit=as_conf.get_chunk_size_unit(), + chunk_size=as_conf.get_chunk_size(), + current_config=as_conf.get_full_config_as_json(),create=True) Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index f1e0be68c..ecd06067b 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -190,15 +190,15 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="",create=False): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ try: current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() update_these_changes = self._get_built_list_of_changes(job_list) - should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size) + should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size,create) if len(update_these_changes) > 0 and should_create_new_run == False: self.manager.update_many_job_data_change_status(update_these_changes) - if should_create_new_run: + if should_create_new_run: return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: @@ -217,11 +217,14 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size): - if len(job_list) != current_experiment_run_dc.total: - return True - if changes_count > int(self._get_date_member_completed_count(job_list)): - return True + def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): + if create: + return True + elif not create and self.expid[0].lower() == "t": + if len(job_list) != current_experiment_run_dc.total: + return True + if changes_count > int(self._get_date_member_completed_count(job_list)): + return True return self._chunk_config_has_changed(current_experiment_run_dc, new_chunk_unit, new_chunk_size) def _chunk_config_has_changed(self, current_exp_run_dc, new_chunk_unit, new_chunk_size): @@ -274,15 +277,16 @@ class ExperimentHistory(): def detect_changes_in_job_list(self, job_list): """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" - job_name_to_job = {job.name: job for job in job_list} + job_name_to_job = {str(job.name): job for job in job_list} current_job_data_dcs = self.manager.get_all_last_job_data_dcs() differences = [] for job_dc in current_job_data_dcs: - if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: - if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): - # If the job is not changing from a finalized status to a starting status - job_dc.status = job_name_to_job[job_dc.job_name].status_str - differences.append(job_dc) + if job_dc.job_name in job_name_to_job: + if job_dc.status != job_name_to_job[job_dc.job_name].status_str: + if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): + # If the job is not changing from a finalized status to a starting status + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) return differences def _get_defined_rowtype(self, code): -- GitLab From 99f642cfb19954ced3c9f72c67e15471ac84ed6f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 13 Oct 2022 09:02:40 +0200 Subject: [PATCH 102/213] new run --- autosubmit/history/experiment_history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index ecd06067b..96651df99 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -220,7 +220,7 @@ class ExperimentHistory(): def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): if create: return True - elif not create and self.expid[0].lower() == "t": + elif not create and self.expid[0].lower() != "t": if len(job_list) != current_experiment_run_dc.total: return True if changes_count > int(self._get_date_member_completed_count(job_list)): -- GitLab From 627f65e5ff5713c0921d03588b3d9c92d66a4080 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 13 Oct 2022 13:05:59 +0200 Subject: [PATCH 103/213] Fixed pipeline tests --- test/unit/test_autosubmit_config.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/test/unit/test_autosubmit_config.py b/test/unit/test_autosubmit_config.py index c4c8480df..00e624406 100644 --- a/test/unit/test_autosubmit_config.py +++ b/test/unit/test_autosubmit_config.py @@ -181,13 +181,22 @@ class TestAutosubmitConfig(TestCase): open_mock.assert_any_call(config.experiment_file, 'w') def test_set_version(self): - # arrange + + #ARRAGE + FakeBasicConfig.DB_PATH = 'fake-path' + sys.modules['os'].path.exists = Mock(return_value=True) + connection_mock = Mock() + cursor_mock = Mock() + connection_mock.cursor = Mock(return_value=cursor_mock) + cursor_mock.fetchone = Mock(return_value=[0]) + + sys.modules['sqlite3'].connect = Mock(return_value=connection_mock) config = AutosubmitConfig(self.any_expid, FakeBasicConfig, ConfigParserFactory()) open_mock = mock_open(read_data='AUTOSUBMIT_VERSION = dummy') with patch.object(builtins, "open", open_mock): # act - config.set_version('dummy-vesion') + config.set_version('dummy-version') # assert open_mock.assert_any_call(getattr(config, '_conf_parser_file'), 'w') @@ -461,3 +470,4 @@ class FakeBasicConfig: LOCAL_PROJ_DIR = '/dummy/local/proj/dir' DEFAULT_PLATFORMS_CONF = '' DEFAULT_JOBS_CONF = '' + -- GitLab From 0dbdedd8a79b8cbeaeb33716f6ef21f6d1070dff Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 17 Oct 2022 16:30:31 +0200 Subject: [PATCH 104/213] when creating the experiment, it now also see if there is a folder called as it. If there is a folder, the (old) experiment will be registered --- autosubmit/autosubmit.py | 1 - autosubmit/database/db_common.py | 9 ++++++++- autosubmit/experiment/experiment_common.py | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index be6c31665..1ea5d3a97 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -916,7 +916,6 @@ class Autosubmit: os.mkdir(os.path.join( BasicConfig.LOCAL_ROOT_DIR, exp_id, 'conf')) Log.info("Copying config files...") - # autosubmit config and experiment copied from AS. files = resource_listdir('autosubmit.config', 'files') for filename in files: diff --git a/autosubmit/database/db_common.py b/autosubmit/database/db_common.py index aaaf4875a..47cc770eb 100644 --- a/autosubmit/database/db_common.py +++ b/autosubmit/database/db_common.py @@ -24,7 +24,7 @@ import os import sqlite3 import multiprocessing import Queue - +import autosubmit from log.log import Log, AutosubmitCritical, AutosubmitError Log.get_logger("Autosubmit") from autosubmit.config.basicConfig import BasicConfig @@ -319,6 +319,7 @@ def _check_experiment_exists(name, error_on_inexistence=True): :return: If experiment exists returns true, if not returns false :rtype: bool """ + if not check_db(): return False try: @@ -339,6 +340,12 @@ def _check_experiment_exists(name, error_on_inexistence=True): if error_on_inexistence: raise AutosubmitCritical( 'The experiment name "{0}" does not exist yet!!!'.format(name), 7005) + if os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR, name)): + try: + _save_experiment(name, 'No description', "3.14.0") + except BaseException as e: + pass + return True return False return True diff --git a/autosubmit/experiment/experiment_common.py b/autosubmit/experiment/experiment_common.py index 160f15158..3c31346c2 100644 --- a/autosubmit/experiment/experiment_common.py +++ b/autosubmit/experiment/experiment_common.py @@ -58,7 +58,7 @@ def new_experiment(description, version, test=False, operational=False): else: new_name = 'a000' else: - new_name = next_experiment_id(last_exp_name) + new_name = last_exp_name if new_name == '': return '' while db_common.check_experiment_exists(new_name, False): -- GitLab From daa18b8d664ea37e56935ee5889d51c158c3899c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 18 Oct 2022 11:44:25 +0200 Subject: [PATCH 105/213] Added more reasons to a job for stop #837 --- autosubmit/autosubmit.py | 6 ++++-- autosubmit/job/job.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 1ea5d3a97..27bda288e 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1467,8 +1467,10 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower( - )] + try: + job.platform = submitter.platforms[job.platform_name.lower()] + except: + raise AutosubmitCritical("hpcarch={0} not found in the platforms configuration file".format(job.platform_name), 7014) # noinspection PyTypeChecker if job.status not in (Status.COMPLETED, Status.SUSPENDED): platforms_to_test.add(job.platform) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 6653c51f9..08b39d27c 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1257,7 +1257,8 @@ class Job(object): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: @@ -1639,7 +1640,8 @@ class WrapperJob(Job): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: -- GitLab From b61915c5fad1763bc28cf312fb3da8d8aad1eb6b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 18 Oct 2022 16:25:49 +0200 Subject: [PATCH 106/213] dbfix pipeline --- test/unit/test_expid.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/unit/test_expid.py b/test/unit/test_expid.py index 85e5a012b..7eee22bfc 100644 --- a/test/unit/test_expid.py +++ b/test/unit/test_expid.py @@ -31,21 +31,21 @@ class TestExpid(TestCase): @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "a006" + current_experiment_id = "a007" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version) self.assertEquals("a007", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_test_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "t0ab" + current_experiment_id = "t0ac" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, True) self.assertEquals("t0ac", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_operational_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "o112" + current_experiment_id = "o113" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, False, True) self.assertEquals("o113", experiment_id) -- GitLab From ca7f0ec0e24a4ea2e5420cfabe13838f81ff1a1f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 19 Oct 2022 13:32:13 +0200 Subject: [PATCH 107/213] Recursive submodules --- autosubmit/git/autosubmit_git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index c191c21df..8d194de74 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -223,7 +223,7 @@ class AutosubmitGit: else: command_1 += " git submodule init;".format(project_destination) for submodule in git_project_submodules: - command_1 += " git submodule update {0};".format(submodule) + command_1 += " git submodule update --init --recursive {0};".format(submodule) if git_remote_project_path == '': try: command_1 = "cd {0}; {1} ".format(git_path,command_1) -- GitLab From 622ddee1e527ca57ed5e80627c290f35c338e1dc Mon Sep 17 00:00:00 2001 From: jberlin Date: Fri, 7 Oct 2022 11:24:08 +0200 Subject: [PATCH 108/213] Made small changes to documentation concerning the Conda installation - #864 --- docs/source/installation/index.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 7159ac7c0..157f28ecc 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -160,7 +160,7 @@ Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. autosubmit install # Get expid - autosubmit expid -H TEST -d "Test exp." + autosubmit expid -H local -d "Test exp." # Create with -np # Since it was a new install the expid will be a000 @@ -175,7 +175,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Download git + # Download git (if it is not already installed) apt install git -y -q # Download autosubmit git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b @@ -186,4 +186,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. conda activate autosubmit # Test autosubmit autosubmit -v - # Configure autosubmitrc and install database as indicated in this doc + # Configure autosubmitrc and install the database as indicated in the installation instructions above this section + +.. hint:: + After installing conda, you may need to close the terminal and re-open it so the installation takes effect. \ No newline at end of file -- GitLab From 986bb585a04780d139eb03dabe34daffccbecbd0 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:17:23 +0200 Subject: [PATCH 109/213] Fixed an issue raised when a platform has no project expid user or scratch dir defined --- autosubmit/autosubmit.py | 7 +++++-- autosubmit/config/config_common.py | 2 +- autosubmit/platforms/paramiko_submitter.py | 19 +++++++++++++------ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 27bda288e..ff24f0967 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4385,8 +4385,11 @@ class Autosubmit: submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform\nCheck the hpcarch platform configuration inside platform.conf", 7014) - + try: + hpcarch = submitter.platforms[as_conf.get_platform()] + except: + hpcarch = "local" + Log.warning("Remote clone may be disabled due to: "+e.message) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": svn_project_url = as_conf.get_svn_project_url() diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index c0cacf190..ff6f31ea4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -623,7 +623,7 @@ class AutosubmitConfig(object): "Platform file is not well-configured or found"]] for section in self._platforms_parser.sections(): - if section in self.hpcarch: + if section.lower() in self.hpcarch.lower(): main_platform_found = True if not self._platforms_parser.check_exists(section, 'TYPE'): self.wrong_config["Platform"] += [[section, diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 12e1e70bc..92594abdd 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -20,7 +20,8 @@ import os -from log.log import Log + +from log.log import Log,AutosubmitCritical,AutosubmitError from autosubmit.config.basicConfig import BasicConfig from autosubmit.config.config_common import AutosubmitConfig from submitter import Submitter @@ -72,7 +73,7 @@ class ParamikoSubmitter(Submitter): :return: platforms used by the experiment :rtype: dict """ - + raise_message="" platforms_used = list() hpcarch = asconf.get_platform() platforms_used.append(hpcarch) @@ -191,12 +192,16 @@ class ParamikoSubmitter(Submitter): remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) - remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, - remote_platform.user, remote_platform.expid) + try: + remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, + remote_platform.user, remote_platform.expid) + remote_platform.update_cmds() + platforms[section.lower()] = remote_platform + + except: + raise_message = "Error in platform.conf: SCRATCH_DIR, PROJECT, USER, EXPID must be defined for platform {0}".format(section) # Executes update_cmds() from corresponding Platform Object - remote_platform.update_cmds() # Save platform into result dictionary - platforms[section.lower()] = remote_platform for section in parser.sections(): # if this section is included in platforms @@ -208,3 +213,5 @@ class ParamikoSubmitter(Submitter): platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms + if raise_message != "": + raise AutosubmitError(raise_message) -- GitLab From 7fa2231d2ed3388a44acadece94971abd886dc77 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:39:37 +0200 Subject: [PATCH 110/213] test local git not working proprly --- autosubmit/autosubmit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ff24f0967..5133dc4ca 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1359,6 +1359,7 @@ class Autosubmit: for wrapper_section in as_conf.get_wrapper_multi(): wrapper_jobs[wrapper_section] = as_conf.get_wrapper_jobs(wrapper_section) wrapper_jobs["wrapper"] = as_conf.get_wrapper_jobs("wrapper") + # Log.warning("Aux Job_list was generated successfully") submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) -- GitLab From e5f18727cb3650aaf9af58fe5d6a0d068e6cf9ee Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:30:39 +0200 Subject: [PATCH 111/213] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ad59f742d..cd56dd095 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.14.0b +#3.14.0b -- GitLab From c99d07e59fe148fcd6ea0ea6404fbc51770bef6f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:30:49 +0200 Subject: [PATCH 112/213] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index cd56dd095..ad59f742d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -#3.14.0b +3.14.0b -- GitLab From f166cbdce7d6699c03afa403ac74b04cb8784f58 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 3 Nov 2022 09:42:59 +0100 Subject: [PATCH 113/213] Unbound variable fixes in some messages and job_status #893 Fixed an issue with recovery not cancelling all jobs --- autosubmit/autosubmit.py | 17 ++++---- autosubmit/git/autosubmit_git.py | 4 +- autosubmit/job/job.py | 49 ++++++++++++----------- autosubmit/job/job_list.py | 14 +++---- autosubmit/monitor/diagram.py | 12 ++++-- autosubmit/platforms/paramiko_platform.py | 37 ++++++++--------- 6 files changed, 69 insertions(+), 64 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5133dc4ca..cbb4b142d 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2068,7 +2068,6 @@ class Autosubmit: platform.connected = False Log.printlog("[{1}] Connection failed to host {0}".format( platform.host, platform.name),Log.WARNING) if issues != "": - platform.connected = False raise AutosubmitCritical( "Issues while checking the connectivity of platforms.", 7010, issues+"\n"+ssh_config_issues) @@ -2221,7 +2220,7 @@ class Autosubmit: "Submission failed, this can be due a failure on the platform", 6015, e.message) if jobs_id is None or len(jobs_id) <= 0: raise AutosubmitError( - "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(e.message,e.trace), 6015) + "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(str(e),""), 6015) i = 0 if hold: sleep(10) @@ -2677,9 +2676,9 @@ class Autosubmit: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) for platform in platforms_to_test: platform.test_connection() - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) if not force: raise AutosubmitCritical( "Experiment can't be recovered due being {0} active jobs in your experiment, If you want to recover the experiment, please use the flag -f and all active jobs will be cancelled".format( @@ -3235,16 +3234,16 @@ class Autosubmit: # Preparation for section parameters no_load_sections = False no_load_platforms = False - try: - job_list = Autosubmit.load_job_list( - expid, as_conf, notransitive=False) - except Exception as e: - no_load_sections = True + + job_list = Autosubmit.load_job_list( + expid, as_conf, notransitive=False) + try: submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) except Exception as e: no_load_platforms = True + submitter = Autosubmit._get_submitter(as_conf) submitter.load_local_platform(as_conf) try: # Gathering parameters of autosubmit and expdef config files @@ -4049,7 +4048,7 @@ class Autosubmit: Log.warning("Experiment folder renamed to: {0}".format( exp_folder + "_to_delete ")) except Exception as e: - Autosubmit.unarchive(expid, uncompress=False) + Autosubmit.unarchive(expid, uncompressed=False) raise AutosubmitCritical( "Can not remove or rename experiments folder", 7012, str(e)) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 8d194de74..493358ed0 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -203,7 +203,7 @@ class AutosubmitGit: command_0 = "cd {0} ; {1}".format(project_path, command_0) output_0 = subprocess.check_output(command_0, shell=True) else: - command_0 = "cd {0} ; {1}".format(git_remote_path, command_0) + command_0 = "cd {0} ; {1}".format(project_path, command_0) hpcarch.send_command(command_0) ##command 1 if os.path.exists(os.path.join(git_path, ".githooks")): @@ -233,7 +233,7 @@ class AutosubmitGit: submodule_failure = True Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( - "Submodule {0} has a wrong configuration".format(submodule), 6014) + "Submodule has a wrong configuration.\n{0}".format(command_1), 6014) else: command_1 = "cd {0}; {1} ".format(git_remote_path, command_1) hpcarch.send_command(command_1) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 08b39d27c..739216c4a 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -624,6 +624,7 @@ class Job(object): as_conf.reload() submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) + platform = submitter.platforms[platform_name.lower()] success = True except BaseException as e: error_message = str(e) @@ -632,31 +633,31 @@ class Job(object): count=count+1 if not success: raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) - platform = submitter.platforms[platform_name.lower()] - try: - platform.test_connection() + else: max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count - if self.wrapper_type is not None and self.wrapper_type == "vertical": - found = False - retrials = 0 - while retrials < 3 and not found: - if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): - found = True - retrials = retrials + 1 - for i in range(max_logs-1,-1,-1): - if platform.check_stat_file_by_retrials(stat_file + str(i)): - last_log = i - else: - break - remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) + try: + platform.test_connection() + if self.wrapper_type is not None and self.wrapper_type == "vertical": + found = False + retrials = 0 + while retrials < 3 and not found: + if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): + found = True + retrials = retrials + 1 + for i in range(max_logs-1,-1,-1): + if platform.check_stat_file_by_retrials(stat_file + str(i)): + last_log = i + else: + break + remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) - else: - remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) + else: + remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except BaseException as e: - Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) + except BaseException as e: + Log.printlog( + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 @@ -1730,7 +1731,7 @@ class WrapperJob(Job): self.name, reason), 6009) # while running jobs? self._check_running_jobs() - self.update_failed_jobs(canceled_wrapper=True) + self.update_failed_jobs() self.cancel_failed_wrapper_job() return @@ -1760,8 +1761,8 @@ class WrapperJob(Job): job.hold = self.hold job.status = self.status if self.status == Status.WAITING: - for job in self.job_list: - job.packed = False + for job2 in self.job_list: + job2.packed = False def _check_inner_job_wallclock(self, job): start_time = self.running_jobs_start[job] diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 4c2712267..424332b47 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -437,13 +437,13 @@ class JobList(object): # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - for parent in jobs_by_section: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + for parent2 in jobs_by_section: + if parent2.chunk in dependency.select_chunks_dest[relation_indx] or len( dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - visited_parents.add(parent) + if parent2 not in visited_parents: + job.add_parent(parent2) + JobList._add_edge(graph, job, parent2) + visited_parents.add(parent2) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) @@ -2040,7 +2040,7 @@ class JobList(object): # root exists if root is not None: - result += self._recursion_print(root, 0) + result += self._recursion_print(root, 0,[]) else: result += "\nCannot find root." diff --git a/autosubmit/monitor/diagram.py b/autosubmit/monitor/diagram.py index 8e8753167..b1f0f6744 100644 --- a/autosubmit/monitor/diagram.py +++ b/autosubmit/monitor/diagram.py @@ -66,14 +66,18 @@ def create_bar_diagram(experiment_id, jobs_list, general_stats, output_file, per exp_stats.calculate_statistics() exp_stats.calculate_summary() exp_stats.make_old_format() - failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + # Stats variables definition + normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) + failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) except Exception as exp: + if not isinstance(normal_plots_count,int): + normal_plots_count = 0 + if not isinstance(failed_jobs_plots_count,int): + failed_jobs_plots_count = 0 print(exp) print(traceback.format_exc()) - # Stats variables definition - normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) - failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) total_plots_count = normal_plots_count + failed_jobs_plots_count # num_plots = norma # ind = np.arange(int(MAX_JOBS_PER_PLOT)) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 1c1177510..4b5c2d4b4 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -118,7 +118,7 @@ class ParamikoPlatform(Platform): raise except BaseException as e: self.connected = False - raise AutosubmitCritical(message,7051) + raise AutosubmitCritical(str(e),7051) #raise AutosubmitError("[{0}] connection failed for host: {1}".format(self.name, self.host), 6002, e.message) def restore_connection(self): @@ -226,7 +226,7 @@ class ParamikoPlatform(Platform): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: - self.restore_connection(reconnect=True) + self.restore_connection() else: raise AutosubmitError( "Couldn't establish a connection to the specified host, wrong configuration?", 6003, e.message) @@ -283,8 +283,8 @@ class ParamikoPlatform(Platform): self._ftpChannel.chmod(remote_path, os.stat(local_path).st_mode) return True except IOError as e: - raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, str(e)) + + raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join(self.tmp_path,filename), code=6004, trace=str(e))) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -594,19 +594,20 @@ class ParamikoPlatform(Platform): sleep_time = sleep_time + 5 # URi: define status list in HPC Queue Class else: - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + job_status = job.status + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -989,7 +990,7 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def parse_queue_reason(self, output): + def parse_queue_reason(self, output, job_id): raise NotImplementedError def get_ssh_output(self): -- GitLab From 8a73de91ebf8e3ba3893b573739ffed5d7a54750 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Thu, 3 Nov 2022 11:20:24 +0100 Subject: [PATCH 114/213] Change section title --- docs/source/userguide/manage/index.rst | 2 +- docs/source/userguide/run/index.rst | 2 +- docs/source/userguide/wrappers/index.rst | 75 ++++++++++++++++-------- 3 files changed, 52 insertions(+), 27 deletions(-) diff --git a/docs/source/userguide/manage/index.rst b/docs/source/userguide/manage/index.rst index 3f4edea72..23fe97a5a 100644 --- a/docs/source/userguide/manage/index.rst +++ b/docs/source/userguide/manage/index.rst @@ -1,4 +1,4 @@ -Manage experiments +Manage Experiments =================== How to clean the experiment diff --git a/docs/source/userguide/run/index.rst b/docs/source/userguide/run/index.rst index 34f937ed7..90f0180bf 100644 --- a/docs/source/userguide/run/index.rst +++ b/docs/source/userguide/run/index.rst @@ -1,4 +1,4 @@ -Running experiments +Running Experiments =================== Run an experiment diff --git a/docs/source/userguide/wrappers/index.rst b/docs/source/userguide/wrappers/index.rst index 388c215ef..e19ddaa1b 100644 --- a/docs/source/userguide/wrappers/index.rst +++ b/docs/source/userguide/wrappers/index.rst @@ -1,6 +1,5 @@ -############ -Wrappers -############ +Configure Wrappers +================== In order to understand the goal of this feature, please take a look at: https://earth.bsc.es/wiki/lib/exe/fetch.php?media=library:seminars:techniques_to_improve_the_throughput.pptx @@ -14,9 +13,37 @@ At the moment there are 4 types of wrappers that can be used depending on the ex When using the wrapper, it is useful to be able to visualize which packages are being created. So, when executing *autosubmit monitor cxxx*, a dashed box indicates the jobs that are wrapped together in the same job package. +How to configure +---------------- + +In ``autosubmit_cxxx.conf``, regardless of the wrapper type, you need to make sure that the values of the variables **MAXWAITINGJOBS** and **TOTALJOBS** are increased according to the number of jobs expected to be waiting/running at the same time in your experiment. + +For example: + +.. code-block:: ini + + [config] + EXPID = .... + AUTOSUBMIT_VERSION = 3.13.0 + ... + + MAXWAITINGJOBS = 100 + TOTALJOBS = 100 + ... + +and below the [config] block, add the wrapper directive, indicating the wrapper type: + +.. code-block:: ini + + [wrapper] + TYPE = + +You can also specify which job types should be wrapped. This can be done using the **JOBS_IN_WRAPPER** parameter. +It is only required for the vertical-mixed type (in which the specified job types will be wrapped together), so if nothing is specified, all jobs will be wrapped. +By default, jobs of the same type will be wrapped together, as long as the constraints are satisfied. Number of jobs in a package -*************************** +~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: ini @@ -29,7 +56,7 @@ Number of jobs in a package - **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` - - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. + - If **MAX_WRAPPED** is not defined, then **TOTALJOBS** is used by default - **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain - If not defined, it considers that **MIN_WRAPPED** is 2. - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. @@ -39,14 +66,14 @@ Number of jobs in a package Wrapper check time -********************** +~~~~~~~~~~~~~~~~~~ It is possible to override the **SAFETYSLEEPTIME** for the wrapper, by using **CHECK_TIME_WRAPPER** and defining a time interval (in seconds) in which the wrapper internal jobs should be checked. .. important:: Note that the **numbers** shown in this documentation are examples. The actual values must be set according to the specific workflow, as well as the platform configurations. Vertical wrapper -======================= +---------------- The vertical wrapper is more appropriate when there are many sequential jobs. To use it, set TYPE = vertical: @@ -66,7 +93,7 @@ In order to be able to use the vertical wrapper, in ``platforms_cxxx.conf`` set Remember to add to each job the corresponding WALLCLOCK time. Vertical with multiple sections -=============================== +------------------------------- This is a mode of the vertical wrapper that allows jobs of different types to be wrapped together. Note that the solution considers the order of the sections defined in the ``jobs_cxxx.conf`` file, so the order of the sections given in **JOBS_IN_WRAPPER** is irrelevant. @@ -78,23 +105,20 @@ Additionally, jobs are grouped within the corresponding date, member and chunk h TYPE = vertical JOBS_IN_WRAPPER = SIM&SIM2 # REQUIRED -.. figure:: ../../workflows/vertical-mixed.png +.. figure:: fig/vertical-mixed.png :name: vertical-mixed :width: 100% :align: center :alt: vertical-mixed wrapper Horizontal wrapper -========================== +------------------ The horizontal wrapper is more appropriate when there are multiple ensemble members that can be run in parallel. If the wrapped jobs have an mpirun call, they will need machine files to specify in which nodes each job will run. Different cases may need specific approaches when creating the machine files. For auto-ecearth use COMPONENTS instead of STANDARD. -Horizontal wrapper -********************** - .. code-block:: ini [wrapper] @@ -111,14 +135,14 @@ In order to be able to use the horizontal wrapper, in ``platforms_cxxx.conf`` se ... MAX_PROCESSORS = 2400 -.. figure:: ../../workflows/horizontal_remote.png +.. figure:: fig/horizontal_remote.png :name: horizontal_remote :width: 60% :align: center :alt: horizontally wrapped jobs Shared-memory Experiments -************************* +~~~~~~~~~~~~~~~~~~~~~~~~~ There is also the possibility of setting the option **METHOD** to SRUN in the wrapper directive (**ONLY** for vertical and vertical-horizontal wrappers). @@ -131,13 +155,13 @@ This allows to form a wrapper with shared-memory paradigm instead of rely in mac METHOD = srun # default ASTHREAD Hybrid wrapper -========================== +-------------- The hybrid wrapper is a wrapper that works both vertically and horizontally at the same time, meaning that members and chunks can be wrapped in one single job. Mixed approach using a combination of horizontal and vertical wrappers and the list of jobs is a list of lists. Horizontal-vertical -=========================== +------------------- - There is a dependency between lists. Each list runs after the previous one finishes; the jobs within the list run in parallel at the same time - It is particularly suitable if there are jobs of different types in the list with different wall clocks, but dependencies between jobs of different lists; it waits for all the jobs in the list to finish before starting the next list @@ -150,7 +174,7 @@ Horizontal-vertical MACHINEFILES = STANDARD JOBS_IN_WRAPPER = SIM&DA -.. figure:: ../../workflows/dasim.png +.. figure:: fig/dasim.png :name: wrapper_horizontal_vertical :width: 100% :align: center @@ -158,7 +182,7 @@ Horizontal-vertical Vertical-horizontal -=========================== +------------------- - In this approach, each list is independent of each other and run in parallel; jobs within the list run one after the other - It is particularly suitable for running many sequential ensembles @@ -171,14 +195,15 @@ Vertical-horizontal MACHINEFILES = STANDARD JOBS_IN_WRAPPER = SIM -.. figure:: ../../workflows/vertical-horizontal.png +.. figure:: fig/vertical-horizontal.png :name: wrapper_vertical_horizontal :width: 100% :align: center :alt: hybrid wrapper Multiple wrappers at once -========================= +------------------------- + This is an special mode that allows you to use multiple **independent** wrappers on the same experiment. By using an special variable that allows to define subwrapper sections .. code-block:: ini @@ -195,14 +220,14 @@ This is an special mode that allows you to use multiple **independent** wrappers TYPE = vertical JOBS_IN_WRAPPER = DA&REDUCE -.. figure:: ../workflows/multiple_wrappers.png +.. figure:: fig/multiple_wrappers.png :name: :width: 100% :align: center :alt: multi wrapper Summary -========================== +------- In `autosubmit_cxxx.conf`: @@ -213,7 +238,7 @@ In `autosubmit_cxxx.conf`: # JOBS_IN_WRAPPER = Sections that should be wrapped together ex SIM # METHOD : Select between MACHINESFILES or Shared-Memory. # MIN_WRAPPED set the minim number of jobs that should be included in the wrapper. DEFAULT = 2 - # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = 99999999999 + # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = TOTALJOBS # Policy : Select the behaviour of the inner jobs Strict/Flexible/Mixed # EXTEND_WALLCLOCK: Allows to extend the wallclock by the max wallclock of the horizontal package (max inner job). Values are integer units (0,1,2) # RETRIALS : Enables a retrial mechanism for vertical wrappers, or default retrial mechanism for the other wrappers @@ -222,7 +247,7 @@ In `autosubmit_cxxx.conf`: TYPE = Vertical #REQUIRED JOBS_IN_WRAPPER = SIM # Job types (as defined in jobs_cxxx.conf) separated by space. REQUIRED only if vertical-mixed MIN_WRAPPED = 2 - MAX_WRAPPED = 999999 # OPTIONAL. Integer value. + MAX_WRAPPED = 9999 # OPTIONAL. Integer value, overrides TOTALJOBS CHECK_TIME_WRAPPER = # OPTIONAL. Time in seconds, overrides SAFETYSLEEPTIME POLICY = flexible # OPTIONAL, Wrapper policy, mixed, flexible, strict QUEUE = bsc_es # If not specified, queue will be the same of the first SECTION specified on JOBS_IN_WRAPPER -- GitLab From 188893bc781eeb23d6030e0423aa3fced2e59ba4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 3 Nov 2022 13:51:58 +0100 Subject: [PATCH 115/213] mail notifier changes --- .gitignore | 3 +++ autosubmit/helpers/utils.py | 1 - autosubmit/notifications/mail_notifier.py | 1 + autosubmit/platforms/pbsplatform.py | 3 ++- autosubmit/statistics/utils.py | 3 +-- 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index ea136b3a0..ae96f6d2d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ autosubmit/simple_test.py .vscode/ .vscode autosubmit.egg-info/ +venv/ +.pytest_cache/ +.cache/ \ No newline at end of file diff --git a/autosubmit/helpers/utils.py b/autosubmit/helpers/utils.py index 1a6d8e763..0ce27ab8a 100644 --- a/autosubmit/helpers/utils.py +++ b/autosubmit/helpers/utils.py @@ -7,7 +7,6 @@ from typing import Tuple def check_experiment_ownership(expid, basic_config, raise_error=False, logger=None): #Logger variable is not needed, LOG is global thus it will be read if avaliable - # type: (str, BasicConfig, bool, Log) -> Tuple[bool, bool, str] my_user_ID = os.getuid() current_owner_ID = 0 current_owner_name = "NA" diff --git a/autosubmit/notifications/mail_notifier.py b/autosubmit/notifications/mail_notifier.py index 53048138f..f774d1ed2 100644 --- a/autosubmit/notifications/mail_notifier.py +++ b/autosubmit/notifications/mail_notifier.py @@ -31,6 +31,7 @@ class MailNotifier: message = MIMEText(message_text) message['From'] = email.utils.formataddr(('Autosubmit', self.config.MAIL_FROM)) message['Subject'] = '[Autosubmit] Warning a remote platform is malfunctioning' + for mail in mail_to: message['To'] = email.utils.formataddr((mail, mail)) try: diff --git a/autosubmit/platforms/pbsplatform.py b/autosubmit/platforms/pbsplatform.py index 961bb4534..089856395 100644 --- a/autosubmit/platforms/pbsplatform.py +++ b/autosubmit/platforms/pbsplatform.py @@ -18,6 +18,7 @@ # along with Autosubmit. If not, see . import os +from log.log import Log, AutosubmitCritical, AutosubmitError from autosubmit.platforms.paramiko_platform import ParamikoPlatform from log.log import Log @@ -49,7 +50,7 @@ class PBSPlatform(ParamikoPlatform): self._header = Pbs12Header() else: Log.error('PBS version {0} not supported'.format(version)) - raise HPCPlatformException('PBS version {0} not supported'.format(version)) + raise AutosubmitError('PBS version {0} not supported'.format(version)) self.job_status = dict() self.job_status['COMPLETED'] = ['F', 'E', 'c', 'C'] diff --git a/autosubmit/statistics/utils.py b/autosubmit/statistics/utils.py index 465740187..765994c9e 100644 --- a/autosubmit/statistics/utils.py +++ b/autosubmit/statistics/utils.py @@ -38,8 +38,7 @@ def timedelta2hours(deltatime): def parse_number_processors(processors_str): """ Defaults to 1 in case of error """ - # type: (str) -> int - if ':' in processors_str: + if ':' in processors_str: components = processors_str.split(":") processors = int(sum( [math.ceil(float(x) / 36.0) * 36.0 for x in components])) -- GitLab From f594cc4dff9812a8f7a75c00b20d15b5c99f7ffc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 2 Nov 2022 15:39:18 +0100 Subject: [PATCH 116/213] mail notifier changes (2) --- autosubmit/notifications/mail_notifier.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/autosubmit/notifications/mail_notifier.py b/autosubmit/notifications/mail_notifier.py index f774d1ed2..ade319601 100644 --- a/autosubmit/notifications/mail_notifier.py +++ b/autosubmit/notifications/mail_notifier.py @@ -31,6 +31,7 @@ class MailNotifier: message = MIMEText(message_text) message['From'] = email.utils.formataddr(('Autosubmit', self.config.MAIL_FROM)) message['Subject'] = '[Autosubmit] Warning a remote platform is malfunctioning' + message['Date'] = email.utils.formatdate(localtime=True) for mail in mail_to: message['To'] = email.utils.formataddr((mail, mail)) @@ -43,6 +44,7 @@ class MailNotifier: message = MIMEText(message_text) message['From'] = email.utils.formataddr(('Autosubmit', self.config.MAIL_FROM)) message['Subject'] = '[Autosubmit] The job {0} status has changed to {1}'.format(job_name, str(status)) + message['Date'] = email.utils.formatdate(localtime=True) for mail in mail_to: message['To'] = email.utils.formataddr((mail, mail)) try: -- GitLab From 8490d2a084a99ff483973067b279679b89e5ac9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Mon, 7 Nov 2022 15:16:45 +0100 Subject: [PATCH 117/213] Correct figure path for wrapper section --- .../userguide/{run => wrappers}/fig/dasim.png | Bin .../{run => wrappers}/fig/horizontal_remote.png | Bin .../{run => wrappers}/fig/multiple_wrappers.png | Bin .../userguide/{run => wrappers}/fig/rerun.png | Bin .../{run => wrappers}/fig/vertical-horizontal.png | Bin .../{run => wrappers}/fig/vertical-mixed.png | Bin 6 files changed, 0 insertions(+), 0 deletions(-) rename docs/source/userguide/{run => wrappers}/fig/dasim.png (100%) rename docs/source/userguide/{run => wrappers}/fig/horizontal_remote.png (100%) rename docs/source/userguide/{run => wrappers}/fig/multiple_wrappers.png (100%) rename docs/source/userguide/{run => wrappers}/fig/rerun.png (100%) rename docs/source/userguide/{run => wrappers}/fig/vertical-horizontal.png (100%) rename docs/source/userguide/{run => wrappers}/fig/vertical-mixed.png (100%) diff --git a/docs/source/userguide/run/fig/dasim.png b/docs/source/userguide/wrappers/fig/dasim.png similarity index 100% rename from docs/source/userguide/run/fig/dasim.png rename to docs/source/userguide/wrappers/fig/dasim.png diff --git a/docs/source/userguide/run/fig/horizontal_remote.png b/docs/source/userguide/wrappers/fig/horizontal_remote.png similarity index 100% rename from docs/source/userguide/run/fig/horizontal_remote.png rename to docs/source/userguide/wrappers/fig/horizontal_remote.png diff --git a/docs/source/userguide/run/fig/multiple_wrappers.png b/docs/source/userguide/wrappers/fig/multiple_wrappers.png similarity index 100% rename from docs/source/userguide/run/fig/multiple_wrappers.png rename to docs/source/userguide/wrappers/fig/multiple_wrappers.png diff --git a/docs/source/userguide/run/fig/rerun.png b/docs/source/userguide/wrappers/fig/rerun.png similarity index 100% rename from docs/source/userguide/run/fig/rerun.png rename to docs/source/userguide/wrappers/fig/rerun.png diff --git a/docs/source/userguide/run/fig/vertical-horizontal.png b/docs/source/userguide/wrappers/fig/vertical-horizontal.png similarity index 100% rename from docs/source/userguide/run/fig/vertical-horizontal.png rename to docs/source/userguide/wrappers/fig/vertical-horizontal.png diff --git a/docs/source/userguide/run/fig/vertical-mixed.png b/docs/source/userguide/wrappers/fig/vertical-mixed.png similarity index 100% rename from docs/source/userguide/run/fig/vertical-mixed.png rename to docs/source/userguide/wrappers/fig/vertical-mixed.png -- GitLab From bc2037db1d10b64b922f3c9f669bd8d24db73afd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Tue, 8 Nov 2022 11:00:14 +0100 Subject: [PATCH 118/213] Correct figure path for defining a workflow section --- .../fig/dashed.png | Bin .../fig/date-synchronize.png | Bin .../fig/dependencies_previous.png | Bin .../fig/dependencies_running.png | Bin .../fig/experiment_delay_doc.png | Bin .../fig/frequency.png | Bin .../fig/member-synchronize.png | Bin .../fig/no-synchronize.png | Bin .../fig/running.png | Bin .../fig/select_chunks.png | Bin .../fig/select_members.png | Bin .../fig/simple.png | Bin .../{configure => defining workflows}/fig/skip.png | Bin .../{configure => defining workflows}/fig/split.png | Bin 14 files changed, 0 insertions(+), 0 deletions(-) rename docs/source/userguide/{configure => defining workflows}/fig/dashed.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/date-synchronize.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/dependencies_previous.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/dependencies_running.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/experiment_delay_doc.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/frequency.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/member-synchronize.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/no-synchronize.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/running.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/select_chunks.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/select_members.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/simple.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/skip.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/split.png (100%) diff --git a/docs/source/userguide/configure/fig/dashed.png b/docs/source/userguide/defining workflows/fig/dashed.png similarity index 100% rename from docs/source/userguide/configure/fig/dashed.png rename to docs/source/userguide/defining workflows/fig/dashed.png diff --git a/docs/source/userguide/configure/fig/date-synchronize.png b/docs/source/userguide/defining workflows/fig/date-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/date-synchronize.png rename to docs/source/userguide/defining workflows/fig/date-synchronize.png diff --git a/docs/source/userguide/configure/fig/dependencies_previous.png b/docs/source/userguide/defining workflows/fig/dependencies_previous.png similarity index 100% rename from docs/source/userguide/configure/fig/dependencies_previous.png rename to docs/source/userguide/defining workflows/fig/dependencies_previous.png diff --git a/docs/source/userguide/configure/fig/dependencies_running.png b/docs/source/userguide/defining workflows/fig/dependencies_running.png similarity index 100% rename from docs/source/userguide/configure/fig/dependencies_running.png rename to docs/source/userguide/defining workflows/fig/dependencies_running.png diff --git a/docs/source/userguide/configure/fig/experiment_delay_doc.png b/docs/source/userguide/defining workflows/fig/experiment_delay_doc.png similarity index 100% rename from docs/source/userguide/configure/fig/experiment_delay_doc.png rename to docs/source/userguide/defining workflows/fig/experiment_delay_doc.png diff --git a/docs/source/userguide/configure/fig/frequency.png b/docs/source/userguide/defining workflows/fig/frequency.png similarity index 100% rename from docs/source/userguide/configure/fig/frequency.png rename to docs/source/userguide/defining workflows/fig/frequency.png diff --git a/docs/source/userguide/configure/fig/member-synchronize.png b/docs/source/userguide/defining workflows/fig/member-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/member-synchronize.png rename to docs/source/userguide/defining workflows/fig/member-synchronize.png diff --git a/docs/source/userguide/configure/fig/no-synchronize.png b/docs/source/userguide/defining workflows/fig/no-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/no-synchronize.png rename to docs/source/userguide/defining workflows/fig/no-synchronize.png diff --git a/docs/source/userguide/configure/fig/running.png b/docs/source/userguide/defining workflows/fig/running.png similarity index 100% rename from docs/source/userguide/configure/fig/running.png rename to docs/source/userguide/defining workflows/fig/running.png diff --git a/docs/source/userguide/configure/fig/select_chunks.png b/docs/source/userguide/defining workflows/fig/select_chunks.png similarity index 100% rename from docs/source/userguide/configure/fig/select_chunks.png rename to docs/source/userguide/defining workflows/fig/select_chunks.png diff --git a/docs/source/userguide/configure/fig/select_members.png b/docs/source/userguide/defining workflows/fig/select_members.png similarity index 100% rename from docs/source/userguide/configure/fig/select_members.png rename to docs/source/userguide/defining workflows/fig/select_members.png diff --git a/docs/source/userguide/configure/fig/simple.png b/docs/source/userguide/defining workflows/fig/simple.png similarity index 100% rename from docs/source/userguide/configure/fig/simple.png rename to docs/source/userguide/defining workflows/fig/simple.png diff --git a/docs/source/userguide/configure/fig/skip.png b/docs/source/userguide/defining workflows/fig/skip.png similarity index 100% rename from docs/source/userguide/configure/fig/skip.png rename to docs/source/userguide/defining workflows/fig/skip.png diff --git a/docs/source/userguide/configure/fig/split.png b/docs/source/userguide/defining workflows/fig/split.png similarity index 100% rename from docs/source/userguide/configure/fig/split.png rename to docs/source/userguide/defining workflows/fig/split.png -- GitLab From 37dba7a35483362759d0fe9a4fe1e48961fcda4b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 8 Nov 2022 12:58:21 +0100 Subject: [PATCH 119/213] added depth --- autosubmit/config/config_common.py | 20 ++++++++++++++++++++ autosubmit/git/autosubmit_git.py | 25 ++++++++++++++++++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index ff6f31ea4..42ac5fe9a 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1107,6 +1107,26 @@ class AutosubmitConfig(object): """ return self._exp_parser.get_option('git', 'FETCH_SINGLE_BRANCH', 'False').lower() + def get_project_submodules_depth(self): + """ + Returns the max depth of submodule at the moment of cloning + Default is -1 (no limit) + :return: depth + :rtype: list + """ + unparsed_depth = self._exp_parser.get_option('git', 'PROJECT_SUBMODULES_DEPTH', -1) + if "[" in unparsed_depth and "]" in unparsed_depth: + unparsed_depth = unparsed_depth.strip("[]") + depth = [int(x) for x in unparsed_depth.split(",")] + else: + try: + depth = int(unparsed_depth) + depth = [depth] + except: + Log.warning("PROJECT_SUBMODULES_DEPTH is not an integer neither a int. Using default value -1") + depth = [] + return depth + def get_project_destination(self): """ Returns git commit from experiment's config file diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 493358ed0..fe8883d0a 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -146,6 +146,10 @@ class AutosubmitGit: git_project_branch = 'master' git_project_commit = as_conf.get_git_project_commit() git_project_submodules = as_conf.get_submodules_list() + git_project_submodules_depth = as_conf.get_project_submodules_depth() + max_depth = -1 + if len(git_project_submodules_depth) > 0: + max_depth = max(git_project_submodules_depth) if as_conf.get_fetch_single_branch() != "true": git_single_branch = False else: @@ -218,12 +222,27 @@ class AutosubmitGit: command_1 += "git checkout {0};".format(git_project_commit) else: command_1 += "git checkout; " + if git_project_submodules.__len__() <= 0: - command_1 += " git submodule update --init --recursive;" + if len(git_project_submodules_depth) > 0: + command_1 += "git submodule update --init --recursive --depth {0};".format( + max_depth) + else: + command_1 += " git submodule update --init --recursive;" else: command_1 += " git submodule init;".format(project_destination) + index_submodule = 0 for submodule in git_project_submodules: - command_1 += " git submodule update --init --recursive {0};".format(submodule) + if len(git_project_submodules_depth) > 0: + if index_submodule < len(git_project_submodules_depth): + command_1 += " git submodule update --init --recursive --depth {0} {1};".format( + git_project_submodules_depth[index_submodule], submodule) + else: + command_1 += " git submodule update --init --recursive --depth {0} {1};".format( + max_depth, submodule) + else: + command_1 += " git submodule update --init --recursive {0};".format(submodule) + index_submodule += 1 if git_remote_project_path == '': try: command_1 = "cd {0}; {1} ".format(git_path,command_1) @@ -235,7 +254,7 @@ class AutosubmitGit: Log.printlog( "Submodule has a wrong configuration.\n{0}".format(command_1), 6014) else: - command_1 = "cd {0}; {1} ".format(git_remote_path, command_1) + command_1 = "cd {0}; {1} ".format(project_path, command_1) hpcarch.send_command(command_1) except subprocess.CalledProcessError as e: shutil.rmtree(project_path) -- GitLab From e4cad10ae7d684e1f214015c8a6a43afb28d578b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 9 Nov 2022 11:07:16 +0100 Subject: [PATCH 120/213] git_project_depth --- autosubmit/config/config_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 42ac5fe9a..94fe15fef 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1114,7 +1114,7 @@ class AutosubmitConfig(object): :return: depth :rtype: list """ - unparsed_depth = self._exp_parser.get_option('git', 'PROJECT_SUBMODULES_DEPTH', -1) + unparsed_depth = str(self._exp_parser.get_option('git', 'PROJECT_SUBMODULES_DEPTH', "-1")) if "[" in unparsed_depth and "]" in unparsed_depth: unparsed_depth = unparsed_depth.strip("[]") depth = [int(x) for x in unparsed_depth.split(",")] -- GitLab From 7d3c7db9cbbf05d18977759a90e30b0210a893d5 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 15 Nov 2022 16:08:57 +0100 Subject: [PATCH 121/213] Added depth, now as is capable to parse depth --- autosubmit/config/config_common.py | 32 ++++++++++++++++++++++++++++++ autosubmit/git/autosubmit_git.py | 32 +++++++++++++++++++++--------- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 94fe15fef..fec2d2e16 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -165,6 +165,38 @@ class AutosubmitConfig(object): """ return self._conf_parser.get_option(wrapper_section_name, 'EXPORT', 'none') + def parse_githooks(self): + """ + Parse githooks section in configuration file + + :return: dictionary with githooks configuration + :rtype: dict + """ + proj_dir = os.path.join( + BasicConfig.LOCAL_ROOT_DIR, self.expid, BasicConfig.LOCAL_PROJ_DIR) + #get project_name + project_name = str(self.get_project_destination()) + + #get githook files from proj_dir + githook_files = [os.path.join(os.path.join(os.path.join(proj_dir,project_name),".githooks"), f) for f in os.listdir(os.path.join(os.path.join(proj_dir,project_name),".githooks")) ] + parameters = self.load_parameters() + + #find all '%(? 0: - command_1 += "git submodule update --init --recursive --depth {0};".format( - max_depth) + Log.info("Depth is incompatible with --recursive, ignoring recursive option") + command_1 += "git submodule update --init; " else: - command_1 += " git submodule update --init --recursive;" + command_1 += " git submodule update --init --recursive; " else: - command_1 += " git submodule init;".format(project_destination) + command_1 += " git submodule init; ".format(project_destination) index_submodule = 0 for submodule in git_project_submodules: if len(git_project_submodules_depth) > 0: + Log.info("Depth is incompatible with --recursive, ignoring recursive option") + if index_submodule < len(git_project_submodules_depth): - command_1 += " git submodule update --init --recursive --depth {0} {1};".format( + command_1 += " git submodule update --init --depth {0} {1}; ".format( git_project_submodules_depth[index_submodule], submodule) else: - command_1 += " git submodule update --init --recursive --depth {0} {1};".format( + command_1 += " git submodule update --init --depth {0} {1}; ".format( max_depth, submodule) else: - command_1 += " git submodule update --init --recursive {0};".format(submodule) + command_1 += " git submodule update --init --recursive {0}; ".format(submodule) index_submodule += 1 if git_remote_project_path == '': try: + if len(command_githook) > 0: + command_githook = "cd {0} ; {1}".format(git_path, command_githook) + as_conf.parse_githooks() + subprocess.check_output(command_githook, shell=True) command_1 = "cd {0}; {1} ".format(git_path,command_1) Log.debug('Githook + Checkout and Submodules: {0}', command_1) output_1 = subprocess.check_output(command_1, shell=True) @@ -254,6 +264,10 @@ class AutosubmitGit: Log.printlog( "Submodule has a wrong configuration.\n{0}".format(command_1), 6014) else: + if len(command_githook) > 0: + command_githook = "cd {0} ; {1}".format(project_path, command_githook) + as_conf.parse_githooks() + hpcarch.send_command(command_githook) command_1 = "cd {0}; {1} ".format(project_path, command_1) hpcarch.send_command(command_1) except subprocess.CalledProcessError as e: -- GitLab From a6c3fea61233769b6be402c02bfb301788e3f5f1 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 15 Nov 2022 16:19:41 +0100 Subject: [PATCH 122/213] small fixes --- autosubmit/config/config_common.py | 2 +- autosubmit/git/autosubmit_git.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index fec2d2e16..2b00308aa 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -188,7 +188,7 @@ class AutosubmitConfig(object): matches = re.findall('%(? 0: + if max_depth > 0: Log.info("Depth is incompatible with --recursive, ignoring recursive option") command_1 += "git submodule update --init; " else: @@ -237,7 +237,7 @@ class AutosubmitGit: command_1 += " git submodule init; ".format(project_destination) index_submodule = 0 for submodule in git_project_submodules: - if len(git_project_submodules_depth) > 0: + if max_depth > 0: Log.info("Depth is incompatible with --recursive, ignoring recursive option") if index_submodule < len(git_project_submodules_depth): -- GitLab From bdf1740c7bbaff3c68aff3fa9562b0f39513dcd4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 17 Nov 2022 15:09:35 +0100 Subject: [PATCH 123/213] depth missing when <= 0 submodule --- autosubmit/git/autosubmit_git.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 3c8f04ba7..1f8fd2d77 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -230,7 +230,7 @@ class AutosubmitGit: if git_project_submodules.__len__() <= 0: if max_depth > 0: Log.info("Depth is incompatible with --recursive, ignoring recursive option") - command_1 += "git submodule update --init; " + command_1 += "git submodule update --init --depth {0}; ".format(max_depth) else: command_1 += " git submodule update --init --recursive; " else: @@ -239,7 +239,6 @@ class AutosubmitGit: for submodule in git_project_submodules: if max_depth > 0: Log.info("Depth is incompatible with --recursive, ignoring recursive option") - if index_submodule < len(git_project_submodules_depth): command_1 += " git submodule update --init --depth {0} {1}; ".format( git_project_submodules_depth[index_submodule], submodule) -- GitLab From ab89eb7a43039292295f0b6e9c6ba3cb2f398298 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 17 Nov 2022 15:16:39 +0100 Subject: [PATCH 124/213] depth missing when <= 0 submodule --- autosubmit/git/autosubmit_git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 1f8fd2d77..bfbd88610 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -230,7 +230,7 @@ class AutosubmitGit: if git_project_submodules.__len__() <= 0: if max_depth > 0: Log.info("Depth is incompatible with --recursive, ignoring recursive option") - command_1 += "git submodule update --init --depth {0}; ".format(max_depth) + command_1 += " git submodule update --init --depth {0}; ".format(max_depth) else: command_1 += " git submodule update --init --recursive; " else: -- GitLab From 49a6eadc38443ead3eb38cff228cf8bc52d2b4aa Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 17 Nov 2022 15:21:27 +0100 Subject: [PATCH 125/213] depth missing when <= 0 submodule --- requeriments.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requeriments.txt b/requeriments.txt index b5783046b..7f4b2ea14 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -3,7 +3,7 @@ configparser argparse>=1.2,<2 python-dateutil>2 matplotlib -numpy +numpy<1.17 pydotplus>=2 pyparsing>=2.0.1 paramiko diff --git a/setup.py b/setup.py index a5a7801ef..fcf93d1f7 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ setup( download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], install_requires=['argparse>=1.2,<2','six>=1.10.0', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', - 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', + 'numpy<1.17', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] -- GitLab From 4d97440c2368d145c3fca8eb1b22fba599de9a4f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 18 Nov 2022 16:12:05 +0100 Subject: [PATCH 126/213] traceback error --- bin/autosubmit | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/autosubmit b/bin/autosubmit index 39ba33332..c722a9654 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -57,13 +57,13 @@ def main(): except Exception as e: if os.path.exists(os.path.join(Log.file_path, "autosubmit.lock")): os.remove(os.path.join(Log.file_path, "autosubmit.lock")) - Log.error("Trace: {0}", str(e.message)) - if "temporarily unavailable" in str(e.message): + Log.error("Trace: {0}", str(e)) + if "temporarily unavailable" in str(e): Log.critical( "Another instance of autosubmit is running on this experiment. If this is not the case, delete autosubmit.lock", 7000) else: Log.critical( - "Unhandled error: If you see this message, please report it in Autosubmit's GitLab project") + "\n{0}Unhandled error: If you see this message, please report it in Autosubmit's GitLab project".format(str(e)), 7000) os._exit(1) -- GitLab From 25a3b3c2cb2a1aa25fbb9f0456802e101c159e91 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 18 Nov 2022 16:12:48 +0100 Subject: [PATCH 127/213] traceback error --- bin/autosubmit | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/autosubmit b/bin/autosubmit index c722a9654..24b99be0a 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -60,10 +60,10 @@ def main(): Log.error("Trace: {0}", str(e)) if "temporarily unavailable" in str(e): Log.critical( - "Another instance of autosubmit is running on this experiment. If this is not the case, delete autosubmit.lock", 7000) + "{0}\nAnother instance of autosubmit is running on this experiment. If this is not the case, delete autosubmit.lock".format(str(e)), 7000) else: Log.critical( - "\n{0}Unhandled error: If you see this message, please report it in Autosubmit's GitLab project".format(str(e)), 7000) + "{0}\nUnhandled error: If you see this message, please report it in Autosubmit's GitLab project".format(str(e)), 7000) os._exit(1) -- GitLab From 867762e680ab5ac3e770fd1906fbbfe0d7a5939b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 12 Jul 2022 17:04:03 +0200 Subject: [PATCH 128/213] locale fix for bschub --- autosubmit/autosubmit.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index cbb4b142d..97f08cc67 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -782,13 +782,19 @@ class Autosubmit: args.command + exp_id + '.log'), "out", log_level) Log.set_file(os.path.join(BasicConfig.GLOBAL_LOG_DIR, args.command + exp_id + '_err.log'), "err") - #Enforce LANG=C + # Enforce LANG=UTF-8 try: try: - locale.setlocale(locale.LC_ALL,'C.UTF-8') - except: - locale.setlocale(locale.LC_ALL, 'C.utf8') - except: + locale.setlocale(locale.LC_ALL, 'C.UTF-8') + except Exception as e: + try: + locale.setlocale(locale.LC_ALL, 'C.utf8') + except Exception as e: + try: + locale.setlocale(locale.LC_ALL, 'en_GB') + except Exception as e: + locale.setlocale(locale.LC_ALL, 'es_ES') + except Exception as e: Log.info("Locale C.utf8 is not found, using '{0}' as fallback".format("C")) locale.setlocale(locale.LC_ALL, 'C') Log.info( -- GitLab From ed12169cfa10fedffc47c66842569db03ce04e68 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 18 Nov 2022 16:27:01 +0100 Subject: [PATCH 129/213] locale fix for bschub (II) --- autosubmit/job/job_common.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/autosubmit/job/job_common.py b/autosubmit/job/job_common.py index a4b20ecc4..6a81f64cb 100644 --- a/autosubmit/job/job_common.py +++ b/autosubmit/job/job_common.py @@ -111,12 +111,17 @@ class StatisticsSnippetBash: # Autosubmit header ################### locale_to_set=$(locale -a | grep ^C.) - if [ -z "$var" ] ; then + if [ -z "$locale_to_set" ] ; then # locale installed... export LC_ALL=$locale_to_set else # locale not installed... - export LC_ALL=C + locale_to_set=$(locale -a | grep ^en_GB.utf8) + if [ -z "$locale_to_set" ] ; then + export LC_ALL=$locale_to_set + else + export LC_ALL=C + fi fi set -xuve @@ -171,9 +176,15 @@ class StatisticsSnippetPython: try: try: locale.setlocale(locale.LC_ALL,'C.utf8') - except: - locale.setlocale(locale.LC_ALL, 'C.UTF-8') - except: + except Exception as e: + try: + locale.setlocale(locale.LC_ALL, 'C.UTF-8') + except Exception as e: + try: + locale.setlocale(locale.LC_ALL, 'en_GB') + except Exception as e: + locale.setlocale(locale.LC_ALL, 'es_ES') + except Exception as e: locale.setlocale(locale.LC_ALL, 'C') job_name_ptrn = '%CURRENT_LOGDIR%/%JOBNAME%' stat_file = open(job_name_ptrn + '_STAT', 'w') @@ -225,7 +236,7 @@ class StatisticsSnippetR: oldw <- getOption("warn") options( warn = -1 ) leave = F - langs <- c("C.utf8","C.UTF-8","C") + langs <- c("C.utf8","C.UTF-8","C","en_GB","es_ES") i = 1 e="" while (nchar(e) == 0 || leave) -- GitLab From d75300236c3dccadd4ecacb4b451d44c7764a9e4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 21 Nov 2022 09:50:13 +0100 Subject: [PATCH 130/213] Traceback now is displayed on unhandled errors --- autosubmit/autosubmit.py | 1 - bin/autosubmit | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 97f08cc67..b55e55cf0 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -800,7 +800,6 @@ class Autosubmit: Log.info( "Autosubmit is running with {0}", Autosubmit.autosubmit_version) - @staticmethod def _check_ownership(expid,raise_error=False): """ diff --git a/bin/autosubmit b/bin/autosubmit index 24b99be0a..714d754a3 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -21,6 +21,7 @@ """Script for handling experiment monitoring""" import os import sys +import traceback scriptdir = os.path.abspath(os.path.dirname(sys.argv[0])) sys.path.append(scriptdir) @@ -63,7 +64,8 @@ def main(): "{0}\nAnother instance of autosubmit is running on this experiment. If this is not the case, delete autosubmit.lock".format(str(e)), 7000) else: Log.critical( - "{0}\nUnhandled error: If you see this message, please report it in Autosubmit's GitLab project".format(str(e)), 7000) + "{1}{0}\nUnhandled error: If you see this message, please report it in Autosubmit's GitLab project".format(str(e),traceback.print_exc() +), 7000) os._exit(1) -- GitLab From c4681df5f4fbe875cd6cfb1c3cef3275ecd6754e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 23 Nov 2022 09:10:21 +0100 Subject: [PATCH 131/213] Allowed changes: auto check if shortcuts the name if the hostname returns the full name. ex. bscearth000.int.bsc -> bscearth000 --- autosubmit/autosubmit.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index b55e55cf0..ad98e6326 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -692,22 +692,28 @@ class Autosubmit: else: expid_less.append("migrate") # pickup import platform - host = platform.node() + fullhost = platform.node() + if "." in fullhost: + host = fullhost.split(".")[0] + elif "," in fullhost: + host = fullhost.split(",")[0] + else: + host = fullhost forbidden = BasicConfig.DENIED_HOSTS authorized = BasicConfig.ALLOWED_HOSTS - message = "Command: {0} is not allowed to run in host: {1}.\n".format(args.command.upper(),host) + message = "Command: {0} is not allowed to run in host: {1}.\n".format(args.command.upper(),fullhost) message += "List of permissions as follows:Command | hosts \nAllowed hosts\n" for command in BasicConfig.ALLOWED_HOSTS: message += " {0}:{1} \n".format(command,BasicConfig.ALLOWED_HOSTS[command]) message += "Denied hosts\n" for command in BasicConfig.DENIED_HOSTS: message += " {0}:{1} \n".format(command,BasicConfig.DENIED_HOSTS[command]) - message += "[Command: autosubmit {0}] is not allowed to run in [host: {1}].".format(args.command.upper(), host) + message += "[Command: autosubmit {0}] is not allowed to run in [host: {1}].".format(args.command.upper(), fullhost) if args.command in BasicConfig.DENIED_HOSTS: - if 'all' in BasicConfig.DENIED_HOSTS[args.command] or host in BasicConfig.DENIED_HOSTS[args.command]: + if 'all' in BasicConfig.DENIED_HOSTS[args.command] or host in BasicConfig.DENIED_HOSTS[args.command] or fullhost in BasicConfig.DENIED_HOSTS[args.command]: raise AutosubmitCritical(message, 7071) if args.command in BasicConfig.ALLOWED_HOSTS: - if 'all' not in BasicConfig.ALLOWED_HOSTS[args.command] and host not in BasicConfig.ALLOWED_HOSTS[args.command]: + if 'all' not in BasicConfig.ALLOWED_HOSTS[args.command] and (host not in BasicConfig.ALLOWED_HOSTS[args.command] or fullhost not in BasicConfig.ALLOWED_HOSTS[args.command]): raise AutosubmitCritical(message, 7071) if expid != 'None' and args.command not in expid_less and args.command not in global_log_command: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) -- GitLab From 7cf8ee4862b6bcdd9f997f1ffa6e55712d95fd00 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 23 Nov 2022 09:34:32 +0100 Subject: [PATCH 132/213] Allowed changes: auto check if shortcuts the name if the hostname returns the full name. ex. bscearth000.int.bsc -> bscearth000 --- autosubmit/autosubmit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ad98e6326..5ab9356be 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -713,7 +713,7 @@ class Autosubmit: if 'all' in BasicConfig.DENIED_HOSTS[args.command] or host in BasicConfig.DENIED_HOSTS[args.command] or fullhost in BasicConfig.DENIED_HOSTS[args.command]: raise AutosubmitCritical(message, 7071) if args.command in BasicConfig.ALLOWED_HOSTS: - if 'all' not in BasicConfig.ALLOWED_HOSTS[args.command] and (host not in BasicConfig.ALLOWED_HOSTS[args.command] or fullhost not in BasicConfig.ALLOWED_HOSTS[args.command]): + if 'all' not in BasicConfig.ALLOWED_HOSTS[args.command] and not (host in BasicConfig.ALLOWED_HOSTS[args.command] or fullhost in BasicConfig.ALLOWED_HOSTS[args.command]): raise AutosubmitCritical(message, 7071) if expid != 'None' and args.command not in expid_less and args.command not in global_log_command: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) -- GitLab From 8174f4ed7967c136fb6001d2c9afe66d793ed1a7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 23 Nov 2022 11:54:34 +0100 Subject: [PATCH 133/213] added git_version checker --- autosubmit/git/autosubmit_git.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index bfbd88610..5d50bb4b6 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -206,7 +206,15 @@ class AutosubmitGit: try: ##command 0 Log.debug('Clone command: {0}', command_0) - + try: + git_version = subprocess.check_output("git --version",shell=True) + git_version = git_version.split(" ")[2].strip("\n") + version_int = "" + for number in git_version.split("."): + version_int += number + git_version = int(version_int) + except: + git_version = 2251 if git_remote_project_path == '': command_0 = "cd {0} ; {1}".format(project_path, command_0) output_0 = subprocess.check_output(command_0, shell=True) @@ -214,7 +222,8 @@ class AutosubmitGit: command_0 = "cd {0} ; {1}".format(project_path, command_0) hpcarch.send_command(command_0) ##command 1 - if os.path.exists(os.path.join(git_path, ".githooks")): + + if os.path.exists(os.path.join(git_path, ".githooks")) and git_version > 2136: for root_dir, dirs, files in os.walk(os.path.join(git_path, ".githooks")): for f_dir in dirs: os.chmod(os.path.join(root_dir, f_dir), 0o750) -- GitLab From ad44ec09c3be29dcb03c6b9234d0a961f9018164 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 23 Nov 2022 12:08:23 +0100 Subject: [PATCH 134/213] interpolation=None default value 2.7.9 (office) and 2.7.15 ( hub ) --- autosubmit/config/config_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_parser.py b/autosubmit/config/config_parser.py index 99d92fd8c..925bdecb8 100644 --- a/autosubmit/config/config_parser.py +++ b/autosubmit/config/config_parser.py @@ -24,7 +24,7 @@ class ConfigParserFactory: class ConfigParser(ConfPar, object): def __init__(self): - super(ConfigParser, self).__init__() + super(ConfigParser, self).__init__(interpolation=None) def get_option(self, section, option, default=None): """ @@ -41,6 +41,7 @@ class ConfigParser(ConfPar, object): :return: option value :rtype: str """ + if self.has_option(section, option): return self.get(section, option) else: -- GitLab From 0ed087370224aed56ec97968a0464d6cd6beb1ea Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 23 Nov 2022 13:49:36 +0100 Subject: [PATCH 135/213] interpolation=None deleted from master branch created a different branch for hubs while I find a solution --- autosubmit/config/config_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/config/config_parser.py b/autosubmit/config/config_parser.py index 925bdecb8..05a4fa346 100644 --- a/autosubmit/config/config_parser.py +++ b/autosubmit/config/config_parser.py @@ -24,7 +24,7 @@ class ConfigParserFactory: class ConfigParser(ConfPar, object): def __init__(self): - super(ConfigParser, self).__init__(interpolation=None) + super(ConfigParser, self).__init__() def get_option(self, section, option, default=None): """ -- GitLab From c8d09f5f2171458d35735d5b3d3ff1f591ced683 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 23 Nov 2022 14:05:28 +0100 Subject: [PATCH 136/213] configparser added to setup.py --- autosubmit/config/config_parser.py | 2 +- environment.yml | 1 + setup.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/autosubmit/config/config_parser.py b/autosubmit/config/config_parser.py index 05a4fa346..925bdecb8 100644 --- a/autosubmit/config/config_parser.py +++ b/autosubmit/config/config_parser.py @@ -24,7 +24,7 @@ class ConfigParserFactory: class ConfigParser(ConfPar, object): def __init__(self): - super(ConfigParser, self).__init__() + super(ConfigParser, self).__init__(interpolation=None) def get_option(self, section, option, default=None): """ diff --git a/environment.yml b/environment.yml index bc6e7308b..6429c6982 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,7 @@ channels: dependencies: - nose +- configparser - nose-cov - paramiko - pydotplus diff --git a/setup.py b/setup.py index fcf93d1f7..69d30f102 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( keywords=['climate', 'weather', 'workflow', 'HPC'], install_requires=['argparse>=1.2,<2','six>=1.10.0', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy<1.17', 'matplotlib', 'typing', 'paramiko == 2.7.1', - 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], + 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests','configparser'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] }, -- GitLab From 09622a9b6e88d2ceb84e5199da11a8d2f94b0310 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 23 Nov 2022 14:10:35 +0100 Subject: [PATCH 137/213] configparser added to setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 69d30f102..c639b85e5 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( keywords=['climate', 'weather', 'workflow', 'HPC'], install_requires=['argparse>=1.2,<2','six>=1.10.0', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy<1.17', 'matplotlib', 'typing', 'paramiko == 2.7.1', - 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests','configparser'], + 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests','configparser==4.0.2'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] }, -- GitLab From 7993085b671659b44b094af7f161dcf28eb02ffa Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 24 Nov 2022 08:30:04 +0100 Subject: [PATCH 138/213] githook only will detected .tmpl extension --- autosubmit/config/config_common.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 2b00308aa..6a1e09843 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -183,14 +183,16 @@ class AutosubmitConfig(object): #find all '%(? Date: Thu, 24 Nov 2022 09:43:45 +0100 Subject: [PATCH 139/213] added exe permissions to the githook generated file --- autosubmit/config/config_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 6a1e09843..5e3e45294 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -193,6 +193,7 @@ class AutosubmitConfig(object): content = content.replace(match, parameters.get(match[1:-1],"")) with open(f_name, 'w') as f: f.write(content) + os.chmod(f_name, 0o750) pass -- GitLab From d5e0758b8370066be64faa41f9032c3a6b3847c2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 24 Nov 2022 14:51:09 +0100 Subject: [PATCH 140/213] Added debug info related to githook --- autosubmit/git/autosubmit_git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 5d50bb4b6..06a082431 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -264,7 +264,7 @@ class AutosubmitGit: as_conf.parse_githooks() subprocess.check_output(command_githook, shell=True) command_1 = "cd {0}; {1} ".format(git_path,command_1) - Log.debug('Githook + Checkout and Submodules: {0}', command_1) + Log.debug('Githook + Checkout and Submodules: {0}', command_githook, command_1) output_1 = subprocess.check_output(command_1, shell=True) except BaseException as e: submodule_failure = True -- GitLab From c66f4795c101b10ad25aaf07f01ce34c0e466f31 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 5 Dec 2022 14:12:04 +0100 Subject: [PATCH 141/213] delete inline comments --- docs/source/qstartguide/index.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/qstartguide/index.rst b/docs/source/qstartguide/index.rst index 6ec8898bd..b65ec0539 100644 --- a/docs/source/qstartguide/index.rst +++ b/docs/source/qstartguide/index.rst @@ -165,10 +165,13 @@ Now open platforms.conf. Note: This will be an example for marenostrum4 [marenostrum4] # Queue type. Options: ps, SGE, LSF, SLURM, PBS, eceaccess - TYPE = slurm # scheduler type + # scheduler type + TYPE = slurm HOST = mn1.bsc.es,mn2.bsc.es,mn3.bsc.es - PROJECT = bsc32 # <- your project - USER = bsc32070 # <- your user + # <- your project ( usually is the user name without the last 3 digits, however check your hpc) + PROJECT = bsc32 + # <- your user name + USER = bsc32070 SCRATCH_DIR = /gpfs/scratch ADD_PROJECT_TO_HOST = False # use 72:00 if you are using a PRACE account, 48:00 for the bsc account -- GitLab From 5c84167f656e8cd28a8352e4ec16b24a1fd00383 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 13 Dec 2022 15:17:01 +0100 Subject: [PATCH 142/213] Release preparation --- VERSION | 2 +- autosubmit/monitor/monitor.py | 11 +++---- autosubmit/platforms/paramiko_platform.py | 36 +++++++++++------------ bin/autosubmit | 2 +- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/VERSION b/VERSION index ad59f742d..f982feb41 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.14.0b +3.14.0 diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index 9556e7d3d..bbfbb1018 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -153,7 +153,6 @@ class Monitor: for job in joblist: if job.has_parents(): continue - if not groups or job.name not in groups['jobs'] or (job.name in groups['jobs'] and len(groups['jobs'][job.name]) == 1): node_job = pydotplus.Node(job.name, shape='box', style="filled", fillcolor=self.color_status(job.status)) @@ -299,6 +298,7 @@ class Monitor: :param job_list_object: Object that has the main txt generation method :type job_list_object: JobList object """ + error_msg = "" try: Log.info('Plotting...') now = time.localtime() @@ -347,13 +347,14 @@ class Monitor: raise except BaseException as e: try: - e.message += "\n"+e.value - if "GraphViz" in e.message: - e.message= "Graphviz is not installed. Autosubmit need this system package in order to plot the workflow." + if "GraphViz" in str(e): + error_msg="Graphviz is not installed. Autosubmit need this system package in order to plot the workflow." + else: + error_msg = str(e) except: pass - Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only written in txt".format(e.message),7014) + Log.printlog("{0}\nSpecified output doesn't have an available viewer installed. The output was only written in txt".format(error_msg),7014) def generate_output_txt(self, expid, joblist, path, classictxt=False, job_list_object=None): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 4b5c2d4b4..84a7bc7f4 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -571,30 +571,29 @@ class ParamikoPlatform(Platform): retries -= 1 sleep(sleep_time) sleep_time = sleep_time + 5 - + if slurm_error: + raise AutosubmitError("Remote pooling failed with error:{0}\n Resetting platforms connections...".format(e_msg)) job_list_status = self.get_ssh_output() if retries >= 0: Log.debug('Successful check job command') in_queue_jobs = [] list_queue_jobid = "" for job in job_list: - if not slurm_error: - job_id = job.id + job_id = job.id + job_status = self.parse_Alljobs_output(job_list_status, job_id) + while len(job_status) <= 0 and retries >= 0: + retries -= 1 + self.send_command(cmd) + job_list_status = self.get_ssh_output() job_status = self.parse_Alljobs_output(job_list_status, job_id) - while len(job_status) <= 0 and retries >= 0: - retries -= 1 - self.send_command(cmd) - job_list_status = self.get_ssh_output() - job_status = self.parse_Alljobs_output(job_list_status, job_id) - if len(job_status) <= 0: - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - sleep(sleep_time) - sleep_time = sleep_time + 5 - # URi: define status list in HPC Queue Class - else: - job_status = job.status + if len(job_status) <= 0: + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + sleep(sleep_time) + sleep_time = sleep_time + 5 + # URi: define status list in HPC Queue Class + if job.status != Status.RUNNING: job.start_time = datetime.datetime.now() # URi: start time if job.start_time is not None and str(job.wrapper_type).lower() == "none": @@ -668,8 +667,7 @@ class ParamikoPlatform(Platform): 'check_job() The job id ({0}) from platform {1} has an status of {2}.', job.id, self.name, job_status) raise AutosubmitError("Some Jobs are in Unknown status", 6008) # job.new_status=job_status - if slurm_error: - raise AutosubmitError(e_msg, 6000) + def get_jobid_by_jobname(self,job_name,retries=2): """ diff --git a/bin/autosubmit b/bin/autosubmit index 714d754a3..c949fb2da 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -53,7 +53,7 @@ def main(): if e.trace is not None: Log.error("Trace: {0}", e.trace) Log.critical("{1} [eCode={0}]", e.code, e.message) - Log.info("More info at https://autosubmit.readthedocs.io/en/latest/faq.html") + Log.info("More info at https://autosubmit.readthedocs.io/en/v3.14.0/troubleshooting/error-codes.html ") os._exit(1) except Exception as e: if os.path.exists(os.path.join(Log.file_path, "autosubmit.lock")): -- GitLab From 84dc67a05a77bc2e0471e1c3db66fc1a3ef62e92 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 21 Dec 2022 11:31:04 +0100 Subject: [PATCH 143/213] Some missing non-vertical wrappers fixed on 4.0 were missing in final 3.14.0 --- autosubmit/job/job_packager.py | 5 +++-- autosubmit/job/job_packages.py | 15 ++++++++++++--- autosubmit/platforms/wrappers/wrapper_builder.py | 15 ++++++++++++--- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index cfc1235e8..5a43d5e99 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -652,9 +652,10 @@ class JobPackager(object): horizontal_packager.wrapper_limits["max_by_section"][section] = horizontal_packager.wrapper_limits["max_by_section"][section] - 1 horizontal_packager.wrapper_limits["max"] = horizontal_packager.wrapper_limits["max"] - actual_wrapped_jobs for job in horizontal_package: - job_list = JobPackagerVerticalSimple([job], job.wallclock, horizontal_packager.wrapper_limits["max"], + #jobs_list, total_wallclock, max_jobs, wrapper_limits, max_wallclock, wrapper_info + job_list = JobPackagerVertical([job], job.wallclock, horizontal_packager.wrapper_limits["max"], horizontal_packager.wrapper_limits, - self._platform.max_wallclock).build_vertical_package(job) + self._platform.max_wallclock,self.wrapper_info).build_vertical_package(job) current_package.append(job_list) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index a3a6a3b58..12994c643 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -663,12 +663,15 @@ class JobPackageHorizontal(JobPackageThread): self._jobs_resources = jobs_resources def _common_script_content(self): + fail_count = 0 + if len(self.jobs) > 0: + fail_count = self.jobs[0].fail_count return self._wrapper_factory.get_wrapper(self._wrapper_factory.horizontal_wrapper, name=self._name, queue=self._queue, project=self._project, wallclock=self._wallclock, num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, rootdir=self.platform.root_dir, - directives=self._custom_directives,threads=self._threads,method=self.method.lower()) + directives=self._custom_directives,threads=self._threads,method=self.method.lower(),fail_count=fail_count) class JobPackageHybrid(JobPackageThread): """ @@ -708,21 +711,27 @@ class JobPackageHybrid(JobPackageThread): class JobPackageVerticalHorizontal(JobPackageHybrid): def _common_script_content(self): + fail_count = 0 + if len(self.jobs) > 0: + fail_count = self.jobs[0].fail_count return self._wrapper_factory.get_wrapper(self._wrapper_factory.hybrid_wrapper_vertical_horizontal, name=self._name, queue=self._queue, project=self._project, wallclock=self._wallclock, num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, - rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=self.method.lower()) + rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=self.method.lower(),fail_count=fail_count) class JobPackageHorizontalVertical(JobPackageHybrid): def _common_script_content(self): + fail_count = 0 + if len(self.jobs) > 0: + fail_count = self.jobs[0].fail_count return self._wrapper_factory.get_wrapper(self._wrapper_factory.hybrid_wrapper_horizontal_vertical, name=self._name, queue=self._queue, project=self._project, wallclock=self._wallclock, num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, - rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=self.method.lower()) + rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=self.method.lower(),fail_count=fail_count) diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 92ac25662..00214d771 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -44,8 +44,14 @@ class WrapperDirector: return wrapper_script class WrapperBuilder(object): def __init__(self, **kwargs): + # Vertical wrapper if "retrials" in kwargs.keys(): self.retrials = kwargs['retrials'] + # rest of wrappers + if "fail_count" in kwargs.keys(): + self.fail_count = kwargs['fail_count'] + else: + self.fail_count = 0 self.header_directive = kwargs['header_directive'] self.job_scripts = kwargs['jobs_scripts'] self.threads = kwargs['threads'] @@ -148,16 +154,19 @@ class PythonWrapperBuilder(WrapperBuilder): Thread.__init__(self) self.template = template self.id_run = id_run + self.fail_count = {0} def run(self): jobname = self.template.replace('.cmd', '') #os.system("echo $(date +%s) > "+jobname+"_STAT") - out = str(self.template) + ".out." + str(self.retrials) - err = str(self.template) + ".err." + str(self.retrials) + out = str(self.template) + ".out." + str(self.fail_count) + err = str(self.template) + ".err." + str(self.fail_count) print(out+"\\n") command = "./" + str(self.template) + " " + str(self.id_run) + " " + os.getcwd() (self.status) = getstatusoutput(command + " > " + out + " 2> " + err) - """).format('\n'.ljust(13)) + """).format(self.fail_count,'\n'.ljust(13)) + + # hybrids def build_joblist_thread(self): -- GitLab From 3c27b6a3fefb4c2bb8d23a39fb552e35124d2b07 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 25 Jan 2023 10:45:14 +0100 Subject: [PATCH 144/213] !https://earth.bsc.es/gitlab/es/autosubmitreact/-/issues/62 --- autosubmit/autosubmit.py | 7 ++++--- autosubmit/history/experiment_history.py | 23 ++++++++++++++--------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5ab9356be..831022b1f 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3941,7 +3941,7 @@ class Autosubmit: backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) Log.debug("Backing up jobs_data...") - subprocess.call(command, shell=True) + out = subprocess.call(command, shell=True) Log.debug("Jobs_data database backup completed.") except BaseException as e: Log.debug("Jobs_data database backup failed.") @@ -3967,14 +3967,15 @@ class Autosubmit: try: if os.path.exists(database_path): result = os.popen("mv {0} {1}".format(database_path, corrupted_db_path)).read() - time.sleep(10) + time.sleep(1) Log.info("Original database moved.") try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() Log.info("Restoring from sql") result = os.popen(bash_command).read() + exp_history.initialize_database() + except: Log.warning("It was not possible to restore the jobs_data.db file... , a new blank db will be created") result = os.popen("rm {0}".format(database_path)).read() diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index 96651df99..e7ae598a1 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -193,16 +193,21 @@ class ExperimentHistory(): def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="",create=False): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ try: - current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() - update_these_changes = self._get_built_list_of_changes(job_list) - should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size,create) - if len(update_these_changes) > 0 and should_create_new_run == False: - self.manager.update_many_job_data_change_status(update_these_changes) - if should_create_new_run: - return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) - return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) + try: + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + update_these_changes = self._get_built_list_of_changes(job_list) + except: + current_experiment_run_dc = 0 + update_these_changes = [] + #("no runs") + should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size,create) + if len(update_these_changes) > 0 and should_create_new_run == False: + self.manager.update_many_job_data_change_status(update_these_changes) + if should_create_new_run: + return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) + return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) + self._log.log(str(exp), traceback.format_exc()) def _get_built_list_of_changes(self, job_list): """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ -- GitLab From 20652cf57195c6986d9e2c1c5b9c806a0e823a1d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 27 Jan 2023 10:50:21 +0100 Subject: [PATCH 145/213] sshexception not working properly in some cases --- autosubmit/platforms/paramiko_platform.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 84a7bc7f4..968df418e 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -128,8 +128,6 @@ class ParamikoPlatform(Platform): retry = 0 try: self.connect() - except SSHException as e: - raise except Exception as e: if ',' in self.host: Log.printlog("Connection Failed to {0}, will test another host".format( -- GitLab From b4d6efe8504879a40e908013c184835856e0d10b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 1 Feb 2023 12:58:29 +0100 Subject: [PATCH 146/213] conection working 3.15.0 cherry-pick #931 --- autosubmit/autosubmit.py | 18 ++++++----- autosubmit/platforms/paramiko_platform.py | 37 +++++++++++++++-------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 831022b1f..09124077e 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -673,7 +673,7 @@ class Autosubmit: print(f.read()) return True return False - elif args.command == 'dbfix': + elif args.command == 'dbfix': return Autosubmit.database_fix(args.expid) elif args.command == 'pklfix': return Autosubmit.pkl_fix(args.expid) @@ -2030,7 +2030,7 @@ class Autosubmit: message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" raise AutosubmitCritical(message, 7000) except AutosubmitCritical as e: - raise AutosubmitCritical(e.message, e.code, e.trace) + raise except BaseException as e: raise AutosubmitCritical("This seems like a bug in the code, please contact AS developers", 7070,e.message) @@ -2076,8 +2076,12 @@ class Autosubmit: if platform_issues == "": Log.result("[{1}] Connection successful to host {0}", platform.host, platform.name) else: - platform.connected = False - Log.printlog("[{1}] Connection failed to host {0}".format( platform.host, platform.name),Log.WARNING) + if platform.connected: + platform.connected = False + Log.printlog("[{1}] Connection sucessful to host {0}, however there are issues with %hpcroot%".format(platform.host, platform.name), + Log.WARNING) + else: + Log.printlog("[{1}] Connection failed to host {0}".format(platform.host, platform.name), Log.WARNING) if issues != "": raise AutosubmitCritical( "Issues while checking the connectivity of platforms.", 7010, issues+"\n"+ssh_config_issues) @@ -2550,7 +2554,7 @@ class Autosubmit: job_list = Autosubmit.load_job_list(expid, as_conf, notransitive=notransitive) Log.debug("Job list restored from {0} files", pkl_dir) jobs = StatisticsUtils.filter_by_section(job_list.get_job_list(), filter_type) - jobs, period_ini, period_fi = StatisticsUtils.filter_by_time_period(jobs, filter_period) + jobs, period_ini, period_fi = StatisticsUtils.filter_by_time_period(jobs, filter_period) # Package information job_to_package, package_to_jobs, _, _ = JobList.retrieve_packages(BasicConfig, expid, [job.name for job in job_list.get_job_list()]) queue_time_fixes = {} @@ -3954,7 +3958,7 @@ class Autosubmit: :type expid: str :return: :rtype: - """ + """ os.umask(0) # Overrides user permissions current_time = int(time.time()) corrupted_db_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_corrupted.db".format(expid)) @@ -3983,7 +3987,7 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - except Exception as exp: + except Exception as exp: Log.critical(str(exp)) @staticmethod diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 968df418e..5d8794a64 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -106,8 +106,11 @@ class ParamikoPlatform(Platform): except BaseException as e: message = e.message if message.find("t accept remote connections") == -1: - transport = self._ssh.get_transport() - transport.send_ignore() + try: + transport = self._ssh.get_transport() + transport.send_ignore() + except: + message = "Timeout connection" return message except EOFError as e: self.connected = False @@ -147,14 +150,14 @@ class ParamikoPlatform(Platform): raise AutosubmitCritical( 'Experiment cant no continue without unexpected behaviour, Stopping Autosubmit', 7050, trace) - except AutosubmitCritical: + except AutosubmitCritical as e: raise except SSHException as e: raise except Exception as e: raise AutosubmitCritical( 'Cant connect to this platform due an unknown error', 7050, str(e)) - + def threaded(fn): def wrapper(*args, **kwargs): thread = Thread(target=fn, args=args, kwargs=kwargs) @@ -192,15 +195,25 @@ class ParamikoPlatform(Platform): 0] if 'identityfile' in self._host_config: self._host_config_id = self._host_config['identityfile'] - + #pkey = paramiko.Ed25519Key.from_private_key_file(self._host_config_id[0]) + port = int(self._host_config.get('port',22)) if 'proxycommand' in self._host_config: self._proxy = paramiko.ProxyCommand( self._host_config['proxycommand']) - self._ssh.connect(self._host_config['hostname'], 22, username=self.user, - key_filename=self._host_config_id, sock=self._proxy, timeout=120 , banner_timeout=120) + try: + self._ssh.connect(self._host_config['hostname'], port, username=self.user, + key_filename=self._host_config_id, sock=self._proxy, timeout=120 , banner_timeout=120) + except Exception as e: + self._ssh.connect(self._host_config['hostname'], port, username=self.user, + key_filename=self._host_config_id, sock=self._proxy, timeout=120, + banner_timeout=120,disabled_algorithms={'pubkeys': ['rsa-sha2-256', 'rsa-sha2-512']}) else: - self._ssh.connect(self._host_config['hostname'], 22, username=self.user, - key_filename=self._host_config_id, timeout=120 , banner_timeout=120) + try: + self._ssh.connect(self._host_config['hostname'], port, username=self.user, + key_filename=self._host_config_id, timeout=60 , banner_timeout=60) + except Exception as e: + self._ssh.connect(self._host_config['hostname'], port, username=self.user, + key_filename=self._host_config_id, timeout=60 , banner_timeout=60,disabled_algorithms={'pubkeys': ['rsa-sha2-256', 'rsa-sha2-512']}) self.transport = self._ssh.get_transport() #self.transport = paramiko.Transport((self._host_config['hostname'], 22)) #self.transport.connect(username=self.user) @@ -212,12 +225,12 @@ class ParamikoPlatform(Platform): except SSHException as e: raise except IOError as e: - if "refused" in e.strerror.lower(): + if "refused" in str(e.strerror).lower(): raise SSHException(" {0} doesn't accept remote connections. Check if there is an typo in the hostname".format(self.host)) - elif "name or service not known" in e.strerror.lower(): + elif "name or service not known" in str(e.strerror).lower(): raise SSHException(" {0} doesn't accept remote connections. Check if there is an typo in the hostname".format(self.host)) else: - raise AutosubmitError("File can't be located due an slow connection", 6016, str(e)) + raise AutosubmitError("File can't be located due an slow or timeout connection", 6016, str(e)) except BaseException as e: self.connected = False if "Authentication failed." in str(e): -- GitLab From a8b1378167d4fa2ba41c2cccb3de07e2a8bc250a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 1 Feb 2023 13:22:51 +0100 Subject: [PATCH 147/213] AS dummy working 3.15.0 cherry-pick #931 --- autosubmit/autosubmit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 09124077e..0e6e54a52 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2078,7 +2078,7 @@ class Autosubmit: else: if platform.connected: platform.connected = False - Log.printlog("[{1}] Connection sucessful to host {0}, however there are issues with %hpcroot%".format(platform.host, platform.name), + Log.printlog("[{1}] Connection sucessful to host {0}, however there are issues with %HPCROOT%".format(platform.host, platform.name), Log.WARNING) else: Log.printlog("[{1}] Connection failed to host {0}".format(platform.host, platform.name), Log.WARNING) -- GitLab From 6f5a5693a3a5b34fed687d16588fdb6d3157e299 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 1 Feb 2023 14:22:05 +0100 Subject: [PATCH 148/213] Added NODES parameter to jobs #931 cherry-pick --- autosubmit/job/job.py | 40 +++++++++++--------- autosubmit/platforms/headers/slurm_header.py | 15 +++++++- autosubmit/platforms/paramiko_platform.py | 3 ++ 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 739216c4a..0f61ccdea 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -95,6 +95,7 @@ class Job(object): self.wallclock = None # type: str self.wchunkinc = None self.tasks = '0' + self.nodes = "" self.threads = '1' self.processors = '1' self.memory = '' @@ -182,7 +183,7 @@ class Job(object): :rtype: set """ return self._parents - + @parents.setter def parents(self, parents): """ @@ -193,13 +194,13 @@ class Job(object): @property def status_str(self): """ - String representation of the current status + String representation of the current status """ return Status.VALUE_TO_KEY.get(self.status, "UNKNOWN") - + @property def children_names_str(self): - """ + """ Comma separated list of children's names """ return ",".join([str(child.name) for child in self._children]) @@ -315,7 +316,7 @@ class Job(object): @property def total_processors(self): """ - Number of processors requested by job. + Number of processors requested by job. Reduces ':' separated format if necessary. """ if ':' in self.processors: @@ -536,8 +537,8 @@ class Job(object): """ Returns the retrials of a job, including the last COMPLETED run. The selection stops, and does not include, when the previous COMPLETED job is located or the list of registers is exhausted. - :return: list of list of dates of retrial [submit, start, finish] in datetime format - :rtype: list of list + :return: list of list of dates of retrial [submit, start, finish] in datetime format + :rtype: list of list """ log_name = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') retrials_list = [] @@ -883,7 +884,7 @@ class Job(object): if previous_status != Status.RUNNING and self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN, Status.RUNNING]: self.write_start_time() - if previous_status == Status.HELD and self.status in [Status.SUBMITTED, Status.QUEUING, Status.RUNNING]: + if previous_status == Status.HELD and self.status in [Status.SUBMITTED, Status.QUEUING, Status.RUNNING]: self.write_submit_time() # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: @@ -1047,6 +1048,8 @@ class Job(object): self.processors = as_conf.get_processors(self.section) self.threads = as_conf.get_threads(self.section) self.tasks = as_conf.get_tasks(self.section) + self.nodes = as_conf.get_nodes(self.section) + self.hyperthreading = as_conf.get_hyperthreading(self.section).lower() if self.hyperthreading is 'none': self.hyperthreading = job_platform.hyperthreading.lower() @@ -1088,6 +1091,7 @@ class Job(object): parameters['CPUS_PER_TASK'] = self.threads parameters['NUMTASK'] = self.tasks parameters['TASKS'] = self.tasks + parameters['NODES'] = self.nodes parameters['TASKS_PER_NODE'] = self.tasks parameters['WALLCLOCK'] = self.wallclock parameters['TASKTYPE'] = self.section @@ -1399,7 +1403,7 @@ class Job(object): exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_submit_time(self.name, submit=data_time[1], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) def write_start_time(self, enabled = False): @@ -1428,8 +1432,8 @@ class Job(object): # Writing database exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) return True @@ -1472,7 +1476,7 @@ class Job(object): wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) # Launch second as threaded function only for slurm - if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": thread_write_finish = Thread(target=ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR).write_platform_data_after_finish, args=(job_data_dc, self.platform)) thread_write_finish.name = "JOB_data_{}".format(self.name) thread_write_finish.start() @@ -1488,7 +1492,7 @@ class Job(object): Writes all data to TOTAL_STATS file :param total_stats: data gathered by the wrapper :type completed: str - """ + """ if first_retrial: self.write_submit_time(enabled=True) path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') @@ -1504,12 +1508,12 @@ class Job(object): exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_submit_time(self.name, submit=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_start_time(self.name, start=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) @@ -1518,7 +1522,7 @@ class Job(object): platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) # Launch second as threaded function only for slurm - if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": thread_write_finish = Thread(target=ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR).write_platform_data_after_finish, args=(job_data_dc, self.platform)) thread_write_finish.name = "JOB_data_{}".format(self.name) thread_write_finish.start() @@ -1743,7 +1747,7 @@ class WrapperJob(Job): for job in self.job_list: job.hold = self.hold job.new_status = Status.QUEUING - job.update_status(self.as_config.get_copy_remote_logs() == 'true') + job.update_status(self.as_config.get_copy_remote_logs() == 'true') Log.info("Job {0} is QUEUING {1}", self.name, reason) else: self.status = Status.HELD diff --git a/autosubmit/platforms/headers/slurm_header.py b/autosubmit/platforms/headers/slurm_header.py index c1eb50a18..a8a3bf811 100644 --- a/autosubmit/platforms/headers/slurm_header.py +++ b/autosubmit/platforms/headers/slurm_header.py @@ -54,6 +54,18 @@ class SlurmHeader(object): return "SBATCH -A {0}".format(job.parameters['CURRENT_PROJ']) return "" + def get_nodes_directive(self, job): + """ + Returns nodes directive for the specified job + :param job: job to create nodes directive for + :type job: Job + :return: nodes directive + :rtype: str + """ + # There is no account, so directive is empty + if job.parameters.get('NODES',"") != '': + return "SBATCH -N {0}".format(job.parameters.get('NODES',"")) + return "" # noinspection PyMethodMayBeStatic,PyUnusedLocal def get_memory_directive(self, job): """ @@ -128,9 +140,9 @@ class SlurmHeader(object): #%QUEUE_DIRECTIVE% #%ACCOUNT_DIRECTIVE% #%MEMORY_DIRECTIVE% - #%THREADS_PER_TASK_DIRECTIVE% #%TASKS_PER_NODE_DIRECTIVE% +#%NODES_DIRECTIVE% #SBATCH -n %NUMPROC% #SBATCH -t %WALLCLOCK%:00 #SBATCH -J %JOBNAME% @@ -152,6 +164,7 @@ class SlurmHeader(object): #%MEMORY_DIRECTIVE% #%MEMORY_PER_TASK_DIRECTIVE% #%THREADS_PER_TASK_DIRECTIVE% +#%NODES_DIRECTIVE% #SBATCH -n %NUMPROC% #%TASKS_PER_NODE_DIRECTIVE% #SBATCH -t %WALLCLOCK%:00 diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 5d8794a64..7ef8c24dd 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -1125,6 +1125,9 @@ class ParamikoPlatform(Platform): if hasattr(self.header, 'get_account_directive'): header = header.replace( '%ACCOUNT_DIRECTIVE%', self.header.get_account_directive(job)) + if hasattr(self.header, 'get_nodes_directive'): + header = header.replace( + '%NODES_DIRECTIVE%', self.header.get_nodes_directive(job)) if hasattr(self.header, 'get_memory_directive'): header = header.replace( '%MEMORY_DIRECTIVE%', self.header.get_memory_directive(job)) -- GitLab From d07e445134e483325bd04ee6449c4b9f062bf517 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 1 Feb 2023 15:15:58 +0100 Subject: [PATCH 149/213] Update slurm_header.py --- autosubmit/platforms/headers/slurm_header.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autosubmit/platforms/headers/slurm_header.py b/autosubmit/platforms/headers/slurm_header.py index a8a3bf811..411f11bef 100644 --- a/autosubmit/platforms/headers/slurm_header.py +++ b/autosubmit/platforms/headers/slurm_header.py @@ -63,8 +63,9 @@ class SlurmHeader(object): :rtype: str """ # There is no account, so directive is empty - if job.parameters.get('NODES',"") != '': - return "SBATCH -N {0}".format(job.parameters.get('NODES',"")) + nodes = job.parameters.get('NODES',"") + if nodes != '': + return "SBATCH -N {0}".format(nodes) return "" # noinspection PyMethodMayBeStatic,PyUnusedLocal def get_memory_directive(self, job): -- GitLab From 167ea5f5a9801efa38b6db3a628d2be0ffdd33b8 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 1 Feb 2023 15:20:45 +0100 Subject: [PATCH 150/213] Added nodes to docs --- docs/source/devguide/variables.rst | 1 + docs/source/userguide/configure/index.rst | 3 +++ 2 files changed, 4 insertions(+) diff --git a/docs/source/devguide/variables.rst b/docs/source/devguide/variables.rst index ede246853..2cab3a71e 100644 --- a/docs/source/devguide/variables.rst +++ b/docs/source/devguide/variables.rst @@ -36,6 +36,7 @@ This variables are relatives to the current job. - **NUMPROC**: Number of processors that the job will use. - **NUMTHREADS**: Number of threads that the job will use. - **NUMTASKS**: Number of tasks that the job will use. +- **NODES**: Number of nodes that the job will use. - **HYPERTHREADING**: Detects if hyperthreading is enabled or not. - **WALLCLOCK**: Number of processors that the job will use. - **SCRATCH_FREE_SPACE**: Percentage of free space required on the ``scratch``. diff --git a/docs/source/userguide/configure/index.rst b/docs/source/userguide/configure/index.rst index 2a2b4f0a8..c57ecf29d 100644 --- a/docs/source/userguide/configure/index.rst +++ b/docs/source/userguide/configure/index.rst @@ -138,6 +138,9 @@ To do this use: * TASKS: tasks number to be submitted to the HPC. If not specified, defaults to 1. +* NODES: nodes number to be submitted to the HPC. If not specified, the directive is not added. + + * HYPERTHREADING: Enables Hyper-threading, this will double the max amount of threads. defaults to false. ( Not available on slurm platforms ) * QUEUE: queue to add the job to. If not specified, uses PLATFORM default. -- GitLab From b73b4adcb7c73ef1ff6fc628f1a0d81749f6fbd3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 2 Feb 2023 08:42:57 +0100 Subject: [PATCH 151/213] Meluxia support #931 --- autosubmit/config/config_common.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 5e3e45294..68c37d427 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -313,6 +313,16 @@ class AutosubmitConfig(object): """ return str(self._jobs_parser.get_option(section, 'TASKS', '0')) + def get_nodes(self, section): + """ + Gets tasks needed for the given job type + :param section: job type + :type section: str + :return: tasks (processes) per host + :rtype: str + """ + return str(self._jobs_parser.get_option(section, 'NODES', '')) + def get_scratch_free_space(self, section): """ Gets scratch free space needed for the given job type -- GitLab From ef37563274ea7ea67cd951ecde4b101fff5f57ac Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 2 Feb 2023 10:13:48 +0100 Subject: [PATCH 152/213] Allow horizontal wrappers being combined together --- autosubmit/job/job_packager.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 5a43d5e99..cf632049f 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -226,13 +226,17 @@ class JobPackager(object): for wrapper_section in self.jobs_in_wrapper: if "&" in self.jobs_in_wrapper[wrapper_section]: char = "&" - else: - char = " " - for section_inside_wrapper in self.jobs_in_wrapper[wrapper_section].split(char): - if section == section_inside_wrapper: + if section == self.jobs_in_wrapper[wrapper_section]: wrapper_defined = True self.current_wrapper_section = wrapper_section break + else: + char = " " + for section_inside_wrapper in self.jobs_in_wrapper[wrapper_section].split(char): + if section == section_inside_wrapper: + wrapper_defined = True + self.current_wrapper_section = wrapper_section + break if wrapper_defined and self._platform.allow_wrappers and self.wrapper_type[self.current_wrapper_section] in ['horizontal', 'vertical','vertical-horizontal', 'horizontal-vertical'] : # Trying to find the value in jobs_parser, if not, default to an autosubmit_.conf value (Looks first in [wrapper] section) wrapper_limits = dict() -- GitLab From 555f2ae29226115d50580bf5daa94765e89cae0c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 18 Jan 2023 15:52:25 +0100 Subject: [PATCH 153/213] ECACCESS - WIP #917 (cherry picked from commit 3a4e1645c3a327c802c15ed9eb093e91a7f2ddd7) --- autosubmit/platforms/ecplatform.py | 35 +++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index e5de0c733..eb5fa2195 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -39,6 +39,7 @@ class EcPlatform(ParamikoPlatform): def __init__(self, expid, name, config, scheduler): ParamikoPlatform.__init__(self, expid, name, config) + #version=scheduler if scheduler == 'pbs': self._header = EcCcaHeader() elif scheduler == 'loadleveler': @@ -68,7 +69,8 @@ class EcPlatform(ParamikoPlatform): self.cancel_cmd = "eceaccess-job-delete" self._checkjob_cmd = "ecaccess-job-list " self._checkhost_cmd = "ecaccess-certificate-list" - self._submit_cmd = ("ecaccess-job-submit -distant -queueName " + self.host + " " + self.host + ":" + + self._checkvalidcert_cmd = "ecaccess-gateway-connected" + self._submit_cmd = ("ecaccess-job-submit -distant -queueName " + self.queue + " " + self.host + ":" + self.remote_log_dir + "/") self._submit_command_name = "ecaccess-job-submit" self.put_cmd = "ecaccess-file-put" @@ -126,7 +128,11 @@ class EcPlatform(ParamikoPlatform): :return: True :rtype: bool """ - self.connected = True + output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if output.lower().find("yes") != -1: + self.connected = True + else: + self.connected = False def restore_connection(self): """ In this case, it does nothing because connection is established for each command @@ -134,7 +140,11 @@ class EcPlatform(ParamikoPlatform): :return: True :rtype: bool """ - self.connected = True + output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if output.lower().find("yes") != -1: + self.connected = True + else: + self.connected = False def test_connection(self): """ In this case, it does nothing because connection is established for each command @@ -142,17 +152,24 @@ class EcPlatform(ParamikoPlatform): :return: True :rtype: bool """ - self.connected = True + output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if output.lower().find("yes") != -1: + self.connected = True + return "OK" + else: + self.connected = False + return "Invalid certificate" + def check_remote_permissions(self): try: try: - output = subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=True) - except: + subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=False) + except Exception as e: pass - output = subprocess.check_output(self.check_remote_permissions_cmd, shell=True) + subprocess.check_output(self.check_remote_permissions_cmd, shell=True) pass - output = subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=True) + subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=True) return True except: return False @@ -161,6 +178,8 @@ class EcPlatform(ParamikoPlatform): try: output = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: + if command.find("ecaccess-job-submit") != -1: + raise AutosubmitError("bad parameters. Error submitting job.") if not ignore_log: raise AutosubmitError('Could not execute command {0} on {1}'.format(e.cmd, self.host),7500,e.message) return False -- GitLab From 6143842347702bd33c451baca17b6b045c3b42cd Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 25 Jan 2023 16:07:35 +0100 Subject: [PATCH 154/213] Ec-access working but the tests are randomly crashing due a bug, debugin (cherry picked from commit 76c967a58b090bc66e06d1fa021ab5d19eb43206) --- autosubmit/platforms/ecplatform.py | 4 ++-- autosubmit/platforms/platform.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index eb5fa2195..02fcc172d 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -26,7 +26,7 @@ from autosubmit.platforms.headers.ec_cca_header import EcCcaHeader from autosubmit.platforms.headers.slurm_header import SlurmHeader from autosubmit.platforms.wrappers.wrapper_factory import EcWrapperFactory from time import sleep - +import locale class EcPlatform(ParamikoPlatform): """ Class to manage queues with ecaccess @@ -70,7 +70,7 @@ class EcPlatform(ParamikoPlatform): self._checkjob_cmd = "ecaccess-job-list " self._checkhost_cmd = "ecaccess-certificate-list" self._checkvalidcert_cmd = "ecaccess-gateway-connected" - self._submit_cmd = ("ecaccess-job-submit -distant -queueName " + self.queue + " " + self.host + ":" + + self._submit_cmd = ("ecaccess-job-submit -distant -queueName " + self.ec_queue + " " + self.host + ":" + self.remote_log_dir + "/") self._submit_command_name = "ecaccess-job-submit" self.put_cmd = "ecaccess-file-put" diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index acbb20aa7..f441f8cbf 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -26,6 +26,7 @@ class Platform(object): self._serial_queue = None self._default_queue = None self.processors_per_node = None + self.ec_queue = "hpc" self.scratch_free_space = None self.custom_directives = None self.host = '' @@ -124,6 +125,8 @@ class Platform(object): parameters['{0}ARCH'.format(prefix)] = self.name parameters['{0}HOST'.format(prefix)] = self.host parameters['{0}QUEUE'.format(prefix)] = self.queue + parameters['{0}EC_QUEUE'.format(prefix)] = self.ec_queue + parameters['{0}USER'.format(prefix)] = self.user parameters['{0}PROJ'.format(prefix)] = self.project parameters['{0}BUDG'.format(prefix)] = self.budget -- GitLab From 0a3d4395e98d7946ea32eea8897debb726950d47 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 30 Jan 2023 12:58:57 +0100 Subject: [PATCH 155/213] Ecaccess support (cherry picked from commit 50a2a49559087fcb1c6f62b9dbada2c8aa71344e) --- autosubmit/job/job_dict.py | 1 + autosubmit/platforms/ecplatform.py | 4 ++++ autosubmit/platforms/paramiko_submitter.py | 3 +++ 3 files changed, 8 insertions(+) diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index 29ca59e28..90f3dec2c 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -495,6 +495,7 @@ class DicJobs: job.platform_name = job.platform_name job.file = self.get_option(section, "FILE", None) job.queue = self.get_option(section, "QUEUE", None) + job.ec_queue = self.get_option(section, "EC_QUEUE", "") job.check = str(self.get_option(section, "CHECK", 'True')).lower() job.export = str(self.get_option(section, "EXPORT", None)) job.processors = str(self.get_option(section, "PROCESSORS", 1)) diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index 02fcc172d..87d57a4c5 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -90,6 +90,9 @@ class EcPlatform(ParamikoPlatform): def get_mkdir_cmd(self): return self.mkdir_cmd + def set_submit_cmd(self,ec_queue="hpc"): + self._submit_cmd = ("ecaccess-job-submit -distant -queueName " + ec_queue + " " + self.host + ":" + + self.remote_log_dir + "/") def parse_job_output(self, output): job_state = output.split('\n') @@ -115,6 +118,7 @@ class EcPlatform(ParamikoPlatform): return self._checkjob_cmd + str(job_id) def get_submit_cmd(self, job_script, job, hold=False, export=""): + self.set_submit_cmd(job.ec_queue) if export == "none" or export == "None" or export is None or export == "": export = "" else: diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 92594abdd..de78ee508 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -181,12 +181,15 @@ class ParamikoSubmitter(Submitter): section, 'TEMP_DIR', None) remote_platform._default_queue = parser.get_option( section, 'QUEUE', None) + + remote_platform.ec_queue =parser.get_option('EC_QUEUE', "hpc") remote_platform._serial_queue = parser.get_option( section, 'SERIAL_QUEUE', None) remote_platform.processors_per_node = parser.get_option(section, 'PROCESSORS_PER_NODE', None) remote_platform.custom_directives = parser.get_option(section, 'CUSTOM_DIRECTIVES', None) + if remote_platform.custom_directives is not None and remote_platform.custom_directives != '' and remote_platform.custom_directives != 'None': Log.debug("Custom directives from platform.conf: {0}".format( remote_platform.custom_directives)) -- GitLab From 8dfd8076c896094cc4e1bd2e3d2f80d966194d19 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 3 Feb 2023 15:23:40 +0100 Subject: [PATCH 156/213] 3.15.0 meluxa - ecaccess --- autosubmit/job/job_dict.py | 2 +- autosubmit/platforms/ecplatform.py | 22 +++++++++++++++++++--- autosubmit/platforms/paramiko_submitter.py | 2 +- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index 90f3dec2c..ef98ee576 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -495,7 +495,7 @@ class DicJobs: job.platform_name = job.platform_name job.file = self.get_option(section, "FILE", None) job.queue = self.get_option(section, "QUEUE", None) - job.ec_queue = self.get_option(section, "EC_QUEUE", "") + job.ec_queue = self.get_option(section, "EC_QUEUE", "hpc") job.check = str(self.get_option(section, "CHECK", 'True')).lower() job.export = str(self.get_option(section, "EXPORT", None)) job.processors = str(self.get_option(section, "PROCESSORS", 1)) diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index 87d57a4c5..960f93f1e 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -40,6 +40,7 @@ class EcPlatform(ParamikoPlatform): def __init__(self, expid, name, config, scheduler): ParamikoPlatform.__init__(self, expid, name, config) #version=scheduler + self.ec_queue = "hpc" if scheduler == 'pbs': self._header = EcCcaHeader() elif scheduler == 'loadleveler': @@ -58,6 +59,21 @@ class EcPlatform(ParamikoPlatform): self._allow_arrays = False self._allow_wrappers = False # TODO self._allow_python_jobs = False + self.root_dir = "" + self.remote_log_dir = "" + self.cancel_cmd = "" + self._checkjob_cmd = "" + self._checkhost_cmd = "" + self._submit_cmd = "" + self._submit_command_name = "" + self.put_cmd = "" + self.get_cmd = "" + self.del_cmd = "" + self.mkdir_cmd = "" + self.check_remote_permissions_cmd = "" + self.check_remote_permissions_remove_cmd = "" + + self.update_cmds() def update_cmds(self): @@ -79,8 +95,8 @@ class EcPlatform(ParamikoPlatform): self.mkdir_cmd = ("ecaccess-file-mkdir " + self.host + ":" + self.scratch + "/" + self.project + "/" + self.user + "/" + self.expid + "; " + "ecaccess-file-mkdir " + self.host + ":" + self.remote_log_dir) - self.check_remote_permissions_cmd = "ecaccess-file-mkdir " + os.path.join(self.scratch,self.project,self.user,"_permission_checker_azxbyc") - self.check_remote_permissions_remove_cmd = "ecaccess-file-rmdir " + os.path.join(self.scratch,self.project,self.user,"_permission_checker_azxbyc") + self.check_remote_permissions_cmd = "ecaccess-file-mkdir " + self.host+":"+os.path.join(self.scratch,self.project,self.user,"_permission_checker_azxbyc") + self.check_remote_permissions_remove_cmd = "ecaccess-file-rmdir " + self.host+":"+os.path.join(self.scratch,self.project,self.user,"_permission_checker_azxbyc") def get_checkhost_cmd(self): return self._checkhost_cmd @@ -175,7 +191,7 @@ class EcPlatform(ParamikoPlatform): pass subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=True) return True - except: + except Exception as e: return False def send_command(self, command, ignore_log=False, x11 = False): diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index de78ee508..7330f6c0c 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -182,7 +182,7 @@ class ParamikoSubmitter(Submitter): remote_platform._default_queue = parser.get_option( section, 'QUEUE', None) - remote_platform.ec_queue =parser.get_option('EC_QUEUE', "hpc") + remote_platform.ec_queue = parser.get_option(section,'EC_QUEUE', "hpc") remote_platform._serial_queue = parser.get_option( section, 'SERIAL_QUEUE', None) remote_platform.processors_per_node = parser.get_option(section, 'PROCESSORS_PER_NODE', -- GitLab From 8c0492437c60a0724db9b09071f7253ed3f0b99f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 3 Feb 2023 15:35:05 +0100 Subject: [PATCH 157/213] Cancel remote job if it is over_wallclock Job_id doesn't depend on retrials anymore when is written --- autosubmit/job/job.py | 6 +++--- autosubmit/platforms/paramiko_platform.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 0f61ccdea..ce66991e9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -611,7 +611,7 @@ class Job(object): return @threaded - def retrieve_logfiles(self, copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = 0): + def retrieve_logfiles(self, copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = 0,job_id=""): max_logs = 0 sleep(5) stat_file = self.script_name[:-4] + "_STAT_" @@ -746,7 +746,7 @@ class Job(object): # Update the logs with Autosubmit Job Id Brand try: for local_log in local_logs: - platform.write_jobid(self.id, os.path.join( + platform.write_jobid(job_id, os.path.join( self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( @@ -899,7 +899,7 @@ class Job(object): if as_conf.get_disable_recovery_threads(self.platform.name) == "true": self.retrieve_logfiles_unthreaded(copy_remote_logs, local_logs) else: - self.retrieve_logfiles(copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = copy.copy(self.fail_count)) + self.retrieve_logfiles(copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = copy.copy(self.fail_count),job_id=self.id) if self.wrapper_type == "vertical": max_logs = int(as_conf.get_retrials()) for i in range(0,max_logs): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 7ef8c24dd..8bbaa96a9 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -616,6 +616,12 @@ class ParamikoPlatform(Platform): try: job.platform.get_completed_files(job.name) job_status = job.check_completion(over_wallclock=True) + if job_status is Status.FAILED: + try: + job.platform.send_command( + self.platform.cancel_cmd + " " + str(job.id)) + except: + pass except: job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: -- GitLab From b78b416b241a385fa569f908b53b29e5f1801a09 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 3 Feb 2023 15:39:25 +0100 Subject: [PATCH 158/213] update version --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index f982feb41..454731f6d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.14.0 +3.15.0b -- GitLab From 8be714967277f3e4beb13e37a7ddbba8532a3008 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 6 Feb 2023 10:43:44 +0100 Subject: [PATCH 159/213] error path updated --- bin/autosubmit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/autosubmit b/bin/autosubmit index c949fb2da..8170a19fd 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -53,7 +53,7 @@ def main(): if e.trace is not None: Log.error("Trace: {0}", e.trace) Log.critical("{1} [eCode={0}]", e.code, e.message) - Log.info("More info at https://autosubmit.readthedocs.io/en/v3.14.0/troubleshooting/error-codes.html ") + Log.info("More info at https://autosubmit.readthedocs.io/en/v3.15.0/faq.html") os._exit(1) except Exception as e: if os.path.exists(os.path.join(Log.file_path, "autosubmit.lock")): -- GitLab From d555522ec9a8881ceb0089152686fdba949b46a3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 6 Feb 2023 11:14:40 +0100 Subject: [PATCH 160/213] Updated error codes --- autosubmit/autosubmit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 0e6e54a52..c40517658 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -774,7 +774,7 @@ class Autosubmit: "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), Autosubmit.autosubmit_version, expid,args.command), - 7067) + 7014) else: if expid == 'None': exp_id = "" @@ -1599,7 +1599,8 @@ class Autosubmit: Log.result("Only jobs with member value in {0} or no member will be allowed in this run. Also, those jobs already SUBMITTED, QUEUING, or RUNNING will be allowed to complete and will be tracked.".format( str(allowed_members))) except AutosubmitCritical as e: - raise AutosubmitCritical(e.message, 7067, e.trace) + e.message += " HINT: check the CUSTOM_DIRECTIVE syntax in your jobs configuration files." + raise AutosubmitCritical(e.message, 7014, e.trace) except Exception as e: raise AutosubmitCritical( "Error in run initialization", 7014, str(e)) # Changing default to 7014 -- GitLab From c849ff5dc6bb8517ba037d72c31afa84d3b7242b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 28 Mar 2023 13:09:53 +0200 Subject: [PATCH 161/213] Fixes to cancel jobs --- autosubmit/autosubmit.py | 8 +++++--- autosubmit/job/job.py | 4 ++-- autosubmit/platforms/paramiko_platform.py | 9 ++++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index c40517658..6817b7cec 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2200,7 +2200,9 @@ class Autosubmit: try: jobs_id = platform.submit_Script(hold=hold) except AutosubmitError as e: - jobnames = [job.name for job in valid_packages_to_submit[0].jobs] + jobnames = [] + for package in valid_packages_to_submit: + jobnames += [job.name for job in package.jobs] for jobname in jobnames: jobid = platform.get_jobid_by_jobname(jobname) #cancel bad submitted job if jobid is encountered @@ -2212,7 +2214,7 @@ class Autosubmit: has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 else: has_trace_bad_parameters = False - if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: + if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find("invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -2236,7 +2238,7 @@ class Autosubmit: "Submission failed, this can be due a failure on the platform", 6015, e.message) if jobs_id is None or len(jobs_id) <= 0: raise AutosubmitError( - "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(str(e),""), 6015) + "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(str(platform.name),""), 6015) i = 0 if hold: sleep(10) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index ce66991e9..8aa3cfc3c 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -729,7 +729,7 @@ class Job(object): platform.get_logs_files(self.expid, l_log) try: for local_log in l_log: - platform.write_jobid(self.id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) + platform.write_jobid(job_id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: pass max_logs = max_logs - 1 @@ -1203,7 +1203,7 @@ class Job(object): template_file.close() else: if self.type == Type.BASH: - template = 'sleep 5' + template = 'sleep 360' elif self.type == Type.PYTHON: template = 'time.sleep(5)' elif self.type == Type.R: diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 8bbaa96a9..e3535f56e 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -519,6 +519,12 @@ class ParamikoPlatform(Platform): job_status = job.check_completion(over_wallclock=True) except: job_status = Status.FAILED + if job_status == Status.FAILED: + try: + job.platform.send_command( + job.platform.cancel_cmd + " " + str(job.id)) + except: + pass elif job_status in self.job_status['QUEUING'] and job.hold is False: job_status = Status.QUEUING elif job_status in self.job_status['QUEUING'] and job.hold is True: @@ -614,12 +620,13 @@ class ParamikoPlatform(Platform): if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": if job.is_over_wallclock(job.start_time,wallclock): try: + Log.debug("Job {0} is over wallclock, checking completion".format(job.id)) job.platform.get_completed_files(job.name) job_status = job.check_completion(over_wallclock=True) if job_status is Status.FAILED: try: job.platform.send_command( - self.platform.cancel_cmd + " " + str(job.id)) + job.platform.cancel_cmd + " " + str(job.id)) except: pass except: -- GitLab From 5d26492bc17df4629797635a21b9a48cf9e95d6a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 29 Mar 2023 12:11:40 +0200 Subject: [PATCH 162/213] closes #870 --- autosubmit/autosubmit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 6817b7cec..e31e122ca 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -833,8 +833,8 @@ class Autosubmit: message+= " Note that this action does not delete any data written by the experiment.\n" message+= "Complete list of files/directories deleted:\n" for root, dirs, files in os.walk(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)): - for dir in dirs: - message += os.path.join(root, dir) + "\n" + for dir_ in dirs: + message += os.path.join(root, dir_) + "\n" message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, "structure_{0}.db".format(expid_delete)) + "\n" message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, @@ -957,7 +957,7 @@ class Autosubmit: Autosubmit._prepare_conf_files( exp_id, hpc, Autosubmit.autosubmit_version, dummy, copy_id) except (OSError, IOError) as e: - Autosubmit._delete_expid(exp_id) + Autosubmit._delete_expid(exp_id, True) raise AutosubmitCritical( "Couldn't create a new experiment, permissions?", 7012, e.message) except BaseException as e: -- GitLab From 71b016cdd37b36cb17d7b9d223ab3398e4421e6a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 29 Mar 2023 13:29:14 +0200 Subject: [PATCH 163/213] Now autosubmit is able to detect if something is wrong with the bashrc #817 --- autosubmit/autosubmit.py | 4 +++ autosubmit/job/job.py | 2 +- autosubmit/platforms/ecplatform.py | 5 ++- autosubmit/platforms/locplatform.py | 8 +++-- autosubmit/platforms/paramiko_platform.py | 39 ++++++++++++----------- autosubmit/platforms/platform.py | 13 ++++++++ 6 files changed, 49 insertions(+), 22 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index e31e122ca..fdbed29f8 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2074,6 +2074,10 @@ class Autosubmit: platform_issues += "\n[{0}] has configuration issues.\n Check that the connection is passwd-less.(ssh {1}@{4})\n Check the parameters that build the root_path are correct:{{scratch_dir/project/user}} = {{{3}/{2}/{1}}}".format( platform.name, platform.user, platform.project, platform.scratch,platform.host) issues += platform_issues + # Checks if bashrc is provinding output that could mess with Autosubmit remote pooling, if so, warns the user but continues as Autosubmit should be able to strip the output + platform.get_bashrc_output() + if platform.bashrc_output != "" or platform.bashrc_err != "": + Log.warning("Bashrc is providing output that could mess with Autosubmit remote pooling\nHINT: add [ -z \"$PS1\" ] && return. at the header of {1}:~/.bashrc".format(platform.name,platform.host)) if platform_issues == "": Log.result("[{1}] Connection successful to host {0}", platform.host, platform.name) else: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 8aa3cfc3c..25cb081b2 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1203,7 +1203,7 @@ class Job(object): template_file.close() else: if self.type == Type.BASH: - template = 'sleep 360' + template = 'sleep 5' elif self.type == Type.PYTHON: template = 'time.sleep(5)' elif self.type == Type.R: diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index 960f93f1e..ba9eb5828 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -203,7 +203,10 @@ class EcPlatform(ParamikoPlatform): if not ignore_log: raise AutosubmitError('Could not execute command {0} on {1}'.format(e.cmd, self.host),7500,e.message) return False - self._ssh_output = output + if output.startswith(self.bashrc_output): + self._ssh_output = output[len(self.bashrc_output):] + else: + self._ssh_output = output return True def send_file(self, filename, check=True): diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index e7734b133..bbb811a26 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -107,8 +107,12 @@ class LocalPlatform(ParamikoPlatform): if not ignore_log: Log.error('Could not execute command {0} on {1}'.format(e.cmd, self.host)) return False - Log.debug("Command '{0}': {1}", command, output) - self._ssh_output = output + if output.startswith(self.bashrc_output): + self._ssh_output = output[len(self.bashrc_output):] + else: + self._ssh_output = output + Log.debug("Command '{0}': {1}", command, self._ssh_output) + return True def send_file(self, filename): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e3535f56e..08d088cc4 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -35,6 +35,7 @@ class ParamikoPlatform(Platform): """ Platform.__init__(self, expid, name, config) + self._ssh_output_err = "" self.connected = False self._default_queue = None @@ -232,6 +233,8 @@ class ParamikoPlatform(Platform): else: raise AutosubmitError("File can't be located due an slow or timeout connection", 6016, str(e)) except BaseException as e: + if "Garbage packet received" in str(e): + Log.error("Couldn't connect to ftp channel due to the stdout given by the {0}:~/.bashrc\nCheck {0}:~/.bashrc for commands that could give output or error".format(self.host)) self.connected = False if "Authentication failed." in str(e): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( @@ -869,7 +872,6 @@ class ParamikoPlatform(Platform): :return: True if executed, False if failed :rtype: bool """ - if "rsync" in command or "find" in command or "convertLink" in command: timeout = None # infinite timeout on migrate command elif "rm" in command: @@ -916,8 +918,6 @@ class ParamikoPlatform(Platform): if not x11: stdout.close() stderr.close() - - self._ssh_output = "" self._ssh_output_err = "" for s in stdout_chunks: @@ -925,23 +925,24 @@ class ParamikoPlatform(Platform): self._ssh_output += s for errorLineCase in stderr_readlines: self._ssh_output_err += errorLineCase - - for errorLineCase in stderr_readlines: - errorLine = errorLineCase.lower() - if "not active" in errorLine: - raise AutosubmitError( - 'SSH Session not active, will restart the platforms', 6005) - if errorLine.find("command not found") != -1: - raise AutosubmitCritical("scheduler is not installed.",7052,self._ssh_output_err) - elif errorLine.find("refused") != -1 or errorLine.find("slurm_persist_conn_open_without_init") != -1 or errorLine.find("slurmdbd") != -1 or errorLine.find("submission failed") != -1 or errorLine.find("git clone") != -1 or errorLine.find("sbatch: error: ") != -1 or errorLine.find("not submitted") != -1 or errorLine.find("invalid") != -1: - if (self._submit_command_name == "sbatch" and (errorLine.find("policy") != -1 or errorLine.find("invalid") != -1) ) or (self._submit_command_name == "sbatch" and errorLine.find("argument") != -1) or (self._submit_command_name == "bsub" and errorLine.find("job not submitted") != -1) or self._submit_command_name == "ecaccess-job-submit" or self._submit_command_name == "qsub ": - raise AutosubmitError(errorLine, 7014, "Bad Parameters.") - raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host,self._ssh_output_err, 6005)) + # if self._bashrc_output matchs the start of self.ssh_output, then strip it from self.ssh_output + if self._ssh_output.startswith(self.bashrc_output): + self._ssh_output = self._ssh_output[len(self.bashrc_output):] + if self._ssh_output_err.startswith(self.bashrc_err): + self._ssh_output_err = self._ssh_output_err[len(self.bashrc_err):] + if "not active" in self._ssh_output_err: + raise AutosubmitError( + 'SSH Session not active, will restart the platforms', 6005) + if self._ssh_output_err.find("command not found") != -1: + raise AutosubmitCritical("scheduler is not installed.",7052,self._ssh_output_err) + elif self._ssh_output_err.find("refused") != -1 or self._ssh_output_err.find("slurm_persist_conn_open_without_init") != -1 or self._ssh_output_err.find("slurmdbd") != -1 or self._ssh_output_err.find("submission failed") != -1 or self._ssh_output_err.find("git clone") != -1 or self._ssh_output_err.find("sbatch: error: ") != -1 or self._ssh_output_err.find("not submitted") != -1 or self._ssh_output_err.find("invalid") != -1: + if (self._submit_command_name == "sbatch" and (self._ssh_output_err.find("policy") != -1 or self._ssh_output_err.find("invalid") != -1) ) or (self._submit_command_name == "sbatch" and self._ssh_output_err.find("argument") != -1) or (self._submit_command_name == "bsub" and self._ssh_output_err.find("job not submitted") != -1) or self._submit_command_name == "ecaccess-job-submit" or self._submit_command_name == "qsub ": + raise AutosubmitError(self._ssh_output_err, 7014, "Bad Parameters.") + raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host,self._ssh_output_err, 6005)) if not ignore_log: - if len(stderr_readlines) > 0: - Log.printlog('Command {0} in {1} warning: {2}'.format( - command, self.host, '\n'.join(stderr_readlines)), 6006) + if self._ssh_output_err != '': + Log.printlog('Command {0} in {1} warning: {2}'.format(command, self.host, self._ssh_output_err), 6006) else: pass #Log.debug('Command {0} in {1} successful with out message: {2}', command, self.host, self._ssh_output) @@ -1199,6 +1200,8 @@ class ParamikoPlatform(Platform): return True except: return False + + def check_remote_log_dir(self): """ diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index f441f8cbf..11caca815 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -49,6 +49,8 @@ class Platform(object): self._allow_arrays = False self._allow_wrappers = False self._allow_python_jobs = True + self.bashrc_output = "" + self.bashrc_err = "" @property def serial_platform(self): @@ -248,6 +250,17 @@ class Platform(object): return False else: return False + def get_bashrc_output(self): + """ + Checks remote bashrc output/err to strip out any unwanted output + """ + try: + self.send_command("sleep 1") + self.bashrc_output = self.get_ssh_output() + self.bashrc_err = self.get_ssh_output_err() + return True + except: + return False def remove_stat_file(self, job_name): """ -- GitLab From 078f19e6e94ac35b9ba653f5f5529e93e4586e3d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 7 Mar 2023 11:34:00 +0100 Subject: [PATCH 164/213] initial skeleton (cherry picked from commit 37d5a37b6a8759f3b14276318909c5aa6925abd0) --- autosubmit/platforms/headers/pjm_header.py | 166 +++++ autosubmit/platforms/pjmplatform.py | 659 ++++++++++++++++++ .../platforms/wrappers/wrapper_factory.py | 38 + 3 files changed, 863 insertions(+) create mode 100644 autosubmit/platforms/headers/pjm_header.py create mode 100644 autosubmit/platforms/pjmplatform.py diff --git a/autosubmit/platforms/headers/pjm_header.py b/autosubmit/platforms/headers/pjm_header.py new file mode 100644 index 000000000..8ce9a5849 --- /dev/null +++ b/autosubmit/platforms/headers/pjm_header.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +# Copyright 2023 Earth Sciences Department, BSC-CNS + +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import textwrap + + +class PJMHeader(object): + """Class to handle the PJM headers of a job""" + + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_queue_directive(self, job): + """ + Returns queue directive for the specified job + + :param job: job to create queue directive for + :type job: Job + :return: queue directive + :rtype: str + """ + # There is no queue, so directive is empty + if job.parameters['CURRENT_QUEUE'] == '': + return "" + else: + return "PJM -L rscgrp={0}".format(job.parameters['CURRENT_QUEUE']) + + + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_account_directive(self, job): + """ + Returns account directive for the specified job + + :param job: job to create account directive for + :type job: Job + :return: account directive + :rtype: str + """ + # wallet,account group_name. source: nkl.cc.u-tokyo.ac.jp + if job.parameters['CURRENT_PROJ'] != '': + return "PJM -g {0}".format(job.parameters['CURRENT_PROJ']) + return "" + + def get_nodes_directive(self, job): + """ + Returns nodes directive for the specified job + :param job: job to create nodes directive for + :type job: Job + :return: nodes directive + :rtype: str + """ + # There is no account, so directive is empty + nodes = job.parameters.get('NODES',"") + if nodes != '': + return "PJM -L node={0}".format(nodes) + return "" + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_memory_directive(self, job): + """ + Returns memory directive for the specified job + + :param job: job to create memory directive for + :type job: Job + :return: memory directive + :rtype: str + """ + if job.parameters['MEMORY'] != '': + return "PJM --node-mem={0}".format(job.parameters['MEMORY']) + return "" + + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_memory_per_task_directive(self, job): + """ + Returns memory per task directive for the specified job + + :param job: job to create memory per task directive for + :type job: Job + :return: memory per task directive + :rtype: str + """ + if job.parameters['MEMORY_PER_TASK'] != '': + return "PJM --core-mem={0}".format(job.parameters['MEMORY_PER_TASK']) + return "" + + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_custom_directives(self, job): + """ + Returns custom directives for the specified job + + :param job: job to create custom directive for + :type job: Job + :return: custom directives + :rtype: str + """ + # There is no custom directives, so directive is empty + if job.parameters['CUSTOM_DIRECTIVES'] != '': + return '\n'.join(str(s) for s in job.parameters['CUSTOM_DIRECTIVES']) + return "" + + + + def get_tasks_per_node(self, job): + """ + Returns memory per task directive for the specified job + + :param job: job to create tasks per node directive for + :type job: Job + :return: tasks per node directive + :rtype: str + """ + if int(job.parameters['TASKS']) > 1: + return "max-proc-per-node={0}".format(job.parameters['TASKS']) + return "" + + SERIAL = textwrap.dedent("""\ +############################################################################### +# %TASKTYPE% %DEFAULT.EXPID% EXPERIMENT +############################################################################### +# +#PJM -N %JOBNAME% +#PJM -L elapse=%WALLCLOCK%:00 +#%QUEUE_DIRECTIVE% +#%ACCOUNT_DIRECTIVE% +#%MEMORY_DIRECTIVE% +%CUSTOM_DIRECTIVES% +#PJM -o=%CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%OUT_LOG_DIRECTIVE% +#PJM -e=%CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%ERR_LOG_DIRECTIVE% +#%X11% +# +############################################################################### + """) + + PARALLEL = textwrap.dedent("""\ +############################################################################### +# %TASKTYPE% %DEFAULT.EXPID% EXPERIMENT +############################################################################### +# +#PJM -N %JOBNAME% +#%NODES_DIRECTIVE% +#PJM --mpi "proc=%NUMPROC%" +#PJM --mpi "%TASKS_PER_NODE_DIRECTIVE%" +#PJM -L elapse=%WALLCLOCK%:00 +#%QUEUE_DIRECTIVE% +#%ACCOUNT_DIRECTIVE% +#%MEMORY_DIRECTIVE% +#%MEMORY_PER_TASK_DIRECTIVE% +#PJM -o=%CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%OUT_LOG_DIRECTIVE% +#PJM -e=%CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%ERR_LOG_DIRECTIVE% +%CUSTOM_DIRECTIVES% +# +############################################################################### + """) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py new file mode 100644 index 000000000..75f2e48a2 --- /dev/null +++ b/autosubmit/platforms/pjmplatform.py @@ -0,0 +1,659 @@ +#!/usr/bin/env python3 + +# Copyright 2017-2020 Earth Sciences Department, BSC-CNS + +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . +import locale +import os +from time import sleep +from time import mktime +from time import time +from datetime import datetime +from typing import List, Union + +from xml.dom.minidom import parseString + +from autosubmit.job.job_common import Status, parse_output_number +from autosubmit.job.job_exceptions import WrongTemplateException +from autosubmit.platforms.paramiko_platform import ParamikoPlatform +from autosubmit.platforms.headers.pjm_header import PJMHeader +from autosubmit.platforms.wrappers.wrapper_factory import PJMWrapperFactory +from log.log import AutosubmitCritical, AutosubmitError, Log + + +class PJMPlatform(ParamikoPlatform): + """ + Class to manage jobs to host using PJM scheduler + + :param expid: experiment's identifier + :type expid: str + """ + + + def __init__(self, expid, name, config): + ParamikoPlatform.__init__(self, expid, name, config) + self.mkdir_cmd = None + self.get_cmd = None + self.put_cmd = None + self._submit_hold_cmd = None + self._submit_command_name = None + self._submit_cmd = None + self._checkhost_cmd = None + self.cancel_cmd = None + self._header = PJMHeader() + self._wrapper = SlurmWrapperFactory(self) + + #https://software.fujitsu.com/jp/manual/manualfiles/m220008/j2ul2452/02enz007/j2ul-2452-02enz0.pdf pagina 16 + self.job_status = dict() + self.job_status['COMPLETED'] = ['COMPLETED'] + self.job_status['RUNNING'] = ['RUNNING'] + self.job_status['QUEUING'] = ['PENDING', 'CONFIGURING', 'RESIZING'] + self.job_status['FAILED'] = ['FAILED', 'CANCELLED', 'CANCELLED+', 'NODE_FAIL', + 'PREEMPTED', 'SUSPENDED', 'TIMEOUT', 'OUT_OF_MEMORY', 'OUT_OF_ME+', 'OUT_OF_ME'] + self._pathdir = "\$HOME/LOG_" + self.expid + self._allow_arrays = False + self._allow_wrappers = True + self.update_cmds() + self.config = config + exp_id_path = os.path.join(config.LOCAL_ROOT_DIR, self.expid) + tmp_path = os.path.join(exp_id_path, "tmp") + self._submit_script_path = os.path.join( + tmp_path, config.LOCAL_ASLOG_DIR, "submit_" + self.name + ".sh") + self._submit_script_file = open(self._submit_script_path, 'wb').close() + + def process_batch_ready_jobs(self,valid_packages_to_submit,failed_packages,error_message="",hold=False): + """ + Retrieve multiple jobs identifiers. + :param valid_packages_to_submit: + :param failed_packages: + :param error_message: + :param hold: + :return: + """ + try: + valid_packages_to_submit = [ package for package in valid_packages_to_submit if package.x11 != True] + if len(valid_packages_to_submit) > 0: + package = valid_packages_to_submit[0] + try: + jobs_id = self.submit_Script(hold=hold) + except AutosubmitError as e: + jobnames = [job.name for job in valid_packages_to_submit[0].jobs] + for jobname in jobnames: + jobid = self.get_jobid_by_jobname(jobname) + #cancel bad submitted job if jobid is encountered + for id_ in jobid: + self.cancel_job(id_) + jobs_id = None + self.connected = False + if e.trace is not None: + has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 + else: + has_trace_bad_parameters = False + if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: + error_msg = "" + for package_tmp in valid_packages_to_submit: + for job_tmp in package_tmp.jobs: + if job_tmp.section not in error_msg: + error_msg += job_tmp.section + "&" + if has_trace_bad_parameters: + error_message+="Check job and queue specified in jobs.conf. Sections that could be affected: {0}".format(error_msg[:-1]) + else: + error_message+="\ncheck that {1} platform has set the correct scheduler. Sections that could be affected: {0}".format( + error_msg[:-1], self.name) + + if e.trace is None: + e.trace = "" + raise AutosubmitCritical(error_message,7014,e.message+"\n"+str(e.trace)) + except IOError as e: + raise AutosubmitError( + "IO issues ", 6016, str(e)) + except BaseException as e: + if str(e).find("scheduler") != -1: + raise AutosubmitCritical("Are you sure that [{0}] scheduler is the correct type for platform [{1}]?.\n Please, double check that {0} is loaded for {1} before autosubmit launch any job.".format(self.type.upper(),self.name.upper()),7070) + raise AutosubmitError( + "Submission failed, this can be due a failure on the platform", 6015, str(e)) + if jobs_id is None or len(jobs_id) <= 0: + raise AutosubmitError( + "Submission failed, this can be due a failure on the platform", 6015,"Jobs_id {0}".format(jobs_id)) + i = 0 + if hold: + sleep(10) + + for package in valid_packages_to_submit: + if hold: + retries = 5 + package.jobs[0].id = str(jobs_id[i]) + try: + can_continue = True + while can_continue and retries > 0: + cmd = package.jobs[0].platform.get_queue_status_cmd(jobs_id[i]) + package.jobs[0].platform.send_command(cmd) + queue_status = package.jobs[0].platform._ssh_output + reason = package.jobs[0].platform.parse_queue_reason(queue_status, jobs_id[i]) + if reason == '(JobHeldAdmin)': + can_continue = False + elif reason == '(JobHeldUser)': + can_continue = True + else: + can_continue = False + sleep(5) + retries = retries - 1 + if not can_continue: + package.jobs[0].platform.send_command(package.jobs[0].platform.cancel_cmd + " {0}".format(jobs_id[i])) + i = i + 1 + continue + if not self.hold_job(package.jobs[0]): + i = i + 1 + continue + except Exception as e: + failed_packages.append(jobs_id) + continue + for job in package.jobs: + job.hold = hold + job.id = str(jobs_id[i]) + job.status = Status.SUBMITTED + job.write_submit_time(hold=hold) + i += 1 + if len(failed_packages) > 0: + for job_id in failed_packages: + package.jobs[0].platform.send_command( + package.jobs[0].platform.cancel_cmd + " {0}".format(job_id)) + raise AutosubmitError("{0} submission failed, some hold jobs failed to be held".format(self.name), 6015) + save = True + except WrongTemplateException as e: + raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( + e.job_name), 7014, str(e)) + except AutosubmitError as e: + raise + except AutosubmitCritical as e: + raise + except Exception as e: + raise AutosubmitError("{0} submission failed".format(self.name), 6015, str(e)) + return save,valid_packages_to_submit + + def open_submit_script(self): + self._submit_script_file = open(self._submit_script_path, 'wb').close() + self._submit_script_file = open(self._submit_script_path, 'ab') + + def get_submit_script(self): + self._submit_script_file.close() + os.chmod(self._submit_script_path, 0o750) + return os.path.join(self.config.LOCAL_ASLOG_DIR, os.path.basename(self._submit_script_path)) + + def submit_job(self, job, script_name, hold=False, export="none"): + """ + Submit a job from a given job object. + + :param export: + :param job: job object + :type job: autosubmit.job.job.Job + :param script_name: job script's name + :rtype scriptname: str + :param hold: send job hold + :type hold: boolean + :return: job id for the submitted job + :rtype: int + """ + if job is None or not job: + x11 = False + else: + x11 = job.x11 + if not x11: + self.get_submit_cmd(script_name, job, hold=hold, export=export) + return None + else: + cmd = self.get_submit_cmd(script_name, job, hold=hold, export=export) + if cmd is None: + return None + if self.send_command(cmd,x11=x11): + job_id = self.get_submitted_job_id(self.get_ssh_output(),x11=x11) + Log.debug("Job ID: {0}", job_id) + return int(job_id) + else: + return None + + def submit_Script(self, hold=False): + # type: (bool) -> Union[List[str], str] + """ + Sends a Submit file Script, execute it in the platform and retrieves the Jobs_ID of all jobs at once. + + :param hold: if True, the job will be held + :type hold: bool + :return: job id for submitted jobs + :rtype: list(str) + """ + try: + self.send_file(self.get_submit_script(), False) + cmd = os.path.join(self.get_files_path(), + os.path.basename(self._submit_script_path)) + try: + self.send_command(cmd) + except AutosubmitError as e: + raise + except AutosubmitCritical as e: + raise + except Exception as e: + raise + jobs_id = self.get_submitted_job_id(self.get_ssh_output()) + return jobs_id + except IOError as e: + raise AutosubmitError("Submit script is not found, retry again in next AS iteration", 6008, str(e)) + except AutosubmitError as e: + raise + except AutosubmitCritical as e: + raise + except Exception as e: + raise AutosubmitError("Submit script is not found, retry again in next AS iteration", 6008, str(e)) + def check_remote_log_dir(self): + """ + Creates log dir on remote host + """ + + try: + # Test if remote_path exists + self._ftpChannel.chdir(self.remote_log_dir) + except IOError as e: + try: + if self.send_command(self.get_mkdir_cmd()): + Log.debug('{0} has been created on {1} .', + self.remote_log_dir, self.host) + else: + raise AutosubmitError("SFTP session not active ", 6007, "Could not create the DIR {0} on HPC {1}'.format(self.remote_log_dir, self.host)".format( + self.remote_log_dir, self.host)) + except BaseException as e: + raise AutosubmitError( + "SFTP session not active ", 6007, str(e)) + + def update_cmds(self): + """ + Updates commands for platforms + """ + self.root_dir = os.path.join( + self.scratch, self.project_dir, self.user, self.expid) + self.remote_log_dir = os.path.join(self.root_dir, "LOG_" + self.expid) + self.cancel_cmd = "scancel" + self._checkhost_cmd = "echo 1" + self._submit_cmd = 'sbatch -D {1} {1}/'.format( + self.host, self.remote_log_dir) + self._submit_command_name = "sbatch" + self._submit_hold_cmd = 'sbatch -H -D {1} {1}/'.format( + self.host, self.remote_log_dir) + # jobid =$(sbatch WOA_run_mn4.sh 2 > & 1 | grep -o "[0-9]*"); scontrol hold $jobid; + self.put_cmd = "scp" + self.get_cmd = "scp" + self.mkdir_cmd = "mkdir -p " + self.remote_log_dir + + def hold_job(self, job): + try: + cmd = "scontrol release {0} ; sleep 2 ; scontrol hold {0} ".format(job.id) + self.send_command(cmd) + job_status = self.check_job(job, submit_hold_check=True) + if job_status == Status.RUNNING: + self.send_command("scancel {0}".format(job.id)) + return False + elif job_status == Status.FAILED: + return False + cmd = self.get_queue_status_cmd(job.id) + self.send_command(cmd) + + queue_status = self._ssh_output + reason = str() + reason = self.parse_queue_reason(queue_status, job.id) + if reason == '(JobHeldUser)': + return True + else: + self.send_command("scancel {0}".format(job.id)) + return False + except BaseException as e: + try: + self.send_command("scancel {0}".format(job.id)) + raise AutosubmitError( + "Can't hold jobid:{0}, canceling job".format(job.id), 6000, str(e)) + except BaseException as e: + raise AutosubmitError( + "Can't cancel the jobid: {0}".format(job.id), 6000, str(e)) + except AutosubmitError as e: + raise + + def get_checkhost_cmd(self): + return self._checkhost_cmd + + def get_mkdir_cmd(self): + return self.mkdir_cmd + + def get_remote_log_dir(self): + return self.remote_log_dir + + def parse_job_output(self, output): + return output.strip().split(' ')[0].strip() + + def parse_job_finish_data(self, output, packed): + """Parses the context of the sacct query to SLURM for a single job. + Only normal jobs return submit, start, finish, joules, ncpus, nnodes. + + When a wrapper has finished, capture finish time. + + :param output: The sacct output + :type output: str + :param packed: true if job belongs to package + :type packed: bool + :return: submit, start, finish, joules, ncpus, nnodes, detailed_data + :rtype: int, int, int, int, int, int, json object (str) + """ + try: + # Setting up: Storing detail for posterity + detailed_data = dict() + steps = [] + # No blank spaces after or before + output = output.strip() if output else None + lines = output.split("\n") if output else [] + is_end_of_wrapper = False + extra_data = None + # If there is output, list exists + if len(lines) > 0: + # Collecting information from all output + for line in lines: + line = line.strip().split() + if len(line) > 0: + # Collecting detailed data + name = str(line[0]) + if packed: + # If it belongs to a wrapper + extra_data = {"ncpus": str(line[2] if len(line) > 2 else "NA"), + "nnodes": str(line[3] if len(line) > 3 else "NA"), + "submit": str(line[4] if len(line) > 4 else "NA"), + "start": str(line[5] if len(line) > 5 else "NA"), + "finish": str(line[6] if len(line) > 6 else "NA"), + "energy": str(line[7] if len(line) > 7 else "NA"), + "MaxRSS": str(line[8] if len(line) > 8 else "NA"), + "AveRSS": str(line[9] if len(line) > 9 else "NA")} + else: + # Normal job + extra_data = {"submit": str(line[4] if len(line) > 4 else "NA"), + "start": str(line[5] if len(line) > 5 else "NA"), + "finish": str(line[6] if len(line) > 6 else "NA"), + "energy": str(line[7] if len(line) > 7 else "NA"), + "MaxRSS": str(line[8] if len(line) > 8 else "NA"), + "AveRSS": str(line[9] if len(line) > 9 else "NA")} + # Detailed data will contain the important information from output + detailed_data[name] = extra_data + steps.append(name) + submit = start = finish = energy = nnodes = ncpus = 0 + status = "UNKNOWN" + # Take first line as source + line = lines[0].strip().split() + ncpus = int(line[2] if len(line) > 2 else 0) + nnodes = int(line[3] if len(line) > 3 else 0) + status = str(line[1]) + if packed is False: + # If it is not wrapper job, take first line as source + if status not in ["COMPLETED", "FAILED", "UNKNOWN"]: + # It is not completed, then its error and send default data plus output + return 0, 0, 0, 0, ncpus, nnodes, detailed_data, False + else: + # If it is a wrapped job + # Check if the wrapper has finished + if status in ["COMPLETED", "FAILED", "UNKNOWN"]: + # Wrapper has finished + is_end_of_wrapper = True + # Continue with first line as source + if line: + try: + # Parse submit and start only for normal jobs (not packed) + submit = int(mktime(datetime.strptime( + line[4], "%Y-%m-%dT%H:%M:%S").timetuple())) if not packed else 0 + start = int(mktime(datetime.strptime( + line[5], "%Y-%m-%dT%H:%M:%S").timetuple())) if not packed else 0 + # Assuming the job has been COMPLETED + # If normal job or end of wrapper => Try to get the finish time from the first line of the output, else default to now. + finish = 0 + + if not packed: + # If normal job, take finish time from first line + finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( + ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) + energy = parse_output_number(line[7]) if len( + line) > 7 and len(line[7]) > 0 else 0 + else: + # If it is a wrapper job + # If end of wrapper, take data from first line + if is_end_of_wrapper is True: + finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( + ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) + energy = parse_output_number(line[7]) if len( + line) > 7 and len(line[7]) > 0 else 0 + else: + # If packed but not end of wrapper, try to get info from current data. + if "finish" in list(extra_data.keys()) and extra_data["finish"] != "Unknown": + # finish data exists + finish = int(mktime(datetime.strptime( + extra_data["finish"], "%Y-%m-%dT%H:%M:%S").timetuple())) + else: + # if finish date does not exist, query previous step. + if len(steps) >= 2 and detailed_data.__contains__(steps[-2]): + new_extra_data = detailed_data[steps[-2]] + if "finish" in list(new_extra_data.keys()) and new_extra_data["finish"] != "Unknown": + # This might result in a job finish < start, need to handle that in the caller function + finish = int(mktime(datetime.strptime( + new_extra_data["finish"], "%Y-%m-%dT%H:%M:%S").timetuple())) + else: + finish = int(time()) + else: + finish = int(time()) + if "energy" in list(extra_data.keys()) and extra_data["energy"] != "NA": + # energy exists + energy = parse_output_number( + extra_data["energy"]) + else: + # if energy does not exist, query previous step + if len(steps) >= 2 and detailed_data.__contains__(steps[-2]): + new_extra_data = detailed_data[steps[-2]] + if "energy" in list(new_extra_data.keys()) and new_extra_data["energy"] != "NA": + energy = parse_output_number( + new_extra_data["energy"]) + else: + energy = 0 + else: + energy = 0 + except Exception as exp: + # print(line) + # Log.info(traceback.format_exc()) + Log.info( + "Parsing mishandling.") + # joules = -1 + pass + + detailed_data = detailed_data if not packed or is_end_of_wrapper is True else extra_data + return submit, start, finish, energy, ncpus, nnodes, detailed_data, is_end_of_wrapper + + return 0, 0, 0, 0, 0, 0, dict(), False + except Exception as exp: + Log.warning( + "Autosubmit couldn't parse SLURM energy output. From parse_job_finish_data: {0}".format(str(exp))) + return 0, 0, 0, 0, 0, 0, dict(), False + + def parse_Alljobs_output(self, output, job_id): + status = "" + try: + status = [x.split()[1] for x in output.splitlines() + if x.split()[0] == str(job_id)] + except BaseException as e: + pass + if len(status) == 0: + return status + return status[0] + + def get_submitted_job_id(self, outputlines, x11 = False): + try: + if outputlines.find("failed") != -1: + raise AutosubmitCritical( + "Submission failed. Command Failed", 7014) + jobs_id = [] + for output in outputlines.splitlines(): + jobs_id.append(int(output.split(' ')[3])) + if x11 == "true": + return jobs_id[0] + else: + return jobs_id + except IndexError: + raise AutosubmitCritical( + "Submission failed. There are issues on your config file", 7014) + + def jobs_in_queue(self): + dom = parseString('') + jobs_xml = dom.getElementsByTagName("JB_job_number") + return [int(element.firstChild.nodeValue) for element in jobs_xml] + + def get_submit_cmd(self, job_script, job, hold=False, export=""): + if (export is None or export.lower() == "none") or len(export) == 0: + export = "" + else: + export += " ; " + if job is None or not job: + x11 = False + else: + x11 = job.x11 + + if x11 == "true": + if not hold: + return export + self._submit_cmd + job_script + else: + return export + self._submit_hold_cmd + job_script + else: + try: + lang = locale.getlocale()[1] + if lang is None: + lang = locale.getdefaultlocale()[1] + if lang is None: + lang = 'UTF-8' + if not hold: + self._submit_script_file.write((export + self._submit_cmd + job_script + "\n").encode(lang)) + else: + self._submit_script_file.write((export + self._submit_hold_cmd + job_script + "\n").encode(lang)) + except BaseException as e: + pass + + def get_checkjob_cmd(self, job_id): + return 'sacct -n -X --jobs {1} -o "State"'.format(self.host, job_id) + + def get_checkAlljobs_cmd(self, jobs_id): + return "sacct -n -X --jobs {1} -o jobid,State".format(self.host, jobs_id) + + def get_queue_status_cmd(self, job_id): + return 'squeue -j {0} -o %A,%R'.format(job_id) + + def get_jobid_by_jobname_cmd(self, job_name): + return 'squeue -o %A,%.50j -n {0}'.format(job_name) + + + def cancel_job(self, job_id): + return 'scancel {0}'.format(job_id) + + def get_job_energy_cmd(self, job_id): + return 'sacct -n --jobs {0} -o JobId%25,State,NCPUS,NNodes,Submit,Start,End,ConsumedEnergy,MaxRSS%25,AveRSS%25'.format(job_id) + + def parse_queue_reason(self, output, job_id): + reason = [x.split(',')[1] for x in output.splitlines() + if x.split(',')[0] == str(job_id)] + if len(reason) > 0: + return reason[0] + return reason + + @staticmethod + def wrapper_header(filename, queue, project, wallclock, num_procs, dependency, directives, threads, method="asthreads", partition=""): + if method == 'srun': + language = "#!/bin/bash" + return \ + language + """ +############################################################################### +# {0} +############################################################################### +# +#SBATCH -J {0} +{1} +{8} +#SBATCH -A {2} +#SBATCH --output={0}.out +#SBATCH --error={0}.err +#SBATCH -t {3}:00 +#SBATCH -n {4} +#SBATCH --cpus-per-task={7} +{5} +{6} + +# +############################################################################### + """.format(filename, queue, project, wallclock, num_procs, dependency, + '\n'.ljust(13).join(str(s) for s in directives), threads,partition) + else: + language = "#!/usr/bin/env python3" + return \ + language + """ +############################################################################### +# {0} +############################################################################### +# +#SBATCH -J {0} +{1} +{8} +#SBATCH -A {2} +#SBATCH --output={0}.out +#SBATCH --error={0}.err +#SBATCH -t {3}:00 +#SBATCH --cpus-per-task={7} +#SBATCH -n {4} +{5} +{6} +# +############################################################################### + """.format(filename, queue, project, wallclock, num_procs, dependency, + '\n'.ljust(13).join(str(s) for s in directives), threads,partition) + + @staticmethod + def allocated_nodes(): + return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list_{0}".format(node_id))""" + + def check_file_exists(self, filename,wrapper_failed=False): + file_exist = False + sleeptime = 5 + retries = 0 + max_retries = 3 + while not file_exist and retries < max_retries: + try: + # This return IOError if path doesn't exist + self._ftpChannel.stat(os.path.join( + self.get_files_path(), filename)) + file_exist = True + except IOError as e: # File doesn't exist, retry in sleeptime + Log.debug("{2} File still no exists.. waiting {0}s for a new retry ( retries left: {1})", sleeptime, + max_retries - retries, os.path.join(self.get_files_path(), filename)) + if not wrapper_failed: + sleep(sleeptime) + sleeptime = sleeptime + 5 + retries = retries + 1 + else: + retries = 9999 + except BaseException as e: # Unrecoverable error + if str(e).lower().find("garbage") != -1: + if not wrapper_failed: + sleep(sleeptime) + sleeptime = sleeptime + 5 + retries = retries + 1 + else: + Log.printlog("remote logs {0} couldn't be recovered".format(filename), 6001) + file_exist = False # won't exist + retries = 999 # no more retries + return file_exist diff --git a/autosubmit/platforms/wrappers/wrapper_factory.py b/autosubmit/platforms/wrappers/wrapper_factory.py index d0690791a..d28187ac9 100644 --- a/autosubmit/platforms/wrappers/wrapper_factory.py +++ b/autosubmit/platforms/wrappers/wrapper_factory.py @@ -141,3 +141,41 @@ class EcWrapperFactory(WrapperFactory): def dependency_directive(self, dependency): return '#PBS -v depend=afterok:{0}'.format(dependency) + +class PJMWrapperFactory(WrapperFactory): + + def vertical_wrapper(self, **kwargs): + return PythonVerticalWrapperBuilder(**kwargs) + + def horizontal_wrapper(self, **kwargs): + + if kwargs["method"] == 'srun': + return SrunHorizontalWrapperBuilder(**kwargs) + else: + return PythonHorizontalWrapperBuilder(**kwargs) + + def hybrid_wrapper_horizontal_vertical(self, **kwargs): + return PythonHorizontalVerticalWrapperBuilder(**kwargs) + + def hybrid_wrapper_vertical_horizontal(self, **kwargs): + if kwargs["method"] == 'srun': + return SrunVerticalHorizontalWrapperBuilder(**kwargs) + else: + return PythonVerticalHorizontalWrapperBuilder(**kwargs) + + def header_directives(self, **kwargs): + return self.platform.wrapper_header(kwargs['name'], kwargs['queue'], kwargs['project'], kwargs['wallclock'], + kwargs['num_processors'], kwargs['dependency'], kwargs['directives'],kwargs['threads'],kwargs['method'],kwargs['partition']) + + def allocated_nodes(self): + return self.platform.allocated_nodes() + + #def dependency_directive(self, dependency): + # # There is no option for afterok in the PJM scheduler, but I think it is not needed. + # return '#PJM --dependency=afterok:{0}'.format(dependency) + + def queue_directive(self, queue): + return '#PJM --qos={0}'.format(queue) + + def partition_directive(self, partition): + return '#PJM --partition={0}'.format(partition) -- GitLab From fdbb9bcbe3e00cde641e8ee3fa196a567ed1e9b6 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 8 Mar 2023 12:47:59 +0100 Subject: [PATCH 165/213] Added test (cherry picked from commit e9ee1b1a58ff186c380608236e4ce30021fb277c) --- autosubmit/platforms/paramiko_submitter.py | 4 + autosubmit/platforms/pjmplatform.py | 35 +++++-- .../platforms/wrappers/wrapper_factory.py | 4 +- test/unit/files/fake-jobs.yml | 36 +++++++ test/unit/files/fake-platforms.yml | 75 +++++++++++++++ test/unit/test_pjm.py | 96 +++++++++++++++++++ 6 files changed, 238 insertions(+), 12 deletions(-) create mode 100755 test/unit/files/fake-jobs.yml create mode 100644 test/unit/files/fake-platforms.yml create mode 100644 test/unit/test_pjm.py diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 7330f6c0c..d4df1c036 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -31,6 +31,7 @@ from autosubmit.platforms.pbsplatform import PBSPlatform from autosubmit.platforms.sgeplatform import SgePlatform from autosubmit.platforms.ecplatform import EcPlatform from autosubmit.platforms.slurmplatform import SlurmPlatform +from autosubmit.platforms.pjmplatform import PJMPlatform from autosubmit.platforms.locplatform import LocalPlatform from autosubmit.platforms.paramiko_platform import ParamikoPlatformException @@ -134,6 +135,9 @@ class ParamikoSubmitter(Submitter): elif platform_type == 'slurm': remote_platform = SlurmPlatform( asconf.expid, section.lower(), BasicConfig) + elif platform_type == 'pjm': + remote_platform = PJMPlatform( + asconf.expid, section, BasicConfig) else: raise Exception( "Queue type not specified on platform {0}".format(section)) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 75f2e48a2..38e8b1b4e 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -18,6 +18,7 @@ # along with Autosubmit. If not, see . import locale import os +import re from time import sleep from time import mktime from time import time @@ -54,18 +55,16 @@ class PJMPlatform(ParamikoPlatform): self._checkhost_cmd = None self.cancel_cmd = None self._header = PJMHeader() - self._wrapper = SlurmWrapperFactory(self) - + self._wrapper = PJMWrapperFactory(self) #https://software.fujitsu.com/jp/manual/manualfiles/m220008/j2ul2452/02enz007/j2ul-2452-02enz0.pdf pagina 16 self.job_status = dict() - self.job_status['COMPLETED'] = ['COMPLETED'] - self.job_status['RUNNING'] = ['RUNNING'] - self.job_status['QUEUING'] = ['PENDING', 'CONFIGURING', 'RESIZING'] - self.job_status['FAILED'] = ['FAILED', 'CANCELLED', 'CANCELLED+', 'NODE_FAIL', - 'PREEMPTED', 'SUSPENDED', 'TIMEOUT', 'OUT_OF_MEMORY', 'OUT_OF_ME+', 'OUT_OF_ME'] + self.job_status['COMPLETED'] = ['EXT'] + self.job_status['RUNNING'] = ['RNO','RNE','RUN'] + self.job_status['QUEUING'] = ['ACC','QUE', 'RNA', 'RNP','HLD'] # NOT SURE ABOUT HOLD HLD + self.job_status['FAILED'] = ['ERR','CCL','RJT'] self._pathdir = "\$HOME/LOG_" + self.expid self._allow_arrays = False - self._allow_wrappers = True + self._allow_wrappers = True # NOT SURE IF WE NEED WRAPPERS self.update_cmds() self.config = config exp_id_path = os.path.join(config.LOCAL_ROOT_DIR, self.expid) @@ -74,6 +73,15 @@ class PJMPlatform(ParamikoPlatform): tmp_path, config.LOCAL_ASLOG_DIR, "submit_" + self.name + ".sh") self._submit_script_file = open(self._submit_script_path, 'wb').close() + def submit_error(self,output): + # Returns true if the output of the submit command indicates an error, false otherwise + if output.lower().find("pjsub".lower()) != -1 and output.lower().find("[INFO] PJM 0000".lower()) != -1: + return False + else: + return True + + + def process_batch_ready_jobs(self,valid_packages_to_submit,failed_packages,error_message="",hold=False): """ Retrieve multiple jobs identifiers. @@ -99,10 +107,14 @@ class PJMPlatform(ParamikoPlatform): jobs_id = None self.connected = False if e.trace is not None: - has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 + has_trace_bad_parameters = self.submit_error(e.trace) else: has_trace_bad_parameters = False - if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: + if e.message is not None: + has_message_bad_parameters = self.submit_error(e.message) + else: + has_message_bad_parameters = False + if has_trace_bad_parameters or has_message_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -143,6 +155,9 @@ class PJMPlatform(ParamikoPlatform): package.jobs[0].platform.send_command(cmd) queue_status = package.jobs[0].platform._ssh_output reason = package.jobs[0].platform.parse_queue_reason(queue_status, jobs_id[i]) + # pjstat -H shows only COMPLETED,FAILED + # pjstat shows only QUEUING,RUNNING + # [bsc32070@armlogin01 ~]$ pjstat -H --filter "jid=167661+167662" if reason == '(JobHeldAdmin)': can_continue = False elif reason == '(JobHeldUser)': diff --git a/autosubmit/platforms/wrappers/wrapper_factory.py b/autosubmit/platforms/wrappers/wrapper_factory.py index d28187ac9..707c2502a 100644 --- a/autosubmit/platforms/wrappers/wrapper_factory.py +++ b/autosubmit/platforms/wrappers/wrapper_factory.py @@ -175,7 +175,7 @@ class PJMWrapperFactory(WrapperFactory): # return '#PJM --dependency=afterok:{0}'.format(dependency) def queue_directive(self, queue): - return '#PJM --qos={0}'.format(queue) + return '#PJM -L rscgrp={0}'.format(queue) def partition_directive(self, partition): - return '#PJM --partition={0}'.format(partition) + return '#PJM -g {0}'.format(partition) diff --git a/test/unit/files/fake-jobs.yml b/test/unit/files/fake-jobs.yml new file mode 100755 index 000000000..93c5a55dc --- /dev/null +++ b/test/unit/files/fake-jobs.yml @@ -0,0 +1,36 @@ +JOBS: + LOCAL_SETUP: + FILE: LOCAL_SETUP.sh + PLATFORM: LOCAL + RUNNING: once + REMOTE_SETUP: + FILE: REMOTE_SETUP.sh + DEPENDENCIES: LOCAL_SETUP + WALLCLOCK: '00:05' + RUNNING: once + INI: + FILE: INI.sh + DEPENDENCIES: REMOTE_SETUP + RUNNING: member + WALLCLOCK: '00:05' + SIM: + FILE: SIM.sh + DEPENDENCIES: INI SIM-1 + RUNNING: chunk + WALLCLOCK: '00:05' + POST: + FILE: POST.sh + DEPENDENCIES: SIM + RUNNING: once + WALLCLOCK: '00:05' + CLEAN: + FILE: CLEAN.sh + DEPENDENCIES: POST + RUNNING: once + WALLCLOCK: '00:05' + TRANSFER: + FILE: TRANSFER.sh + PLATFORM: LOCAL + DEPENDENCIES: CLEAN + RUNNING: member + diff --git a/test/unit/files/fake-platforms.yml b/test/unit/files/fake-platforms.yml new file mode 100644 index 000000000..ba5810c3f --- /dev/null +++ b/test/unit/files/fake-platforms.yml @@ -0,0 +1,75 @@ +PLATFORMS: + ARM: + TYPE: pjm + HOST: armlogin1.bsc.es,armlogin2.bsc.es,armlogin3.bsc.es + PROJECT: bsc32 + USER: bsc32070 + SCRATCH_DIR: /scratch + ADD_PROJECT_TO_HOST: 'False' + MAX_WALLCLOCK: 48:00 + MAX_PROCESSORS: '2400' + PROCESSORS_PER_NODE: '48' + SERIAL_QUEUE: small + QUEUE: small + MARENOSTRUM4: + TYPE: slurm + HOST: mn1.bsc.es,mn2.bsc.es,mn3.bsc.es + PROJECT: bsc32 + USER: bsc32070 + SCRATCH_DIR: /gpfs/scratch + ADD_PROJECT_TO_HOST: 'False' + MAX_WALLCLOCK: 48:00 + MAX_PROCESSORS: '2400' + PROCESSORS_PER_NODE: '48' + SERIAL_QUEUE: debug + QUEUE: debug + MARENOSTRUM_ARCHIVE: + TYPE: ps + HOST: dt01.bsc.es + PROJECT: bsc32 + USER: bsc32070 + SCRATCH_DIR: /gpfs/scratch + ADD_PROJECT_TO_HOST: 'False' + TEST_SUITE: 'False' + POWER9: + TYPE: slurm + HOST: plogin1.bsc.es + PROJECT: bsc32 + USER: bsc32070 + SCRATCH_DIR: /gpfs/scratch + ADD_PROJECT_TO_HOST: 'False' + TEST_SUITE: 'False' + SERIAL_QUEUE: debug + QUEUE: debug + NORD3: + TYPE: lsf + HOST: nord1.bsc.es + PROJECT: bsc32 + USER: bsc32070 + ADD_PROJECT_TO_HOST: 'False' + SCRATCH_DIR: /gpfs/scratch + TEST_SUITE: 'False' + MAX_WALLCLOCK: 48:00 + MAX_PROCESSORS: '1024' + PROCESSORS_PER_NODE: '16' + TRANSFER_NODE: + TYPE: ps + HOST: dt01.bsc.es + PROJECT: bsc32 + USER: bsc32070 + ADD_PROJECT_TO_HOST: 'false' + SCRATCH_DIR: /gpfs/scratch + TRANSFER_NODE_BSCEARTH000: + TYPE: ps + HOST: bscearth000 + USER: dbeltran + PROJECT: Earth + ADD_PROJECT_TO_HOST: 'false' + QUEUE: serial + SCRATCH_DIR: /esarchive/scratch + BSCEARTH000: + TYPE: ps + HOST: bscearth000 + PROJECT: Earth + USER: dbeltran + SCRATCH_DIR: /esarchive/scratch \ No newline at end of file diff --git a/test/unit/test_pjm.py b/test/unit/test_pjm.py new file mode 100644 index 000000000..ecef47439 --- /dev/null +++ b/test/unit/test_pjm.py @@ -0,0 +1,96 @@ +from unittest import TestCase +from unittest.mock import Mock,MagicMock, patch +from autosubmitconfigparser.config.configcommon import AutosubmitConfig +from autosubmitconfigparser.config.yamlparser import YAMLParserFactory +from autosubmit.autosubmit import Autosubmit +import autosubmit.platforms.pjmplatform + +from pathlib import Path +from autosubmit.platforms.platform import Platform +from autosubmit.platforms.pjmplatform import PJMPlatform +import autosubmit.platforms.headers.pjm_header +from tempfile import TemporaryDirectory +from datetime import datetime +from autosubmit.job.job import Job, Status + +class FakeBasicConfig: + DB_DIR = '/dummy/db/dir' + DB_FILE = '/dummy/db/file' + DB_PATH = '/dummy/db/path' + LOCAL_ROOT_DIR = '/dummy/local/root/dir' + LOCAL_TMP_DIR = '/dummy/local/temp/dir' + LOCAL_PROJ_DIR = '/dummy/local/proj/dir' + LOCAL_ASLOG_DIR = '/dummy/local/aslog/dir' + DEFAULT_PLATFORMS_CONF = '' + DEFAULT_JOBS_CONF = '' + @staticmethod + def read(): + return +class TestPJM(TestCase): + + def setUp(self) -> None: + self.exp_id = 'a000' + self.as_conf = MagicMock() + self.as_conf = AutosubmitConfig(self.exp_id, FakeBasicConfig, YAMLParserFactory()) + self.as_conf.experiment_data = dict() + self.as_conf.experiment_data["DEFAULT"] = dict() + self.as_conf.experiment_data["DEFAULT"]["HPCARCH"] = "ARM" + yml_file = Path("files/fake-jobs.yml") + yml_file.exists() + factory = YAMLParserFactory() + parser = factory.create_parser() + parser.data = parser.load(yml_file) + self.as_conf.experiment_data.update(parser.data) + yml_file = Path("files/fake-platforms.yml") + yml_file.exists() + factory = YAMLParserFactory() + parser = factory.create_parser() + parser.data = parser.load(yml_file) + self.as_conf.experiment_data.update(parser.data) + self.setUp_pjm() + + + @patch("builtins.open",MagicMock()) + def setUp_pjm(self): + MagicMock().write = MagicMock() + MagicMock().os.path.join = MagicMock() + self.section = 'ARM' + self.submitted_ok = "[INFO] PJM 0000 pjsub Job 167661 submitted." + self.submitted_fail = "[ERR.] PJM 0057 pjsub node=32 is greater than the upper limit (24)." + self.out= """JOB_ID JOB_NAME MD ST USER GROUP START_DATE ELAPSE_TIM ELAPSE_LIM NODE_REQUIRE VNODE CORE V_MEM V_POL E_POL RANK LST EC PC SN PRI ACCEPT RSC_GRP REASON +167687 test NM ACC bsc32070 bsc32 03/08 11:41:07 0000:00:01 0000:01:00 1 - - - - - bychip ACC 0 0 0 127 03/08 11:41:04 small - +167688 test NM RUN bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:05 small - +167689 test NM RNO bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RNE 0 0 0 127 03/08 11:41:05 small - +167690 test NM RNA bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RUN 0 0 0 127 03/08 11:41:06 small - +167691 test NM RNP bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RNA 0 0 0 127 03/08 11:41:06 small - +167692 test NM HLD bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RNP 0 0 0 127 03/08 11:41:06 small - """ + self.queued_jobs = ["167687","167690","167691","167692"] + self.running_jobs = ["167688","167689"] + self.out_h="""JOB_ID JOB_NAME MD ST USER GROUP START_DATE ELAPSE_TIM ELAPSE_LIM NODE_REQUIRE VNODE CORE V_MEM V_POL E_POL RANK LST EC PC SN PRI ACCEPT RSC_GRP REASON +167648 STDIN NM EXT bsc32070 bsc32 03/06 12:14:00 0000:00:02 0001:00:00 1 - - - - - bychip RNO 0 0 0 127 03/06 12:13:57 def_grp - +167661 test NM ERR bsc32070 bsc32 03/06 13:55:02 0000:00:02 0000:01:00 1 - - - - - bychip RNO 127 0 0 127 03/06 13:54:59 small - +167662 test NM CCL bsc32070 bsc32 03/06 14:25:30 0000:00:02 0000:01:00 1 - - - - - bychip RNO 127 0 0 127 03/06 14:25:27 small - +167663 test NM RJT bsc32070 bsc32 03/06 14:25:54 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/06 14:25:52 small - +167677 test NM EXT bsc32070 bsc32 03/07 16:39:54 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/07 16:39:50 small - +167678 test NM EXT bsc32070 bsc32 03/07 16:39:57 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/07 16:39:53 small - +167683 test NM EXT bsc32070 bsc32 03/08 11:39:45 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:39:41 small - +167687 test NM EXT bsc32070 bsc32 03/08 11:41:07 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:04 small - +167688 test NM EXT bsc32070 bsc32 03/08 11:41:08 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:05 small - +167689 test NM EXT bsc32070 bsc32 03/08 11:41:09 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:05 small - +167690 test NM EXT bsc32070 bsc32 03/08 11:41:10 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:06 small - +167691 test NM EXT bsc32070 bsc32 03/08 11:41:10 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:06 small - +167692 test NM EXT bsc32070 bsc32 03/08 11:41:10 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:06 small - """ + + self.completed_jobs = ["167677", "167678", "167683", "167687", "167688", "167689", "167690", "167691", "167692"] + self.failed_jobs = ["167661", "167662", "167663"] + self.submitter = Autosubmit._get_submitter(self.as_conf) + self.submitter.load_platforms(self.as_conf) + self.remote_platform = self.submitter.platforms[self.section] + + def test_parse_queue_reason(self): + """Test parsing of queue reason.""" + output = self.remote_platform.parse_queue_reason(self.out_h, self.completed_jobs) + self.assertEqual(output, "") + + + -- GitLab From 229c62bcfe7c8679d3d2855ed7f487d5fc72ea33 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 9 Mar 2023 09:05:41 +0100 Subject: [PATCH 166/213] Fix report, improved the report command (cherry picked from commit 4831fc1ce5c0c823b48ae34d06f8be2ba4bcca04) --- autosubmit/autosubmit.py | 2 +- autosubmit/platforms/headers/pjm_header.py | 4 +- autosubmit/platforms/paramiko_platform.py | 4 +- autosubmit/platforms/pjmplatform.py | 248 ++------------------- test/unit/test_pjm.py | 3 +- 5 files changed, 28 insertions(+), 233 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index fdbed29f8..c50bababa 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2197,7 +2197,7 @@ class Autosubmit: raise except Exception as e: raise - if platform.type == "slurm" and not inspect and not only_wrappers: # return to == + if platform.type == "slurm" or platform.type == "pjm" and not inspect and not only_wrappers: try: valid_packages_to_submit = [ package for package in valid_packages_to_submit if package.x11 != True] if len(valid_packages_to_submit) > 0: diff --git a/autosubmit/platforms/headers/pjm_header.py b/autosubmit/platforms/headers/pjm_header.py index 8ce9a5849..c8d876ea9 100644 --- a/autosubmit/platforms/headers/pjm_header.py +++ b/autosubmit/platforms/headers/pjm_header.py @@ -137,8 +137,8 @@ class PJMHeader(object): #%ACCOUNT_DIRECTIVE% #%MEMORY_DIRECTIVE% %CUSTOM_DIRECTIVES% -#PJM -o=%CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%OUT_LOG_DIRECTIVE% -#PJM -e=%CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%ERR_LOG_DIRECTIVE% +#PJM -o %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%OUT_LOG_DIRECTIVE% +#PJM -e %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%ERR_LOG_DIRECTIVE% #%X11% # ############################################################################### diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 08d088cc4..df06403ef 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -548,7 +548,7 @@ class ParamikoPlatform(Platform): job.new_status = job_status def _check_jobid_in_queue(self, ssh_output, job_list_cmd): - for job in job_list_cmd[:-1].split(','): + for job in job_list_cmd.split('+'): if job not in ssh_output: return False return True @@ -1202,7 +1202,7 @@ class ParamikoPlatform(Platform): return False - + def check_remote_log_dir(self): """ Creates log dir on remote host diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 38e8b1b4e..aeb32a22a 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -145,48 +145,12 @@ class PJMPlatform(ParamikoPlatform): sleep(10) for package in valid_packages_to_submit: - if hold: - retries = 5 - package.jobs[0].id = str(jobs_id[i]) - try: - can_continue = True - while can_continue and retries > 0: - cmd = package.jobs[0].platform.get_queue_status_cmd(jobs_id[i]) - package.jobs[0].platform.send_command(cmd) - queue_status = package.jobs[0].platform._ssh_output - reason = package.jobs[0].platform.parse_queue_reason(queue_status, jobs_id[i]) - # pjstat -H shows only COMPLETED,FAILED - # pjstat shows only QUEUING,RUNNING - # [bsc32070@armlogin01 ~]$ pjstat -H --filter "jid=167661+167662" - if reason == '(JobHeldAdmin)': - can_continue = False - elif reason == '(JobHeldUser)': - can_continue = True - else: - can_continue = False - sleep(5) - retries = retries - 1 - if not can_continue: - package.jobs[0].platform.send_command(package.jobs[0].platform.cancel_cmd + " {0}".format(jobs_id[i])) - i = i + 1 - continue - if not self.hold_job(package.jobs[0]): - i = i + 1 - continue - except Exception as e: - failed_packages.append(jobs_id) - continue for job in package.jobs: job.hold = hold job.id = str(jobs_id[i]) job.status = Status.SUBMITTED job.write_submit_time(hold=hold) i += 1 - if len(failed_packages) > 0: - for job_id in failed_packages: - package.jobs[0].platform.send_command( - package.jobs[0].platform.cancel_cmd + " {0}".format(job_id)) - raise AutosubmitError("{0} submission failed, some hold jobs failed to be held".format(self.name), 6015) save = True except WrongTemplateException as e: raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( @@ -226,19 +190,9 @@ class PJMPlatform(ParamikoPlatform): x11 = False else: x11 = job.x11 - if not x11: - self.get_submit_cmd(script_name, job, hold=hold, export=export) - return None - else: - cmd = self.get_submit_cmd(script_name, job, hold=hold, export=export) - if cmd is None: - return None - if self.send_command(cmd,x11=x11): - job_id = self.get_submitted_job_id(self.get_ssh_output(),x11=x11) - Log.debug("Job ID: {0}", job_id) - return int(job_id) - else: - return None + self.get_submit_cmd(script_name, job, hold=hold, export=export) + return None + def submit_Script(self, hold=False): # type: (bool) -> Union[List[str], str] @@ -299,13 +253,11 @@ class PJMPlatform(ParamikoPlatform): self.root_dir = os.path.join( self.scratch, self.project_dir, self.user, self.expid) self.remote_log_dir = os.path.join(self.root_dir, "LOG_" + self.expid) - self.cancel_cmd = "scancel" + self.cancel_cmd = "pjdel" self._checkhost_cmd = "echo 1" - self._submit_cmd = 'sbatch -D {1} {1}/'.format( - self.host, self.remote_log_dir) - self._submit_command_name = "sbatch" - self._submit_hold_cmd = 'sbatch -H -D {1} {1}/'.format( - self.host, self.remote_log_dir) + self._submit_cmd = 'cd {0} ; pjsub -j '.format(self.remote_log_dir) + self._submit_command_name = "pjsub" + self._submit_hold_cmd = 'cd {0} ; pjsub -j '.format(self.remote_log_dir) # jobid =$(sbatch WOA_run_mn4.sh 2 > & 1 | grep -o "[0-9]*"); scontrol hold $jobid; self.put_cmd = "scp" self.get_cmd = "scp" @@ -313,28 +265,19 @@ class PJMPlatform(ParamikoPlatform): def hold_job(self, job): try: - cmd = "scontrol release {0} ; sleep 2 ; scontrol hold {0} ".format(job.id) + cmd = "scontrol pjhold {0} ; sleep 2 ; pjhold {0} ".format(job.id) self.send_command(cmd) job_status = self.check_job(job, submit_hold_check=True) if job_status == Status.RUNNING: - self.send_command("scancel {0}".format(job.id)) + self.send_command("{0} {1}".format(self.cancel_cmd,job.id)) return False elif job_status == Status.FAILED: return False cmd = self.get_queue_status_cmd(job.id) self.send_command(cmd) - - queue_status = self._ssh_output - reason = str() - reason = self.parse_queue_reason(queue_status, job.id) - if reason == '(JobHeldUser)': - return True - else: - self.send_command("scancel {0}".format(job.id)) - return False except BaseException as e: try: - self.send_command("scancel {0}".format(job.id)) + self.send_command("{0} {1}".format(self.cancel_cmd,job.id)) raise AutosubmitError( "Can't hold jobid:{0}, canceling job".format(job.id), 6000, str(e)) except BaseException as e: @@ -356,149 +299,8 @@ class PJMPlatform(ParamikoPlatform): return output.strip().split(' ')[0].strip() def parse_job_finish_data(self, output, packed): - """Parses the context of the sacct query to SLURM for a single job. - Only normal jobs return submit, start, finish, joules, ncpus, nnodes. + return 0, 0, 0, 0, 0, 0, dict(), False - When a wrapper has finished, capture finish time. - - :param output: The sacct output - :type output: str - :param packed: true if job belongs to package - :type packed: bool - :return: submit, start, finish, joules, ncpus, nnodes, detailed_data - :rtype: int, int, int, int, int, int, json object (str) - """ - try: - # Setting up: Storing detail for posterity - detailed_data = dict() - steps = [] - # No blank spaces after or before - output = output.strip() if output else None - lines = output.split("\n") if output else [] - is_end_of_wrapper = False - extra_data = None - # If there is output, list exists - if len(lines) > 0: - # Collecting information from all output - for line in lines: - line = line.strip().split() - if len(line) > 0: - # Collecting detailed data - name = str(line[0]) - if packed: - # If it belongs to a wrapper - extra_data = {"ncpus": str(line[2] if len(line) > 2 else "NA"), - "nnodes": str(line[3] if len(line) > 3 else "NA"), - "submit": str(line[4] if len(line) > 4 else "NA"), - "start": str(line[5] if len(line) > 5 else "NA"), - "finish": str(line[6] if len(line) > 6 else "NA"), - "energy": str(line[7] if len(line) > 7 else "NA"), - "MaxRSS": str(line[8] if len(line) > 8 else "NA"), - "AveRSS": str(line[9] if len(line) > 9 else "NA")} - else: - # Normal job - extra_data = {"submit": str(line[4] if len(line) > 4 else "NA"), - "start": str(line[5] if len(line) > 5 else "NA"), - "finish": str(line[6] if len(line) > 6 else "NA"), - "energy": str(line[7] if len(line) > 7 else "NA"), - "MaxRSS": str(line[8] if len(line) > 8 else "NA"), - "AveRSS": str(line[9] if len(line) > 9 else "NA")} - # Detailed data will contain the important information from output - detailed_data[name] = extra_data - steps.append(name) - submit = start = finish = energy = nnodes = ncpus = 0 - status = "UNKNOWN" - # Take first line as source - line = lines[0].strip().split() - ncpus = int(line[2] if len(line) > 2 else 0) - nnodes = int(line[3] if len(line) > 3 else 0) - status = str(line[1]) - if packed is False: - # If it is not wrapper job, take first line as source - if status not in ["COMPLETED", "FAILED", "UNKNOWN"]: - # It is not completed, then its error and send default data plus output - return 0, 0, 0, 0, ncpus, nnodes, detailed_data, False - else: - # If it is a wrapped job - # Check if the wrapper has finished - if status in ["COMPLETED", "FAILED", "UNKNOWN"]: - # Wrapper has finished - is_end_of_wrapper = True - # Continue with first line as source - if line: - try: - # Parse submit and start only for normal jobs (not packed) - submit = int(mktime(datetime.strptime( - line[4], "%Y-%m-%dT%H:%M:%S").timetuple())) if not packed else 0 - start = int(mktime(datetime.strptime( - line[5], "%Y-%m-%dT%H:%M:%S").timetuple())) if not packed else 0 - # Assuming the job has been COMPLETED - # If normal job or end of wrapper => Try to get the finish time from the first line of the output, else default to now. - finish = 0 - - if not packed: - # If normal job, take finish time from first line - finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( - ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) - energy = parse_output_number(line[7]) if len( - line) > 7 and len(line[7]) > 0 else 0 - else: - # If it is a wrapper job - # If end of wrapper, take data from first line - if is_end_of_wrapper is True: - finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( - ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) - energy = parse_output_number(line[7]) if len( - line) > 7 and len(line[7]) > 0 else 0 - else: - # If packed but not end of wrapper, try to get info from current data. - if "finish" in list(extra_data.keys()) and extra_data["finish"] != "Unknown": - # finish data exists - finish = int(mktime(datetime.strptime( - extra_data["finish"], "%Y-%m-%dT%H:%M:%S").timetuple())) - else: - # if finish date does not exist, query previous step. - if len(steps) >= 2 and detailed_data.__contains__(steps[-2]): - new_extra_data = detailed_data[steps[-2]] - if "finish" in list(new_extra_data.keys()) and new_extra_data["finish"] != "Unknown": - # This might result in a job finish < start, need to handle that in the caller function - finish = int(mktime(datetime.strptime( - new_extra_data["finish"], "%Y-%m-%dT%H:%M:%S").timetuple())) - else: - finish = int(time()) - else: - finish = int(time()) - if "energy" in list(extra_data.keys()) and extra_data["energy"] != "NA": - # energy exists - energy = parse_output_number( - extra_data["energy"]) - else: - # if energy does not exist, query previous step - if len(steps) >= 2 and detailed_data.__contains__(steps[-2]): - new_extra_data = detailed_data[steps[-2]] - if "energy" in list(new_extra_data.keys()) and new_extra_data["energy"] != "NA": - energy = parse_output_number( - new_extra_data["energy"]) - else: - energy = 0 - else: - energy = 0 - except Exception as exp: - # print(line) - # Log.info(traceback.format_exc()) - Log.info( - "Parsing mishandling.") - # joules = -1 - pass - - detailed_data = detailed_data if not packed or is_end_of_wrapper is True else extra_data - return submit, start, finish, energy, ncpus, nnodes, detailed_data, is_end_of_wrapper - - return 0, 0, 0, 0, 0, 0, dict(), False - except Exception as exp: - Log.warning( - "Autosubmit couldn't parse SLURM energy output. From parse_job_finish_data: {0}".format(str(exp))) - return 0, 0, 0, 0, 0, 0, dict(), False def parse_Alljobs_output(self, output, job_id): status = "" @@ -513,12 +315,10 @@ class PJMPlatform(ParamikoPlatform): def get_submitted_job_id(self, outputlines, x11 = False): try: - if outputlines.find("failed") != -1: - raise AutosubmitCritical( - "Submission failed. Command Failed", 7014) jobs_id = [] for output in outputlines.splitlines(): - jobs_id.append(int(output.split(' ')[3])) + if not self.submit_error(output): + jobs_id.append(int(output.split(' ')[5])) if x11 == "true": return jobs_id[0] else: @@ -527,11 +327,6 @@ class PJMPlatform(ParamikoPlatform): raise AutosubmitCritical( "Submission failed. There are issues on your config file", 7014) - def jobs_in_queue(self): - dom = parseString('') - jobs_xml = dom.getElementsByTagName("JB_job_number") - return [int(element.firstChild.nodeValue) for element in jobs_xml] - def get_submit_cmd(self, job_script, job, hold=False, export=""): if (export is None or export.lower() == "none") or len(export) == 0: export = "" @@ -561,24 +356,24 @@ class PJMPlatform(ParamikoPlatform): except BaseException as e: pass - def get_checkjob_cmd(self, job_id): - return 'sacct -n -X --jobs {1} -o "State"'.format(self.host, job_id) def get_checkAlljobs_cmd(self, jobs_id): - return "sacct -n -X --jobs {1} -o jobid,State".format(self.host, jobs_id) + # jobs_id = "jobid1+jobid2+jobid3" + # -H == sacct + return "pjstat -H -v --choose jid,st,ermsg --filter \"jid={0}\"".format(jobs_id) def get_queue_status_cmd(self, job_id): - return 'squeue -j {0} -o %A,%R'.format(job_id) + return 'pjstat -v --choose jid,st,ermsg {0}'.format(job_id) def get_jobid_by_jobname_cmd(self, job_name): - return 'squeue -o %A,%.50j -n {0}'.format(job_name) + return 'pjstat -v --choose jid,st,ermsg --filter \"jnam={0}\"'.format(job_name) def cancel_job(self, job_id): - return 'scancel {0}'.format(job_id) + return '{0} {1}'.format(self.cancel_cmd,job_id) - def get_job_energy_cmd(self, job_id): - return 'sacct -n --jobs {0} -o JobId%25,State,NCPUS,NNodes,Submit,Start,End,ConsumedEnergy,MaxRSS%25,AveRSS%25'.format(job_id) + #def get_job_energy_cmd(self, job_id): + # return 'sacct -n --jobs {0} -o JobId%25,State,NCPUS,NNodes,Submit,Start,End,ConsumedEnergy,MaxRSS%25,AveRSS%25'.format(job_id) def parse_queue_reason(self, output, job_id): reason = [x.split(',')[1] for x in output.splitlines() @@ -587,6 +382,7 @@ class PJMPlatform(ParamikoPlatform): return reason[0] return reason + # Wrapper todo @staticmethod def wrapper_header(filename, queue, project, wallclock, num_procs, dependency, directives, threads, method="asthreads", partition=""): if method == 'srun': diff --git a/test/unit/test_pjm.py b/test/unit/test_pjm.py index ecef47439..6310a0b0f 100644 --- a/test/unit/test_pjm.py +++ b/test/unit/test_pjm.py @@ -89,8 +89,7 @@ class TestPJM(TestCase): def test_parse_queue_reason(self): """Test parsing of queue reason.""" - output = self.remote_platform.parse_queue_reason(self.out_h, self.completed_jobs) - self.assertEqual(output, "") + output = self.remote_platform.parse_queue_reason(self.out_h, self.completed_jobs[0]) -- GitLab From aa4ca3d29e7295fe225d5729f51693c91bbbab5d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 10 Mar 2023 17:00:10 +0100 Subject: [PATCH 167/213] PJM fixed somethings, added wrapper, added some tests (cherry picked from commit 76aed3ddc02a7227de678c136bd337abe4fc6d2f) --- autosubmit/platforms/headers/pjm_header.py | 4 +- autosubmit/platforms/paramiko_platform.py | 68 +++++------- autosubmit/platforms/pjmplatform.py | 115 +++++++++++++++------ autosubmit/platforms/slurmplatform.py | 23 +++++ test/unit/test_pjm.py | 71 ++++++++----- 5 files changed, 180 insertions(+), 101 deletions(-) diff --git a/autosubmit/platforms/headers/pjm_header.py b/autosubmit/platforms/headers/pjm_header.py index c8d876ea9..e77dfdb03 100644 --- a/autosubmit/platforms/headers/pjm_header.py +++ b/autosubmit/platforms/headers/pjm_header.py @@ -158,8 +158,8 @@ class PJMHeader(object): #%ACCOUNT_DIRECTIVE% #%MEMORY_DIRECTIVE% #%MEMORY_PER_TASK_DIRECTIVE% -#PJM -o=%CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%OUT_LOG_DIRECTIVE% -#PJM -e=%CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%ERR_LOG_DIRECTIVE% +#PJM -o %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%OUT_LOG_DIRECTIVE% +#PJM -e %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%ERR_LOG_DIRECTIVE% %CUSTOM_DIRECTIVES% # ############################################################################### diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index df06403ef..4288cece3 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -548,12 +548,32 @@ class ParamikoPlatform(Platform): job.new_status = job_status def _check_jobid_in_queue(self, ssh_output, job_list_cmd): - for job in job_list_cmd.split('+'): + for job in job_list_cmd[:-1].split(','): if job not in ssh_output: return False return True + def parse_joblist(self, job_list): + """ + Convert a list of job_list to job_list_cmd + :param job_list: list of jobs + :type job_list: list + :param ssh_output: ssh output + :type ssh_output: str + :return: job status + :rtype: str + """ + job_list_cmd = "" + for job,job_prev_status in job_list: + if job.id is None: + job_str = "0" + else: + job_str = str(job.id) + job_list_cmd += job_str+"," + if job_list_cmd[-1] == ",": + job_list_cmd=job_list_cmd[:-1] - def check_Alljobs(self, job_list, job_list_cmd, remote_logs, retries=5): + return job_list_cmd + def check_Alljobs(self, job_list, as_conf, retries=5): """ Checks jobs running status @@ -565,8 +585,9 @@ class ParamikoPlatform(Platform): :return: current job status :rtype: autosubmit.job.job_common.Status """ - if job_list_cmd[-1] == ",": - job_list_cmd=job_list_cmd[:-1] + job_status = Status.UNKNOWN + remote_logs = as_conf.get_copy_remote_logs() + job_list_cmd = self.parse_joblist(job_list) cmd = self.get_checkAlljobs_cmd(job_list_cmd) sleep_time = 5 sleep(sleep_time) @@ -655,38 +676,7 @@ class ParamikoPlatform(Platform): Log.error( 'check_job() The job id ({0}) status is {1}.', job_id, job_status) job.new_status = job_status - - reason = str() - if self.type == 'slurm' and len(in_queue_jobs) > 0: - cmd = self.get_queue_status_cmd(list_queue_jobid) - self.send_command(cmd) - queue_status = self._ssh_output - for job in in_queue_jobs: - reason = self.parse_queue_reason(queue_status, job.id) - if job.queuing_reason_cancel(reason): - Log.error( - "Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) - self.send_command( - self.platform.cancel_cmd + " {0}".format(job.id)) - job.new_status = Status.FAILED - job.update_status(remote_logs) - return - elif reason == '(JobHeldUser)': - job.new_status = Status.HELD - if not job.hold: - # SHOULD BE MORE CLASS (GET_scontrol realease but not sure if this can be implemented on others PLATFORMS - self.send_command("scontrol release {0}".format(job.id)) - job.new_status = Status.QUEUING # If it was HELD and was released, it should be QUEUING next. - else: - pass - # This shouldn't happen anymore TODO delete - elif reason == '(JobHeldAdmin)': - Log.debug( - "Job {0} Failed to be HELD, canceling... ", job.name) - job.new_status = Status.WAITING - job.platform.send_command( - job.platform.cancel_cmd + " {0}".format(job.id)) - + self.get_queue_status(in_queue_jobs,list_queue_jobid,as_conf) else: for job in job_list: job_status = Status.UNKNOWN @@ -719,12 +709,6 @@ class ParamikoPlatform(Platform): job_ids = [job_id.split(',')[0] for job_id in job_ids_names] return job_ids - - - - - - def get_checkjob_cmd(self, job_id): """ Returns command to check job status on remote platforms diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index aeb32a22a..5c1940b58 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -33,7 +33,7 @@ from autosubmit.platforms.paramiko_platform import ParamikoPlatform from autosubmit.platforms.headers.pjm_header import PJMHeader from autosubmit.platforms.wrappers.wrapper_factory import PJMWrapperFactory from log.log import AutosubmitCritical, AutosubmitError, Log - +import random class PJMPlatform(ParamikoPlatform): """ @@ -74,7 +74,7 @@ class PJMPlatform(ParamikoPlatform): self._submit_script_file = open(self._submit_script_path, 'wb').close() def submit_error(self,output): - # Returns true if the output of the submit command indicates an error, false otherwise + #returns false if the job submission message is not found if output.lower().find("pjsub".lower()) != -1 and output.lower().find("[INFO] PJM 0000".lower()) != -1: return False else: @@ -94,7 +94,6 @@ class PJMPlatform(ParamikoPlatform): try: valid_packages_to_submit = [ package for package in valid_packages_to_submit if package.x11 != True] if len(valid_packages_to_submit) > 0: - package = valid_packages_to_submit[0] try: jobs_id = self.submit_Script(hold=hold) except AutosubmitError as e: @@ -253,19 +252,18 @@ class PJMPlatform(ParamikoPlatform): self.root_dir = os.path.join( self.scratch, self.project_dir, self.user, self.expid) self.remote_log_dir = os.path.join(self.root_dir, "LOG_" + self.expid) - self.cancel_cmd = "pjdel" + self.cancel_cmd = "pjdel " self._checkhost_cmd = "echo 1" - self._submit_cmd = 'cd {0} ; pjsub -j '.format(self.remote_log_dir) + self._submit_cmd = 'cd {0} ; pjsub '.format(self.remote_log_dir) self._submit_command_name = "pjsub" - self._submit_hold_cmd = 'cd {0} ; pjsub -j '.format(self.remote_log_dir) - # jobid =$(sbatch WOA_run_mn4.sh 2 > & 1 | grep -o "[0-9]*"); scontrol hold $jobid; + self._submit_hold_cmd = 'cd {0} ; pjsub '.format(self.remote_log_dir) self.put_cmd = "scp" self.get_cmd = "scp" self.mkdir_cmd = "mkdir -p " + self.remote_log_dir def hold_job(self, job): try: - cmd = "scontrol pjhold {0} ; sleep 2 ; pjhold {0} ".format(job.id) + cmd = "pjrls {0} ; sleep 2 ; pjhold -R ASHOLD {0}".format(job.id) self.send_command(cmd) job_status = self.check_job(job, submit_hold_check=True) if job_status == Status.RUNNING: @@ -296,12 +294,40 @@ class PJMPlatform(ParamikoPlatform): return self.remote_log_dir def parse_job_output(self, output): - return output.strip().split(' ')[0].strip() + return output.strip().split()[0].strip() def parse_job_finish_data(self, output, packed): return 0, 0, 0, 0, 0, 0, dict(), False - + def queuing_reason_cancel(self, reason): + try: + if len(reason.split('(', 1)) > 1: + reason = reason.split('(', 1)[1].split(')')[0] + if 'Invalid' in reason or reason in ['ANOTHER JOB STARTED','DELAY','DEADLINE SCHEDULE STARTED','ELAPSE LIMIT EXCEEDED','FILE IO ERROR','GATE CHECK','IMPOSSIBLE SCHED','INSUFF CPU','INSUFF MEMORY','INSUFF NODE','INSUFF','INTERNAL ERROR','INVALID HOSTFILE','LIMIT OVER MEMORY','LOST COMM','NO CURRENT DIR','NOT EXIST','RSCGRP NOT EXIST','RSCGRP STOP','RSCUNIT','USER','EXCEED','WAIT SCHED']: + return True + return False + except Exception as e: + return False + def get_queue_status(self, in_queue_jobs, list_queue_jobid, as_conf): + if len(in_queue_jobs) <= 0: + return + cmd = self.get_queue_status_cmd(list_queue_jobid) + self.send_command(cmd) + queue_status = self._ssh_output + for job in in_queue_jobs: + reason = self.parse_queue_reason(queue_status, job.id) + if job.queuing_reason_cancel(reason): + Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) + self.send_command(self.cancel_cmd + " {0}".format(job.id)) + job.new_status = Status.FAILED + job.update_status(as_conf) + elif reason.find('ASHOLD') != -1: + job.new_status = Status.HELD + if not job.hold: + self.send_command("{0} {1}".format(self.cancel_cmd,job.id)) + job.new_status = Status.QUEUING # If it was HELD and was released, it should be QUEUING next. + else: + pass def parse_Alljobs_output(self, output, job_id): status = "" try: @@ -313,12 +339,39 @@ class PJMPlatform(ParamikoPlatform): return status return status[0] + def parse_joblist(self, job_list): + """ + Convert a list of job_list to job_list_cmd + :param job_list: list of jobs + :type job_list: list + :param ssh_output: ssh output + :type ssh_output: str + :return: job status + :rtype: str + """ + job_list_cmd = "" + for job, job_prev_status in job_list: + if job.id is None: + job_str = "0" + else: + job_str = str(job.id) + job_list_cmd += job_str + "+" + if job_list_cmd[-1] == "+": + job_list_cmd = job_list_cmd[:-1] + + return job_list_cmd + def _check_jobid_in_queue(self, ssh_output, job_list_cmd): + for job in job_list_cmd.split('+'): + if job not in ssh_output: + return False + return True + def get_submitted_job_id(self, outputlines, x11 = False): try: jobs_id = [] for output in outputlines.splitlines(): if not self.submit_error(output): - jobs_id.append(int(output.split(' ')[5])) + jobs_id.append(int(output.split()[5])) if x11 == "true": return jobs_id[0] else: @@ -360,10 +413,10 @@ class PJMPlatform(ParamikoPlatform): def get_checkAlljobs_cmd(self, jobs_id): # jobs_id = "jobid1+jobid2+jobid3" # -H == sacct - return "pjstat -H -v --choose jid,st,ermsg --filter \"jid={0}\"".format(jobs_id) + return "pjstat -H -v --choose jid,st,ermsg --filter \"jid={0}\" > as_checkalljobs.txt ; pjstat -v --choose jid,st,ermsg --filter \"jid={0}\" >> as_checkalljobs.txt ; cat as_checkalljobs.txt ; rm as_checkalljobs.txt".format(jobs_id) def get_queue_status_cmd(self, job_id): - return 'pjstat -v --choose jid,st,ermsg {0}'.format(job_id) + return self.get_checkAlljobs_cmd(job_id) def get_jobid_by_jobname_cmd(self, job_name): return 'pjstat -v --choose jid,st,ermsg --filter \"jnam={0}\"'.format(job_name) @@ -376,13 +429,15 @@ class PJMPlatform(ParamikoPlatform): # return 'sacct -n --jobs {0} -o JobId%25,State,NCPUS,NNodes,Submit,Start,End,ConsumedEnergy,MaxRSS%25,AveRSS%25'.format(job_id) def parse_queue_reason(self, output, job_id): - reason = [x.split(',')[1] for x in output.splitlines() - if x.split(',')[0] == str(job_id)] + # split() is used to remove the trailing whitespace but also \t and multiple spaces + # split(" ") is not enough + reason = [x.split()[2] for x in output.splitlines() + if x.split()[0] == str(job_id)] + # In case of duplicates.. we take the first one if len(reason) > 0: return reason[0] return reason - # Wrapper todo @staticmethod def wrapper_header(filename, queue, project, wallclock, num_procs, dependency, directives, threads, method="asthreads", partition=""): if method == 'srun': @@ -393,15 +448,15 @@ class PJMPlatform(ParamikoPlatform): # {0} ############################################################################### # -#SBATCH -J {0} +#PJM -N {0} {1} {8} -#SBATCH -A {2} -#SBATCH --output={0}.out -#SBATCH --error={0}.err -#SBATCH -t {3}:00 -#SBATCH -n {4} -#SBATCH --cpus-per-task={7} +#PJM -g {2} +#PJM -o {0}.out +#PJM -e {0}.err +#PJM -elapse {3}:00 +#PJM --mpi "proc=%NUMPROC%" +#PJM --mpi "max-proc-per-node={7}" {5} {6} @@ -417,15 +472,15 @@ class PJMPlatform(ParamikoPlatform): # {0} ############################################################################### # -#SBATCH -J {0} +#PJM -N {0} {1} {8} -#SBATCH -A {2} -#SBATCH --output={0}.out -#SBATCH --error={0}.err -#SBATCH -t {3}:00 -#SBATCH --cpus-per-task={7} -#SBATCH -n {4} +#PJM -g {2} +#PJM -o {0}.out +#PJM -e {0}.err +#PJM -elapse {3}:00 +#PJM --mpi "proc=%NUMPROC%" +#PJM --mpi "max-proc-per-node={7}" {5} {6} # diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index d757256a4..bb0a68a06 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -394,6 +394,29 @@ class SlurmPlatform(ParamikoPlatform): return reason[0] return reason + def get_queue_status(self, in_queue_jobs, list_queue_jobid, as_conf): + if len(in_queue_jobs) <= 0: + return + cmd = self.get_queue_status_cmd(list_queue_jobid) + self.send_command(cmd) + queue_status = self._ssh_output + for job in in_queue_jobs: + reason = self.parse_queue_reason(queue_status, job.id) + if job.queuing_reason_cancel(reason): # this should be a platform method to be implemented + Log.error( + "Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) + self.send_command( + self.platform.cancel_cmd + " {0}".format(job.id)) + job.new_status = Status.FAILED + job.update_status(as_conf) + elif reason == '(JobHeldUser)': + job.new_status = Status.HELD + if not job.hold: + # SHOULD BE MORE CLASS (GET_scontrol release but not sure if this can be implemented on others PLATFORMS + self.send_command("scontrol release {0}".format(job.id)) + job.new_status = Status.QUEUING # If it was HELD and was released, it should be QUEUING next. + else: + pass @staticmethod def wrapper_header(filename, queue, project, wallclock, num_procs, dependency, directives, threads, method="asthreads"): if method == 'srun': diff --git a/test/unit/test_pjm.py b/test/unit/test_pjm.py index 6310a0b0f..51377c381 100644 --- a/test/unit/test_pjm.py +++ b/test/unit/test_pjm.py @@ -4,6 +4,7 @@ from autosubmitconfigparser.config.configcommon import AutosubmitConfig from autosubmitconfigparser.config.yamlparser import YAMLParserFactory from autosubmit.autosubmit import Autosubmit import autosubmit.platforms.pjmplatform +import pytest from pathlib import Path from autosubmit.platforms.platform import Platform @@ -57,39 +58,55 @@ class TestPJM(TestCase): self.section = 'ARM' self.submitted_ok = "[INFO] PJM 0000 pjsub Job 167661 submitted." self.submitted_fail = "[ERR.] PJM 0057 pjsub node=32 is greater than the upper limit (24)." - self.out= """JOB_ID JOB_NAME MD ST USER GROUP START_DATE ELAPSE_TIM ELAPSE_LIM NODE_REQUIRE VNODE CORE V_MEM V_POL E_POL RANK LST EC PC SN PRI ACCEPT RSC_GRP REASON -167687 test NM ACC bsc32070 bsc32 03/08 11:41:07 0000:00:01 0000:01:00 1 - - - - - bychip ACC 0 0 0 127 03/08 11:41:04 small - -167688 test NM RUN bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:05 small - -167689 test NM RNO bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RNE 0 0 0 127 03/08 11:41:05 small - -167690 test NM RNA bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RUN 0 0 0 127 03/08 11:41:06 small - -167691 test NM RNP bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RNA 0 0 0 127 03/08 11:41:06 small - -167692 test NM HLD bsc32070 bsc32 (03/08 11:41) 0000:00:00 0000:01:00 1 - - - - - bychip RNP 0 0 0 127 03/08 11:41:06 small - """ - self.queued_jobs = ["167687","167690","167691","167692"] - self.running_jobs = ["167688","167689"] - self.out_h="""JOB_ID JOB_NAME MD ST USER GROUP START_DATE ELAPSE_TIM ELAPSE_LIM NODE_REQUIRE VNODE CORE V_MEM V_POL E_POL RANK LST EC PC SN PRI ACCEPT RSC_GRP REASON -167648 STDIN NM EXT bsc32070 bsc32 03/06 12:14:00 0000:00:02 0001:00:00 1 - - - - - bychip RNO 0 0 0 127 03/06 12:13:57 def_grp - -167661 test NM ERR bsc32070 bsc32 03/06 13:55:02 0000:00:02 0000:01:00 1 - - - - - bychip RNO 127 0 0 127 03/06 13:54:59 small - -167662 test NM CCL bsc32070 bsc32 03/06 14:25:30 0000:00:02 0000:01:00 1 - - - - - bychip RNO 127 0 0 127 03/06 14:25:27 small - -167663 test NM RJT bsc32070 bsc32 03/06 14:25:54 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/06 14:25:52 small - -167677 test NM EXT bsc32070 bsc32 03/07 16:39:54 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/07 16:39:50 small - -167678 test NM EXT bsc32070 bsc32 03/07 16:39:57 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/07 16:39:53 small - -167683 test NM EXT bsc32070 bsc32 03/08 11:39:45 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:39:41 small - -167687 test NM EXT bsc32070 bsc32 03/08 11:41:07 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:04 small - -167688 test NM EXT bsc32070 bsc32 03/08 11:41:08 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:05 small - -167689 test NM EXT bsc32070 bsc32 03/08 11:41:09 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:05 small - -167690 test NM EXT bsc32070 bsc32 03/08 11:41:10 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:06 small - -167691 test NM EXT bsc32070 bsc32 03/08 11:41:10 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:06 small - -167692 test NM EXT bsc32070 bsc32 03/08 11:41:10 0000:00:04 0000:01:00 1 - - - - - bychip RNO 0 0 0 127 03/08 11:41:06 small - """ - - self.completed_jobs = ["167677", "167678", "167683", "167687", "167688", "167689", "167690", "167691", "167692"] - self.failed_jobs = ["167661", "167662", "167663"] + self.out= """JOB_ID ST REASON +167727 EXT COMPLETED +167728 RNO - +167729 RNE - +167730 RUN - +167732 ACC - +167733 QUE - +167734 RNA - +167735 RNP - +167736 HLD ASHOLD +167737 ERR - +167738 CCL - +167739 RJT - +""" + self.completed_jobs = ["167727"] + self.running_jobs = ["167728","167729","167730"] + self.queued_jobs = ["167732","167733","167734","167735","167736"] + self.failed_jobs = ["167737","167738","167739"] + self.jobs_that_arent_listed = ["3442432423", "238472364782", "1728362138712"] + self.completed_jobs_cmd = "167727" + self.running_jobs_cmd = "167728+167729+167730" + self.queued_jobs_cmd = "167732+167733+167734+167735+167736" + self.failed_jobs_cmd = "167737+167738+167739" + self.jobs_that_arent_listed_cmd = "3442432423+238472364782+1728362138712" self.submitter = Autosubmit._get_submitter(self.as_conf) self.submitter.load_platforms(self.as_conf) self.remote_platform = self.submitter.platforms[self.section] + def test_parse_Alljobs_output(self): + """Test parsing of all jobs output.""" + for job_id in self.completed_jobs: + assert self.remote_platform.parse_Alljobs_output(self.out,job_id) in self.remote_platform.job_status["COMPLETED"] + for job_id in self.failed_jobs: + assert self.remote_platform.parse_Alljobs_output(self.out,job_id) in self.remote_platform.job_status["FAILED"] + for job_id in self.queued_jobs: + assert self.remote_platform.parse_Alljobs_output(self.out,job_id) in self.remote_platform.job_status["QUEUING"] + for job_id in self.running_jobs: + assert self.remote_platform.parse_Alljobs_output(self.out,job_id) in self.remote_platform.job_status["RUNNING"] + for job_id in self.jobs_that_arent_listed: + assert self.remote_platform.parse_Alljobs_output(self.out,job_id) == [] + + def test_get_submitted_job_id(self): + """Test parsing of submitted job id.""" + output = self.remote_platform.get_submitted_job_id(self.submitted_ok) + assert output == [167661] def test_parse_queue_reason(self): """Test parsing of queue reason.""" - output = self.remote_platform.parse_queue_reason(self.out_h, self.completed_jobs[0]) + output = self.remote_platform.parse_queue_reason(self.out, self.completed_jobs[0]) + assert output == "COMPLETED" -- GitLab From cd3a0c67bd9c11a42b0cefc728128dd9ecc7b697 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 10 Mar 2023 17:00:59 +0100 Subject: [PATCH 168/213] tests (cherry picked from commit 1005075efb7fe3b251a4832b7c5f799b53c2d11a) --- test/unit/test_pjm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/unit/test_pjm.py b/test/unit/test_pjm.py index 51377c381..dcda1727e 100644 --- a/test/unit/test_pjm.py +++ b/test/unit/test_pjm.py @@ -103,6 +103,8 @@ class TestPJM(TestCase): """Test parsing of submitted job id.""" output = self.remote_platform.get_submitted_job_id(self.submitted_ok) assert output == [167661] + output = self.remote_platform.get_submitted_job_id(self.submitted_fail) + assert output == [] def test_parse_queue_reason(self): """Test parsing of queue reason.""" output = self.remote_platform.parse_queue_reason(self.out, self.completed_jobs[0]) -- GitLab From 77a340ee346bce84cdd9b2cd41ed66783821a4a7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 10 Mar 2023 17:37:08 +0100 Subject: [PATCH 169/213] paths for conda? (cherry picked from commit dfbad793b21a15bb9683984017747ce07df2e1d7) --- test/unit/test_pjm.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/unit/test_pjm.py b/test/unit/test_pjm.py index dcda1727e..fed657364 100644 --- a/test/unit/test_pjm.py +++ b/test/unit/test_pjm.py @@ -37,13 +37,17 @@ class TestPJM(TestCase): self.as_conf.experiment_data["DEFAULT"] = dict() self.as_conf.experiment_data["DEFAULT"]["HPCARCH"] = "ARM" yml_file = Path("files/fake-jobs.yml") - yml_file.exists() + if not yml_file.exists(): + yml_file = Path("test/files/fake-jobs.yml") + factory = YAMLParserFactory() parser = factory.create_parser() parser.data = parser.load(yml_file) self.as_conf.experiment_data.update(parser.data) yml_file = Path("files/fake-platforms.yml") yml_file.exists() + if not yml_file.exists(): + yml_file = Path("test/files/fake-platforms.yml") factory = YAMLParserFactory() parser = factory.create_parser() parser.data = parser.load(yml_file) -- GitLab From 6e3f000084bc132564cffb81e0f5710760b5c068 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 15 Mar 2023 12:38:08 +0100 Subject: [PATCH 170/213] bruno review commits (cherry picked from commit e3597eaf62a5fb6b8bc747fff3f98968f8264d34) --- autosubmit/platforms/locplatform.py | 4 ++-- autosubmit/platforms/pjmplatform.py | 14 ++++++-------- autosubmit/platforms/slurmplatform.py | 6 +++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index bbb811a26..f361eb2c3 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -171,8 +171,8 @@ class LocalPlatform(ParamikoPlatform): while not file_exist and retries < max_retries: try: file_exist = os.path.isfile(os.path.join(self.get_files_path(),src)) - if not file_exist: # File doesn't exist, retry in sleeptime - Log.debug("{2} File still no exists.. waiting {0}s for a new retry ( retries left: {1})", sleeptime, + if not file_exist: # File doesn't exist, retry in sleep-time + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, max_retries - retries, remote_path) if not wrapper_failed: sleep(sleeptime) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 5c1940b58..415e59c7b 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -56,11 +56,11 @@ class PJMPlatform(ParamikoPlatform): self.cancel_cmd = None self._header = PJMHeader() self._wrapper = PJMWrapperFactory(self) - #https://software.fujitsu.com/jp/manual/manualfiles/m220008/j2ul2452/02enz007/j2ul-2452-02enz0.pdf pagina 16 + #https://software.fujitsu.com/jp/manual/manualfiles/m220008/j2ul2452/02enz007/j2ul-2452-02enz0.pdf page 16 self.job_status = dict() self.job_status['COMPLETED'] = ['EXT'] self.job_status['RUNNING'] = ['RNO','RNE','RUN'] - self.job_status['QUEUING'] = ['ACC','QUE', 'RNA', 'RNP','HLD'] # NOT SURE ABOUT HOLD HLD + self.job_status['QUEUING'] = ['ACC','QUE', 'RNA', 'RNP','HLD'] # TODO NOT SURE ABOUT HOLD HLD self.job_status['FAILED'] = ['ERR','CCL','RJT'] self._pathdir = "\$HOME/LOG_" + self.expid self._allow_arrays = False @@ -75,10 +75,8 @@ class PJMPlatform(ParamikoPlatform): def submit_error(self,output): #returns false if the job submission message is not found - if output.lower().find("pjsub".lower()) != -1 and output.lower().find("[INFO] PJM 0000".lower()) != -1: - return False - else: - return True + return all(part in output.lower() for part in ["pjsub", "[INFO] PJM 0000"]) + @@ -309,7 +307,7 @@ class PJMPlatform(ParamikoPlatform): except Exception as e: return False def get_queue_status(self, in_queue_jobs, list_queue_jobid, as_conf): - if len(in_queue_jobs) <= 0: + if not in_queue_jobs: return cmd = self.get_queue_status_cmd(list_queue_jobid) self.send_command(cmd) @@ -504,7 +502,7 @@ class PJMPlatform(ParamikoPlatform): self.get_files_path(), filename)) file_exist = True except IOError as e: # File doesn't exist, retry in sleeptime - Log.debug("{2} File still no exists.. waiting {0}s for a new retry ( retries left: {1})", sleeptime, + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, max_retries - retries, os.path.join(self.get_files_path(), filename)) if not wrapper_failed: sleep(sleeptime) diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index bb0a68a06..23871f2c7 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -395,7 +395,7 @@ class SlurmPlatform(ParamikoPlatform): return reason def get_queue_status(self, in_queue_jobs, list_queue_jobid, as_conf): - if len(in_queue_jobs) <= 0: + if not in_queue_jobs: return cmd = self.get_queue_status_cmd(list_queue_jobid) self.send_command(cmd) @@ -479,8 +479,8 @@ class SlurmPlatform(ParamikoPlatform): self._ftpChannel.stat(os.path.join( self.get_files_path(), filename)) file_exist = True - except IOError: # File doesn't exist, retry in sleeptime - Log.debug("{2} File still no exists.. waiting {0}s for a new retry ( retries left: {1})", sleeptime, + except IOError as e: # File doesn't exist, retry in sleeptime + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, max_retries - retries, os.path.join(self.get_files_path(), filename)) if not wrapper_failed: sleep(sleeptime) -- GitLab From 3fdaacb4254ed339dc0f02edd1507d5c51a542ae Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 15 Mar 2023 15:21:18 +0100 Subject: [PATCH 171/213] Changed all for not all (cherry picked from commit f0037bf2b4678d3da501ab779b7a00175dcb4d01) --- autosubmit/platforms/pjmplatform.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 415e59c7b..79e2db88b 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -74,9 +74,12 @@ class PJMPlatform(ParamikoPlatform): self._submit_script_file = open(self._submit_script_path, 'wb').close() def submit_error(self,output): - #returns false if the job submission message is not found - return all(part in output.lower() for part in ["pjsub", "[INFO] PJM 0000"]) - + """ + Check if the output of the submit command contains an error message. + :param output: output of the submit cmd + :return: boolean + """ + return not all(part.lower() in output.lower() for part in ["pjsub", "[INFO] PJM 0000"]) -- GitLab From 399a570a466d79b15c5156d43a2bb8b4c22ca10b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 21 Mar 2023 11:10:50 +0100 Subject: [PATCH 172/213] added with surpress (cherry picked from commit 191c8719a69d9c07b03c6895cf1917100afc9b8f) --- autosubmit/platforms/pjmplatform.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 79e2db88b..27a7b47de 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -19,6 +19,7 @@ import locale import os import re +from contextlib import suppress from time import sleep from time import mktime from time import time @@ -397,7 +398,7 @@ class PJMPlatform(ParamikoPlatform): else: return export + self._submit_hold_cmd + job_script else: - try: + with suppress(BaseException): lang = locale.getlocale()[1] if lang is None: lang = locale.getdefaultlocale()[1] @@ -407,8 +408,6 @@ class PJMPlatform(ParamikoPlatform): self._submit_script_file.write((export + self._submit_cmd + job_script + "\n").encode(lang)) else: self._submit_script_file.write((export + self._submit_hold_cmd + job_script + "\n").encode(lang)) - except BaseException as e: - pass def get_checkAlljobs_cmd(self, jobs_id): -- GitLab From b692b25821abfb21c58747ee8d03d7967b6bf124 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 21 Mar 2023 11:19:22 +0100 Subject: [PATCH 173/213] cancel_cmd update (cherry picked from commit a85f600813202cd42f3b08452f7208f237524615) --- autosubmit/platforms/slurmplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 23871f2c7..152492f28 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -406,7 +406,7 @@ class SlurmPlatform(ParamikoPlatform): Log.error( "Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) self.send_command( - self.platform.cancel_cmd + " {0}".format(job.id)) + self.cancel_cmd + " {0}".format(job.id)) job.new_status = Status.FAILED job.update_status(as_conf) elif reason == '(JobHeldUser)': -- GitLab From fcbb8ed23b1fec5f182a5523f345806088284d78 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 21 Mar 2023 11:19:45 +0100 Subject: [PATCH 174/213] cancel_cmd update (cherry picked from commit 55f72c0c70c86df453369a4bda5f9ce936386c36) --- autosubmit/platforms/paramiko_platform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 4288cece3..acb7cbc71 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -649,8 +649,8 @@ class ParamikoPlatform(Platform): job_status = job.check_completion(over_wallclock=True) if job_status is Status.FAILED: try: - job.platform.send_command( - job.platform.cancel_cmd + " " + str(job.id)) + if self.cancel_cmd is not None: + job.platform.send_command(self.cancel_cmd + " " + str(job.id)) except: pass except: -- GitLab From 48e3c4243d4e10ebb15bb3c73a7664151ba48f38 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 21 Mar 2023 11:24:57 +0100 Subject: [PATCH 175/213] cancel_cmd update (cherry picked from commit 51436a52265ff5ac618e5c36d30d8fb75561f443) --- autosubmit/platforms/slurmplatform.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 152492f28..026bb25f4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -410,13 +410,12 @@ class SlurmPlatform(ParamikoPlatform): job.new_status = Status.FAILED job.update_status(as_conf) elif reason == '(JobHeldUser)': - job.new_status = Status.HELD if not job.hold: - # SHOULD BE MORE CLASS (GET_scontrol release but not sure if this can be implemented on others PLATFORMS + # should be self.release_cmd or something like that but it is not implemented self.send_command("scontrol release {0}".format(job.id)) job.new_status = Status.QUEUING # If it was HELD and was released, it should be QUEUING next. else: - pass + job.new_status = Status.HELD @staticmethod def wrapper_header(filename, queue, project, wallclock, num_procs, dependency, directives, threads, method="asthreads"): if method == 'srun': -- GitLab From c6634bb907203271d0e9c55495b7c9f38be45356 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 22 Mar 2023 12:32:20 +0100 Subject: [PATCH 176/213] unused imports (cherry picked from commit 6ed045c7de16dedd5d69beb9b63c3cf7e1667d73) --- autosubmit/platforms/pjmplatform.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 27a7b47de..33a63ba49 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -18,23 +18,16 @@ # along with Autosubmit. If not, see . import locale import os -import re from contextlib import suppress from time import sleep -from time import mktime -from time import time -from datetime import datetime from typing import List, Union -from xml.dom.minidom import parseString - -from autosubmit.job.job_common import Status, parse_output_number +from autosubmit.job.job_common import Status from autosubmit.job.job_exceptions import WrongTemplateException from autosubmit.platforms.paramiko_platform import ParamikoPlatform from autosubmit.platforms.headers.pjm_header import PJMHeader from autosubmit.platforms.wrappers.wrapper_factory import PJMWrapperFactory from log.log import AutosubmitCritical, AutosubmitError, Log -import random class PJMPlatform(ParamikoPlatform): """ -- GitLab From a3e859bc2e59998db98ab6cfe43d3bf3e57d00fd Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 22 Mar 2023 12:32:49 +0100 Subject: [PATCH 177/213] clean docstring (cherry picked from commit 6c6c671bca8600e0bec923f4556920286c1920ab) --- autosubmit/platforms/pjmplatform.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 33a63ba49..5de0c5d5f 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -339,8 +339,6 @@ class PJMPlatform(ParamikoPlatform): Convert a list of job_list to job_list_cmd :param job_list: list of jobs :type job_list: list - :param ssh_output: ssh output - :type ssh_output: str :return: job status :rtype: str """ -- GitLab From fbd065bf3ab79a54105ebe425fc0d810e1f899ae Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 22 Mar 2023 12:34:26 +0100 Subject: [PATCH 178/213] CLEAN DEbug code (cherry picked from commit 05c468178eac5da6d0855a4b455ccf484018d1b8) --- autosubmit/platforms/pjmplatform.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 5de0c5d5f..43a5e60cf 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -321,8 +321,6 @@ class PJMPlatform(ParamikoPlatform): if not job.hold: self.send_command("{0} {1}".format(self.cancel_cmd,job.id)) job.new_status = Status.QUEUING # If it was HELD and was released, it should be QUEUING next. - else: - pass def parse_Alljobs_output(self, output, job_id): status = "" try: -- GitLab From 984490b63e8bc639913b7c6f9cee6070333bf741 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 22 Mar 2023 12:43:57 +0100 Subject: [PATCH 179/213] printlog (cherry picked from commit de0535d441ceeb19f2d533b24625617370686db5) --- autosubmit/platforms/pjmplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 43a5e60cf..2b585467c 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -312,7 +312,7 @@ class PJMPlatform(ParamikoPlatform): for job in in_queue_jobs: reason = self.parse_queue_reason(queue_status, job.id) if job.queuing_reason_cancel(reason): - Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) + Log.printlog(f"Job {job.name} will be cancelled and set to FAILED as it was queuing due to {reason}",6000) self.send_command(self.cancel_cmd + " {0}".format(job.id)) job.new_status = Status.FAILED job.update_status(as_conf) -- GitLab From 6a858cb96775a7628ba581de2b6991134f72adb1 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 22 Mar 2023 12:53:31 +0100 Subject: [PATCH 180/213] added .lower() just in case (cherry picked from commit 1e31c2c8dcfc7ed304e490a9a6dce179e0c2cc82) --- autosubmit/autosubmit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index c50bababa..813c4042c 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2197,7 +2197,7 @@ class Autosubmit: raise except Exception as e: raise - if platform.type == "slurm" or platform.type == "pjm" and not inspect and not only_wrappers: + if platform.type.lower() in ["slurm", "pjm"] and not inspect and not only_wrappers: try: valid_packages_to_submit = [ package for package in valid_packages_to_submit if package.x11 != True] if len(valid_packages_to_submit) > 0: -- GitLab From 26e39f6e9c87522104d47672b97e16d093f2e692 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 22 Mar 2023 16:40:46 +0100 Subject: [PATCH 181/213] testing pipeline change (cherry picked from commit 3deeaa46fec6d91f8af793e6061e942da3510e02) --- test/unit/test_pjm.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/test/unit/test_pjm.py b/test/unit/test_pjm.py index fed657364..d68b88953 100644 --- a/test/unit/test_pjm.py +++ b/test/unit/test_pjm.py @@ -36,18 +36,12 @@ class TestPJM(TestCase): self.as_conf.experiment_data = dict() self.as_conf.experiment_data["DEFAULT"] = dict() self.as_conf.experiment_data["DEFAULT"]["HPCARCH"] = "ARM" - yml_file = Path("files/fake-jobs.yml") - if not yml_file.exists(): - yml_file = Path("test/files/fake-jobs.yml") - + yml_file = Path(__file__).resolve().parent / "files/fake-jobs.yml" factory = YAMLParserFactory() parser = factory.create_parser() parser.data = parser.load(yml_file) self.as_conf.experiment_data.update(parser.data) - yml_file = Path("files/fake-platforms.yml") - yml_file.exists() - if not yml_file.exists(): - yml_file = Path("test/files/fake-platforms.yml") + yml_file = Path(__file__).resolve().parent / "files/fake-platforms.yml" factory = YAMLParserFactory() parser = factory.create_parser() parser.data = parser.load(yml_file) -- GitLab From e521093e3f80bf4fdb2c985a1f25278342d0b02a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 22 Mar 2023 16:51:55 +0100 Subject: [PATCH 182/213] removed x11 from pjm (cherry picked from commit 01783457310d64707b9c66c6f0ecac257c6eff87) --- autosubmit/platforms/pjmplatform.py | 41 +++++++++-------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 2b585467c..57bcb7de3 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -87,7 +87,7 @@ class PJMPlatform(ParamikoPlatform): :return: """ try: - valid_packages_to_submit = [ package for package in valid_packages_to_submit if package.x11 != True] + valid_packages_to_submit = [ package for package in valid_packages_to_submit ] if len(valid_packages_to_submit) > 0: try: jobs_id = self.submit_Script(hold=hold) @@ -180,10 +180,6 @@ class PJMPlatform(ParamikoPlatform): :return: job id for the submitted job :rtype: int """ - if job is None or not job: - x11 = False - else: - x11 = job.x11 self.get_submit_cmd(script_name, job, hold=hold, export=export) return None @@ -357,16 +353,14 @@ class PJMPlatform(ParamikoPlatform): return False return True - def get_submitted_job_id(self, outputlines, x11 = False): + def get_submitted_job_id(self, outputlines): try: jobs_id = [] for output in outputlines.splitlines(): if not self.submit_error(output): jobs_id.append(int(output.split()[5])) - if x11 == "true": - return jobs_id[0] - else: - return jobs_id + + return jobs_id except IndexError: raise AutosubmitCritical( "Submission failed. There are issues on your config file", 7014) @@ -376,27 +370,18 @@ class PJMPlatform(ParamikoPlatform): export = "" else: export += " ; " - if job is None or not job: - x11 = False - else: - x11 = job.x11 - if x11 == "true": + + with suppress(BaseException): + lang = locale.getlocale()[1] + if lang is None: + lang = locale.getdefaultlocale()[1] + if lang is None: + lang = 'UTF-8' if not hold: - return export + self._submit_cmd + job_script + self._submit_script_file.write((export + self._submit_cmd + job_script + "\n").encode(lang)) else: - return export + self._submit_hold_cmd + job_script - else: - with suppress(BaseException): - lang = locale.getlocale()[1] - if lang is None: - lang = locale.getdefaultlocale()[1] - if lang is None: - lang = 'UTF-8' - if not hold: - self._submit_script_file.write((export + self._submit_cmd + job_script + "\n").encode(lang)) - else: - self._submit_script_file.write((export + self._submit_hold_cmd + job_script + "\n").encode(lang)) + self._submit_script_file.write((export + self._submit_hold_cmd + job_script + "\n").encode(lang)) def get_checkAlljobs_cmd(self, jobs_id): -- GitLab From efa407fb32173e728c1e9935e983c0c377f8b1ea Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 22 Mar 2023 16:56:54 +0100 Subject: [PATCH 183/213] added abstract functions to paramiko (cherry picked from commit 2cb9eb1b9588b46a533543e5c8b13a655f23d809) --- autosubmit/platforms/paramiko_platform.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index acb7cbc71..43697c170 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -740,6 +740,22 @@ class ParamikoPlatform(Platform): sys.stderr.write(session.recv_stderr(4096)) def x11_handler(self, channel, (src_addr, src_port)): + def get_jobid_by_jobname_cmd(self, job_name): + """ + Returns command to get job id by job name on remote platforms + :param job_name: + :return: str + """ + return NotImplementedError + + def get_queue_status_cmd(self, job_name): + """ + Returns command to get queue status on remote platforms + :return: str + """ + return NotImplementedError + + def x11_handler(self, channel, xxx_todo_changeme): '''handler for incoming x11 connections for each x11 incoming connection, - get a connection to the local display -- GitLab From 9768305ab97476a79e114088ba6dc09d2e993d87 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 28 Mar 2023 13:09:53 +0200 Subject: [PATCH 184/213] Fixes to cancel jobs --- autosubmit/job/job.py | 2 +- autosubmit/platforms/paramiko_platform.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 25cb081b2..8aa3cfc3c 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1203,7 +1203,7 @@ class Job(object): template_file.close() else: if self.type == Type.BASH: - template = 'sleep 5' + template = 'sleep 360' elif self.type == Type.PYTHON: template = 'time.sleep(5)' elif self.type == Type.R: diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 43697c170..e1555b3df 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -649,8 +649,8 @@ class ParamikoPlatform(Platform): job_status = job.check_completion(over_wallclock=True) if job_status is Status.FAILED: try: - if self.cancel_cmd is not None: - job.platform.send_command(self.cancel_cmd + " " + str(job.id)) + if job.platform.cancel_cmd is not None: + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id)) except: pass except: -- GitLab From 11bb532b02522c27604d889e9bddbd091b89ccdf Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 28 Mar 2023 13:10:15 +0200 Subject: [PATCH 185/213] sleeptime --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 8aa3cfc3c..25cb081b2 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1203,7 +1203,7 @@ class Job(object): template_file.close() else: if self.type == Type.BASH: - template = 'sleep 360' + template = 'sleep 5' elif self.type == Type.PYTHON: template = 'time.sleep(5)' elif self.type == Type.R: -- GitLab From 63bdbb87f227033e1f7aafa838ca169b1d7a0897 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 29 Mar 2023 13:54:56 +0200 Subject: [PATCH 186/213] Pjm support in 3.15 #921 (I) --- autosubmit/platforms/paramiko_platform.py | 1 - autosubmit/platforms/pjmplatform.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e1555b3df..63fd6b93e 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -739,7 +739,6 @@ class ParamikoPlatform(Platform): while session.recv_stderr_ready(): sys.stderr.write(session.recv_stderr(4096)) - def x11_handler(self, channel, (src_addr, src_port)): def get_jobid_by_jobname_cmd(self, job_name): """ Returns command to get job id by job name on remote platforms diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 57bcb7de3..96ff0e17a 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -18,7 +18,6 @@ # along with Autosubmit. If not, see . import locale import os -from contextlib import suppress from time import sleep from typing import List, Union @@ -308,7 +307,7 @@ class PJMPlatform(ParamikoPlatform): for job in in_queue_jobs: reason = self.parse_queue_reason(queue_status, job.id) if job.queuing_reason_cancel(reason): - Log.printlog(f"Job {job.name} will be cancelled and set to FAILED as it was queuing due to {reason}",6000) + Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}".format(job.name,reason),6000) self.send_command(self.cancel_cmd + " {0}".format(job.id)) job.new_status = Status.FAILED job.update_status(as_conf) @@ -372,7 +371,7 @@ class PJMPlatform(ParamikoPlatform): export += " ; " - with suppress(BaseException): + try: lang = locale.getlocale()[1] if lang is None: lang = locale.getdefaultlocale()[1] @@ -382,7 +381,8 @@ class PJMPlatform(ParamikoPlatform): self._submit_script_file.write((export + self._submit_cmd + job_script + "\n").encode(lang)) else: self._submit_script_file.write((export + self._submit_hold_cmd + job_script + "\n").encode(lang)) - + except: + pass def get_checkAlljobs_cmd(self, jobs_id): # jobs_id = "jobid1+jobid2+jobid3" -- GitLab From f9fd078cb3dd3a6867d2e2568c2f424ccbfeed31 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 29 Mar 2023 16:31:28 +0200 Subject: [PATCH 187/213] Pjm support in 3.15 #921 (II) --- autosubmit/autosubmit.py | 3 +-- autosubmit/platforms/paramiko_platform.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 813c4042c..aa5f9db65 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1770,10 +1770,9 @@ class Autosubmit: # Check slurm single jobs, the other platforms has already been checked. for platform_jobs in slurm: platform = platform_jobs[0] - jobs_to_check = platform_jobs[1] Log.debug("Checking all jobs at once") platform.check_Alljobs( - platform_jobs[3], jobs_to_check, as_conf.get_copy_remote_logs()) + platform_jobs[3], as_conf) #Log.info("FD slurm jobs: {0}".format(log.fd_show.fd_table_status_str())) for j_Indx in xrange(0, len(platform_jobs[3])): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 63fd6b93e..eef39d1d6 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -563,7 +563,7 @@ class ParamikoPlatform(Platform): :rtype: str """ job_list_cmd = "" - for job,job_prev_status in job_list: + for job in job_list: if job.id is None: job_str = "0" else: -- GitLab From 8682b494094d78f970cc85c72a4383a26f75c40c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 29 Mar 2023 16:35:29 +0200 Subject: [PATCH 188/213] Pjm support in 3.15 #921 (III) --- test/unit/test_pjm.py | 112 ------------------------------------------ 1 file changed, 112 deletions(-) delete mode 100644 test/unit/test_pjm.py diff --git a/test/unit/test_pjm.py b/test/unit/test_pjm.py deleted file mode 100644 index d68b88953..000000000 --- a/test/unit/test_pjm.py +++ /dev/null @@ -1,112 +0,0 @@ -from unittest import TestCase -from unittest.mock import Mock,MagicMock, patch -from autosubmitconfigparser.config.configcommon import AutosubmitConfig -from autosubmitconfigparser.config.yamlparser import YAMLParserFactory -from autosubmit.autosubmit import Autosubmit -import autosubmit.platforms.pjmplatform -import pytest - -from pathlib import Path -from autosubmit.platforms.platform import Platform -from autosubmit.platforms.pjmplatform import PJMPlatform -import autosubmit.platforms.headers.pjm_header -from tempfile import TemporaryDirectory -from datetime import datetime -from autosubmit.job.job import Job, Status - -class FakeBasicConfig: - DB_DIR = '/dummy/db/dir' - DB_FILE = '/dummy/db/file' - DB_PATH = '/dummy/db/path' - LOCAL_ROOT_DIR = '/dummy/local/root/dir' - LOCAL_TMP_DIR = '/dummy/local/temp/dir' - LOCAL_PROJ_DIR = '/dummy/local/proj/dir' - LOCAL_ASLOG_DIR = '/dummy/local/aslog/dir' - DEFAULT_PLATFORMS_CONF = '' - DEFAULT_JOBS_CONF = '' - @staticmethod - def read(): - return -class TestPJM(TestCase): - - def setUp(self) -> None: - self.exp_id = 'a000' - self.as_conf = MagicMock() - self.as_conf = AutosubmitConfig(self.exp_id, FakeBasicConfig, YAMLParserFactory()) - self.as_conf.experiment_data = dict() - self.as_conf.experiment_data["DEFAULT"] = dict() - self.as_conf.experiment_data["DEFAULT"]["HPCARCH"] = "ARM" - yml_file = Path(__file__).resolve().parent / "files/fake-jobs.yml" - factory = YAMLParserFactory() - parser = factory.create_parser() - parser.data = parser.load(yml_file) - self.as_conf.experiment_data.update(parser.data) - yml_file = Path(__file__).resolve().parent / "files/fake-platforms.yml" - factory = YAMLParserFactory() - parser = factory.create_parser() - parser.data = parser.load(yml_file) - self.as_conf.experiment_data.update(parser.data) - self.setUp_pjm() - - - @patch("builtins.open",MagicMock()) - def setUp_pjm(self): - MagicMock().write = MagicMock() - MagicMock().os.path.join = MagicMock() - self.section = 'ARM' - self.submitted_ok = "[INFO] PJM 0000 pjsub Job 167661 submitted." - self.submitted_fail = "[ERR.] PJM 0057 pjsub node=32 is greater than the upper limit (24)." - self.out= """JOB_ID ST REASON -167727 EXT COMPLETED -167728 RNO - -167729 RNE - -167730 RUN - -167732 ACC - -167733 QUE - -167734 RNA - -167735 RNP - -167736 HLD ASHOLD -167737 ERR - -167738 CCL - -167739 RJT - -""" - self.completed_jobs = ["167727"] - self.running_jobs = ["167728","167729","167730"] - self.queued_jobs = ["167732","167733","167734","167735","167736"] - self.failed_jobs = ["167737","167738","167739"] - self.jobs_that_arent_listed = ["3442432423", "238472364782", "1728362138712"] - self.completed_jobs_cmd = "167727" - self.running_jobs_cmd = "167728+167729+167730" - self.queued_jobs_cmd = "167732+167733+167734+167735+167736" - self.failed_jobs_cmd = "167737+167738+167739" - self.jobs_that_arent_listed_cmd = "3442432423+238472364782+1728362138712" - self.submitter = Autosubmit._get_submitter(self.as_conf) - self.submitter.load_platforms(self.as_conf) - self.remote_platform = self.submitter.platforms[self.section] - - def test_parse_Alljobs_output(self): - """Test parsing of all jobs output.""" - for job_id in self.completed_jobs: - assert self.remote_platform.parse_Alljobs_output(self.out,job_id) in self.remote_platform.job_status["COMPLETED"] - for job_id in self.failed_jobs: - assert self.remote_platform.parse_Alljobs_output(self.out,job_id) in self.remote_platform.job_status["FAILED"] - for job_id in self.queued_jobs: - assert self.remote_platform.parse_Alljobs_output(self.out,job_id) in self.remote_platform.job_status["QUEUING"] - for job_id in self.running_jobs: - assert self.remote_platform.parse_Alljobs_output(self.out,job_id) in self.remote_platform.job_status["RUNNING"] - for job_id in self.jobs_that_arent_listed: - assert self.remote_platform.parse_Alljobs_output(self.out,job_id) == [] - - def test_get_submitted_job_id(self): - """Test parsing of submitted job id.""" - output = self.remote_platform.get_submitted_job_id(self.submitted_ok) - assert output == [167661] - output = self.remote_platform.get_submitted_job_id(self.submitted_fail) - assert output == [] - def test_parse_queue_reason(self): - """Test parsing of queue reason.""" - output = self.remote_platform.parse_queue_reason(self.out, self.completed_jobs[0]) - assert output == "COMPLETED" - - - -- GitLab From 79f3b2b2b64ce5a7c85132dad2988e72d51142ce Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 30 Mar 2023 09:11:10 +0200 Subject: [PATCH 189/213] Pjm support in 3.15 #921 (III) --- autosubmit/autosubmit.py | 40 +++++++++++++++------- autosubmit/platforms/headers/pjm_header.py | 12 +++---- autosubmit/platforms/paramiko_platform.py | 10 +++--- autosubmit/platforms/pjmplatform.py | 6 +++- 4 files changed, 43 insertions(+), 25 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index aa5f9db65..d78ef0746 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1990,7 +1990,7 @@ class Autosubmit: message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" raise AutosubmitCritical(message, 7000) except BaseException as e: # If this happens, there is a bug in the code or an exception not-well caught - raise AutosubmitCritical("There is a bug in the code, please contact via git",7070,e.message) + raise AutosubmitCritical("There is a bug in the code, please contact via gitlab",str(e),7070) Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends try: @@ -2203,14 +2203,17 @@ class Autosubmit: try: jobs_id = platform.submit_Script(hold=hold) except AutosubmitError as e: - jobnames = [] - for package in valid_packages_to_submit: - jobnames += [job.name for job in package.jobs] - for jobname in jobnames: - jobid = platform.get_jobid_by_jobname(jobname) - #cancel bad submitted job if jobid is encountered - for id in jobid: - platform.cancel_job(id) + try: + jobnames = [] + for package in valid_packages_to_submit: + jobnames += [job.name for job in package.jobs] + for jobname in jobnames: + jobid = platform.get_jobid_by_jobname(jobname) + #cancel bad submitted job if jobid is encountered + for id in jobid: + platform.cancel_job(id) + except: + pass jobs_id = None platform.connected = False if e.trace is not None: @@ -2240,6 +2243,19 @@ class Autosubmit: raise AutosubmitError( "Submission failed, this can be due a failure on the platform", 6015, e.message) if jobs_id is None or len(jobs_id) <= 0: + try: + jobnames = [] + for package in valid_packages_to_submit: + jobnames += [job.name for job in package.jobs] + for jobname in jobnames: + jobid = platform.get_jobid_by_jobname(jobname) + #cancel bad submitted job if jobid is encountered + for id in jobid: + platform.cancel_job(id) + except: + pass + platform.connected = False + raise AutosubmitError( "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(str(platform.name),""), 6015) i = 0 @@ -2280,11 +2296,11 @@ class Autosubmit: job.status = Status.SUBMITTED job.write_submit_time(hold=hold) i += 1 - save = True + if not inspect: + job_list.save() if len(failed_packages) > 0: for job_id in failed_packages: - package.jobs[0].platform.send_command( - package.jobs[0].platform.cancel_cmd + " {0}".format(job_id)) + platform.send_command( platform.cancel_cmd + " {0}".format(job_id)) raise AutosubmitError( "{0} submission failed, some hold jobs failed to be held".format(platform.name), 6015) except WrongTemplateException as e: diff --git a/autosubmit/platforms/headers/pjm_header.py b/autosubmit/platforms/headers/pjm_header.py index e77dfdb03..886ccdc95 100644 --- a/autosubmit/platforms/headers/pjm_header.py +++ b/autosubmit/platforms/headers/pjm_header.py @@ -128,7 +128,7 @@ class PJMHeader(object): SERIAL = textwrap.dedent("""\ ############################################################################### -# %TASKTYPE% %DEFAULT.EXPID% EXPERIMENT +# %TASKTYPE% %EXPID% EXPERIMENT ############################################################################### # #PJM -N %JOBNAME% @@ -137,8 +137,8 @@ class PJMHeader(object): #%ACCOUNT_DIRECTIVE% #%MEMORY_DIRECTIVE% %CUSTOM_DIRECTIVES% -#PJM -o %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%OUT_LOG_DIRECTIVE% -#PJM -e %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%ERR_LOG_DIRECTIVE% +#PJM -o %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ%/%CURRENT_USER%/%EXPID%/LOG_%EXPID%/%OUT_LOG_DIRECTIVE% +#PJM -e %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ%/%CURRENT_USER%/%EXPID%/LOG_%EXPID%/%ERR_LOG_DIRECTIVE% #%X11% # ############################################################################### @@ -146,7 +146,7 @@ class PJMHeader(object): PARALLEL = textwrap.dedent("""\ ############################################################################### -# %TASKTYPE% %DEFAULT.EXPID% EXPERIMENT +# %TASKTYPE% %EXPID% EXPERIMENT ############################################################################### # #PJM -N %JOBNAME% @@ -158,8 +158,8 @@ class PJMHeader(object): #%ACCOUNT_DIRECTIVE% #%MEMORY_DIRECTIVE% #%MEMORY_PER_TASK_DIRECTIVE% -#PJM -o %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%OUT_LOG_DIRECTIVE% -#PJM -e %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ_DIR%/%CURRENT_USER%/%DEFAULT.EXPID%/LOG_%DEFAULT.EXPID%/%ERR_LOG_DIRECTIVE% +#PJM -o %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ%/%CURRENT_USER%/%EXPID%/LOG_%EXPID%/%OUT_LOG_DIRECTIVE% +#PJM -e %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ%/%CURRENT_USER%/%EXPID%/LOG_%EXPID%/%ERR_LOG_DIRECTIVE% %CUSTOM_DIRECTIVES% # ############################################################################### diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index eef39d1d6..fe7c0bfe4 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -216,12 +216,10 @@ class ParamikoPlatform(Platform): self._ssh.connect(self._host_config['hostname'], port, username=self.user, key_filename=self._host_config_id, timeout=60 , banner_timeout=60,disabled_algorithms={'pubkeys': ['rsa-sha2-256', 'rsa-sha2-512']}) self.transport = self._ssh.get_transport() - #self.transport = paramiko.Transport((self._host_config['hostname'], 22)) - #self.transport.connect(username=self.user) - window_size = pow(4, 12) # about ~16MB chunks - max_packet_size = pow(4, 12) - #self._ftpChannel = self._ssh.open_sftp() - self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=window_size,max_packet_size=max_packet_size) + self.transport.banner_timeout = 60 + + self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=pow(4, 12) ,max_packet_size=pow(4, 12) ) + self._ftpChannel.get_channel().settimeout(120) self.connected = True except SSHException as e: raise diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 96ff0e17a..f90c83db7 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -240,7 +240,7 @@ class PJMPlatform(ParamikoPlatform): Updates commands for platforms """ self.root_dir = os.path.join( - self.scratch, self.project_dir, self.user, self.expid) + self.scratch, self.project, self.user, self.expid) self.remote_log_dir = os.path.join(self.root_dir, "LOG_" + self.expid) self.cancel_cmd = "pjdel " self._checkhost_cmd = "echo 1" @@ -388,6 +388,10 @@ class PJMPlatform(ParamikoPlatform): # jobs_id = "jobid1+jobid2+jobid3" # -H == sacct return "pjstat -H -v --choose jid,st,ermsg --filter \"jid={0}\" > as_checkalljobs.txt ; pjstat -v --choose jid,st,ermsg --filter \"jid={0}\" >> as_checkalljobs.txt ; cat as_checkalljobs.txt ; rm as_checkalljobs.txt".format(jobs_id) + def get_checkjob_cmd(self, jobs_id): + # jobs_id = "jobid1+jobid2+jobid3" + # -H == sacct + return self.get_checkAlljobs_cmd(self, jobs_id) def get_queue_status_cmd(self, job_id): return self.get_checkAlljobs_cmd(job_id) -- GitLab From a99e6eba7704ce9f36089886f4d4673a613b07c2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 30 Mar 2023 09:48:07 +0200 Subject: [PATCH 190/213] #959 --- autosubmit/config/config_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 68c37d427..c7c96aeee 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -915,6 +915,7 @@ class AutosubmitConfig(object): Creates parser objects for configuration files """ try: + Log.debug("Reloading configuration each Autosubmit iteration") self._conf_parser = AutosubmitConfig.get_parser( self.parser_factory, self._conf_parser_file) self._platforms_parser = AutosubmitConfig.get_parser( -- GitLab From e61c9dfb56d8e827a596a981a9fd83a5b22ec8a0 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Apr 2023 10:20:31 +0200 Subject: [PATCH 191/213] Fix machinefiles, added a node_list id --- autosubmit/autosubmit.py | 6 +- autosubmit/config/config_common.py | 3 +- autosubmit/platforms/pjmplatform.py | 4 +- autosubmit/platforms/slurmplatform.py | 2 +- .../platforms/wrappers/wrapper_builder.py | 9 +- teeeest.py | 181 +++++++++++++++++ test-unthreaded.py | 186 ++++++++++++++++++ 7 files changed, 380 insertions(+), 11 deletions(-) create mode 100644 teeeest.py create mode 100644 test-unthreaded.py diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index d78ef0746..8fe2669cc 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1620,6 +1620,7 @@ class Autosubmit: # AUTOSUBMIT - MAIN LOOP ######################### # Main loop. Finishing when all jobs have been submitted + while job_list.get_active(): #Log.info("FD: {0}".format(log.fd_show.fd_table_status_str())) try: @@ -1794,6 +1795,7 @@ class Autosubmit: as_conf, submitter=submitter) job_list.save() if len(job_list.get_ready()) > 0: + Log.debug("Reloading configuration each Autosubmit iteration") save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) job_list.update_list(as_conf, submitter=submitter) @@ -1990,7 +1992,7 @@ class Autosubmit: message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" raise AutosubmitCritical(message, 7000) except BaseException as e: # If this happens, there is a bug in the code or an exception not-well caught - raise AutosubmitCritical("There is a bug in the code, please contact via gitlab",str(e),7070) + raise AutosubmitCritical("There is a bug in the code, please contact via gitlab",7070,str(e)) Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends try: @@ -2032,7 +2034,7 @@ class Autosubmit: except AutosubmitCritical as e: raise except BaseException as e: - raise AutosubmitCritical("This seems like a bug in the code, please contact AS developers", 7070,e.message) + raise AutosubmitCritical("This seems like a bug in the code, please contact AS developers", 7070,str(e)) @staticmethod def restore_platforms(platform_to_test,mail_notify=False,as_conf=None,expid=expid): diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index c7c96aeee..13243d4e1 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -915,7 +915,6 @@ class AutosubmitConfig(object): Creates parser objects for configuration files """ try: - Log.debug("Reloading configuration each Autosubmit iteration") self._conf_parser = AutosubmitConfig.get_parser( self.parser_factory, self._conf_parser_file) self._platforms_parser = AutosubmitConfig.get_parser( @@ -1738,7 +1737,7 @@ class AutosubmitConfig(object): :return: machinefiles function to use :rtype: string """ - return self._conf_parser.get_option(wrapper_section_name, 'MACHINEFILES', '') + return self._conf_parser.get_option(wrapper_section_name, 'MACHINEFILES', 'STANDARD') def get_export(self, section): """ Gets command line for being submitted with diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index f90c83db7..164920b14 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -127,7 +127,7 @@ class PJMPlatform(ParamikoPlatform): "IO issues ", 6016, str(e)) except BaseException as e: if str(e).find("scheduler") != -1: - raise AutosubmitCritical("Are you sure that [{0}] scheduler is the correct type for platform [{1}]?.\n Please, double check that {0} is loaded for {1} before autosubmit launch any job.".format(self.type.upper(),self.name.upper()),7070) + raise AutosubmitCritical("Are you sure that [{0}] scheduler is the correct type for platform [{1}]?.\n Please, double check that {0} is loaded for {1} before autosubmit launch any job.".format(self.type.upper(),self.name.upper()),str(e),7070) raise AutosubmitError( "Submission failed, this can be due a failure on the platform", 6015, str(e)) if jobs_id is None or len(jobs_id) <= 0: @@ -468,7 +468,7 @@ class PJMPlatform(ParamikoPlatform): @staticmethod def allocated_nodes(): - return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list_{0}".format(node_id))""" + return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > {0}".format(node_id))""" def check_file_exists(self, filename,wrapper_failed=False): file_exist = False diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 026bb25f4..0792ee64e 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -465,7 +465,7 @@ class SlurmPlatform(ParamikoPlatform): @staticmethod def allocated_nodes(): - return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list")""" + return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > {0}".format(node_id))""" def check_file_exists(self, filename,wrapper_failed=False): file_exist = False diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 00214d771..836e86d27 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -115,7 +115,7 @@ class PythonWrapperBuilder(WrapperBuilder): sample_list = list(sample_str) random.shuffle(sample_list) final_string = ''.join(sample_list) - return final_string+"_FAILED" + return final_string def build_imports(self): return textwrap.dedent(""" @@ -142,7 +142,8 @@ class PythonWrapperBuilder(WrapperBuilder): return getattr(self.stream, attr) sys.stdout = Unbuffered(sys.stdout) - wrapper_id = "{1}" + wrapper_id = "{1}_FAILED" + node_id = "node_list_{1}" # Defining scripts to be run scripts= {0} """).format(str(self.job_scripts), self.get_random_alphanumeric_string(5,5),'\n'.ljust(13)) @@ -182,7 +183,7 @@ class PythonWrapperBuilder(WrapperBuilder): {0} os.system("mkdir -p machinefiles") - with open('node_list', 'r') as file: + with open('{{0}}'.format(node_id), 'r') as file: all_nodes = file.read() all_nodes = all_nodes.split("_NEWLINE_") @@ -687,7 +688,7 @@ class SrunWrapperBuilder(WrapperBuilder): {0} os.system("mkdir -p machinefiles") - with open('node_list', 'r') as file: + with open('{{0}}'.format(node_id), 'r') as file: all_nodes = file.read() all_nodes = all_nodes.split("_NEWLINE_") diff --git a/teeeest.py b/teeeest.py new file mode 100644 index 000000000..c7919c429 --- /dev/null +++ b/teeeest.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python2 +############################################################################### +# a000_ASThread_16801803863908_5_60 +############################################################################### +# +#SBATCH -J a000_ASThread_16801803863908_5_60 +#SBATCH --qos=debug +#SBATCH -A bsc32 +#SBATCH --output=a000_ASThread_16801803863908_5_60.out +#SBATCH --error=a000_ASThread_16801803863908_5_60.err +#SBATCH -t 02:00:00 +#SBATCH --cpus-per-task=1 +#SBATCH -n 8 +############################################################################### + +import os +import sys +# from bscearth.utils.date import date2str +from threading import Thread +from commands import getstatusoutput +from datetime import datetime +import time +from math import ceil +from collections import OrderedDict +import copy + + +class Unbuffered(object): + def __init__(self, stream): + self.stream = stream + + def write(self, data): + self.stream.write(data) + self.stream.flush() + + def writelines(self, datas): + self.stream.writelines(datas) + self.stream.flush() + + def __getattr__(self, attr): + return getattr(self.stream, attr) + + +sys.stdout = Unbuffered(sys.stdout) +wrapper_id = "8aQlI6U962_FAILED" +# Defining scripts to be run +scripts = [[u'a000_19600101_fc0000_1_SIM.cmd', u'a000_19600101_fc0000_2_SIM.cmd', u'a000_19600101_fc0000_3_SIM.cmd', + u'a000_19600101_fc0000_4_SIM.cmd', u'a000_19600101_fc0000_5_SIM.cmd'], [u'a000_19600101_fc0000_POST.cmd'], + [u'a000_19600101_fc0001_4_SIM.cmd', u'a000_19600101_fc0001_2_SIM.cmd', u'a000_19600101_fc0001_5_SIM.cmd', + u'a000_19600101_fc0001_1_SIM.cmd', u'a000_19600101_fc0001_3_SIM.cmd'], [u'a000_19600101_fc0001_POST.cmd'], + [u'a000_19600101_fc0002_4_SIM.cmd', u'a000_19600101_fc0002_2_SIM.cmd', u'a000_19600101_fc0002_1_SIM.cmd', + u'a000_19600101_fc0002_5_SIM.cmd', u'a000_19600101_fc0002_3_SIM.cmd'], [u'a000_19600101_fc0002_POST.cmd'], + [u'a000_19600101_fc0003_4_SIM.cmd', u'a000_19600101_fc0003_3_SIM.cmd', u'a000_19600101_fc0003_5_SIM.cmd', + u'a000_19600101_fc0003_1_SIM.cmd', u'a000_19600101_fc0003_2_SIM.cmd'], [u'a000_19600101_fc0003_POST.cmd'], + [u'a000_19600101_fc0004_5_SIM.cmd', u'a000_19600101_fc0004_1_SIM.cmd', u'a000_19600101_fc0004_4_SIM.cmd', + u'a000_19600101_fc0004_3_SIM.cmd', u'a000_19600101_fc0004_2_SIM.cmd'], [u'a000_19600101_fc0004_POST.cmd'], + [u'a000_19600101_fc0005_2_SIM.cmd', u'a000_19600101_fc0005_5_SIM.cmd', u'a000_19600101_fc0005_1_SIM.cmd', + u'a000_19600101_fc0005_4_SIM.cmd', u'a000_19600101_fc0005_3_SIM.cmd'], [u'a000_19600101_fc0005_POST.cmd'], + [u'a000_19600101_fc0006_5_SIM.cmd', u'a000_19600101_fc0006_2_SIM.cmd', u'a000_19600101_fc0006_4_SIM.cmd', + u'a000_19600101_fc0006_1_SIM.cmd', u'a000_19600101_fc0006_3_SIM.cmd'], [u'a000_19600101_fc0006_POST.cmd'], + [u'a000_19600101_fc0007_5_SIM.cmd', u'a000_19600101_fc0007_1_SIM.cmd', u'a000_19600101_fc0007_4_SIM.cmd', + u'a000_19600101_fc0007_2_SIM.cmd', u'a000_19600101_fc0007_3_SIM.cmd'], [u'a000_19600101_fc0007_POST.cmd'], + [u'a000_19600101_fc0008_1_SIM.cmd', u'a000_19600101_fc0008_3_SIM.cmd', u'a000_19600101_fc0008_5_SIM.cmd', + u'a000_19600101_fc0008_4_SIM.cmd', u'a000_19600101_fc0008_2_SIM.cmd'], [u'a000_19600101_fc0008_POST.cmd'], + [u'a000_19600101_fc0009_1_SIM.cmd', u'a000_19600101_fc0009_5_SIM.cmd', u'a000_19600101_fc0009_4_SIM.cmd', + u'a000_19600101_fc0009_2_SIM.cmd', u'a000_19600101_fc0009_3_SIM.cmd'], [u'a000_19600101_fc0009_POST.cmd']] + + +class JobThread(Thread): + def __init__(self, template, id_run): + Thread.__init__(self) + self.template = template + self.id_run = id_run + self.fail_count = 0 + + def run(self): + jobname = self.template.replace('.cmd', '') + print("Thread level {0}".format(jobname)) + # os.system("echo $(date +%s) > "+jobname+"_STAT") + out = str(self.template) + ".out." + str(self.fail_count) + err = str(self.template) + ".err." + str(self.fail_count) + print(out + "\n") + print("{1}/machinefiles/machinefile_{0}".format(jobname,os.getcwd())) + os.environ["MACHINEFILE"] = "{1}/machinefiles/machinefile_{0}".format(jobname,os.getcwd()) + command = "./" + str(self.template) + " " + str(self.id_run) + " " + os.getcwd() + (self.status) = getstatusoutput(command + " > " + out + " 2> " + err) + + +class JobListThread(Thread): + def __init__(self, jobs_list, id_run, node_list): + Thread.__init__(self) + self.jobs_list = jobs_list + self.id_run = id_run + self.node_list = node_list + + def run(self): + pid_list = [] + print("Jobs list: {0}".format(self.jobs_list)) + print("len_jobs_list: {0}".format(len(self.jobs_list))) + print("all_nodes: {0}".format(self.node_list)) + + for i,job in enumerate(self.jobs_list): + jobname = job.replace(".cmd", '') + section = jobname.split('_')[-1] + machines = "" + cores = int(jobs_resources[section]['PROCESSORS']) + tasks = int(jobs_resources[section]['TASKS']) + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + nodes = int(ceil((float(tasks*cores)) / processors_per_node)) + print("Nodes: {0}".format(nodes)) + print("Nodes_remaining_for_this_list: {0}".format(self.node_list)) + while nodes > 0: + node = self.node_list.pop(0) + machines += "{0} 1\n".format(node, processors_per_node) + nodes = nodes - 1 + # machines = "\n".join([s for s in machines.split("\n") if s]) + #machines = "\n".join([s for s in machines.split("\n") if s]) + print("FINAL_MACHINES:{0} ".format(machines)) + with open("machinefiles/machinefile_" + jobname, "w") as machinefile: + machinefile.write(machines) + current = JobThread(job, i + self.id_run) + pid_list.append(current) + current.start() + + # Waiting until all scripts finish + for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() + + completed_filename = self.jobs_list[i].replace('.cmd', '_COMPLETED') + completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = self.jobs_list[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) + failed_wrapper = os.path.join(os.getcwd(), wrapper_id) + if os.path.exists(completed_path): + print datetime.now(), "The job ", pid.template, " has been COMPLETED" + else: + open(failed_wrapper, 'w').close() + open(failed_path, 'w').close() + print datetime.now(), "The job ", pid.template, " has FAILED" + + +# Getting the list of allocated nodes +os.system("scontrol show hostnames $SLURM_JOB_NODELIST > {0}".format(node_id)) +os.system("mkdir -p machinefiles") + +with open('node_list_{0}'.format(wrapper_id), 'r') as file: + all_nodes = file.read() + +all_nodes = all_nodes.split("\n") + +total_cores = 5 +jobs_resources = {u'POST': {'TASKS': u'12', 'PROCESSORS': '1'}, 'MACHINEFILES': u'STANDARD', + 'PROCESSORS_PER_NODE': u'12', u'SIM': {'TASKS': '1', 'PROCESSORS': '1'}} +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +idx = 0 +all_cores = [] +while total_cores > 0: + if processors_per_node > 0: + processors_per_node -= 1 + total_cores -= 1 + all_cores.append(all_nodes[idx]) + else: + idx += 1 + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + +failed_wrapper = os.path.join(os.getcwd(), wrapper_id) +for i in range(len(scripts)): + current = JobListThread(scripts[i], i * (len(scripts[i])), copy.deepcopy(all_cores)) + current.start() + current.join() + if os.path.exists(failed_wrapper): + os.system("rm -f node_list_{0}".format(wrapper_id)) + os.remove(os.path.join(os.getcwd(), wrapper_id)) + wrapper_failed = os.path.join(os.getcwd(), "WRAPPER_FAILED") + open(wrapper_failed, 'w').close() + os._exit(1) +os.system("rm -f {0}".format(node_list)) + + diff --git a/test-unthreaded.py b/test-unthreaded.py new file mode 100644 index 000000000..27f32e281 --- /dev/null +++ b/test-unthreaded.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python2 +############################################################################### +# a000_ASThread_16801803863908_5_60 +############################################################################### +# +#SBATCH -J test +#SBATCH --qos=debug +#SBATCH -A bsc32 +#SBATCH --output=test.out +#SBATCH --error=test.err +#SBATCH -t 02:00:00 +#SBATCH --cpus-per-task=1 +#SBATCH -n 8 +############################################################################### + +import os +import sys +import subprocess +# from bscearth.utils.date import date2str +from threading import Thread +from commands import getstatusoutput +from datetime import datetime +import time +from math import ceil +from collections import OrderedDict +import copy + + +class Unbuffered(object): + def __init__(self, stream): + self.stream = stream + + def write(self, data): + self.stream.write(data) + self.stream.flush() + + def writelines(self, datas): + self.stream.writelines(datas) + self.stream.flush() + + def __getattr__(self, attr): + return getattr(self.stream, attr) + + + +class Job: + def __init__(self, template, id_run): + self.template = template + self.id_run = id_run + self.fail_count = 0 + self.process = None + + def launch(self): + jobname = self.template.replace('.cmd', '') + print("Thread level {0}".format(jobname)) + # os.system("echo $(date +%s) > "+jobname+"_STAT") + out = str(self.template) + ".out." + str(self.fail_count) + err = str(self.template) + ".err." + str(self.fail_count) + print(out + "\n") + print("{1}/machinefiles/machinefile_{0}".format(jobname,os.getcwd())) + os.environ["MACHINEFILE"] = "{1}/machinefiles/machinefile_{0}".format(jobname,os.getcwd()) + command = "./" + str(self.template) + " " + str(self.id_run) + " " + os.getcwd() + # Use subprocess to run the command and get the process ID + self.process = subprocess.Popen(command + " > " + out + " 2> " + err, shell=True) + return self + +class JobList: + def __init__(self, jobs_list, id_run, node_list): + """ + + :param jobs_list: + :param id_run: + :param node_list: + """ + self.jobs_list = jobs_list + self.id_run = id_run + self.node_list = node_list + + def launch(self): + """ + Launch the jobs in the wrapper sublist + :return: + """ + pid_list = [] + for i,job in enumerate(self.jobs_list): + jobname = job.replace(".cmd", '') + section = jobname.split('_')[-1] + machines = "" + cores = int(jobs_resources[section]['PROCESSORS']) + tasks = int(jobs_resources[section]['TASKS']) + cores_per_tasks = ceil((float(tasks*cores))) + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + nodes = (1 + int(cores_per_tasks) ) / processors_per_node # 1 for the main process + remaining_processors = abs(cores_per_tasks - (nodes * processors_per_node)) + while nodes > 0: + node = self.node_list.pop(0) + if nodes > 1: + machines += "{0} {1}\n".format(node, processors_per_node) + else: + machines += "{0} {1}\n".format(node, remaining_processors+1) # +1 for the main process + nodes = nodes - 1 + with open("machinefiles/machinefile_" + jobname, "w") as machinefile: + machinefile.write(machines) + pid_list.append(Job(job, self.id_run).launch()) + self.check_status(pid_list) + def check_status(self,pid_list): + for i in range(len(pid_list)): + job = pid_list[i] + #(process_output, process_error) = pid.communicate() + job.process.wait() + completed_filename = self.jobs_list[i].replace('.cmd', '_COMPLETED') + completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = self.jobs_list[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) + failed_wrapper = os.path.join(os.getcwd(), wrapper_id) + if os.path.exists(completed_path): + print datetime.now(), "The job ", completed_filename, " has been COMPLETED" + else: + open(failed_wrapper, 'w').close() + open(failed_path, 'w').close() + print datetime.now(), "The job ", completed_filename, " has FAILED" + + +sys.stdout = Unbuffered(sys.stdout) +wrapper_id = "8aQlI6U962_FAILED" +# Defining scripts to be run +scripts = [[u'a000_19600101_fc0000_1_SIM.cmd', u'a000_19600101_fc0000_2_SIM.cmd', u'a000_19600101_fc0000_3_SIM.cmd', + u'a000_19600101_fc0000_4_SIM.cmd', u'a000_19600101_fc0000_5_SIM.cmd'], [u'a000_19600101_fc0000_POST.cmd'], + [u'a000_19600101_fc0001_4_SIM.cmd', u'a000_19600101_fc0001_2_SIM.cmd', u'a000_19600101_fc0001_5_SIM.cmd', + u'a000_19600101_fc0001_1_SIM.cmd', u'a000_19600101_fc0001_3_SIM.cmd'], [u'a000_19600101_fc0001_POST.cmd'], + [u'a000_19600101_fc0002_4_SIM.cmd', u'a000_19600101_fc0002_2_SIM.cmd', u'a000_19600101_fc0002_1_SIM.cmd', + u'a000_19600101_fc0002_5_SIM.cmd', u'a000_19600101_fc0002_3_SIM.cmd'], [u'a000_19600101_fc0002_POST.cmd'], + [u'a000_19600101_fc0003_4_SIM.cmd', u'a000_19600101_fc0003_3_SIM.cmd', u'a000_19600101_fc0003_5_SIM.cmd', + u'a000_19600101_fc0003_1_SIM.cmd', u'a000_19600101_fc0003_2_SIM.cmd'], [u'a000_19600101_fc0003_POST.cmd'], + [u'a000_19600101_fc0004_5_SIM.cmd', u'a000_19600101_fc0004_1_SIM.cmd', u'a000_19600101_fc0004_4_SIM.cmd', + u'a000_19600101_fc0004_3_SIM.cmd', u'a000_19600101_fc0004_2_SIM.cmd'], [u'a000_19600101_fc0004_POST.cmd'], + [u'a000_19600101_fc0005_2_SIM.cmd', u'a000_19600101_fc0005_5_SIM.cmd', u'a000_19600101_fc0005_1_SIM.cmd', + u'a000_19600101_fc0005_4_SIM.cmd', u'a000_19600101_fc0005_3_SIM.cmd'], [u'a000_19600101_fc0005_POST.cmd'], + [u'a000_19600101_fc0006_5_SIM.cmd', u'a000_19600101_fc0006_2_SIM.cmd', u'a000_19600101_fc0006_4_SIM.cmd', + u'a000_19600101_fc0006_1_SIM.cmd', u'a000_19600101_fc0006_3_SIM.cmd'], [u'a000_19600101_fc0006_POST.cmd'], + [u'a000_19600101_fc0007_5_SIM.cmd', u'a000_19600101_fc0007_1_SIM.cmd', u'a000_19600101_fc0007_4_SIM.cmd', + u'a000_19600101_fc0007_2_SIM.cmd', u'a000_19600101_fc0007_3_SIM.cmd'], [u'a000_19600101_fc0007_POST.cmd'], + [u'a000_19600101_fc0008_1_SIM.cmd', u'a000_19600101_fc0008_3_SIM.cmd', u'a000_19600101_fc0008_5_SIM.cmd', + u'a000_19600101_fc0008_4_SIM.cmd', u'a000_19600101_fc0008_2_SIM.cmd'], [u'a000_19600101_fc0008_POST.cmd'], + [u'a000_19600101_fc0009_1_SIM.cmd', u'a000_19600101_fc0009_5_SIM.cmd', u'a000_19600101_fc0009_4_SIM.cmd', + u'a000_19600101_fc0009_2_SIM.cmd', u'a000_19600101_fc0009_3_SIM.cmd'], [u'a000_19600101_fc0009_POST.cmd']] + + +# Getting the list of allocated nodes +os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list_{0}".format(wrapper_id)) +os.system("mkdir -p machinefiles") + +with open('node_list_{0}'.format(wrapper_id.split("_")[0]), 'r') as file: + all_nodes = file.read() + +all_nodes = all_nodes.split("\n") + +total_cores = 5+1 +jobs_resources = {u'POST': {'TASKS': u'12', 'PROCESSORS': '1'}, 'MACHINEFILES': u'STANDARD', + 'PROCESSORS_PER_NODE': u'12', u'SIM': {'TASKS': '1', 'PROCESSORS': '1'}} +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +idx = 0 +all_cores = [] +while total_cores > 0: + if processors_per_node > 0: + processors_per_node -= 1 + total_cores -= 1 + all_cores.append(all_nodes[idx]) + else: + idx += 1 + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + +failed_wrapper = os.path.join(os.getcwd(), wrapper_id) +for i in range(len(scripts)): + current = JobList(scripts[i], i * (len(scripts[i])), copy.deepcopy(all_cores)) + current.launch() + if os.path.exists(failed_wrapper): + os.system("rm -f node_list_{0}".format(wrapper_id.split("_")[0])) + os.remove(os.path.join(os.getcwd(), wrapper_id)) + wrapper_failed = os.path.join(os.getcwd(), "WRAPPER_FAILED") + open(wrapper_failed, 'w').close() + os._exit(1) +os.system("rm -f node_list_{0}".format(wrapper_id.split("_")[0])) + + -- GitLab From 0e9d34984690a4ff01429965d98ccd757393ed1c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Apr 2023 10:30:12 +0200 Subject: [PATCH 192/213] Fix machinefiles, added a node_list id --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ae96f6d2d..7521fdc23 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ autosubmit/simple_test.py autosubmit.egg-info/ venv/ .pytest_cache/ -.cache/ \ No newline at end of file +.cache/ +teeeest.py +test-unthreaded.py \ No newline at end of file -- GitLab From b8b20bad67155f83f948a1e4eaa0794b89db7551 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Apr 2023 16:28:43 +0200 Subject: [PATCH 193/213] fixed sleep for dummy python jobs --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 25cb081b2..1b7d0de6f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1204,7 +1204,7 @@ class Job(object): else: if self.type == Type.BASH: template = 'sleep 5' - elif self.type == Type.PYTHON: + elif self.type == Type.PYTHON or self.type == Type.PYTHON2 or self.type == Type.PYTHON3: template = 'time.sleep(5)' elif self.type == Type.R: template = 'Sys.sleep(5)' -- GitLab From 30b2fe38fa89a03e3ad203552bd3aac6acf19830 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Apr 2023 15:04:41 +0200 Subject: [PATCH 194/213] Ready to update ( II ) --- autosubmit/autosubmit.py | 45 ++++++++++++++++------- autosubmit/platforms/paramiko_platform.py | 13 +++++-- autosubmit/platforms/slurmplatform.py | 4 +- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8fe2669cc..9a528fef5 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2155,7 +2155,9 @@ class Autosubmit: package.submit(as_conf, job_list.parameters, inspect, hold=hold) save=True if not inspect: - job_list.save() + if platform.type.lower() != "slurm": + job_list.update_list(as_conf) + job_list.save() valid_packages_to_submit.append(package) except (IOError, OSError): if package.jobs[0].id != 0: @@ -2202,20 +2204,32 @@ class Autosubmit: try: valid_packages_to_submit = [ package for package in valid_packages_to_submit if package.x11 != True] if len(valid_packages_to_submit) > 0: + submit_time = int(time.time() / 60) try: jobs_id = platform.submit_Script(hold=hold) except AutosubmitError as e: - try: - jobnames = [] - for package in valid_packages_to_submit: - jobnames += [job.name for job in package.jobs] - for jobname in jobnames: - jobid = platform.get_jobid_by_jobname(jobname) - #cancel bad submitted job if jobid is encountered - for id in jobid: - platform.cancel_job(id) - except: - pass + for package in valid_packages_to_submit: + try: + elapsed_time_minutes = str(int(round(int(time.time() / 60) - submit_time)+2)) + job_historic = platform.get_jobid_by_jobname(package.jobs[0].name,minutes=elapsed_time_minutes) + except: + job_historic = [] + #Recover jobid from jobname + if len(job_historic) > 0 and isinstance(job_historic, list): + job_id = job_historic[-1] + for job_id_historic in job_historic: + if job_id_historic != job_id: + try: + platform.cancel_job(job_id_historic) + except: + pass + for job in package.jobs: + job.hold = hold + job.id = str(job_id) + job.status = Status.SUBMITTED + job.write_submit_time(hold=hold) + #job_list.update_list(as_conf) + job_list.save() jobs_id = None platform.connected = False if e.trace is not None: @@ -2301,8 +2315,11 @@ class Autosubmit: if not inspect: job_list.save() if len(failed_packages) > 0: - for job_id in failed_packages: - platform.send_command( platform.cancel_cmd + " {0}".format(job_id)) + try: + for job_id in failed_packages: + platform.send_command( platform.cancel_cmd + " {0}".format(job_id)) + except: + pass raise AutosubmitError( "{0} submission failed, some hold jobs failed to be held".format(platform.name), 6015) except WrongTemplateException as e: diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index fe7c0bfe4..c75895cce 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -683,8 +683,10 @@ class ParamikoPlatform(Platform): raise AutosubmitError("Some Jobs are in Unknown status", 6008) # job.new_status=job_status + def get_jobid_by_jobname_cmd(self, job_name,minutes="5"): + return "" - def get_jobid_by_jobname(self,job_name,retries=2): + def get_jobid_by_jobname(self,job_name,retries=2,minutes="5"): """ Get job id by job name :param retries: retries @@ -692,7 +694,7 @@ class ParamikoPlatform(Platform): :return: job id """ #sleep(5) - cmd = self.get_jobid_by_jobname_cmd(job_name) + cmd = self.get_jobid_by_jobname_cmd(job_name,minutes) self.send_command(cmd) job_id_name = self.get_ssh_output() while len(job_id_name) <= 0 and retries > 0: @@ -704,8 +706,11 @@ class ParamikoPlatform(Platform): #get id last line job_ids_names = job_id_name.split('\n')[1:-1] #get all ids by jobname - job_ids = [job_id.split(',')[0] for job_id in job_ids_names] - return job_ids + job_ids = [job_id.split(' ')[0] for job_id in job_ids_names] + return job_ids + else: + return [] + def get_checkjob_cmd(self, job_id): """ diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 0792ee64e..67c073a75 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -377,8 +377,8 @@ class SlurmPlatform(ParamikoPlatform): def get_queue_status_cmd(self, job_id): return 'squeue -j {0} -o %A,%R'.format(job_id) - def get_jobid_by_jobname_cmd(self, job_name): - return 'squeue -o %A,%.50j -n {0}'.format(job_name) + def get_jobid_by_jobname_cmd(self, job_name,minutes="5"): + return "sacct --name {0} -o JobID -n -X --starttime=$(date -d '{1} minutes ago' +'%Y-%m-%dT%H:%M:%S')".format(job_name,minutes) def cancel_job(self, job_id): -- GitLab From eb245eefe92ff8cdbeb40bbcb0b1611157019dab Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Apr 2023 16:10:55 +0200 Subject: [PATCH 195/213] Changed submission recovery to a better version that keeps in mind the submission time. Changed to sacct since the job could be finished. Also now the jobs are not cancelled if they're not duplicated. --- autosubmit/autosubmit.py | 45 ++++++++++++----------- autosubmit/platforms/paramiko_platform.py | 33 +++++++++-------- autosubmit/platforms/platform.py | 3 +- 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 9a528fef5..2f1b3baa4 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2208,28 +2208,31 @@ class Autosubmit: try: jobs_id = platform.submit_Script(hold=hold) except AutosubmitError as e: - for package in valid_packages_to_submit: - try: - elapsed_time_minutes = str(int(round(int(time.time() / 60) - submit_time)+2)) - job_historic = platform.get_jobid_by_jobname(package.jobs[0].name,minutes=elapsed_time_minutes) - except: - job_historic = [] - #Recover jobid from jobname - if len(job_historic) > 0 and isinstance(job_historic, list): - job_id = job_historic[-1] - for job_id_historic in job_historic: - if job_id_historic != job_id: - try: - platform.cancel_job(job_id_historic) - except: - pass - for job in package.jobs: - job.hold = hold - job.id = str(job_id) - job.status = Status.SUBMITTED - job.write_submit_time(hold=hold) - #job_list.update_list(as_conf) + try: + for package in valid_packages_to_submit: + try: + elapsed_time_minutes = str(int(round(int(time.time() / 60) - submit_time)+1)) + job_historic = platform.get_jobid_by_jobname(package.jobs[0].name,minutes=elapsed_time_minutes) + except: + job_historic = [] + #Recover jobid from jobname + if len(job_historic) > 0 and isinstance(job_historic, list): + job_id = job_historic[-1] + for job_id_historic in job_historic: + if job_id_historic != job_id: + try: + platform.send_command(platform.cancel_cmd + " {0}".format(job_id_historic)) + except: + pass + for job in package.jobs: + job.hold = hold + job.id = str(job_id) + job.status = Status.SUBMITTED + job.write_submit_time(hold=hold) + except: + pass job_list.save() + job_list.update_list(as_conf,store_change=True) jobs_id = None platform.connected = False if e.trace is not None: diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index c75895cce..53c2b580f 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -683,8 +683,7 @@ class ParamikoPlatform(Platform): raise AutosubmitError("Some Jobs are in Unknown status", 6008) # job.new_status=job_status - def get_jobid_by_jobname_cmd(self, job_name,minutes="5"): - return "" + def get_jobid_by_jobname(self,job_name,retries=2,minutes="5"): """ @@ -695,21 +694,23 @@ class ParamikoPlatform(Platform): """ #sleep(5) cmd = self.get_jobid_by_jobname_cmd(job_name,minutes) - self.send_command(cmd) - job_id_name = self.get_ssh_output() - while len(job_id_name) <= 0 and retries > 0: + if cmd != "" and cmd is not None: self.send_command(cmd) - job_id_name = self.get_ssh_output() - retries -= 1 - sleep(2) - if retries >= 0: - #get id last line - job_ids_names = job_id_name.split('\n')[1:-1] - #get all ids by jobname - job_ids = [job_id.split(' ')[0] for job_id in job_ids_names] - return job_ids - else: - return [] + job_id_name = "" + while len(job_id_name) <= 0 and retries > 0: + job_id_name = self.get_ssh_output() + if len(job_id_name) <= 0: + self.send_command(cmd) + else: + break + retries -= 1 + if retries >= 0: + #get id last line + job_ids_names = job_id_name.split('\n') + #get all ids by jobname + job_ids = [job_id.split(' ')[0] for job_id in job_ids_names if job_id != ""] + return job_ids + return [] def get_checkjob_cmd(self, job_id): diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index 11caca815..09f2d565d 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -160,7 +160,8 @@ class Platform(object): :type dest: str """ raise NotImplementedError - + def get_jobid_by_jobname_cmd(self, job_name,minutes="5"): + return "" def get_file(self, filename, must_exist=True, relative_path='', ignore_log=False, wrapper_failed=False): """ Copies a file from the current platform to experiment's tmp folder -- GitLab From e474bc868da257d44fc078285a3567eee47e3eb6 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 14 Apr 2023 09:33:21 +0200 Subject: [PATCH 196/213] chdir was causing long delays, moved the checker from this place to the connectivity check, added keepalive --- autosubmit/job/job.py | 4 ++-- autosubmit/job/job_packages.py | 2 +- autosubmit/platforms/ecplatform.py | 4 +++- autosubmit/platforms/locplatform.py | 3 ++- autosubmit/platforms/paramiko_platform.py | 7 +++++-- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1b7d0de6f..b1c4e6938 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1194,7 +1194,7 @@ class Job(object): template = '' if as_conf.get_remote_dependencies(): if self.type == Type.BASH: - template = 'sleep 5' + "\n" + template = 'sleep 360' + "\n" elif self.type == Type.PYTHON: template = 'time.sleep(5)' + "\n" elif self.type == Type.R: @@ -1203,7 +1203,7 @@ class Job(object): template_file.close() else: if self.type == Type.BASH: - template = 'sleep 5' + template = 'sleep 360' elif self.type == Type.PYTHON or self.type == Type.PYTHON2 or self.type == Type.PYTHON3: template = 'time.sleep(5)' elif self.type == Type.R: diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 12994c643..13c979690 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -427,7 +427,7 @@ class JobPackageThread(JobPackageBase): def _send_files(self): Log.debug("Check remote dir") - self.platform.check_remote_log_dir() + #self.platform.check_remote_log_dir() compress_type = "w" output_filepath = '{0}.tar'.format("wrapper_scripts") if callable(getattr(self.platform, 'remove_multiple_files')): diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index ba9eb5828..1d2b9a90a 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -193,6 +193,8 @@ class EcPlatform(ParamikoPlatform): return True except Exception as e: return False + self.check_remote_log_dir() + def send_command(self, command, ignore_log=False, x11 = False): try: @@ -210,7 +212,7 @@ class EcPlatform(ParamikoPlatform): return True def send_file(self, filename, check=True): - self.check_remote_log_dir() + #self.check_remote_log_dir() self.delete_file(filename) command = '{0} {1} {3}:{2}'.format(self.put_cmd, os.path.join(self.tmp_path, filename), os.path.join(self.get_files_path(), filename), self.host) diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index f361eb2c3..7a1d8d580 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -116,7 +116,7 @@ class LocalPlatform(ParamikoPlatform): return True def send_file(self, filename): - self.check_remote_log_dir() + #self.check_remote_log_dir() self.delete_file(filename,del_cmd=True) command = '{0} {1} {2}'.format(self.put_cmd, os.path.join(self.tmp_path, filename), os.path.join(self.tmp_path, 'LOG_' + self.expid, filename)) @@ -151,6 +151,7 @@ class LocalPlatform(ParamikoPlatform): return True def check_remote_permissions(self): + self.check_remote_log_dir() return True # Moves .err .out diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 53c2b580f..913a54ab3 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -217,6 +217,7 @@ class ParamikoPlatform(Platform): key_filename=self._host_config_id, timeout=60 , banner_timeout=60,disabled_algorithms={'pubkeys': ['rsa-sha2-256', 'rsa-sha2-512']}) self.transport = self._ssh.get_transport() self.transport.banner_timeout = 60 + self.transport.set_keepalive(120) self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=pow(4, 12) ,max_packet_size=pow(4, 12) ) self._ftpChannel.get_channel().settimeout(120) @@ -285,7 +286,7 @@ class ParamikoPlatform(Platform): """ if check: - self.check_remote_log_dir() + #self.check_remote_log_dir() self.delete_file(filename) try: local_path = os.path.join(os.path.join(self.tmp_path, filename)) @@ -1203,6 +1204,8 @@ class ParamikoPlatform(Platform): return True except: return False + self.check_remote_log_dir() + @@ -1210,7 +1213,7 @@ class ParamikoPlatform(Platform): """ Creates log dir on remote host """ - + # TODO change it to be in the check_remote_permissions if self.type == "slurm": try: # Test if remote_path exists -- GitLab From 32979c1fe2488e9ea306e1686d92bc9e00e9ef37 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 19 Apr 2023 15:41:10 +0200 Subject: [PATCH 197/213] Fix remote_folder --- autosubmit/platforms/ecplatform.py | 36 ++++++++++++++--------- autosubmit/platforms/locplatform.py | 1 - autosubmit/platforms/paramiko_platform.py | 3 +- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index 1d2b9a90a..b633f4f3e 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -149,9 +149,12 @@ class EcPlatform(ParamikoPlatform): :rtype: bool """ output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) - if output.lower().find("yes") != -1: - self.connected = True - else: + try: + if output.lower().find("yes") != -1: + self.connected = True + else: + self.connected = False + except: self.connected = False def restore_connection(self): """ @@ -161,9 +164,12 @@ class EcPlatform(ParamikoPlatform): :rtype: bool """ output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) - if output.lower().find("yes") != -1: - self.connected = True - else: + try: + if output.lower().find("yes") != -1: + self.connected = True + else: + self.connected = False + except: self.connected = False def test_connection(self): """ @@ -173,14 +179,17 @@ class EcPlatform(ParamikoPlatform): :rtype: bool """ output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) - if output.lower().find("yes") != -1: - self.connected = True - return "OK" - else: + try: + if output.lower().find("yes") != -1: + self.connected = True + return "OK" + else: + self.connected = False + return "Invalid certificate" + except: self.connected = False return "Invalid certificate" - def check_remote_permissions(self): try: try: @@ -188,13 +197,12 @@ class EcPlatform(ParamikoPlatform): except Exception as e: pass subprocess.check_output(self.check_remote_permissions_cmd, shell=True) - pass subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=True) + self.check_remote_log_dir() + return True except Exception as e: return False - self.check_remote_log_dir() - def send_command(self, command, ignore_log=False, x11 = False): try: diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 7a1d8d580..0bee9fb70 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -116,7 +116,6 @@ class LocalPlatform(ParamikoPlatform): return True def send_file(self, filename): - #self.check_remote_log_dir() self.delete_file(filename,del_cmd=True) command = '{0} {1} {2}'.format(self.put_cmd, os.path.join(self.tmp_path, filename), os.path.join(self.tmp_path, 'LOG_' + self.expid, filename)) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 913a54ab3..6455f1c31 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -1201,10 +1201,10 @@ class ParamikoPlatform(Platform): pass self._ftpChannel.mkdir(path) self._ftpChannel.rmdir(path) + self.check_remote_log_dir() return True except: return False - self.check_remote_log_dir() @@ -1213,7 +1213,6 @@ class ParamikoPlatform(Platform): """ Creates log dir on remote host """ - # TODO change it to be in the check_remote_permissions if self.type == "slurm": try: # Test if remote_path exists -- GitLab From 1f04d8d28f6f76a05e81ace495c1e7bbdcd5ae68 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 26 Apr 2023 12:26:36 +0200 Subject: [PATCH 198/213] Fixed splits --- autosubmit/job/job_list.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 2a687dbd8..f2fbe86d2 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -407,14 +407,9 @@ class JobList(object): for parent in parents_jobs: # Generic for all dependencies if dependency.delay == -1 or chunk > dependency.delay: - if isinstance(parent, list): - if job.split is not None: - parent = filter( - lambda _parent: _parent.split == job.split, parent)[0] - else: - if dependency.splits is not None: - parent = filter( - lambda _parent: _parent.split in dependency.splits, parent) + if parent.split is not None and dependency.splits is not None: + if parent.split not in dependency.splits: + continue #Select chunk + select member if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): job.add_parent(parent) -- GitLab From b9d30b1c7449dc4a9c7c609af06b92d51b1d4c84 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 26 Apr 2023 16:04:09 +0200 Subject: [PATCH 199/213] fixed ec_access --- autosubmit/config/config_common.py | 18 ++++++++++++++++++ autosubmit/job/job.py | 3 ++- autosubmit/platforms/ecplatform.py | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 13243d4e1..e4ce07a66 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -322,7 +322,25 @@ class AutosubmitConfig(object): :rtype: str """ return str(self._jobs_parser.get_option(section, 'NODES', '')) + def get_ec_queue_platform(self, section): + """ + Gets ec_queue needed for the given job type + :param section: job type + :type section: str + :return: tasks (processes) per host + :rtype: str + """ + return str(self._platforms_parser.get_option(section, 'EC_QUEUE', 'hpc')) + def get_ec_queue(self, job): + """ + Gets ec_queue needed for the given job type + :param section: job type + :type section: str + :return: tasks (processes) per host + :rtype: str + """ + return str(self._jobs_parser.get_option(job.section, 'EC_QUEUE', self.get_ec_queue_platform(job.platform_name))) def get_scratch_free_space(self, section): """ Gets scratch free space needed for the given job type diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index b1c4e6938..e2375f9af 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1049,7 +1049,7 @@ class Job(object): self.threads = as_conf.get_threads(self.section) self.tasks = as_conf.get_tasks(self.section) self.nodes = as_conf.get_nodes(self.section) - + self.ec_queue = as_conf.get_ec_queue(self) self.hyperthreading = as_conf.get_hyperthreading(self.section).lower() if self.hyperthreading is 'none': self.hyperthreading = job_platform.hyperthreading.lower() @@ -1103,6 +1103,7 @@ class Job(object): parameters['CURRENT_ARCH'] = job_platform.name parameters['CURRENT_HOST'] = job_platform.host parameters['CURRENT_QUEUE'] = self.queue + parameters['CURRENT_EC_QUEUE'] = self.ec_queue parameters['CURRENT_USER'] = job_platform.user parameters['CURRENT_PROJ'] = job_platform.project parameters['CURRENT_BUDG'] = job_platform.budget diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index b633f4f3e..7d7a36ffd 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -82,7 +82,7 @@ class EcPlatform(ParamikoPlatform): """ self.root_dir = os.path.join(self.scratch, self.project, self.user, self.expid) self.remote_log_dir = os.path.join(self.root_dir, "LOG_" + self.expid) - self.cancel_cmd = "eceaccess-job-delete" + self.cancel_cmd = "ecaccess-job-delete" self._checkjob_cmd = "ecaccess-job-list " self._checkhost_cmd = "ecaccess-certificate-list" self._checkvalidcert_cmd = "ecaccess-gateway-connected" -- GitLab From 023ab7022ae0ae159122713838bb65565bd157ac Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Apr 2023 09:12:22 +0200 Subject: [PATCH 200/213] closes 778 --- docs/source/devguide/variables.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/devguide/variables.rst b/docs/source/devguide/variables.rst index 2cab3a71e..52b0389d4 100644 --- a/docs/source/devguide/variables.rst +++ b/docs/source/devguide/variables.rst @@ -30,6 +30,7 @@ This variables are relatives to the current job. - **Chunk_END_MONTH**: chunk's end month - **Chunk_END_DAY**: chunk's end day - **Chunk_END_HOUR**: chunk's end hour +- **STARTDATES**: List of startdates - **PREV**: days since startdate at the chunk's start - **Chunk_FIRST**: True if the current chunk is the first, false otherwise. - **Chunk_LAST**: True if the current chunk is the last, false otherwise. -- GitLab From 95a68f0fedd046ea93fb9be83e5d2d85952f123d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 23 Jun 2023 11:13:42 +0200 Subject: [PATCH 201/213] Trying to fix ecmwf issue --- autosubmit/autosubmit.py | 11 +++++++---- autosubmit/job/job.py | 10 +++++----- autosubmit/job/job_packager.py | 6 +++--- autosubmit/job/job_packages.py | 14 +++++++------- autosubmit/platforms/ecplatform.py | 6 ++++++ autosubmit/platforms/paramiko_platform.py | 2 +- autosubmit/platforms/pjmplatform.py | 6 ++++-- 7 files changed, 33 insertions(+), 22 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 2f1b3baa4..0050026a8 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2155,7 +2155,7 @@ class Autosubmit: package.submit(as_conf, job_list.parameters, inspect, hold=hold) save=True if not inspect: - if platform.type.lower() != "slurm": + if str(platform.type).lower() != "slurm": job_list.update_list(as_conf) job_list.save() valid_packages_to_submit.append(package) @@ -2167,7 +2167,7 @@ class Autosubmit: if package.jobs[0].id != 0: failed_packages.append(package.jobs[0].id) platform.connected = False - if e.trace.lower().find("bad parameters") != -1 or e.message.lower().find("scheduler is not installed") != -1: + if str(e.trace).lower().find("bad parameters") != -1 or str(e.message).lower().find("scheduler is not installed") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -2176,7 +2176,7 @@ class Autosubmit: for job_tmp in package.jobs: if job_tmp.section not in error_msg: error_msg += job_tmp.section + "&" - if e.trace.lower().find("bad parameters") != -1: + if str(e.trace).lower().find("bad parameters") != -1: error_message+="\ncheck job and queue specified in jobs.conf. Sections that could be affected: {0}".format( error_msg[:-1]) else: @@ -2200,7 +2200,7 @@ class Autosubmit: raise except Exception as e: raise - if platform.type.lower() in ["slurm", "pjm"] and not inspect and not only_wrappers: + if str(platform.type).lower() in ["slurm", "pjm"] and not inspect and not only_wrappers: try: valid_packages_to_submit = [ package for package in valid_packages_to_submit if package.x11 != True] if len(valid_packages_to_submit) > 0: @@ -2208,6 +2208,8 @@ class Autosubmit: try: jobs_id = platform.submit_Script(hold=hold) except AutosubmitError as e: + if not e.message: + e.message = "" try: for package in valid_packages_to_submit: try: @@ -2238,6 +2240,7 @@ class Autosubmit: if e.trace is not None: has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 else: + e.trace = "" has_trace_bad_parameters = False if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find("invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index e2375f9af..09253c49a 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -625,7 +625,7 @@ class Job(object): as_conf.reload() submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) - platform = submitter.platforms[platform_name.lower()] + platform = submitter.platforms[str(platform_name).lower()] success = True except BaseException as e: error_message = str(e) @@ -890,7 +890,7 @@ class Job(object): if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: # New thread, check if file exist expid = copy.deepcopy(self.expid) - platform_name = copy.deepcopy(self.platform_name.lower()) + platform_name = copy.deepcopy(str(self.platform_name).lower()) local_logs = copy.deepcopy(self.local_logs) remote_logs = copy.deepcopy(self.remote_logs) as_conf = AutosubmitConfig( @@ -1050,9 +1050,9 @@ class Job(object): self.tasks = as_conf.get_tasks(self.section) self.nodes = as_conf.get_nodes(self.section) self.ec_queue = as_conf.get_ec_queue(self) - self.hyperthreading = as_conf.get_hyperthreading(self.section).lower() + self.hyperthreading = str(as_conf.get_hyperthreading(self.section)).lower() if self.hyperthreading is 'none': - self.hyperthreading = job_platform.hyperthreading.lower() + self.hyperthreading = str(job_platform.hyperthreading).lower() if self.tasks == '0' and job_platform.processors_per_node: self.tasks = job_platform.processors_per_node @@ -1189,7 +1189,7 @@ class Job(object): """ parameters = self.parameters try: # issue in tests with project_type variable while using threads - if as_conf.get_project_type().lower() != "none": + if str(as_conf.get_project_type()).lower() != "none": template_file = open(os.path.join( as_conf.get_project_dir(), self.file), 'r') template = '' diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index cf632049f..ea3e0b9f1 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -87,14 +87,14 @@ class JobPackager(object): self.wrapper_type["wrapper"] = self._as_config.get_wrapper_type() self.wrapper_policy["wrapper"] = self._as_config.get_wrapper_policy() - self.wrapper_method["wrapper"] = self._as_config.get_wrapper_method().lower() + self.wrapper_method["wrapper"] = str(self._as_config.get_wrapper_method()).lower() self.jobs_in_wrapper["wrapper"] = self._as_config.get_wrapper_jobs() self.extensible_wallclock["wrapper"] = self._as_config.get_extensible_wallclock() if self._as_config.get_wrapper_type() == "multi": for wrapper_section in self._as_config.get_wrapper_multi(): self.wrapper_type[wrapper_section] = self._as_config.get_wrapper_type(wrapper_section) self.wrapper_policy[wrapper_section] = self._as_config.get_wrapper_policy(wrapper_section) - self.wrapper_method[wrapper_section] = self._as_config.get_wrapper_method(wrapper_section).lower() + self.wrapper_method[wrapper_section] = str(self._as_config.get_wrapper_method(wrapper_section)).lower() self.jobs_in_wrapper[wrapper_section] = self._as_config.get_wrapper_jobs(wrapper_section) self.extensible_wallclock[wrapper_section] = int(self._as_config.get_extensible_wallclock(wrapper_section)) self.wrapper_info = [self.wrapper_type,self.wrapper_policy,self.wrapper_method,self.jobs_in_wrapper,self.extensible_wallclock] # to pass to job_packages @@ -159,7 +159,7 @@ class JobPackager(object): jobs_ready = list() if len(self._jobs_list.jobs_to_run_first) > 0: jobs_ready = [job for job in self._jobs_list.jobs_to_run_first if - ( self._platform is None or job.platform.name.lower() == self._platform.name.lower() ) and + ( self._platform is None or str(job.platform.name).lower() == str(self._platform.name).lower() ) and job.status == Status.READY] if len(jobs_ready) == 0: if self.hold: diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 13c979690..84e2a97c6 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -98,13 +98,13 @@ class JobPackageBase(object): @threaded def check_scripts(self,jobs,configuration, parameters,only_generate,hold): for job in jobs: - if job.check.lower() == Job.CHECK_ON_SUBMISSION.lower(): + if str(job.check).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): if only_generate: exit = True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): lock.acquire() - if configuration.get_project_type().lower() != "none": + if str(configuration.get_project_type()).lower() != "none": raise AutosubmitCritical( "Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format( job.name), 7014) @@ -150,7 +150,7 @@ class JobPackageBase(object): try: if len(self.jobs) < thread_number: for job in self.jobs: - if job.check.lower() == Job.CHECK_ON_SUBMISSION.lower(): + if str(job.check).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): if only_generate: exit=True break @@ -638,7 +638,7 @@ class JobPackageVertical(JobPackageThread): num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, rootdir=self.platform.root_dir, - directives=self._custom_directives,threads=self._threads,method=self.method.lower(),retrials=self.inner_retrials, wallclock_by_level=wallclock_by_level) + directives=self._custom_directives,threads=self._threads,method=str(self.method).lower(),retrials=self.inner_retrials, wallclock_by_level=wallclock_by_level) class JobPackageHorizontal(JobPackageThread): @@ -671,7 +671,7 @@ class JobPackageHorizontal(JobPackageThread): num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, rootdir=self.platform.root_dir, - directives=self._custom_directives,threads=self._threads,method=self.method.lower(),fail_count=fail_count) + directives=self._custom_directives,threads=self._threads,method=str(self.method).lower(),fail_count=fail_count) class JobPackageHybrid(JobPackageThread): """ @@ -719,7 +719,7 @@ class JobPackageVerticalHorizontal(JobPackageHybrid): wallclock=self._wallclock, num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, - rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=self.method.lower(),fail_count=fail_count) + rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=str(self.method).lower(),fail_count=fail_count) class JobPackageHorizontalVertical(JobPackageHybrid): @@ -733,5 +733,5 @@ class JobPackageHorizontalVertical(JobPackageHybrid): wallclock=self._wallclock, num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, - rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=self.method.lower(),fail_count=fail_count) + rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=str(self.method).lower(),fail_count=fail_count) diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index 7d7a36ffd..58848b9cc 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -149,6 +149,8 @@ class EcPlatform(ParamikoPlatform): :rtype: bool """ output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if not output: + output = "" try: if output.lower().find("yes") != -1: self.connected = True @@ -164,6 +166,8 @@ class EcPlatform(ParamikoPlatform): :rtype: bool """ output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if not output: + output = "" try: if output.lower().find("yes") != -1: self.connected = True @@ -179,6 +183,8 @@ class EcPlatform(ParamikoPlatform): :rtype: bool """ output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if not output: + output = "" try: if output.lower().find("yes") != -1: self.connected = True diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 6455f1c31..0f8aa39ba 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -1183,7 +1183,7 @@ class ParamikoPlatform(Platform): def check_tmp_exists(self): try: if self.send_command("ls {0}".format(self.temp_dir)): - if "no such file or directory" in self.get_ssh_output_err().lower(): + if "no such file or directory" in str(self.get_ssh_output_err()).lower(): return False else: return True diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 164920b14..6ba39d4f8 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -102,12 +102,14 @@ class PJMPlatform(ParamikoPlatform): if e.trace is not None: has_trace_bad_parameters = self.submit_error(e.trace) else: + e.trace = "" has_trace_bad_parameters = False if e.message is not None: has_message_bad_parameters = self.submit_error(e.message) else: + e.message = "" has_message_bad_parameters = False - if has_trace_bad_parameters or has_message_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: + if has_trace_bad_parameters or has_message_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find("invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -365,7 +367,7 @@ class PJMPlatform(ParamikoPlatform): "Submission failed. There are issues on your config file", 7014) def get_submit_cmd(self, job_script, job, hold=False, export=""): - if (export is None or export.lower() == "none") or len(export) == 0: + if (export is None or str(export).lower() == "none") or len(export) == 0: export = "" else: export += " ; " -- GitLab From 7600a8d9f49a274683996a75e26693f8403f962a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Wed, 28 Jun 2023 12:36:34 +0200 Subject: [PATCH 202/213] include tailer and header to parameters dict --- autosubmit/job/job.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 09253c49a..2d816a877 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -149,6 +149,8 @@ class Job(object): self.export = "none" self.dependencies = [] self.start_time = None + self.ext_header_path = '' + self.ext_tailer_path = '' def __getstate__(self): odict = self.__dict__ @@ -1098,7 +1100,37 @@ class Job(object): parameters['SCRATCH_FREE_SPACE'] = self.scratch_free_space parameters['CUSTOM_DIRECTIVES'] = self.custom_directives parameters['HYPERTHREADING'] = self.hyperthreading + # we open the files and offload the whole script as a string + # memory issues if the script is too long? Add a check to avoid problems... + if self.ext_header_path != '': + try: + header_script = open(self.ext_tailer_path, 'r').read() + except Exception as e: # add the file not found exception + AutosubmitError("Couldn't fetch extended header script") + # log it! + Log.debug( + "PARAMETER update: Extended Header: {0} doesn't exist".format(e.message)) + # ignore it! + header_script = '' + parameters['EXTENDED_HEADER'] = header_script + else: + # we have no script to include + parameters['EXTENDED_HEADER'] = '' + if self.ext_tailer_path != '': + try: + tailer_script = open(self.ext_tailer_path, 'r').read() + except Exception as e: # add the file not found exception + AutosubmitError("Couldn't fetch extended tailer script") + # log it! + Log.debug( + "PARAMETER update: Extended Tailer: {0} doesn't exist".format(e.message)) + # ignore it! + tailer_script = '' + parameters['EXTENDED_TAILER'] = tailer_script + else: + # we have no script to include + parameters['EXTENDED_TAILER'] = '' parameters['CURRENT_ARCH'] = job_platform.name parameters['CURRENT_HOST'] = job_platform.host -- GitLab From a0d55adf173df21cd7140153e1cf89598f4ab416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Fri, 30 Jun 2023 10:49:36 +0200 Subject: [PATCH 203/213] add tailer and header --- autosubmit/job/job.py | 54 +++++++++++++++++------------------- autosubmit/job/job_common.py | 15 ++++++++-- autosubmit/job/job_dict.py | 3 ++ 3 files changed, 40 insertions(+), 32 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 2d816a877..13221c72f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -60,6 +60,29 @@ def threaded(fn): return wrapper +def read_header_tailer_script(script_path): + """ + Opens and reads a BASH script. + + Will ignore lines starting with # + + :param script_path: absolute path to the script + :type script_path: string + """ + script = '' + if script_path == '': + return script + try: + for line in open(script_path, 'r'): + if line[0] != "#": + script += line + except Exception as e: # add the file not found exception + Log.debug( + "PARAMETER update: Extended script: {0} doesn't exist".format(e.message)) + raise AutosubmitError("Couldn't fetch extended script") + return script + + class Job(object): """ Class to handle all the tasks with Jobs at HPC. @@ -1102,35 +1125,8 @@ class Job(object): parameters['HYPERTHREADING'] = self.hyperthreading # we open the files and offload the whole script as a string # memory issues if the script is too long? Add a check to avoid problems... - if self.ext_header_path != '': - try: - header_script = open(self.ext_tailer_path, 'r').read() - except Exception as e: # add the file not found exception - AutosubmitError("Couldn't fetch extended header script") - # log it! - Log.debug( - "PARAMETER update: Extended Header: {0} doesn't exist".format(e.message)) - # ignore it! - header_script = '' - parameters['EXTENDED_HEADER'] = header_script - else: - # we have no script to include - parameters['EXTENDED_HEADER'] = '' - - if self.ext_tailer_path != '': - try: - tailer_script = open(self.ext_tailer_path, 'r').read() - except Exception as e: # add the file not found exception - AutosubmitError("Couldn't fetch extended tailer script") - # log it! - Log.debug( - "PARAMETER update: Extended Tailer: {0} doesn't exist".format(e.message)) - # ignore it! - tailer_script = '' - parameters['EXTENDED_TAILER'] = tailer_script - else: - # we have no script to include - parameters['EXTENDED_TAILER'] = '' + parameters['EXTENDED_HEADER'] = read_header_tailer_script(self.ext_header_path) + parameters['EXTENDED_TAILER'] = read_header_tailer_script(self.ext_tailer_path) parameters['CURRENT_ARCH'] = job_platform.name parameters['CURRENT_HOST'] = job_platform.host diff --git a/autosubmit/job/job_common.py b/autosubmit/job/job_common.py index 6a81f64cb..bfbe2cbac 100644 --- a/autosubmit/job/job_common.py +++ b/autosubmit/job/job_common.py @@ -110,6 +110,7 @@ class StatisticsSnippetBash: ################### # Autosubmit header ################### + locale_to_set=$(locale -a | grep ^C.) if [ -z "$locale_to_set" ] ; then # locale installed... @@ -127,7 +128,12 @@ class StatisticsSnippetBash: set -xuve job_name_ptrn='%CURRENT_LOGDIR%/%JOBNAME%' echo $(date +%s) > ${job_name_ptrn}_STAT - + + ################### + # Extended header + ################### + %EXTENDED_HEADER% + ################### # Autosubmit job ################### @@ -137,7 +143,11 @@ class StatisticsSnippetBash: @staticmethod def as_tailer(): return textwrap.dedent("""\ - + + ################### + # Extended tailer + ################### + %EXTENDED_TAILER% ################### # Autosubmit tailer ################### @@ -201,7 +211,6 @@ class StatisticsSnippetPython: # expand tailer to use python3 def as_tailer(self): return textwrap.dedent("""\ - ################### # Autosubmit tailer ################### diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index ef98ee576..f2494ee33 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -519,6 +519,9 @@ class DicJobs: job.running = self.get_option(section, 'RUNNING', 'once').lower() job.x11 = bool(self.get_option(section, 'X11', False )) + job.ext_tailer_path = self.get_option(section, 'EXTENDED_TAILER_PATH', '') + job.ext_header_path = self.get_option(section, 'EXTENDED_HEADER_PATH', '') + if self.get_option(section, "SKIPPABLE", "False").lower() == "true": job.skippable = True else: -- GitLab From d1a4f3eb5072ff0fe2168c02c1abe4b288e5fbb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Fri, 30 Jun 2023 11:23:19 +0200 Subject: [PATCH 204/213] update template file --- autosubmit/config/files/jobs.conf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/autosubmit/config/files/jobs.conf b/autosubmit/config/files/jobs.conf index e57b8c819..4744113bd 100644 --- a/autosubmit/config/files/jobs.conf +++ b/autosubmit/config/files/jobs.conf @@ -56,6 +56,9 @@ ## Optional. Custom directives for the resource manager of the platform used for that job. ## Put as many as you wish in json formatted array. # CUSTOM_DIRECTIVE = ["#PBS -v myvar=value, "#PBS -v othervar=value"] +## Optional. Custom directive to add a custom script at the beginning ir end of the autosubmit cmd +# EXTENDED_HEADER_PATH = /path/to/script.sh +# EXTENDED_TAILER_PATH = /path/to/script.sh [LOCAL_SETUP] FILE = LOCAL_SETUP.sh -- GitLab From c9e4ae28151bd6bc1cb6a0f07afc501732a88aaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Fri, 30 Jun 2023 11:40:21 +0200 Subject: [PATCH 205/213] add error code to input/output issues --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 13221c72f..dcbb09615 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -79,7 +79,7 @@ def read_header_tailer_script(script_path): except Exception as e: # add the file not found exception Log.debug( "PARAMETER update: Extended script: {0} doesn't exist".format(e.message)) - raise AutosubmitError("Couldn't fetch extended script") + raise AutosubmitError("Couldn't fetch extended script", 6004) return script -- GitLab From a127c03e63a8314f25e1eff556814fe0fef72984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Fri, 30 Jun 2023 11:40:31 +0200 Subject: [PATCH 206/213] add documentation on header and tailer --- docs/source/userguide/configure/index.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/userguide/configure/index.rst b/docs/source/userguide/configure/index.rst index c57ecf29d..cadaf925e 100644 --- a/docs/source/userguide/configure/index.rst +++ b/docs/source/userguide/configure/index.rst @@ -176,6 +176,10 @@ There are also other, less used features that you can use: * QUEUE: queue to add the job to. If not specified, uses PLATFORM default. +* EXTENDED_HEADER_PATH: path to a script to be appended at the begging of the .cmd script that Autosubmit generates. Only supports job type BASH. + +* EXTENDED_TAILER_PATH: path to a script to be appended at the end of the .cmd script that Autosubmit generates. Only supports job type BASH. + How to configure email notifications ------------------------------------ -- GitLab From cb8e42e75d6b1a82d04a8c2820943f80ae54ebc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Fri, 30 Jun 2023 12:29:38 +0200 Subject: [PATCH 207/213] made the path relative to the project dir --- autosubmit/job/job.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index dcbb09615..e3cbf4e92 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -60,7 +60,7 @@ def threaded(fn): return wrapper -def read_header_tailer_script(script_path): +def read_header_tailer_script(script_path, as_conf): """ Opens and reads a BASH script. @@ -68,12 +68,14 @@ def read_header_tailer_script(script_path): :param script_path: absolute path to the script :type script_path: string + :param as_conf: Autosubmit configuration file + :type as_conf: config """ script = '' if script_path == '': return script try: - for line in open(script_path, 'r'): + for line in open(os.path.join(as_conf.get_project_dir(), script_path), 'r'): if line[0] != "#": script += line except Exception as e: # add the file not found exception @@ -1125,8 +1127,8 @@ class Job(object): parameters['HYPERTHREADING'] = self.hyperthreading # we open the files and offload the whole script as a string # memory issues if the script is too long? Add a check to avoid problems... - parameters['EXTENDED_HEADER'] = read_header_tailer_script(self.ext_header_path) - parameters['EXTENDED_TAILER'] = read_header_tailer_script(self.ext_tailer_path) + parameters['EXTENDED_HEADER'] = read_header_tailer_script(self.ext_header_path, as_conf) + parameters['EXTENDED_TAILER'] = read_header_tailer_script(self.ext_tailer_path, as_conf) parameters['CURRENT_ARCH'] = job_platform.name parameters['CURRENT_HOST'] = job_platform.host -- GitLab From d307d0992cb6ddf167293d54d576c9fe9b952164 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Fri, 30 Jun 2023 15:23:04 +0200 Subject: [PATCH 208/213] test if tailer and header are properly substituted onto the script --- test/unit/test_job.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/unit/test_job.py b/test/unit/test_job.py index fae76dccb..59e1f51fc 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -179,6 +179,36 @@ class TestJob(TestCase): write_mock.write.assert_called_with('some-content: 999, 777, 666 % %') chmod_mock.assert_called_with(os.path.join(self.job._tmp_path, self.job.name + '.cmd'), 0o755) + def test_create_header_tailer_script(self): + tailer_script = '#!/usr/bin/bash\necho "Header test"\n' + header_script = '#!/usr/bin/bash\necho "Tailer test"\n' + # arrange + self.job.parameters = dict() + self.job.type = 0 # Type.BASH + self.job.parameters["EXTENDED_HEADER"] = header_script + self.job.parameters["EXTENDED_TAILER"] = tailer_script + + self.job._tmp_path = '/tmp/' + + update_content_mock = Mock(return_value='%EXTENDED_HEADER%\nsome-content\n%EXTENDED_TAILER%') + self.job.update_content = update_content_mock + + # fill the rest of the values on the job with something + update_parameters_mock = Mock(return_value=self.job.parameters) + self.job.update_parameters = update_parameters_mock + + # create an autosubmit config + config = Mock(spec=AutosubmitConfig) + + # will create a file on /tmp + self.job.create_script(config) + + with open(os.path.join(self.job._tmp_path, self.job.name + '.cmd')) as script_file: + full_script = script_file.read() + assert header_script in full_script + assert tailer_script in full_script + + def test_that_check_script_returns_false_when_there_is_an_unbound_template_variable(self): # arrange update_content_mock = Mock(return_value='some-content: %UNBOUND%') -- GitLab From 9c6c9d4f4f21a755b57aaabc6218ae25a7352cd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Thu, 6 Jul 2023 11:24:00 +0200 Subject: [PATCH 209/213] move to classmethod and added check for the type --- autosubmit/job/job.py | 72 +++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index e3cbf4e92..46a68b62f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -60,31 +60,6 @@ def threaded(fn): return wrapper -def read_header_tailer_script(script_path, as_conf): - """ - Opens and reads a BASH script. - - Will ignore lines starting with # - - :param script_path: absolute path to the script - :type script_path: string - :param as_conf: Autosubmit configuration file - :type as_conf: config - """ - script = '' - if script_path == '': - return script - try: - for line in open(os.path.join(as_conf.get_project_dir(), script_path), 'r'): - if line[0] != "#": - script += line - except Exception as e: # add the file not found exception - Log.debug( - "PARAMETER update: Extended script: {0} doesn't exist".format(e.message)) - raise AutosubmitError("Couldn't fetch extended script", 6004) - return script - - class Job(object): """ Class to handle all the tasks with Jobs at HPC. @@ -637,6 +612,49 @@ class Job(object): str(e), self.name), 6001) return + def read_header_tailer_script(self, script_path, as_conf): + """ + Opens and reads a BASH script. + + Will strip away the line with the hash bang (#!) + + :param script_path: relative to the experiment directory path to the script + :type script_path: string + :param as_conf: Autosubmit configuration file + :type as_conf: config + """ + script = '' + if script_path == '': + return script + try: + for line in open(os.path.join(as_conf.get_project_dir(), script_path), 'r'): + if "!#" not in line: + script += line + else: + # check if the type of the script matches the one in the extended + if "bash" in line: + if self.type != Type.BASH: + Log.error("PARAMETER update: Extended script: script seems BASH but job isn't") + # We stop Autosubmit if we don't find the script + raise AutosubmitCritical("Extended script: script seems BASH but job isn't\n", 7011) + elif "Rscript" in line: + if self.type != Type.R: + Log.error("PARAMETER update: Extended script: script seems Rscript but job isn't") + # We stop Autosubmit if we don't find the script + raise AutosubmitCritical("Extended script: script seems Rscript but job isn't\n", 7011) + elif "python" in line: + if self.type not in (Type.PYTHON, Type.PYTHON2, Type.PYTHON3): + Log.error( + "PARAMETER update: Extended script: script seems Python but job isn't") + # We stop Autosubmit if we don't find the script + raise AutosubmitCritical("Extended script: script seems Python but job isn't\n", 7011) + except Exception as e: # log + Log.error( + "PARAMETER update: Extended script: {0} doesn't exist".format(e.message)) + # We stop Autosubmit if we don't find the script + raise AutosubmitCritical("Extended script: failed to fetch {0} \n".format(str(e)), 7014) + return script + @threaded def retrieve_logfiles(self, copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = 0,job_id=""): max_logs = 0 @@ -1127,8 +1145,8 @@ class Job(object): parameters['HYPERTHREADING'] = self.hyperthreading # we open the files and offload the whole script as a string # memory issues if the script is too long? Add a check to avoid problems... - parameters['EXTENDED_HEADER'] = read_header_tailer_script(self.ext_header_path, as_conf) - parameters['EXTENDED_TAILER'] = read_header_tailer_script(self.ext_tailer_path, as_conf) + parameters['EXTENDED_HEADER'] = self.read_header_tailer_script(self.ext_header_path, as_conf) + parameters['EXTENDED_TAILER'] = self.read_header_tailer_script(self.ext_tailer_path, as_conf) parameters['CURRENT_ARCH'] = job_platform.name parameters['CURRENT_HOST'] = job_platform.host -- GitLab From cf5ad01cfd3d95d335b0fb7a90a42508e3ddea5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Thu, 6 Jul 2023 11:43:29 +0200 Subject: [PATCH 210/213] Remove error message when critical error is raised --- autosubmit/job/job.py | 44 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 46a68b62f..6565b8c31 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -614,7 +614,7 @@ class Job(object): def read_header_tailer_script(self, script_path, as_conf): """ - Opens and reads a BASH script. + Opens and reads a script. If it is not a BASH script it will fail :( Will strip away the line with the hash bang (#!) @@ -623,36 +623,32 @@ class Job(object): :param as_conf: Autosubmit configuration file :type as_conf: config """ + script_name = script_path.rsplit("/")[-1] # pick the name of the script for a more verbose error script = '' if script_path == '': return script + try: - for line in open(os.path.join(as_conf.get_project_dir(), script_path), 'r'): - if "!#" not in line: - script += line - else: - # check if the type of the script matches the one in the extended - if "bash" in line: - if self.type != Type.BASH: - Log.error("PARAMETER update: Extended script: script seems BASH but job isn't") - # We stop Autosubmit if we don't find the script - raise AutosubmitCritical("Extended script: script seems BASH but job isn't\n", 7011) - elif "Rscript" in line: - if self.type != Type.R: - Log.error("PARAMETER update: Extended script: script seems Rscript but job isn't") - # We stop Autosubmit if we don't find the script - raise AutosubmitCritical("Extended script: script seems Rscript but job isn't\n", 7011) - elif "python" in line: - if self.type not in (Type.PYTHON, Type.PYTHON2, Type.PYTHON3): - Log.error( - "PARAMETER update: Extended script: script seems Python but job isn't") - # We stop Autosubmit if we don't find the script - raise AutosubmitCritical("Extended script: script seems Python but job isn't\n", 7011) + script_file = open(os.path.join(as_conf.get_project_dir(), script_path), 'r') except Exception as e: # log - Log.error( - "PARAMETER update: Extended script: {0} doesn't exist".format(e.message)) # We stop Autosubmit if we don't find the script raise AutosubmitCritical("Extended script: failed to fetch {0} \n".format(str(e)), 7014) + + for line in script_file: + if "#!" not in line: + script += line + else: + # check if the type of the script matches the one in the extended + if "bash" in line: + if self.type != Type.BASH: + raise AutosubmitCritical("Extended script: script {0} seems BASH but job {1} isn't\n".format(script_name, self.script_name), 7011) + elif "Rscript" in line: + if self.type != Type.R: + raise AutosubmitCritical("Extended script: script {0} seems Rscript but job {1} isn't\n".format(script_name, self.script_name), 7011) + elif "python" in line: + if self.type not in (Type.PYTHON, Type.PYTHON2, Type.PYTHON3): + raise AutosubmitCritical("Extended script: script {0} seems Python but job {1} isn't\n".format(script_name, self.script_name), 7011) + return script @threaded -- GitLab From 9edb501ed7d547ff96e4e6350fb1c6d8671ebe5c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 26 Jul 2023 11:54:55 +0200 Subject: [PATCH 211/213] fixes 1088 --- autosubmit/job/job_packages.py | 50 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 84e2a97c6..4c6c6100a 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -25,16 +25,16 @@ except ImportError: from ConfigParser import SafeConfigParser import os +import random +import time from datetime import timedelta -import time -import random from autosubmit.job.job_common import Status -from log.log import Log,AutosubmitCritical,AutosubmitError +from log.log import Log, AutosubmitCritical + Log.get_logger("Autosubmit") -from autosubmit.job.job_exceptions import WrongTemplateException from autosubmit.job.job import Job -from bscearth.utils.date import sum_str_hours,date2str +from bscearth.utils.date import sum_str_hours from threading import Thread, Lock from typing import List import multiprocessing @@ -100,7 +100,6 @@ class JobPackageBase(object): for job in jobs: if str(job.check).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): if only_generate: - exit = True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): lock.acquire() @@ -148,12 +147,21 @@ class JobPackageBase(object): thread_number = thread_number * 5 chunksize = int((len(self.jobs) + thread_number - 1) / thread_number) try: - if len(self.jobs) < thread_number: - for job in self.jobs: - if str(job.check).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): - if only_generate: - exit=True - break + # get one job of each section jobs by section + if only_generate: + sections = configuration.get_wrapper_jobs(self.current_wrapper_section) + if "&" in sections: + sections.split("&") + elif " " in sections: + sections.split(" ") + else: + sections = [sections] + for section in sections: + if str(configuration._jobs_parser.get_option(section, "CHECK", 'True')).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): + exit = True + if exit: + if len(self.jobs) < thread_number: + for job in self.jobs: if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): if str(configuration.get_project_type()).lower() != "none": raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) @@ -162,15 +170,15 @@ class JobPackageBase(object): Log.warning("On submission script has some empty variables") else: Log.result("Script {0} OK",job.name) - job.update_parameters(configuration, parameters) - # looking for directives on jobs - self._custom_directives = self._custom_directives | set(job.custom_directives) - else: - Lhandle = list() - for i in xrange(0, len(self.jobs), chunksize): - Lhandle.append(self.check_scripts(self.jobs[i:i + chunksize], configuration, parameters, only_generate, hold)) - for dataThread in Lhandle: - dataThread.join() + job.update_parameters(configuration, parameters) + # looking for directives on jobs + self._custom_directives = self._custom_directives | set(job.custom_directives) + else: + Lhandle = list() + for i in xrange(0, len(self.jobs), chunksize): + Lhandle.append(self.check_scripts(self.jobs[i:i + chunksize], configuration, parameters, only_generate, hold)) + for dataThread in Lhandle: + dataThread.join() except BaseException as e: #should be IOERROR raise AutosubmitCritical( "Error on {1}, template [{0}] still does not exists in running time(check=on_submission actived) ".format(job.file,job.name), 7014) -- GitLab From 4354c2cdab731e022097dfc449675c4d24157365 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 26 Jul 2023 13:07:18 +0200 Subject: [PATCH 212/213] fixed test --- autosubmit/job/job_packages.py | 15 ++++++++++----- test/unit/test_job_package.py | 30 +++++++++++++++++++----------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 4c6c6100a..8e3c3409a 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -159,7 +159,7 @@ class JobPackageBase(object): for section in sections: if str(configuration._jobs_parser.get_option(section, "CHECK", 'True')).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): exit = True - if exit: + if not exit: if len(self.jobs) < thread_number: for job in self.jobs: if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): @@ -170,7 +170,8 @@ class JobPackageBase(object): Log.warning("On submission script has some empty variables") else: Log.result("Script {0} OK",job.name) - job.update_parameters(configuration, parameters) + # called inside check_script + #job.update_parameters(configuration, parameters) # looking for directives on jobs self._custom_directives = self._custom_directives | set(job.custom_directives) else: @@ -179,9 +180,13 @@ class JobPackageBase(object): Lhandle.append(self.check_scripts(self.jobs[i:i + chunksize], configuration, parameters, only_generate, hold)) for dataThread in Lhandle: dataThread.join() - except BaseException as e: #should be IOERROR - raise AutosubmitCritical( - "Error on {1}, template [{0}] still does not exists in running time(check=on_submission actived) ".format(job.file,job.name), 7014) + except BaseException as e: + original = e + if not exit: + raise AutosubmitCritical( + "Error on {1}, template [{0}] still does not exists in running time(check=on_submission actived)\n{2} ".format(self.jobs[0].file,self.jobs[0].name,e), 7014) + else: + raise AutosubmitCritical(original,7014) Log.debug("Creating Scripts") if not exit: if len(self.jobs) < thread_number: diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index c6a52166f..a929d6c43 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -1,12 +1,10 @@ from unittest import TestCase -import os -from mock import Mock -from mock import patch +from mock import Mock, patch, MagicMock -from autosubmit.job.job_packages import JobPackageSimple from autosubmit.job.job import Job from autosubmit.job.job_common import Status +from autosubmit.job.job_packages import JobPackageSimple class TestJobPackage(TestCase): @@ -43,24 +41,34 @@ class TestJobPackage(TestCase): def test_job_package_platform_getter(self): self.assertEquals(self.platform, self.job_package.platform) - def test_job_package_submission(self): + @patch('os.path.exists') + def test_job_package_submission(self, os_mock): # arrange write_mock = Mock().write = Mock() - - for job in self.jobs: + os_mock.return_value = True + for job in self.job_package.jobs: job._tmp_path = Mock() + job.name = "fake-name" job._get_paramiko_template = Mock("false","empty") + job.file = "fake-file" + job.update_parameters = MagicMock(return_value="fake-params") + job.parameters = "fake-params" + + self.job_package._create_scripts = Mock() self.job_package._send_files = Mock() self.job_package._do_submission = Mock() - for job in self.jobs: - job.update_parameters = Mock() + configuration = Mock() + configuration.get_project_type = Mock(return_value='fake-type') + configuration.get_project_dir = Mock(return_value='fake-dir') + configuration.get_project_name = Mock(return_value='fake-name') + # act - self.job_package.submit('fake-config', 'fake-params') + self.job_package.submit(configuration, 'fake-params') # assert for job in self.jobs: - job.update_parameters.assert_called_once_with('fake-config', 'fake-params') + job.update_parameters.assert_called_once_with(configuration, 'fake-params') self.job_package._create_scripts.is_called_once_with() self.job_package._send_files.is_called_once_with() self.job_package._do_submission.is_called_once_with() -- GitLab From 90c8f0307898b83627f3fa0d0ab5fde9d9e776cc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Jul 2023 08:47:05 +0200 Subject: [PATCH 213/213] fixed test --- autosubmit/job/job_packager.py | 27 +++++++++++++-------------- autosubmit/job/job_packages.py | 21 ++++++++++++--------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index ea3e0b9f1..57d20f275 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -17,17 +17,17 @@ # You should have received a copy of the GNU General Public License # along with Autosubmit. If not, see . -from log.log import Log, AutosubmitCritical, AutosubmitError -from autosubmit.job.job_common import Status, Type +import operator from bscearth.utils.date import sum_str_hours -from autosubmit.job.job_packages import JobPackageSimple, JobPackageVertical, JobPackageHorizontal, \ - JobPackageSimpleWrapped, JobPackageHorizontalVertical, JobPackageVerticalHorizontal, JobPackageBase -from operator import attrgetter from math import ceil -import operator -from collections import defaultdict +from operator import attrgetter from typing import List +from autosubmit.job.job_common import Status, Type +from autosubmit.job.job_packages import JobPackageSimple, JobPackageVertical, JobPackageHorizontal, \ + JobPackageSimpleWrapped, JobPackageHorizontalVertical, JobPackageVerticalHorizontal, JobPackageBase +from log.log import Log, AutosubmitCritical + class JobPackager(object): """ @@ -437,10 +437,10 @@ class JobPackager(object): wrapper_limits["min_v"], wrapper_limits["min"], len(active_jobs)), 6013) else: - message = "Wrapper couldn't be formed under {0} POLICY due minimum limit not being reached: [wrappeable:{4} < defined_min:{5}] [wrappeable_h:{1} < defined_min_h:{2}]|[wrappeable_v:{3} < defined_min_v:{4}] ".format( + message = "Wrapper couldn't be formed under {0} POLICY due minimum limit not being reached: [wrappeable:{5} <= defined_min:{6}] [wrappeable_h:{1} <= defined_min_h:{2}]|[wrappeable_v:{3} <= defined_min_v:{4}] ".format( self.wrapper_policy[self.current_wrapper_section], min_h, - wrapper_limits["min_h"], min_v, wrapper_limits["min_v"], - wrapper_limits["min"], len(active_jobs)) + wrapper_limits["min_h"], min_v, wrapper_limits["min_v"], len(p.jobs), + wrapper_limits["min"]) if hard_deadlock: message += "\nCheck your configuration: The next wrappeable job can't be wrapped until some of inner jobs of current packages finishes which is imposible" if min_v > 1: @@ -481,13 +481,12 @@ class JobPackager(object): if len(active_jobs) > 0: if show_log: Log.printlog( - "Wrapper policy is set to MIXED and there are not enough jobs to form a wrapper.[wrappeable:{4} < defined_min:{5}] [wrappeable_h:{0} < defined_min_h:{1}]|[wrappeable_v:{2} < defined_min_v:{3}] waiting until the wrapper can be formed.".format( + "Wrapper policy is set to MIXED and there are not enough jobs to form a wrapper.[wrappeable:{4} <= defined_min:{5}] [wrappeable_h:{0} <= defined_min_h:{1}]|[wrappeable_v:{2} <= defined_min_v:{3}] waiting until the wrapper can be formed.".format( min_h, wrapper_limits["min_h"], min_v, wrapper_limits["min_v"],wrapper_limits["min"],len(active_jobs)), 6013) else: - message = "Wrapper couldn't be formed under {0} POLICY due minimum limit not being reached: [wrappeable:{4} < defined_min:{5}] [wrappeable_h:{1} < defined_min_h:{2}]|[wrappeable_v:{3} < defined_min_v:{4}] ".format( - self.wrapper_policy[self.current_wrapper_section], min_h, - wrapper_limits["min_h"], min_v, wrapper_limits["min_v"],wrapper_limits["min"],len(active_jobs)) + message = "Wrapper couldn't be formed under {0} POLICY due minimum limit not being reached: [wrappeable:{5} <= defined_min:{6}] [wrappeable_h:{1} <= defined_min_h:{2}]|[wrappeable_v:{3} <= defined_min_v:{4}] ".format( + self.wrapper_policy[self.current_wrapper_section], min_h,wrapper_limits["min_h"],min_v, wrapper_limits["min_v"], len(p.jobs),wrapper_limits["min"]) if hard_deadlock: message += "\nCheck your configuration: The next wrappeable job can't be wrapped until some of inner jobs of current packages finishes which is imposible" if min_v > 1: diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 8e3c3409a..6174ffc69 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -113,9 +113,9 @@ class JobPackageBase(object): Log.warning("On submission script has some empty variables") else: Log.result("Script {0} OK", job.name) - lock.acquire() - job.update_parameters(configuration, parameters) - lock.release() + # lock.acquire() + # job.update_parameters(configuration, parameters) + # lock.release() # looking for directives on jobs self._custom_directives = self._custom_directives | set(job.custom_directives) @threaded @@ -149,13 +149,16 @@ class JobPackageBase(object): try: # get one job of each section jobs by section if only_generate: - sections = configuration.get_wrapper_jobs(self.current_wrapper_section) - if "&" in sections: - sections.split("&") - elif " " in sections: - sections.split(" ") + if hasattr(configuration, 'current_wrapper_section'): + sections = configuration.get_wrapper_jobs(self.current_wrapper_section) + if "&" in sections: + sections.split("&") + elif " " in sections: + sections.split(" ") + else: + sections = [sections] else: - sections = [sections] + sections = [self.jobs[0].section] for section in sections: if str(configuration._jobs_parser.get_option(section, "CHECK", 'True')).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): exit = True -- GitLab