From 077cd9ad691f64e9d37bea3d1f951a2633883dad Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 28 Jul 2022 16:19:58 +0200 Subject: [PATCH 001/121] over_wallclock fix --- autosubmit/job/job.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 226b85c37..948269142 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -768,6 +768,22 @@ class Job(object): except BaseException as e: pass return + def parse_time(self,wallclock): + format = "minute" + regex = re.compile(r'(((?P\d+):)((?P\d+)))(:(?P\d+))?') + parts = regex.match(wallclock) + if not parts: + return + parts = parts.groupdict() + if int(parts['hours']) > 0 : + format = "hour" + else: + format = "minute" + time_params = {} + for name, param in parts.items(): + if param: + time_params[name] = int(param) + return datetime.timedelta(**time_params),format # Duplicated for wrappers and jobs to fix in 4.0.0 def is_over_wallclock(self, start_time, wallclock): """ @@ -777,25 +793,13 @@ class Job(object): :return: """ elapsed = datetime.datetime.now() - start_time - wallclock = datetime.datetime.strptime(wallclock, '%H:%M') - total = 0.0 - if wallclock.hour > 0: - total = wallclock.hour - format = "hour" - else: - format = "minute" - if format == "hour": - if wallclock.minute > 0: - total += wallclock.minute / 60.0 - if wallclock.second > 0: - total += wallclock.second / 60.0 / 60.0 + wallclock,time_format = self.parse_time(wallclock) + if time_format == "hour": + total = wallclock.days * 24 + wallclock.seconds / 60 / 60 else: - if wallclock.minute > 0: - total += wallclock.minute - if wallclock.second > 0: - total += wallclock.second / 60.0 + total = wallclock.days * 24 + wallclock.seconds / 60 total = total * 1.30 # in this case we only want to avoid slurm issues so the time is increased by 50% - if format == "hour": + if time_format == "hour": hour = int(total) minute = int((total - int(total)) * 60.0) second = int(((total - int(total)) * 60 - -- GitLab From 45008de2246909d4c4c0d441ce2647d234f95089 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 4 Aug 2022 04:21:27 +0200 Subject: [PATCH 002/121] adds support for post+1 --- autosubmit/job/job_list.py | 13 ++++++++++++- docs/source/usage/configuration/new_job.rst | 4 +++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 3d55bb040..d880e3dc4 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -376,6 +376,8 @@ class JobList(object): # Get current job dependency relations. Used for select chunk option. This is the job in where select chunks option is defined if len(dependency.select_chunks_orig) > 0: # find chunk relation other_parents = dic_jobs.get_jobs(dependency.section, date, member, None) + jobs_by_section = [p for p in other_parents if p.section == dependency.section] + chunk_relation_indx = 0 while chunk_relation_indx < len(dependency.select_chunks_orig): if job.running in ["once"] or len(dependency.select_chunks_orig[chunk_relation_indx]) == 0 or job.chunk in dependency.select_chunks_orig[chunk_relation_indx]: @@ -425,7 +427,16 @@ class JobList(object): JobList._add_edge(graph, job, parent) other_parents.remove(parent) visited_parents.add(parent) - + # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. + if len(job.parents) <= 0: + for relation_indx in chunk_relations_to_add: + for parent in jobs_by_section: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + visited_parents.add(parent) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) diff --git a/docs/source/usage/configuration/new_job.rst b/docs/source/usage/configuration/new_job.rst index e8fb39692..b9099e348 100644 --- a/docs/source/usage/configuration/new_job.rst +++ b/docs/source/usage/configuration/new_job.rst @@ -31,10 +31,12 @@ This is the minimum job definition and usually is not enough. You usually will n .. code-block:: ini [jobs] - SELECT_CHUNKS = SIM*[1]*[3] # Enables the dependency of chunk 1 with chunk 3. While chunks 2,4 won't be linked. + SELECT_CHUNKS = SIM*[1:3] # Enables the dependency of chunk 1,2 and 3. While 4 won't be linked. SELECT_CHUNKS = SIM*[1,3] # Enables the dependency of chunk 1 and 3. While 2 and 4 won't be linked SELECT_CHUNKS = SIM*[1] # Enables the dependency of chunk 1. While 2, 3 and 4 won't be linked + SELECT_CHUNKS = SIM*[1]*[3] # Enables the dependency of SIM_1 with CHILD_3. While chunks 2,4 won't be linked. + SELECT_CHUNKS = SIM*[2:4]*[2:4] SIM*[2]*[1] # Links SIM_2:4 with CHILDREN_2:4 and links SIM_2 with CHILD_1 * SELECT_MEMBERS (optional): by default, all sections depend on all jobs the items specified on the DEPENDENCIES parameter. However, with this parameter, you could select the members of a specific job section. At the end of this doc, you will find diverse examples of this feature. Caution, you must pick the member index, not the member name. -- GitLab From 1cec0bab0ee05c47f8bc2b5b3bc5de6c6422b6ec Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 8 Aug 2022 16:36:54 +0200 Subject: [PATCH 003/121] fix project_Destination --- autosubmit/autosubmit.py | 3 ++- autosubmit/config/config_common.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 19dc23baf..5bdb10116 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4324,7 +4324,8 @@ class Autosubmit: """ project_destination = as_conf.get_project_destination() if project_destination is None or len(project_destination) == 0: - raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) + if project_type.lower() != "none": + raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": submitter = Autosubmit._get_submitter(as_conf) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 7b2a6a12b..e3e9188a4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1119,11 +1119,14 @@ class AutosubmitConfig(object): elif self.get_project_type().lower() == "git": value = self.get_git_project_origin().split( '/')[-1].split('.')[-2] - return value + if value != "": + return value + else: + return "project_files" except Exception as exp: Log.debug(str(exp)) Log.debug(traceback.format_exc()) - return '' + return "project_files" def set_git_project_commit(self, as_conf): """ -- GitLab From 3fbfdb808c309b83d63192cd78aa2212ce77f191 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 15:09:33 +0200 Subject: [PATCH 004/121] tkinter --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 2b4a65497..e6112df7d 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -11,7 +11,7 @@ The Autosubmit code is maintained in *PyPi*, the main source for python packages .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10, tkinter .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. -- GitLab From 71cd5a6bed9d30231a562d9664f89d585934b5f5 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 14:56:12 +0200 Subject: [PATCH 005/121] tkinter --- docs/source/installation.rst | 4 ++-- requeriments.txt | 1 + setup.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index e6112df7d..5dd60a136 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -7,11 +7,11 @@ How to install The Autosubmit code is maintained in *PyPi*, the main source for python packages. -- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk, python2-dev, graphviz >= 2.41, pip2 +- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk(tkinter in centOS), python2-dev, graphviz >= 2.41, pip2 .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10, tkinter +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10 .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. diff --git a/requeriments.txt b/requeriments.txt index f2dfdd0aa..d57974475 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -13,6 +13,7 @@ typing bscearth.utils cryptography==3.3.2 PyNaCl==1.4.0 +six>=1.10.0 requests xlib Pygments \ No newline at end of file diff --git a/setup.py b/setup.py index 35e8f4f4f..7935f7a42 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], extras_require={ -- GitLab From 46b2c19842702a6f82aef43089326fd1d385f86d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 15:42:36 +0200 Subject: [PATCH 006/121] author change --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7935f7a42..d4d0f0179 100644 --- a/setup.py +++ b/setup.py @@ -34,8 +34,8 @@ setup( version=version, description='Autosubmit: a versatile tool to manage Weather and Climate Experiments in diverse ' 'Supercomputing Environments', - author='Domingo Manubens-Gil', - author_email='domingo.manubens@bsc.es', + author='Daniel Beltran Mora', + author_email='daniel.beltran@bsc.es', url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], -- GitLab From f7a014ba3d3b554ddc52e6a27be42de501e0f6ff Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 15:08:10 +0200 Subject: [PATCH 007/121] Added requests, improvement exception recovery for wrappers , added more info, bugfixed status appearing in log.out , bug fixed lc level not being able to change --- autosubmit/autosubmit.py | 54 ++++++++++++++--------- autosubmit/platforms/paramiko_platform.py | 32 ++++++++------ autosubmit/platforms/platform.py | 2 +- environment.yml | 1 + log/log.py | 15 ++++++- setup.py | 2 +- 6 files changed, 69 insertions(+), 37 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5bdb10116..8704d27f3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -162,7 +162,7 @@ class Autosubmit: parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version) parser.add_argument('-lf', '--logfile', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), - default='WARNING', type=str, + default='DEBUG', type=str, help="sets file's log level.") parser.add_argument('-lc', '--logconsole', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), default='INFO', type=str, @@ -1659,7 +1659,11 @@ class Autosubmit: Log.debug('Checking Wrapper {0}'.format(str(job_id))) wrapper_job.checked_time = datetime.datetime.now() # This is where wrapper will be checked on the slurm platform, update takes place. - platform.check_job(wrapper_job) + try: + platform.check_job(wrapper_job,is_wrapper=True) + except BaseException as e: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking wrapper {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) #Log.info("FD 3Wrapper checked: {0}".format(log.fd_show.fd_table_status_str())) try: if wrapper_job.status != wrapper_job.new_status: @@ -1671,8 +1675,12 @@ class Autosubmit: "Wrapper is in Unknown Status couldn't get wrapper parameters", 7050) # New status will be saved and inner_jobs will be checked. - wrapper_job.check_status( - wrapper_job.new_status) + try: + wrapper_job.check_status(wrapper_job.new_status) + except: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking the inner_jobs of {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) + # Erase from packages if the wrapper failed to be queued ( Hold Admin bug ) if wrapper_job.status == Status.WAITING: for inner_job in wrapper_job.job_list: @@ -1782,9 +1790,18 @@ class Autosubmit: # No need to wait until the remote platform reconnection recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - consecutive_retrials = 1 - delay = min(15*consecutive_retrials,120) + consecutive_retrials = 0 + failed_names = {} + Log.info("Storing failed job count...") + try: + for job in job_list.get_job_list(): + if job.fail_count > 0: + failed_names[job.name] = job.fail_count + except BaseException as e: + Log.printlog("Error trying to store failed job count",Log.WARNING) + Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: + delay = min(15 * consecutive_retrials, 120) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1794,6 +1811,7 @@ class Autosubmit: Log.info("Recovering job_list...") job_list = Autosubmit.load_job_list( expid, as_conf, notransitive=notransitive) + Log.info("Recovering job_list... Done") if allowed_members: # Set allowed members after checks have been performed. This triggers the setter and main logic of the -rm feature. job_list.run_members = allowed_members @@ -1801,26 +1819,20 @@ class Autosubmit: "Only jobs with member value in {0} or no member will be allowed in this run. Also, those jobs already SUBMITTED, QUEUING, or RUNNING will be allowed to complete and will be tracked.".format( str(allowed_members))) platforms_to_test = set() + Log.info("Recovering platform information...") for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) - #Recover job_list while keeping job.fail_count - failed_names = {} - for job in job_list.get_job_list(): - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] - platforms_to_test.add(job.platform) - if job.fail_count > 0: - failed_names[job.name] = job.fail_count + + Log.info("Recovering platform information... Done") + Log.info("Recovering Failure count...") for job in job_list.get_job_list(): if job.name in failed_names.keys(): job.fail_count = failed_names[job.name] - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] + Log.info("Recovering Failure count... Done") + Log.info("Recovering parameters...") Autosubmit._load_parameters(as_conf, job_list, submitter.platforms) # Recovery wrapper [Packages] @@ -1876,9 +1888,11 @@ class Autosubmit: None, None, jobs[0].platform, as_conf, jobs[0].hold) job_list.job_package_map[jobs[0].id] = wrapper_job + Log.info("Recovering wrappers... Done") job_list.update_list(as_conf) Log.info("Saving recovered job list...") job_list.save() + Log.info("Saving recovered job list... Done") recovery = True Log.result("Recover of job_list is completed") except AutosubmitError as e: @@ -1886,10 +1900,10 @@ class Autosubmit: Log.result("Recover of job_list has fail {0}".format(e.message)) except IOError as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) except BaseException as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) # Restore platforms and try again, to avoid endless loop with failed configuration, a hard limit is set. reconnected = False mail_notify = True diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 43adfd5c6..e57512f55 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -452,17 +452,20 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status :param retries: retries :param job: job + :type job: autosubmit.job.job.Job + :param default_status: default status if job is not found :type job: class(job) :param default_status: status to assign if it can be retrieved from the platform :type default_status: autosubmit.job.job_common.Status :return: current job status :rtype: autosubmit.job.job_common.Status + """ job_id = job.id job_status = Status.UNKNOWN @@ -491,19 +494,20 @@ class ParamikoPlatform(Platform): job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: job_status = Status.RUNNING - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + if not is_wrapper: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED elif job_status in self.job_status['QUEUING'] and job.hold is False: job_status = Status.QUEUING elif job_status in self.job_status['QUEUING'] and job.hold is True: diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index c2ccf3575..acbb20aa7 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -384,7 +384,7 @@ class Platform(object): """ raise NotImplementedError - def check_job(self, jobid, default_status=Status.COMPLETED, retries=5): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status diff --git a/environment.yml b/environment.yml index 4585486d9..bc6e7308b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - portalocker - networkx - python=2.7 +- requests - pip: - bscearth.utils - Xlib diff --git a/log/log.py b/log/log.py index ae3ca5a74..216fc23eb 100644 --- a/log/log.py +++ b/log/log.py @@ -161,7 +161,7 @@ class Log: logging.getLogger(name) @staticmethod - def set_file(file_path, type='out', level=WARNING): + def set_file(file_path, type='out', level="WARNING"): """ Configure the file to store the log. If another file was specified earlier, new messages will only go to the new file. @@ -169,6 +169,19 @@ class Log: :param file_path: file to store the log :type file_path: str """ + levels = {} + levels["STATUS_FAILED"] = 500 + levels["STATUS"] = 1000 + levels["DEBUG"] = 2000 + levels["WARNING"] = 3000 + levels["INFO"] = 4000 + levels["RESULT"] = 5000 + levels["ERROR"] = 6000 + levels["CRITICAL"] = 7000 + levels["NO_LOG"] = levels["CRITICAL"] + 1000 + + level = levels.get(str(level).upper(),"DEBUG") + max_retrials = 3 retrials = 0 timeout = 5 diff --git a/setup.py b/setup.py index d4d0f0179..8e56eb8c5 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( keywords=['climate', 'weather', 'workflow', 'HPC'], install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', - 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], + 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] }, -- GitLab From 417305910c1b0331d341cde655b9ad518b6bed96 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 15:53:09 +0200 Subject: [PATCH 008/121] stat fix --- autosubmit/autosubmit.py | 1 + autosubmit/job/job.py | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8704d27f3..03853b178 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1763,6 +1763,7 @@ class Autosubmit: save2 = job_list.update_list( as_conf, submitter=submitter) job_list.save() + if len(job_list.get_ready()) > 0: save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 948269142..28c9b2be9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -630,10 +630,9 @@ class Job(object): found = False retrials = 0 while retrials < 3 and not found: - sleep(2) if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): found = True - retrials = retrials - 1 + retrials = retrials + 1 for i in range(max_logs-1,-1,-1): if platform.check_stat_file_by_retrials(stat_file + str(i)): last_log = i @@ -1181,18 +1180,18 @@ class Job(object): if self.type == Type.BASH: template = 'sleep 5' + "\n" elif self.type == Type.PYTHON: - template = 'time.sleep(30)' + "\n" + template = 'time.sleep(5)' + "\n" elif self.type == Type.R: - template = 'Sys.sleep(30)' + "\n" + template = 'Sys.sleep(5)' + "\n" template += template_file.read() template_file.close() else: if self.type == Type.BASH: - template = 'sleep 35' + template = 'sleep 5' elif self.type == Type.PYTHON: - template = 'time.sleep(35)' + template = 'time.sleep(5)' elif self.type == Type.R: - template = 'Sys.sleep(35)' + template = 'Sys.sleep(5)' else: template = '' except: -- GitLab From 29676178bca47777f1e5561eedd1f001bf73a7a2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 16:44:23 +0200 Subject: [PATCH 009/121] wrapper_type is now being saved correctly --- autosubmit/autosubmit.py | 1 - autosubmit/job/job_list.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 03853b178..8704d27f3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1763,7 +1763,6 @@ class Autosubmit: save2 = job_list.update_list( as_conf, submitter=submitter) job_list.save() - if len(job_list.get_ready()) > 0: save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index d880e3dc4..395e07467 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -215,6 +215,15 @@ class JobList(object): new, notransitive, update_structure=update_structure) for job in self._job_list: job.parameters = parameters + job_data = jobs_data.get(job.name,"none") + try: + if job_data != "none": + job.wrapper_type = job_data[12] + else: + job.wrapper_type = "none" + except BaseException as e: + job.wrapper_type = "none" + # Checking for member constraints if len(run_only_members) > 0: # Found -- GitLab From 5137b0022be260c23150e66bb153a52d86abf5a9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 08:45:34 +0200 Subject: [PATCH 010/121] erased debug info, changed exception for baseexception --- autosubmit/job/job.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 28c9b2be9..1056b93f6 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -620,10 +620,9 @@ class Job(object): submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) platform = submitter.platforms[platform_name.lower()] - try: - platform.test_connection() - except: - pass + + platform.test_connection() + max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count if self.wrapper_type is not None and self.wrapper_type == "vertical": @@ -643,7 +642,7 @@ class Job(object): else: remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except Exception as e: + except BaseException as e: Log.printlog( "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) out_exist = False -- GitLab From ce7a4b1be6d3a6bd1ca782d1e3fda5ee2545f4ab Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 12:53:18 +0200 Subject: [PATCH 011/121] Fixed delay issue #862 --- autosubmit/job/job_list.py | 60 +++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 395e07467..395c97e4c 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -415,37 +415,37 @@ class JobList(object): if dependency.splits is not None: parent = filter( lambda _parent: _parent.split in dependency.splits, parent) - #Select chunk + select member - if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - elif len(dependency.select_members_orig) > 0: - for relation_indx in member_relations_to_add: - if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - elif len(dependency.select_chunks_orig) > 0: + #Select chunk + select member + if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + elif len(dependency.select_members_orig) > 0: + for relation_indx in member_relations_to_add: + if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + elif len(dependency.select_chunks_orig) > 0: + for relation_indx in chunk_relations_to_add: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. + if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( - dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. - if len(job.parents) <= 0: - for relation_indx in chunk_relations_to_add: - for parent in jobs_by_section: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( - dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - visited_parents.add(parent) + for parent in jobs_by_section: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + visited_parents.add(parent) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) -- GitLab From 9e68d54234bbbf51a089776dde1db8e9e494e195 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 13:40:57 +0200 Subject: [PATCH 012/121] Added 5min retrial in case that something is wrong while recovering the As_conf info inside a thread. --- autosubmit/job/job.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1056b93f6..325564bec 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -614,15 +614,27 @@ class Job(object): max_logs = 0 sleep(5) stat_file = self.script_name[:-4] + "_STAT_" + retries = 2 + count = 0 + success = False + error_message = "" + while count < retries or success: + try: + as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) + as_conf.reload() + submitter = self._get_submitter(as_conf) + submitter.load_platforms(as_conf) + success = True + except BaseException as e: + error_message = str(e) + sleep(60*5) + pass + count=count+1 + if not success: + raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) + platform = submitter.platforms[platform_name.lower()] try: - as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.reload() - submitter = self._get_submitter(as_conf) - submitter.load_platforms(as_conf) - platform = submitter.platforms[platform_name.lower()] - platform.test_connection() - max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count if self.wrapper_type is not None and self.wrapper_type == "vertical": @@ -644,7 +656,7 @@ class Job(object): except BaseException as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(e.message, self.name), 6001) out_exist = False err_exist = False retries = 3 -- GitLab From e065788458a9e0ec3077463c0ef24844e2eeebcc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:30:45 +0200 Subject: [PATCH 013/121] e --- autosubmit/autosubmit.py | 2 +- autosubmit/job/job.py | 2 +- autosubmit/platforms/slurmplatform.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8704d27f3..b299c7dcc 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2227,7 +2227,7 @@ class Autosubmit: "{0} submission failed, some hold jobs failed to be held".format(platform.name), 6015) except WrongTemplateException as e: raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( - e.job_name), 7014, e.message) + e.job_name), 7014, str(e)) except AutosubmitError as e: raise except AutosubmitCritical as e: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 325564bec..1068dca65 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -618,7 +618,7 @@ class Job(object): count = 0 success = False error_message = "" - while count < retries or success: + while (count < retries) or success: try: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index cd96b21cc..5d31690c4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -362,8 +362,8 @@ class SlurmPlatform(ParamikoPlatform): return export + self._submit_hold_cmd + job_script else: if not hold: - self._submit_script_file.write( - export + self._submit_cmd + job_script + "\n") + write_this = export + self._submit_cmd + job_script +"\n" + self._submit_script_file.write(write_this) else: self._submit_script_file.write( export + self._submit_hold_cmd + job_script + "\n") -- GitLab From 1d79e5d748f7e7071ec0d4650c1b16d11bff7c96 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:34:19 +0200 Subject: [PATCH 014/121] e --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1068dca65..9365e516f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -618,7 +618,7 @@ class Job(object): count = 0 success = False error_message = "" - while (count < retries) or success: + while (count < retries) or not success: try: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() -- GitLab From 4ce7f18eaa5288980bffa600211e8c6cb884675e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:50:30 +0200 Subject: [PATCH 015/121] fixed message --- autosubmit/platforms/paramiko_submitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index c597274f7..acba2bcce 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -184,8 +184,9 @@ class ParamikoSubmitter(Submitter): None) remote_platform.custom_directives = parser.get_option(section, 'CUSTOM_DIRECTIVES', None) - Log.debug("Custom directives from platform.conf: {0}".format( - remote_platform.custom_directives)) + if remote_platform.custom_directives is not None and remote_platform.custom_directives != '' and remote_platform.custom_directives != 'None': + Log.debug("Custom directives from platform.conf: {0}".format( + remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, -- GitLab From e768dde8f1892ce6c5c75712a539f129c21fff7b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 6 Sep 2022 10:53:49 +0200 Subject: [PATCH 016/121] conda fix --- docs/source/installation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 5dd60a136..9a90c4e54 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -172,9 +172,9 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. .. code-block:: bash # Download conda - wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh./Miniconda3-py39_4.12.0-Linux-x86_64.sh + wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it - ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + chmod + x ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh # Download git apt install git -y -q # Download autosubmit -- GitLab From 578751f40bf1a910b3adcba0862dec86c455e6be Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 6 Sep 2022 11:01:23 +0200 Subject: [PATCH 017/121] conda fix --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 9a90c4e54..64b314886 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -174,7 +174,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Download conda wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it - chmod + x ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh # Download git apt install git -y -q # Download autosubmit -- GitLab From 3cdfa7f700b99e04217cf77ad570ce332980fb9d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 7 Sep 2022 15:23:52 +0200 Subject: [PATCH 018/121] Wrapper is now fully independent from total and waiting jobs as expected #857 --- autosubmit/autosubmit.py | 5 +-- autosubmit/config/config_common.py | 9 ++-- autosubmit/job/job_packager.py | 52 +++++++++++----------- autosubmit/platforms/paramiko_submitter.py | 4 +- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index b299c7dcc..6fd5932a3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1372,8 +1372,8 @@ class Autosubmit: while job_list.get_active(): Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True, only_wrappers, hold=False) - for job in job_list.get_uncompleted_and_not_waiting(): - job.status = Status.COMPLETED + #for job in job_list.get_uncompleted_and_not_waiting(): + # job.status = Status.COMPLETED job_list.update_list(as_conf, False) @staticmethod @@ -2071,7 +2071,6 @@ class Autosubmit: platform.open_submit_script() valid_packages_to_submit = [] # type: List[JobPackageBase] for package in packages_to_submit: - try: # If called from inspect command or -cw if only_wrappers or inspect: diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index e3e9188a4..3f5c39a3b 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1600,7 +1600,9 @@ class AutosubmitConfig(object): :return: maximum number of jobs (or total jobs) :rtype: int """ - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED', self.get_total_jobs())) + #total_jobs = self.get_total_jobs() + #unlimited because wrapper should count as one + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED', 999999999)) def get_max_wrapped_jobs_vertical(self, wrapper_section_name="wrapper"): """ @@ -1609,8 +1611,7 @@ class AutosubmitConfig(object): :return: maximum number of jobs (or total jobs) :rtype: int """ - max_wrapped = self.get_max_wrapped_jobs(wrapper_section_name) - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_V', max_wrapped)) + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_V', -1)) def get_max_wrapped_jobs_horizontal(self, wrapper_section_name="wrapper"): """ @@ -1620,7 +1621,7 @@ class AutosubmitConfig(object): :rtype: int """ max_wrapped = self.get_max_wrapped_jobs(wrapper_section_name) - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_H', max_wrapped)) + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_H', -1)) def get_min_wrapped_jobs_vertical(self, wrapper_section_name="wrapper"): """ diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 54a6268c3..cfc1235e8 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -57,7 +57,12 @@ class JobPackager(object): # Submitted + Queuing Jobs for specific Platform queuing_jobs = jobs_list.get_queuing(platform) # We now consider the running jobs count - running_jobs_count = len(jobs_list.get_running(platform)) + running_jobs = jobs_list.get_running(platform) + running_by_id = dict() + for running_job in running_jobs: + running_by_id[running_job.id] = running_job + running_jobs_len = len(running_by_id.keys()) + queued_by_id = dict() for queued_job in queuing_jobs: queued_by_id[queued_job.id] = queued_job @@ -76,10 +81,9 @@ class JobPackager(object): # .total_jobs Maximum number of jobs at the same time self._max_jobs_to_submit = platform.total_jobs - queuing_jobs_len # Substracting running jobs - self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_count + self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_len self._max_jobs_to_submit = self._max_jobs_to_submit if self._max_jobs_to_submit > 0 else 0 - self.max_jobs = min(self._max_wait_jobs_to_submit, - self._max_jobs_to_submit) + self.max_jobs = min(self._max_wait_jobs_to_submit,self._max_jobs_to_submit) self.wrapper_type["wrapper"] = self._as_config.get_wrapper_type() self.wrapper_policy["wrapper"] = self._as_config.get_wrapper_policy() @@ -94,24 +98,15 @@ class JobPackager(object): self.jobs_in_wrapper[wrapper_section] = self._as_config.get_wrapper_jobs(wrapper_section) self.extensible_wallclock[wrapper_section] = int(self._as_config.get_extensible_wallclock(wrapper_section)) self.wrapper_info = [self.wrapper_type,self.wrapper_policy,self.wrapper_method,self.jobs_in_wrapper,self.extensible_wallclock] # to pass to job_packages - - - # True or False - - Log.debug( - "Number of jobs available: {0}", self._max_wait_jobs_to_submit) + Log.debug("Number of jobs available: {0}", self._max_wait_jobs_to_submit) if self.hold: - Log.debug("Number of jobs prepared: {0}", len( - jobs_list.get_prepared(platform))) + Log.debug("Number of jobs prepared: {0}", len(jobs_list.get_prepared(platform))) if len(jobs_list.get_prepared(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_prepared(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_prepared(platform))) else: - Log.debug("Number of jobs ready: {0}", len( - jobs_list.get_ready(platform, hold=False))) + Log.debug("Number of jobs ready: {0}", len(jobs_list.get_ready(platform, hold=False))) if len(jobs_list.get_ready(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_ready(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_ready(platform))) self._maxTotalProcessors = 0 def compute_weight(self, job_list): @@ -210,8 +205,7 @@ class JobPackager(object): # Sort by Priority, highest first list_of_available = sorted( available_sorted, key=lambda k: k.priority, reverse=True) - num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len( - jobs_ready), self._max_jobs_to_submit) + num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len(jobs_ready), self._max_jobs_to_submit) # Take the first num_jobs_to_submit from the list of available jobs_to_submit_tmp = list_of_available[0:num_jobs_to_submit] #jobs_to_submit = [ @@ -248,6 +242,10 @@ class JobPackager(object): wrapper_limits["max_h"] = self._as_config.get_max_wrapped_jobs_horizontal(self.current_wrapper_section) if wrapper_limits["max"] < wrapper_limits["max_v"] * wrapper_limits["max_h"]: wrapper_limits["max"] = wrapper_limits["max_v"] * wrapper_limits["max_h"] + if wrapper_limits["max_v"] == -1: + wrapper_limits["max_v"] = wrapper_limits["max"] + if wrapper_limits["max_h"] == -1: + wrapper_limits["max_h"] = wrapper_limits["max"] if '&' not in section: if self._as_config.jobs_parser.has_option(section, 'DEPENDENCIES'): dependencies_keys = self._as_config.jobs_parser.get( @@ -552,7 +550,7 @@ class JobPackager(object): def _build_horizontal_packages(self, section_list, wrapper_limits, section): packages = [] horizontal_packager = JobPackagerHorizontal(section_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) + wrapper_limits["max"], self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) package_jobs = horizontal_packager.build_horizontal_package() @@ -585,11 +583,11 @@ class JobPackager(object): """ packages = [] for job in section_list: - if self.max_jobs > 0: + if wrapper_limits["max"] > 0: if job.packed is False: job.packed = True dict_jobs = self._jobs_list.get_ordered_jobs_by_date_member(self.current_wrapper_section) - job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, self.max_jobs, wrapper_limits, self._platform.max_wallclock) + job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, wrapper_limits["max"], wrapper_limits, self._platform.max_wallclock) jobs_list = job_vertical_packager.build_vertical_package(job) packages.append(JobPackageVertical(jobs_list, configuration=self._as_config,wrapper_section=self.current_wrapper_section,wrapper_info=wrapper_info)) @@ -605,7 +603,7 @@ class JobPackager(object): ## READY JOBS ## ## Create the horizontal ## horizontal_packager = JobPackagerHorizontal(jobs_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node) + wrapper_limits["max"], self._platform.processors_per_node) if self.wrapper_type[self.current_wrapper_section] == 'vertical-horizontal': return self._build_vertical_horizontal_package(horizontal_packager, jobs_resources) @@ -654,7 +652,7 @@ class JobPackager(object): horizontal_packager.wrapper_limits["max_by_section"][section] = horizontal_packager.wrapper_limits["max_by_section"][section] - 1 horizontal_packager.wrapper_limits["max"] = horizontal_packager.wrapper_limits["max"] - actual_wrapped_jobs for job in horizontal_package: - job_list = JobPackagerVerticalSimple([job], job.wallclock, self.max_jobs, + job_list = JobPackagerVerticalSimple([job], job.wallclock, horizontal_packager.wrapper_limits["max"], horizontal_packager.wrapper_limits, self._platform.max_wallclock).build_vertical_package(job) @@ -706,7 +704,7 @@ class JobPackagerVertical(object): :rtype: List() of Job Object \n """ # self.jobs_list starts as only 1 member, but wrapped jobs are added in the recursion - if len(self.jobs_list) >= self.max_jobs or len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: + if len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: return self.jobs_list child = self.get_wrappable_child(job) # If not None, it is wrappable @@ -897,7 +895,7 @@ class JobPackagerHorizontal(object): for section in jobs_by_section: current_package_by_section[section] = 0 for job in jobs_by_section[section]: - if self.max_jobs > 0 and len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: + if len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: if int(job.tasks) != 0 and int(job.tasks) != int(self.processors_node) and \ int(job.tasks) < job.total_processors: nodes = int( diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index acba2bcce..1f577426f 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -159,8 +159,10 @@ class ParamikoSubmitter(Submitter): asconf.get_max_processors()) remote_platform.max_waiting_jobs = int(parser.get_option(section, 'MAX_WAITING_JOBS', asconf.get_max_waiting_jobs())) - remote_platform.total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', + totaljobs = int(parser.get_option(section, 'TOTALJOBS', asconf.get_total_jobs())) + total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', asconf.get_total_jobs())) + remote_platform.total_jobs = min(min(totaljobs, total_jobs),asconf.get_total_jobs()) remote_platform.hyperthreading = parser.get_option(section, 'HYPERTHREADING', 'false').lower() remote_platform.project = parser.get_option( -- GitLab From f8a51172cb2f483cac5c013cfc090de213de1353 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Sep 2022 11:45:53 +0200 Subject: [PATCH 019/121] error message fix --- autosubmit/platforms/paramiko_platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e57512f55..e1b36f116 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -901,7 +901,7 @@ class ParamikoPlatform(Platform): except AutosubmitError as e: raise except IOError as e: - raise AutosubmitError(e.message,6016) + raise AutosubmitError("IO issues, something seems wrong with {0}".format(self.name),6016,e.message) except BaseException as e: raise AutosubmitError('Command {0} in {1} warning: {2}'.format( command, self.host, '\n'.join(stderr_readlines)), 6005, e.message) -- GitLab From 835215e84ce616477a46c4233eff5a5ec41e7114 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Sep 2022 15:45:24 +0200 Subject: [PATCH 020/121] docs update --- docs/source/devel_proj.rst | 19 ++++++++++++++++++- docs/source/faq.rst | 4 +++- .../usage/configuration/new_platform.rst | 4 ++-- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/docs/source/devel_proj.rst b/docs/source/devel_proj.rst index 056fb3265..17caddcf5 100644 --- a/docs/source/devel_proj.rst +++ b/docs/source/devel_proj.rst @@ -699,4 +699,21 @@ The custom directives can be used for multiple parameters at the same time using # [test [80] // small [40] // large [1040] MAX_PROCESSORS = 80 # test [40] / small [40] // large [40] - PROCESSORS_PER_NODE = 40 \ No newline at end of file + PROCESSORS_PER_NODE = 40 + +Controling the number of active concurrent tasks in an experiment +---------------------------------------------------------------------- + +In some cases, you may want to control the number of concurrent tasks/jobs that can be active in an experiment. + +To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS`` and ``MAX_WAITING_JOBS`` variable in the ``conf/autosubmit_cxxx.conf`` file. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + # Maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 + diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 7d1e31b34..b659c6bdc 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -155,7 +155,9 @@ Minor errors - Error codes [6000+] +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6013 | Configuration issues | Check log output for more info | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ -| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6015 | Submission failed | Automatically, if there aren't bigger issues | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6016 | Temporal connection issues | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ diff --git a/docs/source/usage/configuration/new_platform.rst b/docs/source/usage/configuration/new_platform.rst index 173dafae4..675d4edc6 100644 --- a/docs/source/usage/configuration/new_platform.rst +++ b/docs/source/usage/configuration/new_platform.rst @@ -53,9 +53,9 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be waiting in this platform. +* MAX_WAITING_JOBS: maximum number of jobs to be queuing or submitted in this platform. -* TOTAL_JOBS: maximum number of jobs to be running at the same time in this platform. +* TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. * CUSTOM_DIRECTIVES: Custom directives for the resource manager of this platform. -- GitLab From 50b2db0ce581933ea7c3f9e2f510ebee574fbc0b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 16 Sep 2022 15:48:55 +0200 Subject: [PATCH 021/121] Now critical issues messages is always shown --- autosubmit/autosubmit.py | 7 ++++--- autosubmit/config/config_common.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 6fd5932a3..355260a76 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4341,12 +4341,13 @@ class Autosubmit: raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": - submitter = Autosubmit._get_submitter(as_conf) - submitter.load_platforms(as_conf) + try: + submitter = Autosubmit._get_submitter(as_conf) + submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform", 7014, e.message) + raise AutosubmitCritical("Can't set main platform\nCheck the hpcarch platform configuration inside platform.conf", 7014) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 3f5c39a3b..cc8aa3e1c 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -540,6 +540,8 @@ class AutosubmitConfig(object): # In case that there are critical errors in the configuration, Autosubmit won't continue. if running_time is True: raise AutosubmitCritical(e.message, e.code, e.trace) + else: + Log.printlog(e.message+"\n") except Exception as e: raise AutosubmitCritical( "There was an error while showing the config log messages", 7014, str(e)) -- GitLab From 022a881c7c83e2d40665d5267a984206985a4db0 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 21 Sep 2022 16:09:27 +0200 Subject: [PATCH 022/121] Patch for db_fix --- autosubmit/autosubmit.py | 20 ++++++++++++-------- requeriments.txt | 1 + 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 355260a76..60b064de9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1556,7 +1556,8 @@ class Autosubmit: exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) except Exception as e: # This error is important - raise AutosubmitCritical("Error while processing historical database.", 7005, str(e)) + Log.printlog("Error while processing historical database.", 7005, str(e)) + try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -4224,13 +4225,16 @@ class Autosubmit: except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) - Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), + as_conf.get_full_config_as_json(), + job_list.get_job_list()) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: if group_by: status = list() diff --git a/requeriments.txt b/requeriments.txt index d57974475..c34451db2 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,3 +1,4 @@ +configparser argparse>=1.2,<2 python-dateutil>2 matplotlib -- GitLab From d680e3652664c7acc4a05a80aef869392667d8c8 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 22 Sep 2022 09:53:04 +0200 Subject: [PATCH 023/121] Patch for db_fix (1) --- autosubmit/autosubmit.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 60b064de9..153c0c8a3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1776,9 +1776,22 @@ class Autosubmit: job_list.update_list(as_conf, submitter=submitter) job_list.save() # Safe spot to store changes - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - if len(job_changes_tracker) > 0: - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except BaseException as e: + Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", + Log.INFO) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} if Autosubmit.exit: job_list.save() @@ -1949,8 +1962,16 @@ class Autosubmit: raise AutosubmitCritical("There is a bug in the code, please contact via git",7070,e.message) Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + Log.printlog() # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True -- GitLab From 6b7ff9ef2c0f5cd355530e4a3971ae382e5dedb9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 23 Sep 2022 15:05:21 +0200 Subject: [PATCH 024/121] Does an sql dump everytime a change is detected. Then db_fix load this sql dump --- autosubmit/autosubmit.py | 102 ++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 55 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 153c0c8a3..75baab6de 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -58,6 +58,7 @@ import locale from distutils.util import strtobool from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set +import sqlite3 try: import dialog @@ -71,6 +72,7 @@ import tarfile import time import copy import os +import glob import pwd import sys import shutil @@ -1553,11 +1555,14 @@ class Autosubmit: # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current run does not exist. exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) except Exception as e: - # This error is important - Log.printlog("Error while processing historical database.", 7005, str(e)) - + try: + Autosubmit.database_fix(expid) + # This error is important + except: + pass try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -1781,6 +1786,7 @@ class Autosubmit: historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) if len(job_changes_tracker) > 0: exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) @@ -1790,6 +1796,7 @@ class Autosubmit: historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) if len(job_changes_tracker) > 0: exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except: Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} @@ -1965,13 +1972,12 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except: try: Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) except: - Log.printlog() + pass # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True @@ -3901,6 +3907,17 @@ class Autosubmit: raise @staticmethod + def database_backup(expid): + try: + database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) + backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) + Log.info("Backing up jobs_data...") + subprocess.call(command, shell=True) + Log.result("Jobs_data database backup completed.") + except BaseException as e: + Log.info("Jobs_data database backup failed.") + @staticmethod def database_fix(expid): """ Database methods. Performs a sql dump of the database and restores it. @@ -3912,52 +3929,31 @@ class Autosubmit: """ os.umask(0) # Overrides user permissions current_time = int(time.time()) + corrupted_db_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_corrupted.db".format(expid)) + database_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) - database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_{1}.db".format(expid, str(current_time))) - dump_file_name = 'job_data_{0}_{1}.sql'.format(expid, current_time) + database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + dump_file_name = 'job_data_{0}.sql'.format(expid, current_time) dump_file_path = os.path.join(BasicConfig.JOBDATA_DIR, dump_file_name) - bash_command = 'sqlite3 {0} .dump > {1}'.format(database_path, dump_file_path) + bash_command = 'cat {1} | sqlite3 {0}'.format(database_path, dump_file_path) try: - if os.path.exists(database_path): + if os.path.exists(database_path): + result = os.popen("mv {0} {1}".format(database_path, corrupted_db_path)).read() + time.sleep(10) + Log.info("Original database moved.") + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + Log.info("Restoring from sql") result = os.popen(bash_command).read() - if result is not None and os.path.exists(dump_file_path): - Log.info("sqldump {0} created".format(dump_file_path)) - Log.info( - "Backing up original database {0}".format(database_path)) - result = os.popen("mv {0} {1}".format(database_path, database_backup_path)).read() - time.sleep(10) - if result is not None and not os.path.exists(database_path): - Log.info("Original database moved.") - Log.info("Restoring from sqldump") - HUtils.create_file_with_full_permissions(database_path) - result = os.popen("cat {0} | sqlite3 {1}".format( - dump_file_path, database_path)).read() - time.sleep(10) - if result is not None and os.path.exists(database_path): - Log.info( - "Database {0} restored.".format(database_path)) - Log.info("Deleting sqldump.") - result = os.popen( - "rm {0}".format(dump_file_path)).read() - sleep(5) - if result is not None and not os.path.exists(dump_file_path): - ExperimentHistory(expid).initialize_database() - Log.info("sqldump file deleted.") - Log.result( - "The database {0} has been fixed.".format(database_path)) - else: - raise Exception( - "The sqldump file could not be removed.") - else: - raise Exception( - "It was not possible to restore the sqldump file.") - else: - raise Exception( - "It was not possible to delete the original database.") - else: - raise Exception("The sqldump file couldn't be created.") - else: - raise Exception("The database file doesn't exist.") + except: + Log.warning("It was not possible to restore the jobs_data.db file... , a new blank db will be created") + result = os.popen("rm {0}".format(database_path)).read() + + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() except Exception as exp: Log.critical(str(exp)) @@ -4243,17 +4239,12 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) try: Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) except: Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: @@ -5018,6 +5009,7 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.process_status_changes(job_list.get_job_list(), chunk_unit=as_conf.get_chunk_size_unit(), chunk_size=as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) else: Log.printlog( "Changes NOT saved to the JobList!!!!: use -s option to save", 3000) -- GitLab From 7213cb18e2abe25090b6a75f440eda6e730b4302 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Sep 2022 09:21:49 +0200 Subject: [PATCH 025/121] database changes #870 --- autosubmit/autosubmit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 75baab6de..337247605 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3912,11 +3912,11 @@ class Autosubmit: database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) - Log.info("Backing up jobs_data...") + Log.debug("Backing up jobs_data...") subprocess.call(command, shell=True) - Log.result("Jobs_data database backup completed.") + Log.debug("Jobs_data database backup completed.") except BaseException as e: - Log.info("Jobs_data database backup failed.") + Log.debug("Jobs_data database backup failed.") @staticmethod def database_fix(expid): """ -- GitLab From f1f3ea23923b2eabb669cdf3ecb517915c9365d9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 30 Sep 2022 13:50:03 +0200 Subject: [PATCH 026/121] #877 conda typo --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 64b314886..4f68c3788 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -183,7 +183,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Create conda environment conda env update -f environment.yml -n autosubmit python=2 # Activate env - source activate autosubmit + conda activate autosubmit # Test autosubmit autosubmit -v # Configure autosubmitrc and install database as indicated in this doc -- GitLab From 2867216631fe6d9c1017af331afc13c0635f2dc3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 30 Sep 2022 13:50:53 +0200 Subject: [PATCH 027/121] #877 changed version to the lastest one (3.14.0b) --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 4f68c3788..7159ac7c0 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -178,7 +178,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Download git apt install git -y -q # Download autosubmit - git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 + git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b cd autosubmit # Create conda environment conda env update -f environment.yml -n autosubmit python=2 -- GitLab From e41fab2383d907df115cbeaf1310755e60a0878c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 13:03:40 +0200 Subject: [PATCH 028/121] #inline comments, fixes for slrum --- autosubmit/autosubmit.py | 20 ++++-- autosubmit/platforms/paramiko_platform.py | 74 ++++++++++++++++------- test/regression/tests_runner.py | 1 + 3 files changed, 69 insertions(+), 26 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 337247605..09ce96335 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1584,7 +1584,7 @@ class Autosubmit: if unparsed_two_step_start != "": job_list.parse_jobs_by_filter(unparsed_two_step_start) - main_loop_retrials = 3650 # Hard limit of tries 3650 tries at 15-120seconds sleep each try + main_loop_retrials = 11250*2 # Hard limit of tries ( 48h min 72h max), 2 retrials per stop # establish the connection to all platforms Autosubmit.restore_platforms(platforms_to_test) @@ -1822,7 +1822,7 @@ class Autosubmit: Log.printlog("Error trying to store failed job count",Log.WARNING) Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: - delay = min(15 * consecutive_retrials, 120) + delay = min(15 * consecutive_retrials, 30) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1959,7 +1959,7 @@ class Autosubmit: except BaseException: reconnected = False if main_loop_retrials <= 0: - raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of 4hours reached", 7051, e.message) + raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of {0} retrials reached".format(main_loop_retrials), 7051, e.message) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error raise AutosubmitCritical(e.message, e.code, e.trace) except portalocker.AlreadyLocked: @@ -3322,7 +3322,12 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("Unknown error while reporting the parameters list, likely it is due IO issues",7040,e.message) - + @staticmethod + def removeInlineComments(cfgparser): + for section in cfgparser.sections(): + for item in cfgparser.items(section): + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + return cfgparser @staticmethod def describe(experiment_id): """ @@ -3497,6 +3502,7 @@ class Autosubmit: parser.set("autosubmitapi", "url", autosubmitapi_url) #parser.add_section("hosts") #parser.set("hosts", "whitelist", " localhost # Add your machine names") + parser = Autosubmit.removeInlineComments(parser) parser.write(config_file) config_file.close() Log.result("Configuration file written successfully: \n\t{0}".format(rc_path)) @@ -3591,6 +3597,8 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) + parser = Autosubmit.removeInlineComments(parser) + if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') if parser.has_option('database', 'filename'): @@ -3723,11 +3731,15 @@ class Autosubmit: parser.add_section('mail') parser.set('mail', 'smtp_server', smtp_hostname) parser.set('mail', 'mail_from', mail_from) + parser = Autosubmit.removeInlineComments(parser) + parser.write(config_file) config_file.close() d.msgbox("Configuration file written successfully", width=50, height=5) os.system('clear') + + except (IOError, OSError) as e: raise AutosubmitCritical( "Can not write config file", 7012, e.message) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e1b36f116..fb9059915 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -550,35 +550,63 @@ class ParamikoPlatform(Platform): cmd = self.get_checkAlljobs_cmd(job_list_cmd) sleep_time = 5 sleep(sleep_time) - self.send_command(cmd) - while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + slurm_error = False + e_msg = "" + try: self.send_command(cmd) - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - retries -= 1 - sleep(sleep_time) - sleep_time = sleep_time + 5 + except AutosubmitError as e: + e_msg = e.trace+" "+e.message + slurm_error = True + if not slurm_error: + while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + try: + self.send_command(cmd) + except AutosubmitError as e: + e_msg = e.trace + " " + e.message + slurm_error = True + break + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + retries -= 1 + sleep(sleep_time) + sleep_time = sleep_time + 5 + job_list_status = self.get_ssh_output() if retries >= 0: Log.debug('Successful check job command') in_queue_jobs = [] list_queue_jobid = "" for job in job_list: - job_id = job.id - job_status = self.parse_Alljobs_output(job_list_status, job_id) - while len(job_status) <= 0 and retries >= 0: - retries -= 1 - self.send_command(cmd) - job_list_status = self.get_ssh_output() + if not slurm_error: + job_id = job.id job_status = self.parse_Alljobs_output(job_list_status, job_id) - if len(job_status) <= 0: - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - sleep(sleep_time) - sleep_time = sleep_time + 5 - # URi: define status list in HPC Queue Class + while len(job_status) <= 0 and retries >= 0: + retries -= 1 + self.send_command(cmd) + job_list_status = self.get_ssh_output() + job_status = self.parse_Alljobs_output(job_list_status, job_id) + if len(job_status) <= 0: + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + sleep(sleep_time) + sleep_time = sleep_time + 5 + # URi: define status list in HPC Queue Class + else: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -595,12 +623,12 @@ class ParamikoPlatform(Platform): elif retries == 0: job_status = Status.COMPLETED job.update_status(remote_logs) - else: job_status = Status.UNKNOWN Log.error( 'check_job() The job id ({0}) status is {1}.', job_id, job_status) job.new_status = job_status + reason = str() if self.type == 'slurm' and len(in_queue_jobs) > 0: cmd = self.get_queue_status_cmd(list_queue_jobid) @@ -639,6 +667,8 @@ class ParamikoPlatform(Platform): 'check_job() The job id ({0}) from platform {1} has an status of {2}.', job.id, self.name, job_status) raise AutosubmitError("Some Jobs are in Unknown status", 6008) # job.new_status=job_status + if slurm_error: + raise AutosubmitError(e_msg, 6000) def get_jobid_by_jobname(self,job_name,retries=2): """ diff --git a/test/regression/tests_runner.py b/test/regression/tests_runner.py index ffd490888..ab186e849 100644 --- a/test/regression/tests_runner.py +++ b/test/regression/tests_runner.py @@ -79,6 +79,7 @@ def run(current_experiment_id, only_list=None, exclude_list=None, max_threads=5) tests_parser.optionxform = str tests_parser.read(tests_parser_file) + # Resetting the database clean_database(db_path) create_database() -- GitLab From 540e8a02d20e486d3302df07ab938aaa778d3eb8 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 15:43:21 +0200 Subject: [PATCH 029/121] Remove inline comments working #870 --- autosubmit/autosubmit.py | 13 +++---------- autosubmit/config/config_common.py | 9 +++++++++ autosubmit/config/config_parser.py | 5 ++++- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 09ce96335..82e4b44e9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3322,12 +3322,7 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("Unknown error while reporting the parameters list, likely it is due IO issues",7040,e.message) - @staticmethod - def removeInlineComments(cfgparser): - for section in cfgparser.sections(): - for item in cfgparser.items(section): - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) - return cfgparser + @staticmethod def describe(experiment_id): """ @@ -3502,7 +3497,6 @@ class Autosubmit: parser.set("autosubmitapi", "url", autosubmitapi_url) #parser.add_section("hosts") #parser.set("hosts", "whitelist", " localhost # Add your machine names") - parser = Autosubmit.removeInlineComments(parser) parser.write(config_file) config_file.close() Log.result("Configuration file written successfully: \n\t{0}".format(rc_path)) @@ -3597,7 +3591,6 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) - parser = Autosubmit.removeInlineComments(parser) if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') @@ -3731,8 +3724,6 @@ class Autosubmit: parser.add_section('mail') parser.set('mail', 'smtp_server', smtp_hostname) parser.set('mail', 'mail_from', mail_from) - parser = Autosubmit.removeInlineComments(parser) - parser.write(config_file) config_file.close() d.msgbox("Configuration file written successfully", @@ -5398,10 +5389,12 @@ class Autosubmit: raise AutosubmitCritical('Can not test a RERUN experiment', 7014) content = open(as_conf.experiment_file).read() + if random_select: if hpc is None: platforms_parser = as_conf.get_parser( ConfigParserFactory(), as_conf.platforms_file) + test_platforms = list() for section in platforms_parser.sections(): if platforms_parser.get_option(section, 'TEST_SUITE', 'false').lower() == 'true': diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index cc8aa3e1c..74dcc3e1e 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1759,6 +1759,13 @@ class AutosubmitConfig(object): commit = self.get_git_project_commit() return origin_exists and (branch is not None or commit is not None) + @staticmethod + def removeInlineComments(cfgparser): + for section in cfgparser.sections(): + for item in cfgparser.items(section): + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + return cfgparser + @staticmethod def get_parser(parser_factory, file_path): """ @@ -1794,5 +1801,7 @@ class AutosubmitConfig(object): raise Exception( "{}\n This file and the correctness of its content are necessary.".format(str(exp))) # parser.read(file_path) + #remove inline comments + parser = AutosubmitConfig.removeInlineComments(parser) return parser diff --git a/autosubmit/config/config_parser.py b/autosubmit/config/config_parser.py index 87b28456a..99d92fd8c 100644 --- a/autosubmit/config/config_parser.py +++ b/autosubmit/config/config_parser.py @@ -14,8 +14,11 @@ class ConfigParserFactory: def __init__(self): pass + + def create_parser(self): - return ConfigParser() + parser = ConfigParser() + return parser class ConfigParser(ConfPar, object): -- GitLab From 31c925f9c2718cba9f48a62eae37dc589f490eb3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 16:04:02 +0200 Subject: [PATCH 030/121] setstatus doesn't crash anymore if the id does not exists --- autosubmit/autosubmit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 82e4b44e9..37aa84475 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4467,7 +4467,10 @@ class Autosubmit: if job.status in [Status.SUBMITTED, Status.QUEUING, Status.HELD] and final_status not in [Status.QUEUING, Status.HELD, Status.SUSPENDED]: job.hold = False if job.platform_name and job.platform_name.lower() != "local": - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + try: + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + except: + pass elif job.status in [Status.QUEUING, Status.RUNNING, Status.SUBMITTED] and final_status == Status.SUSPENDED: if job.platform_name and job.platform_name.lower() != "local": job.platform.send_command("scontrol hold " + "{0}".format(job.id), ignore_log=True) -- GitLab From 25c11e3c0521957fec64cec162ae96001b2bab8a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 11:08:39 +0200 Subject: [PATCH 031/121] Fixed e message error --- autosubmit/autosubmit.py | 14 +++++++------- autosubmit/config/config_common.py | 4 ++-- autosubmit/git/autosubmit_git.py | 4 ++-- autosubmit/job/job.py | 8 ++++---- autosubmit/job/job_dict.py | 2 +- autosubmit/job/job_list.py | 6 +++--- autosubmit/monitor/monitor.py | 2 +- autosubmit/platforms/paramiko_platform.py | 16 ++++++++-------- test/regression/tests_utils.py | 2 +- 9 files changed, 29 insertions(+), 29 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 37aa84475..2fca7cb7b 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -590,7 +590,7 @@ class Autosubmit: except Exception as e: if type(e) is SystemExit: # Version keyword force an exception in parse arg due and os_exit(0) but the program is succesfully finished - if e.message == 0: + if str(e) == 0: print(Autosubmit.autosubmit_version) os._exit(0) raise AutosubmitCritical( @@ -836,28 +836,28 @@ class Autosubmit: if ret: Log.result("Experiment {0} deleted".format(expid_delete)) except BaseException as e: - error_message += 'Can not delete experiment entry: {0}\n'.format(e.message) + error_message += 'Can not delete experiment entry: {0}\n'.format(str(e)) Log.info("Removing experiment directory...") try: shutil.rmtree(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)) except BaseException as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) try: Log.info("Removing Structure db...") structures_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, "structure_{0}.db".format(expid_delete)) if os.path.exists(structures_path): os.remove(structures_path) except BaseException as e: - error_message += 'Can not delete structure: {0}\n'.format(e.message) + error_message += 'Can not delete structure: {0}\n'.format(str(e)) try: Log.info("Removing job_data db...") job_data_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid_delete)) if os.path.exists(job_data_path): os.remove(job_data_path) except BaseException as e: - error_message += 'Can not delete job_data: {0}\n'.format(e.message) + error_message += 'Can not delete job_data: {0}\n'.format(str(e)) except OSError as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) else: if not eadmin: raise AutosubmitCritical( @@ -1811,7 +1811,7 @@ class Autosubmit: # No need to wait until the remote platform reconnection recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - consecutive_retrials = 0 + consecutive_retrials = 1 failed_names = {} Log.info("Storing failed job count...") try: diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 74dcc3e1e..ddbb04c78 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -513,11 +513,11 @@ class AutosubmitConfig(object): self.reload() except IOError as e: raise AutosubmitError( - "I/O Issues con config files", 6016, e.message) + "I/O Issues con config files", 6016, str(e)) except (AutosubmitCritical, AutosubmitError) as e: raise except BaseException as e: - raise AutosubmitCritical("Unknown issue while checking the configulation files (check_conf_files)",7040,e.message) + raise AutosubmitCritical("Unknown issue while checking the configulation files (check_conf_files)",7040,str(e)) # Annotates all errors found in the configuration files in dictionaries self.warn_config and self.wrong_config. self.check_expdef_conf() self.check_platforms_conf() diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 817b5e09b..c191c21df 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -60,7 +60,7 @@ class AutosubmitGit: shell=True) except subprocess.CalledProcessError as e: raise AutosubmitCritical( - "Failed to retrieve git info ...", 7064, e.message) + "Failed to retrieve git info ...", 7064, str(e)) if output: Log.info("Changes not committed detected... SKIPPING!") raise AutosubmitCritical("Commit needed!", 7013) @@ -231,7 +231,7 @@ class AutosubmitGit: output_1 = subprocess.check_output(command_1, shell=True) except BaseException as e: submodule_failure = True - Log.printlog("Trace: {0}".format(e.message), 6014) + Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( "Submodule {0} has a wrong configuration".format(submodule), 6014) else: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 9365e516f..6653c51f9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -600,13 +600,13 @@ class Job(object): self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( - e.message, self.name)) + str(e), self.name)) except AutosubmitError as e: Log.printlog("Trace {0} \nFailed to retrieve log file for job {1}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) return @threaded @@ -656,7 +656,7 @@ class Job(object): except BaseException as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index 0b16d29af..d0aef9f42 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -126,7 +126,7 @@ class DicJobs: except BaseException as e: raise AutosubmitCritical( "Wrong format for {1} parameter in section {0}".format(section,called_from), 7011, - e.message) + str(e)) pass return parsed_list def read_section(self, section, priority, default_job_type, jobs_data=dict()): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 395c97e4c..ae52a0c78 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -249,7 +249,7 @@ class JobList(object): else: self._ordered_jobs_by_date_member[wrapper_section] = {} except BaseException as e: - raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,e.message) + raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,str(e)) pass @@ -1419,11 +1419,11 @@ class JobList(object): self._persistence_file, self._job_list if self.run_members is None or job_list is None else job_list) pass except BaseException as e: - raise AutosubmitError(e.message,6040,"Failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Failure while saving the job_list") except AutosubmitError as e: raise except BaseException as e: - raise AutosubmitError(e.message,6040,"Unknown failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Unknown failure while saving the job_list") def backup_save(self): diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index 55c60156a..9556e7d3d 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -353,7 +353,7 @@ class Monitor: except: pass - Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only writted in txt".format(e.message),7014) + Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only written in txt".format(e.message),7014) def generate_output_txt(self, expid, joblist, path, classictxt=False, job_list_object=None): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index fb9059915..1c1177510 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -112,7 +112,7 @@ class ParamikoPlatform(Platform): except EOFError as e: self.connected = False raise AutosubmitError("[{0}] not alive. Host: {1}".format( - self.name, self.host), 6002, e.message) + self.name, self.host), 6002, str(e)) except (AutosubmitError,AutosubmitCritical,IOError): self.connected = False raise @@ -136,7 +136,7 @@ class ParamikoPlatform(Platform): self.host.split(',')[0]), 6002) else: raise AutosubmitCritical( - "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,e.message) + "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,str(e)) while self.connected is False and retry < retries: try: self.connect(True) @@ -155,7 +155,7 @@ class ParamikoPlatform(Platform): raise except Exception as e: raise AutosubmitCritical( - 'Cant connect to this platform due an unknown error', 7050, e.message) + 'Cant connect to this platform due an unknown error', 7050, str(e)) def threaded(fn): def wrapper(*args, **kwargs): @@ -219,12 +219,12 @@ class ParamikoPlatform(Platform): elif "name or service not known" in e.strerror.lower(): raise SSHException(" {0} doesn't accept remote connections. Check if there is an typo in the hostname".format(self.host)) else: - raise AutosubmitError("File can't be located due an slow connection", 6016, e.message) + raise AutosubmitError("File can't be located due an slow connection", 6016, str(e)) except BaseException as e: self.connected = False - if "Authentication failed." in e.message: + if "Authentication failed." in str(e): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( - self._host_config['hostname']), 7050, e.message) + self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: self.restore_connection(reconnect=True) else: @@ -284,7 +284,7 @@ class ParamikoPlatform(Platform): return True except IOError as e: raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, e.message) + self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, str(e)) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -358,7 +358,7 @@ class ParamikoPlatform(Platform): except BaseException as e: Log.error('Could not remove file {0} due a wrong configuration'.format( os.path.join(self.get_files_path(), filename))) - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: raise AutosubmitCritical( "Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ", 7051, e.message) diff --git a/test/regression/tests_utils.py b/test/regression/tests_utils.py index 297fb8f75..53ead0dd5 100644 --- a/test/regression/tests_utils.py +++ b/test/regression/tests_utils.py @@ -23,7 +23,7 @@ def check_cmd(command, path=BIN_PATH, verbose='AS_TEST_VERBOSE' in os.environ): except subprocess.CalledProcessError as e: if verbose: - print e.output + print str(e) return False -- GitLab From 5640508259cc54472d91afa33ea3e2e6eb60e1a9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 13:16:06 +0200 Subject: [PATCH 032/121] log error --- autosubmit/config/config_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index ddbb04c78..4b683f1e4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1763,7 +1763,10 @@ class AutosubmitConfig(object): def removeInlineComments(cfgparser): for section in cfgparser.sections(): for item in cfgparser.items(section): - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + try: + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + except: + pass return cfgparser @staticmethod -- GitLab From 3d42d2e3f4f5af861b9244b88334ed92ee46403f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 13:33:32 +0200 Subject: [PATCH 033/121] CUSTOM directive has # crashing with the removeinlinecomments --- autosubmit/config/config_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 4b683f1e4..50c4d69e8 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1764,7 +1764,10 @@ class AutosubmitConfig(object): for section in cfgparser.sections(): for item in cfgparser.items(section): try: - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + if str(item[0]).upper() == "CUSTOM_DIRECTIVES": + pass + else: + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) except: pass return cfgparser -- GitLab From b6609fb36022d6d6c1a8fceafed01083246b3e54 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 15:01:34 +0200 Subject: [PATCH 034/121] Changed delete message, added complete list of directories --- autosubmit/autosubmit.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 2fca7cb7b..0720672e7 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -818,6 +818,16 @@ class Autosubmit: :return: True if succesfully deleted, False otherwise :rtype: boolean """ + message = "The {0} experiment was removed from the local disk and from the database.".format(expid_delete) + message+= " Note that this action does not delete any data written by the experiment.\n" + message+= "Complete list of files/directories deleted:\n" + for root, dirs, files in os.walk(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)): + for dir in dirs: + message += os.path.join(root, dir) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, + "structure_{0}.db".format(expid_delete)) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, + "job_data_{0}.db".format(expid_delete)) + "\n" owner,eadmin,currentOwner = Autosubmit._check_ownership(expid_delete) if expid_delete == '' or expid_delete is None and not os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR,expid_delete)): Log.printlog("Experiment directory does not exist.",Log.WARNING) @@ -865,6 +875,7 @@ class Autosubmit: else: raise AutosubmitCritical( 'Current user is not the owner of the experiment. {0} can not be deleted!'.format(expid_delete), 7012) + Log.printlog(message, Log.RESULT) except Exception as e: # Avoid calling Log at this point since it is possible that tmp folder is already deleted. error_message += "Couldn't delete the experiment".format(e.message) -- GitLab From 70f6711de4d5cedba985e576472bdbf6ed559e8e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 15:19:47 +0200 Subject: [PATCH 035/121] disable inline delete --- autosubmit/config/config_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 50c4d69e8..63b31483d 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1808,6 +1808,6 @@ class AutosubmitConfig(object): "{}\n This file and the correctness of its content are necessary.".format(str(exp))) # parser.read(file_path) #remove inline comments - parser = AutosubmitConfig.removeInlineComments(parser) + #parser = AutosubmitConfig.removeInlineComments(parser) return parser -- GitLab From 2942e7e6462fa9c136c31a42bf62858c47db592a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 16:11:47 +0200 Subject: [PATCH 036/121] Fixed node missconfiguration slurm message not being detected correclty --- autosubmit/autosubmit.py | 10 ++++++---- autosubmit/job/job_packages.py | 2 +- autosubmit/platforms/paramiko_submitter.py | 4 +++- autosubmit/platforms/slurmplatform.py | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 0720672e7..48e5b2e28 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2191,11 +2191,11 @@ class Autosubmit: platform.cancel_job(id) jobs_id = None platform.connected = False - if type(e.trace) is not None: - has_trace_bad_parameters = e.trace.lower().find("bad parameters") != -1 + if e.trace is not None: + has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 else: has_trace_bad_parameters = False - if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1: + if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -2206,7 +2206,9 @@ class Autosubmit: else: error_message+="Check that {1} platform has set the correct scheduler. Sections that could be affected: {0}".format( error_msg[:-1], platform.name) - raise AutosubmitCritical(error_message,7014,e.message+"\n"+e.trace) + if e.trace is None: + e.trace = "" + raise AutosubmitCritical(error_message,7014,e.message+"\n"+str(e.trace)) except IOError as e: raise AutosubmitError( "IO issues ", 6016, e.message) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 52afa70cc..a3a6a3b58 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -155,7 +155,7 @@ class JobPackageBase(object): exit=True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - if configuration.get_project_type().lower() != "none": + if str(configuration.get_project_type()).lower() != "none": raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) if not job.check_script(configuration, parameters,show_logs=job.check_warnings): Log.warning("Script {0} check failed",job.name) diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 1f577426f..12e1e70bc 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -203,6 +203,8 @@ class ParamikoSubmitter(Submitter): if parser.has_option(section, 'SERIAL_PLATFORM'): platforms[section.lower()].serial_platform = platforms[parser.get_option(section, 'SERIAL_PLATFORM', - None).lower()] + None)] + if platforms[section.lower()].serial_platform is not None: + platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 5d31690c4..d757256a4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -466,7 +466,7 @@ class SlurmPlatform(ParamikoPlatform): else: retries = 9999 except BaseException as e: # Unrecoverable error - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: if not wrapper_failed: sleep(sleeptime) sleeptime = sleeptime + 5 -- GitLab From df2165954c91e66e713605cbc1d74644c207abaa Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 11:16:06 +0200 Subject: [PATCH 037/121] Added include_members and chunks #748 --- autosubmit/job/job_dict.py | 36 ++++++++++++++++++++++++++++++------ test/unit/test_dic_jobs.py | 2 +- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index d0aef9f42..b7e6b4a6d 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -152,11 +152,19 @@ class DicJobs: elif running == 'date': self._create_jobs_startdate(section, priority, frequency, default_job_type, jobs_data,splits) elif running == 'member': - self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits,self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits, \ + self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + elif running == 'chunk': synchronize = self.get_option(section, "SYNCHRONIZE", None) delay = int(self.get_option(section, "DELAY", -1)) - self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data,excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"),excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data, \ + excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"), \ + excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + included_chunks=self.parse_relation(section,False,self.get_option(section, "INCLUDED_CHUNKS", []),"INCLUDED_CHUNKS"), \ + included_members=self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + pass def _create_jobs_once(self, section, priority, default_job_type, jobs_data=dict(),splits=0): @@ -218,7 +226,7 @@ class DicJobs: - def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[]): + def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[],included_members=[]): """ Create jobs to be run once per member @@ -242,11 +250,18 @@ class DicJobs: count = 0 if splits > 0: for member in self._member_list: - if self._member_list.index(member) not in excluded_members: - tmp_dic[section][date][member] = [] + if len(included_members) == 0: + if self._member_list.index(member) not in excluded_members: + tmp_dic[section][date][member] = [] + else: + if self._member_list.index(member) in included_members: + tmp_dic[section][date][member] = [] for member in self._member_list: if self._member_list.index(member) in excluded_members: continue + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue count += 1 if count % frequency == 0 or count == len(self._member_list): if splits <= 0: @@ -259,7 +274,7 @@ class DicJobs: - def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[]): + def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]): """ Create jobs to be run once per chunk @@ -282,6 +297,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): @@ -311,6 +329,9 @@ class DicJobs: for date in self._date_list: self._dic[section][date] = dict() for member in self._member_list: + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue if self._member_list.index(member) in excluded_members: continue self._dic[section][date][member] = dict() @@ -318,6 +339,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 5565c9328..39f7690b2 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -123,7 +123,7 @@ class TestDicJobs(TestCase): self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() self.dictionary._create_jobs_member.assert_not_called() - self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[]) + self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]) def test_dic_creates_right_jobs_by_startdate(self): # arrange -- GitLab From 6127001e9093604718f0f1546b4de2a0eef92bb7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:17:17 +0200 Subject: [PATCH 038/121] Bugfix timeout #812 --- autosubmit/platforms/locplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 3fe62f5cc..e7734b133 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -83,7 +83,7 @@ class LocalPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: -- GitLab From 99aec684a83eeafd9db75d2f9f9c0378f23ef9e9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:29:32 +0200 Subject: [PATCH 039/121] Erased wrong info about TOTAL_JOBS --- .../usage/configuration/new_platform.rst | 2 +- docs/source/usage/run_modes/wrappers.rst | 34 ++----------------- 2 files changed, 4 insertions(+), 32 deletions(-) diff --git a/docs/source/usage/configuration/new_platform.rst b/docs/source/usage/configuration/new_platform.rst index 675d4edc6..971778061 100644 --- a/docs/source/usage/configuration/new_platform.rst +++ b/docs/source/usage/configuration/new_platform.rst @@ -53,7 +53,7 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be queuing or submitted in this platform. +* MAX_WAITING_JOBS: Maximum number of jobs to be queuing or submitted in this platform. * TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. diff --git a/docs/source/usage/run_modes/wrappers.rst b/docs/source/usage/run_modes/wrappers.rst index 8085e4884..388c215ef 100644 --- a/docs/source/usage/run_modes/wrappers.rst +++ b/docs/source/usage/run_modes/wrappers.rst @@ -14,34 +14,6 @@ At the moment there are 4 types of wrappers that can be used depending on the ex When using the wrapper, it is useful to be able to visualize which packages are being created. So, when executing *autosubmit monitor cxxx*, a dashed box indicates the jobs that are wrapped together in the same job package. -How to configure -======================== - -In ``autosubmit_cxxx.conf``, regardless of the wrapper type, you need to make sure that the values of the variables **MAXWAITINGJOBS** and **TOTALJOBS** are increased according to the number of jobs expected to be waiting/running at the same time in your experiment. - -For example: - -.. code-block:: ini - - [config] - EXPID = .... - AUTOSUBMIT_VERSION = 3.13.0 - ... - - MAXWAITINGJOBS = 100 - TOTALJOBS = 100 - ... - -and below the [config] block, add the wrapper directive, indicating the wrapper type: - -.. code-block:: ini - - [wrapper] - TYPE = - -You can also specify which job types should be wrapped. This can be done using the **JOBS_IN_WRAPPER** parameter. -It is only required for the vertical-mixed type (in which the specified job types will be wrapped together), so if nothing is specified, all jobs will be wrapped. -By default, jobs of the same type will be wrapped together, as long as the constraints are satisfied. Number of jobs in a package *************************** @@ -57,7 +29,7 @@ Number of jobs in a package - **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` - - If **MAX_WRAPPED** is not defined, then **TOTALJOBS** is used by default + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. - **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain - If not defined, it considers that **MIN_WRAPPED** is 2. - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. @@ -241,7 +213,7 @@ In `autosubmit_cxxx.conf`: # JOBS_IN_WRAPPER = Sections that should be wrapped together ex SIM # METHOD : Select between MACHINESFILES or Shared-Memory. # MIN_WRAPPED set the minim number of jobs that should be included in the wrapper. DEFAULT = 2 - # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = TOTALJOBS + # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = 99999999999 # Policy : Select the behaviour of the inner jobs Strict/Flexible/Mixed # EXTEND_WALLCLOCK: Allows to extend the wallclock by the max wallclock of the horizontal package (max inner job). Values are integer units (0,1,2) # RETRIALS : Enables a retrial mechanism for vertical wrappers, or default retrial mechanism for the other wrappers @@ -250,7 +222,7 @@ In `autosubmit_cxxx.conf`: TYPE = Vertical #REQUIRED JOBS_IN_WRAPPER = SIM # Job types (as defined in jobs_cxxx.conf) separated by space. REQUIRED only if vertical-mixed MIN_WRAPPED = 2 - MAX_WRAPPED = 9999 # OPTIONAL. Integer value, overrides TOTALJOBS + MAX_WRAPPED = 999999 # OPTIONAL. Integer value. CHECK_TIME_WRAPPER = # OPTIONAL. Time in seconds, overrides SAFETYSLEEPTIME POLICY = flexible # OPTIONAL, Wrapper policy, mixed, flexible, strict QUEUE = bsc_es # If not specified, queue will be the same of the first SECTION specified on JOBS_IN_WRAPPER -- GitLab From 03fef134b138167b7ad8bfaeabac899011ccbbf2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:38:51 +0200 Subject: [PATCH 040/121] Added wrapper info under devel_proj -> Controling the number of active concurrent tasks in an experiment #857 --- docs/source/devel_proj.rst | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/docs/source/devel_proj.rst b/docs/source/devel_proj.rst index 17caddcf5..0dda37b3c 100644 --- a/docs/source/devel_proj.rst +++ b/docs/source/devel_proj.rst @@ -712,8 +712,34 @@ To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS .. code-block:: ini - # Maximum number of submitted,waiting and running tasks - TOTAL_JOBS = 10 - # Maximum number of submitted and waiting tasks - MAX_WAITING_JOBS = 10 + # Controls the maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Controls the maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 +To control the number of jobs included in a wrapper, you can use the `MAX_WRAPPED_JOBS` and `MIN_WRAPPED_JOBS` variables in the ``conf/autosubmit_cxxx.conf`` file. + +Note that a wrapped job is counted as a single job regardless of the number of tasks it contains. Therefore, `TOTAL_JOBS` and `MAX_WAITING_JOBS` won't have an impact inside a wrapper. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + [wrapper] + TYPE = + MIN_WRAPPED = 2 # Minium amount of jobs that will be wrapped together in any given time. + MIN_WRAPPED_H = 2 # Same as above but only for the horizontal packages. + MIN_WRAPPED_V = 2 # Same as above but only for the vertical packages. + MAX_WRAPPED = 99999 # Maximum amount of jobs that will be wrapped together in any given time. + MAX_WRAPPED_H = 99999 # Same as above but only for the horizontal packages. + MAX_WRAPPED_V = 99999 # Same as above but only for the vertical packages. + +- **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section + - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. +- **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain + - If not defined, it considers that **MIN_WRAPPED** is 2. + - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. + - If **POLICY** is mixed and there are failed jobs inside a wrapper, these jobs will be submitted as individual jobs. + - If **POLICY** is strict and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will not be submitted until there are enough tasks to build a package. + - strict and mixed policies can cause **deadlocks**. -- GitLab From 0408c45ce3d202f46502b837f91e08cde02f082a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 16:07:50 +0200 Subject: [PATCH 041/121] Deleted argcomplete --- autosubmit/autosubmit.py | 4 +--- setup.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 48e5b2e28..ccb1bbac9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# PYTHON_ARGCOMPLETE_OK # Copyright 2015-2020 Earth Sciences Department, BSC-CNS @@ -65,7 +64,7 @@ try: except Exception: dialog = None from time import sleep -import argparse, argcomplete +import argparse import subprocess import json import tarfile @@ -583,7 +582,6 @@ class Autosubmit: # Changelog subparsers.add_parser('changelog', description='show changelog') - argcomplete.autocomplete(parser) args = parser.parse_args() diff --git a/setup.py b/setup.py index 8e56eb8c5..a5a7801ef 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + install_requires=['argparse>=1.2,<2','six>=1.10.0', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ -- GitLab From c6f88d53a0c5b7d30be2c8187fbae4c46f859404 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 6 Oct 2022 14:33:10 +0200 Subject: [PATCH 042/121] Fixed an issue with main_platform = local and no platforms configured --- autosubmit/config/config_common.py | 8 +++++--- autosubmit/history/data_classes/job_data.py | 3 ++- autosubmit/job/job_dict.py | 7 ++++--- autosubmit/platforms/psplatform.py | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 63b31483d..26ce6ec50 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -607,9 +607,7 @@ class AutosubmitConfig(object): """ Checks experiment's queues configuration file. """ - if len(self._platforms_parser.sections()) == 0: - self.wrong_config["Platform"] += [["Global", - "Platform file is not well-configured or found"]] + if len(self._platforms_parser.sections()) != len(set(self._platforms_parser.sections())): self.wrong_config["Platform"] += [["Global", @@ -619,7 +617,11 @@ class AutosubmitConfig(object): main_platform_found = True elif self.ignore_undefined_platforms: main_platform_found = True + if len(self._platforms_parser.sections()) == 0 and not main_platform_found: + self.wrong_config["Platform"] += [["Global", + "Platform file is not well-configured or found"]] for section in self._platforms_parser.sections(): + if section in self.hpcarch: main_platform_found = True if not self._platforms_parser.check_exists(section, 'TYPE'): diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index b5249b797..93a88797a 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -57,7 +57,8 @@ class JobData(object): platform) > 0 else "NA" self.job_id = job_id if job_id else 0 try: - self.extra_data_parsed = loads(extra_data) + if extra_data != "": + self.extra_data_parsed = loads(extra_data) except Exception as exp: self.extra_data_parsed = {} # Fail fast self.extra_data = extra_data diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index b7e6b4a6d..29ca59e28 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -402,9 +402,10 @@ class DicJobs: for d in self._date_list: self._get_date(jobs, dic, d, member, chunk) try: - if type(jobs[0]) is list: - jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] - jobs = jobs_flattened + if len(jobs) > 0: + if type(jobs[0]) is list: + jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] + jobs = jobs_flattened except BaseException as e: pass return jobs diff --git a/autosubmit/platforms/psplatform.py b/autosubmit/platforms/psplatform.py index aee3e4eb7..e2c3ede88 100644 --- a/autosubmit/platforms/psplatform.py +++ b/autosubmit/platforms/psplatform.py @@ -76,7 +76,7 @@ class PsPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: -- GitLab From e0564c48d6230b7fd80bc048dff78a806f30069b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 6 Oct 2022 14:41:04 +0200 Subject: [PATCH 043/121] fixed tests --- requeriments.txt | 1 + test/unit/test_dic_jobs.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requeriments.txt b/requeriments.txt index c34451db2..b5783046b 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,3 +1,4 @@ +pytest==2.9.2 configparser argparse>=1.2,<2 python-dateutil>2 diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 39f7690b2..f955f96dc 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -81,9 +81,10 @@ class TestDicJobs(TestCase): frequency = 123 splits = 0 excluded_list_m = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='member') - self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() @@ -95,7 +96,7 @@ class TestDicJobs(TestCase): # assert self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() - self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m) + self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m,included_list_m) self.dictionary._create_jobs_chunk.assert_not_called() def test_read_section_running_chunk_create_jobs_chunk(self): @@ -108,9 +109,11 @@ class TestDicJobs(TestCase): splits = 0 excluded_list_c = [] excluded_list_m = [] + included_list_c = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='chunk') - self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m,included_list_c,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() -- GitLab From 425f667a50ffa4b3577f4405bfbcbff0353e24cf Mon Sep 17 00:00:00 2001 From: jberlin Date: Fri, 7 Oct 2022 11:24:08 +0200 Subject: [PATCH 044/121] Made small changes to documentation concerning the Conda installation - #864 --- docs/source/installation.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 7159ac7c0..157f28ecc 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -160,7 +160,7 @@ Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. autosubmit install # Get expid - autosubmit expid -H TEST -d "Test exp." + autosubmit expid -H local -d "Test exp." # Create with -np # Since it was a new install the expid will be a000 @@ -175,7 +175,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Download git + # Download git (if it is not already installed) apt install git -y -q # Download autosubmit git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b @@ -186,4 +186,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. conda activate autosubmit # Test autosubmit autosubmit -v - # Configure autosubmitrc and install database as indicated in this doc + # Configure autosubmitrc and install the database as indicated in the installation instructions above this section + +.. hint:: + After installing conda, you may need to close the terminal and re-open it so the installation takes effect. \ No newline at end of file -- GitLab From f4658dd56bde49fbd72d3eacada1d83a9fbd769d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 7 Oct 2022 13:10:25 +0200 Subject: [PATCH 045/121] updateversion does not need -v, and now stores the change in the db #882 #881 --- autosubmit/autosubmit.py | 25 +++++++++++++------------ autosubmit/config/config_common.py | 2 ++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ccb1bbac9..be15c0bec 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -755,18 +755,19 @@ class Autosubmit: force_update_version = args.update_version else: force_update_version = False - if force_update_version: - if as_conf.get_version() != Autosubmit.autosubmit_version: - Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", - as_conf.get_version(), Autosubmit.autosubmit_version, expid) - as_conf.set_version(Autosubmit.autosubmit_version) - else: - if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: - raise AutosubmitCritical( - "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" - "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), - Autosubmit.autosubmit_version, expid,args.command), - 7067) + if args.command not in ["upgrade","updateversion"]: + if force_update_version: + if as_conf.get_version() != Autosubmit.autosubmit_version: + Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", + as_conf.get_version(), Autosubmit.autosubmit_version, expid) + as_conf.set_version(Autosubmit.autosubmit_version) + else: + if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: + raise AutosubmitCritical( + "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" + "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), + Autosubmit.autosubmit_version, expid,args.command), + 7067) else: if expid == 'None': exp_id = "" diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 26ce6ec50..c0cacf190 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -35,6 +35,7 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from autosubmit.config.basicConfig import BasicConfig from collections import defaultdict +from autosubmit.database.db_common import update_experiment_descrip_version class AutosubmitConfig(object): @@ -1369,6 +1370,7 @@ class AutosubmitConfig(object): content = content.replace(re.search('AUTOSUBMIT_VERSION =.*', content).group(0), "AUTOSUBMIT_VERSION = " + autosubmit_version) open(self._conf_parser_file, 'w').write(content) + update_experiment_descrip_version(self.expid, description=None, version=autosubmit_version) def get_version(self): """ -- GitLab From 1ba44999f5545f29caaaa3c4d98d18348a2c931b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 11 Oct 2022 15:33:59 +0200 Subject: [PATCH 046/121] NEW RUN RULES changes #847 --- autosubmit/autosubmit.py | 16 ++++++++---- autosubmit/history/experiment_history.py | 32 +++++++++++++----------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index be15c0bec..be6c31665 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -59,10 +59,11 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set import sqlite3 -try: - import dialog -except Exception: - dialog = None +#try: +# import dialog +#except Exception: +# dialog = None +dialog = None from time import sleep import argparse import subprocess @@ -4253,7 +4254,12 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + + #exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + exp_history.process_status_changes(job_list.get_job_list(), + chunk_unit=as_conf.get_chunk_size_unit(), + chunk_size=as_conf.get_chunk_size(), + current_config=as_conf.get_full_config_as_json(),create=True) Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index f1e0be68c..ecd06067b 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -190,15 +190,15 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="",create=False): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ try: current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() update_these_changes = self._get_built_list_of_changes(job_list) - should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size) + should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size,create) if len(update_these_changes) > 0 and should_create_new_run == False: self.manager.update_many_job_data_change_status(update_these_changes) - if should_create_new_run: + if should_create_new_run: return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: @@ -217,11 +217,14 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size): - if len(job_list) != current_experiment_run_dc.total: - return True - if changes_count > int(self._get_date_member_completed_count(job_list)): - return True + def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): + if create: + return True + elif not create and self.expid[0].lower() == "t": + if len(job_list) != current_experiment_run_dc.total: + return True + if changes_count > int(self._get_date_member_completed_count(job_list)): + return True return self._chunk_config_has_changed(current_experiment_run_dc, new_chunk_unit, new_chunk_size) def _chunk_config_has_changed(self, current_exp_run_dc, new_chunk_unit, new_chunk_size): @@ -274,15 +277,16 @@ class ExperimentHistory(): def detect_changes_in_job_list(self, job_list): """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" - job_name_to_job = {job.name: job for job in job_list} + job_name_to_job = {str(job.name): job for job in job_list} current_job_data_dcs = self.manager.get_all_last_job_data_dcs() differences = [] for job_dc in current_job_data_dcs: - if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: - if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): - # If the job is not changing from a finalized status to a starting status - job_dc.status = job_name_to_job[job_dc.job_name].status_str - differences.append(job_dc) + if job_dc.job_name in job_name_to_job: + if job_dc.status != job_name_to_job[job_dc.job_name].status_str: + if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): + # If the job is not changing from a finalized status to a starting status + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) return differences def _get_defined_rowtype(self, code): -- GitLab From d9c47c385a6511ff1c991bbf96f7eaafe48eb2d7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 13 Oct 2022 09:02:40 +0200 Subject: [PATCH 047/121] new run --- autosubmit/history/experiment_history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index ecd06067b..96651df99 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -220,7 +220,7 @@ class ExperimentHistory(): def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): if create: return True - elif not create and self.expid[0].lower() == "t": + elif not create and self.expid[0].lower() != "t": if len(job_list) != current_experiment_run_dc.total: return True if changes_count > int(self._get_date_member_completed_count(job_list)): -- GitLab From 522d15890808a9cf67a76bed6b52babc48c8f419 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 13 Oct 2022 13:05:59 +0200 Subject: [PATCH 048/121] Fixed pipeline tests --- test/unit/test_autosubmit_config.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/test/unit/test_autosubmit_config.py b/test/unit/test_autosubmit_config.py index c4c8480df..00e624406 100644 --- a/test/unit/test_autosubmit_config.py +++ b/test/unit/test_autosubmit_config.py @@ -181,13 +181,22 @@ class TestAutosubmitConfig(TestCase): open_mock.assert_any_call(config.experiment_file, 'w') def test_set_version(self): - # arrange + + #ARRAGE + FakeBasicConfig.DB_PATH = 'fake-path' + sys.modules['os'].path.exists = Mock(return_value=True) + connection_mock = Mock() + cursor_mock = Mock() + connection_mock.cursor = Mock(return_value=cursor_mock) + cursor_mock.fetchone = Mock(return_value=[0]) + + sys.modules['sqlite3'].connect = Mock(return_value=connection_mock) config = AutosubmitConfig(self.any_expid, FakeBasicConfig, ConfigParserFactory()) open_mock = mock_open(read_data='AUTOSUBMIT_VERSION = dummy') with patch.object(builtins, "open", open_mock): # act - config.set_version('dummy-vesion') + config.set_version('dummy-version') # assert open_mock.assert_any_call(getattr(config, '_conf_parser_file'), 'w') @@ -461,3 +470,4 @@ class FakeBasicConfig: LOCAL_PROJ_DIR = '/dummy/local/proj/dir' DEFAULT_PLATFORMS_CONF = '' DEFAULT_JOBS_CONF = '' + -- GitLab From 288ea4e64e0120eb569de151ceb90d43364b12bc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 17 Oct 2022 16:30:31 +0200 Subject: [PATCH 049/121] when creating the experiment, it now also see if there is a folder called as it. If there is a folder, the (old) experiment will be registered --- autosubmit/autosubmit.py | 1 - autosubmit/database/db_common.py | 9 ++++++++- autosubmit/experiment/experiment_common.py | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index be6c31665..1ea5d3a97 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -916,7 +916,6 @@ class Autosubmit: os.mkdir(os.path.join( BasicConfig.LOCAL_ROOT_DIR, exp_id, 'conf')) Log.info("Copying config files...") - # autosubmit config and experiment copied from AS. files = resource_listdir('autosubmit.config', 'files') for filename in files: diff --git a/autosubmit/database/db_common.py b/autosubmit/database/db_common.py index aaaf4875a..47cc770eb 100644 --- a/autosubmit/database/db_common.py +++ b/autosubmit/database/db_common.py @@ -24,7 +24,7 @@ import os import sqlite3 import multiprocessing import Queue - +import autosubmit from log.log import Log, AutosubmitCritical, AutosubmitError Log.get_logger("Autosubmit") from autosubmit.config.basicConfig import BasicConfig @@ -319,6 +319,7 @@ def _check_experiment_exists(name, error_on_inexistence=True): :return: If experiment exists returns true, if not returns false :rtype: bool """ + if not check_db(): return False try: @@ -339,6 +340,12 @@ def _check_experiment_exists(name, error_on_inexistence=True): if error_on_inexistence: raise AutosubmitCritical( 'The experiment name "{0}" does not exist yet!!!'.format(name), 7005) + if os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR, name)): + try: + _save_experiment(name, 'No description', "3.14.0") + except BaseException as e: + pass + return True return False return True diff --git a/autosubmit/experiment/experiment_common.py b/autosubmit/experiment/experiment_common.py index 160f15158..3c31346c2 100644 --- a/autosubmit/experiment/experiment_common.py +++ b/autosubmit/experiment/experiment_common.py @@ -58,7 +58,7 @@ def new_experiment(description, version, test=False, operational=False): else: new_name = 'a000' else: - new_name = next_experiment_id(last_exp_name) + new_name = last_exp_name if new_name == '': return '' while db_common.check_experiment_exists(new_name, False): -- GitLab From 1225dc4636e913733a24bb1a15ad5f04d56756f2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 18 Oct 2022 11:44:25 +0200 Subject: [PATCH 050/121] Added more reasons to a job for stop #837 --- autosubmit/autosubmit.py | 6 ++++-- autosubmit/job/job.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 1ea5d3a97..27bda288e 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1467,8 +1467,10 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower( - )] + try: + job.platform = submitter.platforms[job.platform_name.lower()] + except: + raise AutosubmitCritical("hpcarch={0} not found in the platforms configuration file".format(job.platform_name), 7014) # noinspection PyTypeChecker if job.status not in (Status.COMPLETED, Status.SUSPENDED): platforms_to_test.add(job.platform) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 6653c51f9..08b39d27c 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1257,7 +1257,8 @@ class Job(object): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: @@ -1639,7 +1640,8 @@ class WrapperJob(Job): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: -- GitLab From 18ca02106a2faa2259b9f5eaddab104089cd4b85 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 18 Oct 2022 16:25:49 +0200 Subject: [PATCH 051/121] dbfix pipeline --- test/unit/test_expid.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/unit/test_expid.py b/test/unit/test_expid.py index 85e5a012b..7eee22bfc 100644 --- a/test/unit/test_expid.py +++ b/test/unit/test_expid.py @@ -31,21 +31,21 @@ class TestExpid(TestCase): @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "a006" + current_experiment_id = "a007" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version) self.assertEquals("a007", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_test_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "t0ab" + current_experiment_id = "t0ac" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, True) self.assertEquals("t0ac", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_operational_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "o112" + current_experiment_id = "o113" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, False, True) self.assertEquals("o113", experiment_id) -- GitLab From 27ed656eff50553f9f14742df4e4c9d5f2c5d88e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 19 Oct 2022 13:32:13 +0200 Subject: [PATCH 052/121] Recursive submodules --- autosubmit/git/autosubmit_git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index c191c21df..8d194de74 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -223,7 +223,7 @@ class AutosubmitGit: else: command_1 += " git submodule init;".format(project_destination) for submodule in git_project_submodules: - command_1 += " git submodule update {0};".format(submodule) + command_1 += " git submodule update --init --recursive {0};".format(submodule) if git_remote_project_path == '': try: command_1 = "cd {0}; {1} ".format(git_path,command_1) -- GitLab From bf2246f5617b5959d63cbcfe2037fb8cea02e133 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:17:23 +0200 Subject: [PATCH 053/121] Fixed an issue raised when a platform has no project expid user or scratch dir defined --- autosubmit/autosubmit.py | 7 +++++-- autosubmit/config/config_common.py | 2 +- autosubmit/platforms/paramiko_submitter.py | 19 +++++++++++++------ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 27bda288e..ff24f0967 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4385,8 +4385,11 @@ class Autosubmit: submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform\nCheck the hpcarch platform configuration inside platform.conf", 7014) - + try: + hpcarch = submitter.platforms[as_conf.get_platform()] + except: + hpcarch = "local" + Log.warning("Remote clone may be disabled due to: "+e.message) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": svn_project_url = as_conf.get_svn_project_url() diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index c0cacf190..ff6f31ea4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -623,7 +623,7 @@ class AutosubmitConfig(object): "Platform file is not well-configured or found"]] for section in self._platforms_parser.sections(): - if section in self.hpcarch: + if section.lower() in self.hpcarch.lower(): main_platform_found = True if not self._platforms_parser.check_exists(section, 'TYPE'): self.wrong_config["Platform"] += [[section, diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 12e1e70bc..92594abdd 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -20,7 +20,8 @@ import os -from log.log import Log + +from log.log import Log,AutosubmitCritical,AutosubmitError from autosubmit.config.basicConfig import BasicConfig from autosubmit.config.config_common import AutosubmitConfig from submitter import Submitter @@ -72,7 +73,7 @@ class ParamikoSubmitter(Submitter): :return: platforms used by the experiment :rtype: dict """ - + raise_message="" platforms_used = list() hpcarch = asconf.get_platform() platforms_used.append(hpcarch) @@ -191,12 +192,16 @@ class ParamikoSubmitter(Submitter): remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) - remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, - remote_platform.user, remote_platform.expid) + try: + remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, + remote_platform.user, remote_platform.expid) + remote_platform.update_cmds() + platforms[section.lower()] = remote_platform + + except: + raise_message = "Error in platform.conf: SCRATCH_DIR, PROJECT, USER, EXPID must be defined for platform {0}".format(section) # Executes update_cmds() from corresponding Platform Object - remote_platform.update_cmds() # Save platform into result dictionary - platforms[section.lower()] = remote_platform for section in parser.sections(): # if this section is included in platforms @@ -208,3 +213,5 @@ class ParamikoSubmitter(Submitter): platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms + if raise_message != "": + raise AutosubmitError(raise_message) -- GitLab From c65ca656d52a27f60e409e06a6a6ca6f9ba63cb4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:30:39 +0200 Subject: [PATCH 054/121] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ad59f742d..cd56dd095 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.14.0b +#3.14.0b -- GitLab From c41ef89f05ab40a890174f3f7f523077b55cb804 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:30:49 +0200 Subject: [PATCH 055/121] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index cd56dd095..ad59f742d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -#3.14.0b +3.14.0b -- GitLab From be9bc4a4d4293b9cee2e0f4c6bafe9b30925a6f4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:39:37 +0200 Subject: [PATCH 056/121] test local git not working proprly --- autosubmit/autosubmit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ff24f0967..5133dc4ca 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1359,6 +1359,7 @@ class Autosubmit: for wrapper_section in as_conf.get_wrapper_multi(): wrapper_jobs[wrapper_section] = as_conf.get_wrapper_jobs(wrapper_section) wrapper_jobs["wrapper"] = as_conf.get_wrapper_jobs("wrapper") + # Log.warning("Aux Job_list was generated successfully") submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) -- GitLab From 330574b835d9571f42645dd4bc6af079a353cff6 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 3 Nov 2022 09:42:59 +0100 Subject: [PATCH 057/121] Unbound variable fixes in some messages and job_status #893 Fixed an issue with recovery not cancelling all jobs --- autosubmit/autosubmit.py | 17 ++++---- autosubmit/git/autosubmit_git.py | 4 +- autosubmit/job/job.py | 49 ++++++++++++----------- autosubmit/job/job_list.py | 14 +++---- autosubmit/monitor/diagram.py | 12 ++++-- autosubmit/platforms/paramiko_platform.py | 37 ++++++++--------- 6 files changed, 69 insertions(+), 64 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5133dc4ca..cbb4b142d 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2068,7 +2068,6 @@ class Autosubmit: platform.connected = False Log.printlog("[{1}] Connection failed to host {0}".format( platform.host, platform.name),Log.WARNING) if issues != "": - platform.connected = False raise AutosubmitCritical( "Issues while checking the connectivity of platforms.", 7010, issues+"\n"+ssh_config_issues) @@ -2221,7 +2220,7 @@ class Autosubmit: "Submission failed, this can be due a failure on the platform", 6015, e.message) if jobs_id is None or len(jobs_id) <= 0: raise AutosubmitError( - "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(e.message,e.trace), 6015) + "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(str(e),""), 6015) i = 0 if hold: sleep(10) @@ -2677,9 +2676,9 @@ class Autosubmit: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) for platform in platforms_to_test: platform.test_connection() - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) if not force: raise AutosubmitCritical( "Experiment can't be recovered due being {0} active jobs in your experiment, If you want to recover the experiment, please use the flag -f and all active jobs will be cancelled".format( @@ -3235,16 +3234,16 @@ class Autosubmit: # Preparation for section parameters no_load_sections = False no_load_platforms = False - try: - job_list = Autosubmit.load_job_list( - expid, as_conf, notransitive=False) - except Exception as e: - no_load_sections = True + + job_list = Autosubmit.load_job_list( + expid, as_conf, notransitive=False) + try: submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) except Exception as e: no_load_platforms = True + submitter = Autosubmit._get_submitter(as_conf) submitter.load_local_platform(as_conf) try: # Gathering parameters of autosubmit and expdef config files @@ -4049,7 +4048,7 @@ class Autosubmit: Log.warning("Experiment folder renamed to: {0}".format( exp_folder + "_to_delete ")) except Exception as e: - Autosubmit.unarchive(expid, uncompress=False) + Autosubmit.unarchive(expid, uncompressed=False) raise AutosubmitCritical( "Can not remove or rename experiments folder", 7012, str(e)) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 8d194de74..493358ed0 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -203,7 +203,7 @@ class AutosubmitGit: command_0 = "cd {0} ; {1}".format(project_path, command_0) output_0 = subprocess.check_output(command_0, shell=True) else: - command_0 = "cd {0} ; {1}".format(git_remote_path, command_0) + command_0 = "cd {0} ; {1}".format(project_path, command_0) hpcarch.send_command(command_0) ##command 1 if os.path.exists(os.path.join(git_path, ".githooks")): @@ -233,7 +233,7 @@ class AutosubmitGit: submodule_failure = True Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( - "Submodule {0} has a wrong configuration".format(submodule), 6014) + "Submodule has a wrong configuration.\n{0}".format(command_1), 6014) else: command_1 = "cd {0}; {1} ".format(git_remote_path, command_1) hpcarch.send_command(command_1) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 08b39d27c..739216c4a 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -624,6 +624,7 @@ class Job(object): as_conf.reload() submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) + platform = submitter.platforms[platform_name.lower()] success = True except BaseException as e: error_message = str(e) @@ -632,31 +633,31 @@ class Job(object): count=count+1 if not success: raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) - platform = submitter.platforms[platform_name.lower()] - try: - platform.test_connection() + else: max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count - if self.wrapper_type is not None and self.wrapper_type == "vertical": - found = False - retrials = 0 - while retrials < 3 and not found: - if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): - found = True - retrials = retrials + 1 - for i in range(max_logs-1,-1,-1): - if platform.check_stat_file_by_retrials(stat_file + str(i)): - last_log = i - else: - break - remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) + try: + platform.test_connection() + if self.wrapper_type is not None and self.wrapper_type == "vertical": + found = False + retrials = 0 + while retrials < 3 and not found: + if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): + found = True + retrials = retrials + 1 + for i in range(max_logs-1,-1,-1): + if platform.check_stat_file_by_retrials(stat_file + str(i)): + last_log = i + else: + break + remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) - else: - remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) + else: + remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except BaseException as e: - Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) + except BaseException as e: + Log.printlog( + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 @@ -1730,7 +1731,7 @@ class WrapperJob(Job): self.name, reason), 6009) # while running jobs? self._check_running_jobs() - self.update_failed_jobs(canceled_wrapper=True) + self.update_failed_jobs() self.cancel_failed_wrapper_job() return @@ -1760,8 +1761,8 @@ class WrapperJob(Job): job.hold = self.hold job.status = self.status if self.status == Status.WAITING: - for job in self.job_list: - job.packed = False + for job2 in self.job_list: + job2.packed = False def _check_inner_job_wallclock(self, job): start_time = self.running_jobs_start[job] diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index ae52a0c78..2a687dbd8 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -439,13 +439,13 @@ class JobList(object): # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - for parent in jobs_by_section: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + for parent2 in jobs_by_section: + if parent2.chunk in dependency.select_chunks_dest[relation_indx] or len( dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - visited_parents.add(parent) + if parent2 not in visited_parents: + job.add_parent(parent2) + JobList._add_edge(graph, job, parent2) + visited_parents.add(parent2) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) @@ -2042,7 +2042,7 @@ class JobList(object): # root exists if root is not None: - result += self._recursion_print(root, 0) + result += self._recursion_print(root, 0,[]) else: result += "\nCannot find root." diff --git a/autosubmit/monitor/diagram.py b/autosubmit/monitor/diagram.py index 8e8753167..b1f0f6744 100644 --- a/autosubmit/monitor/diagram.py +++ b/autosubmit/monitor/diagram.py @@ -66,14 +66,18 @@ def create_bar_diagram(experiment_id, jobs_list, general_stats, output_file, per exp_stats.calculate_statistics() exp_stats.calculate_summary() exp_stats.make_old_format() - failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + # Stats variables definition + normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) + failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) except Exception as exp: + if not isinstance(normal_plots_count,int): + normal_plots_count = 0 + if not isinstance(failed_jobs_plots_count,int): + failed_jobs_plots_count = 0 print(exp) print(traceback.format_exc()) - # Stats variables definition - normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) - failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) total_plots_count = normal_plots_count + failed_jobs_plots_count # num_plots = norma # ind = np.arange(int(MAX_JOBS_PER_PLOT)) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 1c1177510..4b5c2d4b4 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -118,7 +118,7 @@ class ParamikoPlatform(Platform): raise except BaseException as e: self.connected = False - raise AutosubmitCritical(message,7051) + raise AutosubmitCritical(str(e),7051) #raise AutosubmitError("[{0}] connection failed for host: {1}".format(self.name, self.host), 6002, e.message) def restore_connection(self): @@ -226,7 +226,7 @@ class ParamikoPlatform(Platform): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: - self.restore_connection(reconnect=True) + self.restore_connection() else: raise AutosubmitError( "Couldn't establish a connection to the specified host, wrong configuration?", 6003, e.message) @@ -283,8 +283,8 @@ class ParamikoPlatform(Platform): self._ftpChannel.chmod(remote_path, os.stat(local_path).st_mode) return True except IOError as e: - raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, str(e)) + + raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join(self.tmp_path,filename), code=6004, trace=str(e))) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -594,19 +594,20 @@ class ParamikoPlatform(Platform): sleep_time = sleep_time + 5 # URi: define status list in HPC Queue Class else: - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + job_status = job.status + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -989,7 +990,7 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def parse_queue_reason(self, output): + def parse_queue_reason(self, output, job_id): raise NotImplementedError def get_ssh_output(self): -- GitLab From d1152bffe2dbadb296f598a6e79a4176f8018905 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 28 Jul 2022 16:19:58 +0200 Subject: [PATCH 058/121] over_wallclock fix --- autosubmit/job/job.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 226b85c37..948269142 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -768,6 +768,22 @@ class Job(object): except BaseException as e: pass return + def parse_time(self,wallclock): + format = "minute" + regex = re.compile(r'(((?P\d+):)((?P\d+)))(:(?P\d+))?') + parts = regex.match(wallclock) + if not parts: + return + parts = parts.groupdict() + if int(parts['hours']) > 0 : + format = "hour" + else: + format = "minute" + time_params = {} + for name, param in parts.items(): + if param: + time_params[name] = int(param) + return datetime.timedelta(**time_params),format # Duplicated for wrappers and jobs to fix in 4.0.0 def is_over_wallclock(self, start_time, wallclock): """ @@ -777,25 +793,13 @@ class Job(object): :return: """ elapsed = datetime.datetime.now() - start_time - wallclock = datetime.datetime.strptime(wallclock, '%H:%M') - total = 0.0 - if wallclock.hour > 0: - total = wallclock.hour - format = "hour" - else: - format = "minute" - if format == "hour": - if wallclock.minute > 0: - total += wallclock.minute / 60.0 - if wallclock.second > 0: - total += wallclock.second / 60.0 / 60.0 + wallclock,time_format = self.parse_time(wallclock) + if time_format == "hour": + total = wallclock.days * 24 + wallclock.seconds / 60 / 60 else: - if wallclock.minute > 0: - total += wallclock.minute - if wallclock.second > 0: - total += wallclock.second / 60.0 + total = wallclock.days * 24 + wallclock.seconds / 60 total = total * 1.30 # in this case we only want to avoid slurm issues so the time is increased by 50% - if format == "hour": + if time_format == "hour": hour = int(total) minute = int((total - int(total)) * 60.0) second = int(((total - int(total)) * 60 - -- GitLab From 1129425403695cc7b418a9e305e2fc3641e2e707 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 8 Aug 2022 16:36:54 +0200 Subject: [PATCH 059/121] fix project_Destination --- autosubmit/autosubmit.py | 3 ++- autosubmit/config/config_common.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 19dc23baf..5bdb10116 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4324,7 +4324,8 @@ class Autosubmit: """ project_destination = as_conf.get_project_destination() if project_destination is None or len(project_destination) == 0: - raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) + if project_type.lower() != "none": + raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": submitter = Autosubmit._get_submitter(as_conf) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 7b2a6a12b..e3e9188a4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1119,11 +1119,14 @@ class AutosubmitConfig(object): elif self.get_project_type().lower() == "git": value = self.get_git_project_origin().split( '/')[-1].split('.')[-2] - return value + if value != "": + return value + else: + return "project_files" except Exception as exp: Log.debug(str(exp)) Log.debug(traceback.format_exc()) - return '' + return "project_files" def set_git_project_commit(self, as_conf): """ -- GitLab From 746cc1691a97d728ef30036409248140f655d218 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 15:09:33 +0200 Subject: [PATCH 060/121] tkinter --- docs/source/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 6573f8723..55938efa0 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -8,7 +8,7 @@ The Autosubmit code is maintained in *PyPi*, the main source for python packages .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10, tkinter .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. -- GitLab From a33c2afea042c8c888a559aaacd21e849c1100fb Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 14:56:12 +0200 Subject: [PATCH 061/121] tkinter --- docs/source/installation/index.rst | 4 ++-- requeriments.txt | 1 + setup.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 55938efa0..f1a9640a9 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -4,11 +4,11 @@ How to Install Autosubmit The Autosubmit code is maintained in *PyPi*, the main source for python packages. -- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk, python2-dev, graphviz >= 2.41, pip2 +- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk(tkinter in centOS), python2-dev, graphviz >= 2.41, pip2 .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10, tkinter +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10 .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. diff --git a/requeriments.txt b/requeriments.txt index f2dfdd0aa..d57974475 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -13,6 +13,7 @@ typing bscearth.utils cryptography==3.3.2 PyNaCl==1.4.0 +six>=1.10.0 requests xlib Pygments \ No newline at end of file diff --git a/setup.py b/setup.py index 35e8f4f4f..7935f7a42 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], extras_require={ -- GitLab From 7b2fd57f194a48f4811804d6ed5c2234e00c71e4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Aug 2022 15:42:36 +0200 Subject: [PATCH 062/121] author change --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7935f7a42..d4d0f0179 100644 --- a/setup.py +++ b/setup.py @@ -34,8 +34,8 @@ setup( version=version, description='Autosubmit: a versatile tool to manage Weather and Climate Experiments in diverse ' 'Supercomputing Environments', - author='Domingo Manubens-Gil', - author_email='domingo.manubens@bsc.es', + author='Daniel Beltran Mora', + author_email='daniel.beltran@bsc.es', url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], -- GitLab From 75a6c968042d9f11c86ba297d8c5e2e7a1a72c12 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 15:08:10 +0200 Subject: [PATCH 063/121] Added requests, improvement exception recovery for wrappers , added more info, bugfixed status appearing in log.out , bug fixed lc level not being able to change --- autosubmit/autosubmit.py | 54 ++++++++++++++--------- autosubmit/platforms/paramiko_platform.py | 32 ++++++++------ autosubmit/platforms/platform.py | 2 +- environment.yml | 1 + log/log.py | 15 ++++++- setup.py | 2 +- 6 files changed, 69 insertions(+), 37 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5bdb10116..8704d27f3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -162,7 +162,7 @@ class Autosubmit: parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version) parser.add_argument('-lf', '--logfile', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), - default='WARNING', type=str, + default='DEBUG', type=str, help="sets file's log level.") parser.add_argument('-lc', '--logconsole', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), default='INFO', type=str, @@ -1659,7 +1659,11 @@ class Autosubmit: Log.debug('Checking Wrapper {0}'.format(str(job_id))) wrapper_job.checked_time = datetime.datetime.now() # This is where wrapper will be checked on the slurm platform, update takes place. - platform.check_job(wrapper_job) + try: + platform.check_job(wrapper_job,is_wrapper=True) + except BaseException as e: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking wrapper {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) #Log.info("FD 3Wrapper checked: {0}".format(log.fd_show.fd_table_status_str())) try: if wrapper_job.status != wrapper_job.new_status: @@ -1671,8 +1675,12 @@ class Autosubmit: "Wrapper is in Unknown Status couldn't get wrapper parameters", 7050) # New status will be saved and inner_jobs will be checked. - wrapper_job.check_status( - wrapper_job.new_status) + try: + wrapper_job.check_status(wrapper_job.new_status) + except: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking the inner_jobs of {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) + # Erase from packages if the wrapper failed to be queued ( Hold Admin bug ) if wrapper_job.status == Status.WAITING: for inner_job in wrapper_job.job_list: @@ -1782,9 +1790,18 @@ class Autosubmit: # No need to wait until the remote platform reconnection recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - consecutive_retrials = 1 - delay = min(15*consecutive_retrials,120) + consecutive_retrials = 0 + failed_names = {} + Log.info("Storing failed job count...") + try: + for job in job_list.get_job_list(): + if job.fail_count > 0: + failed_names[job.name] = job.fail_count + except BaseException as e: + Log.printlog("Error trying to store failed job count",Log.WARNING) + Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: + delay = min(15 * consecutive_retrials, 120) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1794,6 +1811,7 @@ class Autosubmit: Log.info("Recovering job_list...") job_list = Autosubmit.load_job_list( expid, as_conf, notransitive=notransitive) + Log.info("Recovering job_list... Done") if allowed_members: # Set allowed members after checks have been performed. This triggers the setter and main logic of the -rm feature. job_list.run_members = allowed_members @@ -1801,26 +1819,20 @@ class Autosubmit: "Only jobs with member value in {0} or no member will be allowed in this run. Also, those jobs already SUBMITTED, QUEUING, or RUNNING will be allowed to complete and will be tracked.".format( str(allowed_members))) platforms_to_test = set() + Log.info("Recovering platform information...") for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) - #Recover job_list while keeping job.fail_count - failed_names = {} - for job in job_list.get_job_list(): - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] - platforms_to_test.add(job.platform) - if job.fail_count > 0: - failed_names[job.name] = job.fail_count + + Log.info("Recovering platform information... Done") + Log.info("Recovering Failure count...") for job in job_list.get_job_list(): if job.name in failed_names.keys(): job.fail_count = failed_names[job.name] - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] + Log.info("Recovering Failure count... Done") + Log.info("Recovering parameters...") Autosubmit._load_parameters(as_conf, job_list, submitter.platforms) # Recovery wrapper [Packages] @@ -1876,9 +1888,11 @@ class Autosubmit: None, None, jobs[0].platform, as_conf, jobs[0].hold) job_list.job_package_map[jobs[0].id] = wrapper_job + Log.info("Recovering wrappers... Done") job_list.update_list(as_conf) Log.info("Saving recovered job list...") job_list.save() + Log.info("Saving recovered job list... Done") recovery = True Log.result("Recover of job_list is completed") except AutosubmitError as e: @@ -1886,10 +1900,10 @@ class Autosubmit: Log.result("Recover of job_list has fail {0}".format(e.message)) except IOError as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) except BaseException as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) # Restore platforms and try again, to avoid endless loop with failed configuration, a hard limit is set. reconnected = False mail_notify = True diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 43adfd5c6..e57512f55 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -452,17 +452,20 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status :param retries: retries :param job: job + :type job: autosubmit.job.job.Job + :param default_status: default status if job is not found :type job: class(job) :param default_status: status to assign if it can be retrieved from the platform :type default_status: autosubmit.job.job_common.Status :return: current job status :rtype: autosubmit.job.job_common.Status + """ job_id = job.id job_status = Status.UNKNOWN @@ -491,19 +494,20 @@ class ParamikoPlatform(Platform): job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: job_status = Status.RUNNING - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + if not is_wrapper: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED elif job_status in self.job_status['QUEUING'] and job.hold is False: job_status = Status.QUEUING elif job_status in self.job_status['QUEUING'] and job.hold is True: diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index c2ccf3575..acbb20aa7 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -384,7 +384,7 @@ class Platform(object): """ raise NotImplementedError - def check_job(self, jobid, default_status=Status.COMPLETED, retries=5): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status diff --git a/environment.yml b/environment.yml index 4585486d9..bc6e7308b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - portalocker - networkx - python=2.7 +- requests - pip: - bscearth.utils - Xlib diff --git a/log/log.py b/log/log.py index ae3ca5a74..216fc23eb 100644 --- a/log/log.py +++ b/log/log.py @@ -161,7 +161,7 @@ class Log: logging.getLogger(name) @staticmethod - def set_file(file_path, type='out', level=WARNING): + def set_file(file_path, type='out', level="WARNING"): """ Configure the file to store the log. If another file was specified earlier, new messages will only go to the new file. @@ -169,6 +169,19 @@ class Log: :param file_path: file to store the log :type file_path: str """ + levels = {} + levels["STATUS_FAILED"] = 500 + levels["STATUS"] = 1000 + levels["DEBUG"] = 2000 + levels["WARNING"] = 3000 + levels["INFO"] = 4000 + levels["RESULT"] = 5000 + levels["ERROR"] = 6000 + levels["CRITICAL"] = 7000 + levels["NO_LOG"] = levels["CRITICAL"] + 1000 + + level = levels.get(str(level).upper(),"DEBUG") + max_retrials = 3 retrials = 0 timeout = 5 diff --git a/setup.py b/setup.py index d4d0f0179..8e56eb8c5 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( keywords=['climate', 'weather', 'workflow', 'HPC'], install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', - 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], + 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] }, -- GitLab From 5d1fa462207ce7e05ee8597c9b1ff25ae2164ecf Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 15:53:09 +0200 Subject: [PATCH 064/121] stat fix --- autosubmit/autosubmit.py | 1 + autosubmit/job/job.py | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8704d27f3..03853b178 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1763,6 +1763,7 @@ class Autosubmit: save2 = job_list.update_list( as_conf, submitter=submitter) job_list.save() + if len(job_list.get_ready()) > 0: save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 948269142..28c9b2be9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -630,10 +630,9 @@ class Job(object): found = False retrials = 0 while retrials < 3 and not found: - sleep(2) if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): found = True - retrials = retrials - 1 + retrials = retrials + 1 for i in range(max_logs-1,-1,-1): if platform.check_stat_file_by_retrials(stat_file + str(i)): last_log = i @@ -1181,18 +1180,18 @@ class Job(object): if self.type == Type.BASH: template = 'sleep 5' + "\n" elif self.type == Type.PYTHON: - template = 'time.sleep(30)' + "\n" + template = 'time.sleep(5)' + "\n" elif self.type == Type.R: - template = 'Sys.sleep(30)' + "\n" + template = 'Sys.sleep(5)' + "\n" template += template_file.read() template_file.close() else: if self.type == Type.BASH: - template = 'sleep 35' + template = 'sleep 5' elif self.type == Type.PYTHON: - template = 'time.sleep(35)' + template = 'time.sleep(5)' elif self.type == Type.R: - template = 'Sys.sleep(35)' + template = 'Sys.sleep(5)' else: template = '' except: -- GitLab From c21485af35cbeb4f24fc21dbc2adf1d18f11e127 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Aug 2022 16:44:23 +0200 Subject: [PATCH 065/121] wrapper_type is now being saved correctly --- autosubmit/autosubmit.py | 1 - autosubmit/job/job_list.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 03853b178..8704d27f3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1763,7 +1763,6 @@ class Autosubmit: save2 = job_list.update_list( as_conf, submitter=submitter) job_list.save() - if len(job_list.get_ready()) > 0: save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 3d55bb040..d8abc0eff 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -215,6 +215,15 @@ class JobList(object): new, notransitive, update_structure=update_structure) for job in self._job_list: job.parameters = parameters + job_data = jobs_data.get(job.name,"none") + try: + if job_data != "none": + job.wrapper_type = job_data[12] + else: + job.wrapper_type = "none" + except BaseException as e: + job.wrapper_type = "none" + # Checking for member constraints if len(run_only_members) > 0: # Found -- GitLab From 401a1b3e36064aa0f86f4e48f06acc8328916d20 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 08:45:34 +0200 Subject: [PATCH 066/121] erased debug info, changed exception for baseexception --- autosubmit/job/job.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 28c9b2be9..1056b93f6 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -620,10 +620,9 @@ class Job(object): submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) platform = submitter.platforms[platform_name.lower()] - try: - platform.test_connection() - except: - pass + + platform.test_connection() + max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count if self.wrapper_type is not None and self.wrapper_type == "vertical": @@ -643,7 +642,7 @@ class Job(object): else: remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except Exception as e: + except BaseException as e: Log.printlog( "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) out_exist = False -- GitLab From 0c0bcd3c2138f64b9def6c8a8801df211bed08e1 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 12:53:18 +0200 Subject: [PATCH 067/121] Fixed delay issue #862 --- autosubmit/job/job_list.py | 51 ++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index d8abc0eff..b26f24e74 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -413,28 +413,37 @@ class JobList(object): if dependency.splits is not None: parent = filter( lambda _parent: _parent.split in dependency.splits, parent) - #Select chunk + select member - if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - elif len(dependency.select_members_orig) > 0: - for relation_indx in member_relations_to_add: - if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - elif len(dependency.select_chunks_orig) > 0: + #Select chunk + select member + if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + elif len(dependency.select_members_orig) > 0: + for relation_indx in member_relations_to_add: + if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + elif len(dependency.select_chunks_orig) > 0: + for relation_indx in chunk_relations_to_add: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. + if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( - dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - + for parent in jobs_by_section: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + visited_parents.add(parent) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) -- GitLab From fbc02f6ae0962dd5536c0c3c10ab613e86df1b13 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 29 Aug 2022 13:40:57 +0200 Subject: [PATCH 068/121] Added 5min retrial in case that something is wrong while recovering the As_conf info inside a thread. --- autosubmit/job/job.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1056b93f6..325564bec 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -614,15 +614,27 @@ class Job(object): max_logs = 0 sleep(5) stat_file = self.script_name[:-4] + "_STAT_" + retries = 2 + count = 0 + success = False + error_message = "" + while count < retries or success: + try: + as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) + as_conf.reload() + submitter = self._get_submitter(as_conf) + submitter.load_platforms(as_conf) + success = True + except BaseException as e: + error_message = str(e) + sleep(60*5) + pass + count=count+1 + if not success: + raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) + platform = submitter.platforms[platform_name.lower()] try: - as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.reload() - submitter = self._get_submitter(as_conf) - submitter.load_platforms(as_conf) - platform = submitter.platforms[platform_name.lower()] - platform.test_connection() - max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count if self.wrapper_type is not None and self.wrapper_type == "vertical": @@ -644,7 +656,7 @@ class Job(object): except BaseException as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(e.message, self.name), 6001) out_exist = False err_exist = False retries = 3 -- GitLab From 7dc55ee35d06f663a3c0174cbd6896a3e7613534 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:30:45 +0200 Subject: [PATCH 069/121] e --- autosubmit/autosubmit.py | 2 +- autosubmit/job/job.py | 2 +- autosubmit/platforms/slurmplatform.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 8704d27f3..b299c7dcc 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2227,7 +2227,7 @@ class Autosubmit: "{0} submission failed, some hold jobs failed to be held".format(platform.name), 6015) except WrongTemplateException as e: raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( - e.job_name), 7014, e.message) + e.job_name), 7014, str(e)) except AutosubmitError as e: raise except AutosubmitCritical as e: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 325564bec..1068dca65 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -618,7 +618,7 @@ class Job(object): count = 0 success = False error_message = "" - while count < retries or success: + while (count < retries) or success: try: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index cd96b21cc..5d31690c4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -362,8 +362,8 @@ class SlurmPlatform(ParamikoPlatform): return export + self._submit_hold_cmd + job_script else: if not hold: - self._submit_script_file.write( - export + self._submit_cmd + job_script + "\n") + write_this = export + self._submit_cmd + job_script +"\n" + self._submit_script_file.write(write_this) else: self._submit_script_file.write( export + self._submit_hold_cmd + job_script + "\n") -- GitLab From 4690246861edd1b3809440933a5929008506b7a6 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:34:19 +0200 Subject: [PATCH 070/121] e --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 1068dca65..9365e516f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -618,7 +618,7 @@ class Job(object): count = 0 success = False error_message = "" - while (count < retries) or success: + while (count < retries) or not success: try: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() -- GitLab From 359b1eb147af2ee3c5392e1c0d64ff5d1ef74e5e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 31 Aug 2022 15:50:30 +0200 Subject: [PATCH 071/121] fixed message --- autosubmit/platforms/paramiko_submitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index c597274f7..acba2bcce 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -184,8 +184,9 @@ class ParamikoSubmitter(Submitter): None) remote_platform.custom_directives = parser.get_option(section, 'CUSTOM_DIRECTIVES', None) - Log.debug("Custom directives from platform.conf: {0}".format( - remote_platform.custom_directives)) + if remote_platform.custom_directives is not None and remote_platform.custom_directives != '' and remote_platform.custom_directives != 'None': + Log.debug("Custom directives from platform.conf: {0}".format( + remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, -- GitLab From 54dbb37052d2516572d1ebdfd6a9c0d3e78b9698 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 6 Sep 2022 10:53:49 +0200 Subject: [PATCH 072/121] conda fix --- docs/source/installation/index.rst | 154 +++++++++++++++-------------- 1 file changed, 79 insertions(+), 75 deletions(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index f1a9640a9..9a90c4e54 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -1,6 +1,9 @@ -######################### -How to Install Autosubmit -######################### +############ +Installation +############ + +How to install +============== The Autosubmit code is maintained in *PyPi*, the main source for python packages. @@ -35,76 +38,8 @@ or download, unpack and: .. hint:: To see the changelog, use ``autosubmit changelog`` -Examples -======== - -Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. ------------------------------------------------------------------------------- - -.. code-block:: bash - - - # Update repositories - apt update - - # Avoid interactive stuff - export DEBIAN_FRONTEND=noninteractive - - # Dependencies - apt install wget curl python2 python-tk python2-dev graphviz -y -q - - # Additional dependencies related with pycrypto - apt install build-essential libssl-dev libffi-dev -y -q - - # Download get pip script and launch it - wget https://bootstrap.pypa.io/pip/2.7/get-pip.py - python2 get-pip.py - - # Install autosubmit using pip - pip2 install autosubmit - - # Check that we can execute autosubmit commands - autosubmit -h - - # Configure - autosubmit configure - - # Install - autosubmit install - - # Get expid - autosubmit expid -H TEST -d "Test exp." - - # Create with -np - # Since it was a new install the expid will be a000 - autosubmit create a000 -np - -Sequence of instructions to install Autosubmit and its dependencies with conda. -------------------------------------------------------------------------------- - -.. code-block:: bash - - # Download conda - wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Launch it - ./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Download git - apt install git -y -q - # Download autosubmit - git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 - cd autosubmit - # Create conda environment - conda env update -f environment.yml -n autosubmit python=2 - # Activate env - source activate autosubmit - # Test autosubmit - autosubmit -v - # Configure autosubmitrc and install database as indicated in this doc - - -################ How to configure -################ +================ After installation, you have to configure database and path for Autosubmit. In order to use the default settings, just create a directory called `autosubmit` in your home directory before running the configure command. @@ -136,9 +71,11 @@ For installing the database for Autosubmit on the configured folder, when no dat autosubmit install -.. important:: Be careful ! autosubmit install will create a blank database. +.. danger:: Be careful ! autosubmit install will create a blank database. -Lastly, if autosubmit configure doesn't work for you or you need to configure additional info create or modify /etc/autosubmitrc file or ~/.autosubmitrc with the information as follows: +Lastly, if autosubmit configure doesn't work for you or you need to configure additional info create: + +Create or modify /etc/autosubmitrc file or ~/.autosubmitrc with the information as follows: .. code-block:: ini @@ -182,4 +119,71 @@ From 3.14+ onwards, autosubmit commands can be tailored to run on specific machi * If no commands are defined, all commands are authorized. * If no machines are defined, all machines are authorized. -Now you are ready to use Autosubmit ! \ No newline at end of file +Now you are ready to use Autosubmit ! + + +Examples +======== + +Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. +------------------------------------------------------------------------------ + +.. code-block:: bash + + + # Update repositories + apt update + + # Avoid interactive stuff + export DEBIAN_FRONTEND=noninteractive + + # Dependencies + apt install wget curl python2 python-tk python2-dev graphviz -y -q + + # Additional dependencies related with pycrypto + apt install build-essential libssl-dev libffi-dev -y -q + + # Download get pip script and launch it + wget https://bootstrap.pypa.io/pip/2.7/get-pip.py + python2 get-pip.py + + # Install autosubmit using pip + pip2 install autosubmit + + # Check that we can execute autosubmit commands + autosubmit -h + + # Configure + autosubmit configure + + # Install + autosubmit install + + # Get expid + autosubmit expid -H TEST -d "Test exp." + + # Create with -np + # Since it was a new install the expid will be a000 + autosubmit create a000 -np + +Sequence of instructions to install Autosubmit and its dependencies with conda. +------------------------------------------------------------------------------- + +.. code-block:: bash + + # Download conda + wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh + # Launch it + chmod + x ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + # Download git + apt install git -y -q + # Download autosubmit + git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 + cd autosubmit + # Create conda environment + conda env update -f environment.yml -n autosubmit python=2 + # Activate env + source activate autosubmit + # Test autosubmit + autosubmit -v + # Configure autosubmitrc and install database as indicated in this doc -- GitLab From 21b77c1153765fe83e22e6f2e6be769808b1dcd9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 6 Sep 2022 11:01:23 +0200 Subject: [PATCH 073/121] conda fix --- docs/source/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 9a90c4e54..64b314886 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -174,7 +174,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Download conda wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it - chmod + x ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh # Download git apt install git -y -q # Download autosubmit -- GitLab From e9da166cbfb546c31afeb70eaed9066a5d32ad2c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 7 Sep 2022 15:23:52 +0200 Subject: [PATCH 074/121] Wrapper is now fully independent from total and waiting jobs as expected #857 --- autosubmit/autosubmit.py | 5 +-- autosubmit/config/config_common.py | 9 ++-- autosubmit/job/job_packager.py | 52 +++++++++++----------- autosubmit/platforms/paramiko_submitter.py | 4 +- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index b299c7dcc..6fd5932a3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1372,8 +1372,8 @@ class Autosubmit: while job_list.get_active(): Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True, only_wrappers, hold=False) - for job in job_list.get_uncompleted_and_not_waiting(): - job.status = Status.COMPLETED + #for job in job_list.get_uncompleted_and_not_waiting(): + # job.status = Status.COMPLETED job_list.update_list(as_conf, False) @staticmethod @@ -2071,7 +2071,6 @@ class Autosubmit: platform.open_submit_script() valid_packages_to_submit = [] # type: List[JobPackageBase] for package in packages_to_submit: - try: # If called from inspect command or -cw if only_wrappers or inspect: diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index e3e9188a4..3f5c39a3b 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1600,7 +1600,9 @@ class AutosubmitConfig(object): :return: maximum number of jobs (or total jobs) :rtype: int """ - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED', self.get_total_jobs())) + #total_jobs = self.get_total_jobs() + #unlimited because wrapper should count as one + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED', 999999999)) def get_max_wrapped_jobs_vertical(self, wrapper_section_name="wrapper"): """ @@ -1609,8 +1611,7 @@ class AutosubmitConfig(object): :return: maximum number of jobs (or total jobs) :rtype: int """ - max_wrapped = self.get_max_wrapped_jobs(wrapper_section_name) - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_V', max_wrapped)) + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_V', -1)) def get_max_wrapped_jobs_horizontal(self, wrapper_section_name="wrapper"): """ @@ -1620,7 +1621,7 @@ class AutosubmitConfig(object): :rtype: int """ max_wrapped = self.get_max_wrapped_jobs(wrapper_section_name) - return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_H', max_wrapped)) + return int(self._conf_parser.get_option(wrapper_section_name, 'MAX_WRAPPED_H', -1)) def get_min_wrapped_jobs_vertical(self, wrapper_section_name="wrapper"): """ diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 54a6268c3..cfc1235e8 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -57,7 +57,12 @@ class JobPackager(object): # Submitted + Queuing Jobs for specific Platform queuing_jobs = jobs_list.get_queuing(platform) # We now consider the running jobs count - running_jobs_count = len(jobs_list.get_running(platform)) + running_jobs = jobs_list.get_running(platform) + running_by_id = dict() + for running_job in running_jobs: + running_by_id[running_job.id] = running_job + running_jobs_len = len(running_by_id.keys()) + queued_by_id = dict() for queued_job in queuing_jobs: queued_by_id[queued_job.id] = queued_job @@ -76,10 +81,9 @@ class JobPackager(object): # .total_jobs Maximum number of jobs at the same time self._max_jobs_to_submit = platform.total_jobs - queuing_jobs_len # Substracting running jobs - self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_count + self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_len self._max_jobs_to_submit = self._max_jobs_to_submit if self._max_jobs_to_submit > 0 else 0 - self.max_jobs = min(self._max_wait_jobs_to_submit, - self._max_jobs_to_submit) + self.max_jobs = min(self._max_wait_jobs_to_submit,self._max_jobs_to_submit) self.wrapper_type["wrapper"] = self._as_config.get_wrapper_type() self.wrapper_policy["wrapper"] = self._as_config.get_wrapper_policy() @@ -94,24 +98,15 @@ class JobPackager(object): self.jobs_in_wrapper[wrapper_section] = self._as_config.get_wrapper_jobs(wrapper_section) self.extensible_wallclock[wrapper_section] = int(self._as_config.get_extensible_wallclock(wrapper_section)) self.wrapper_info = [self.wrapper_type,self.wrapper_policy,self.wrapper_method,self.jobs_in_wrapper,self.extensible_wallclock] # to pass to job_packages - - - # True or False - - Log.debug( - "Number of jobs available: {0}", self._max_wait_jobs_to_submit) + Log.debug("Number of jobs available: {0}", self._max_wait_jobs_to_submit) if self.hold: - Log.debug("Number of jobs prepared: {0}", len( - jobs_list.get_prepared(platform))) + Log.debug("Number of jobs prepared: {0}", len(jobs_list.get_prepared(platform))) if len(jobs_list.get_prepared(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_prepared(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_prepared(platform))) else: - Log.debug("Number of jobs ready: {0}", len( - jobs_list.get_ready(platform, hold=False))) + Log.debug("Number of jobs ready: {0}", len(jobs_list.get_ready(platform, hold=False))) if len(jobs_list.get_ready(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_ready(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_ready(platform))) self._maxTotalProcessors = 0 def compute_weight(self, job_list): @@ -210,8 +205,7 @@ class JobPackager(object): # Sort by Priority, highest first list_of_available = sorted( available_sorted, key=lambda k: k.priority, reverse=True) - num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len( - jobs_ready), self._max_jobs_to_submit) + num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len(jobs_ready), self._max_jobs_to_submit) # Take the first num_jobs_to_submit from the list of available jobs_to_submit_tmp = list_of_available[0:num_jobs_to_submit] #jobs_to_submit = [ @@ -248,6 +242,10 @@ class JobPackager(object): wrapper_limits["max_h"] = self._as_config.get_max_wrapped_jobs_horizontal(self.current_wrapper_section) if wrapper_limits["max"] < wrapper_limits["max_v"] * wrapper_limits["max_h"]: wrapper_limits["max"] = wrapper_limits["max_v"] * wrapper_limits["max_h"] + if wrapper_limits["max_v"] == -1: + wrapper_limits["max_v"] = wrapper_limits["max"] + if wrapper_limits["max_h"] == -1: + wrapper_limits["max_h"] = wrapper_limits["max"] if '&' not in section: if self._as_config.jobs_parser.has_option(section, 'DEPENDENCIES'): dependencies_keys = self._as_config.jobs_parser.get( @@ -552,7 +550,7 @@ class JobPackager(object): def _build_horizontal_packages(self, section_list, wrapper_limits, section): packages = [] horizontal_packager = JobPackagerHorizontal(section_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) + wrapper_limits["max"], self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) package_jobs = horizontal_packager.build_horizontal_package() @@ -585,11 +583,11 @@ class JobPackager(object): """ packages = [] for job in section_list: - if self.max_jobs > 0: + if wrapper_limits["max"] > 0: if job.packed is False: job.packed = True dict_jobs = self._jobs_list.get_ordered_jobs_by_date_member(self.current_wrapper_section) - job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, self.max_jobs, wrapper_limits, self._platform.max_wallclock) + job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, wrapper_limits["max"], wrapper_limits, self._platform.max_wallclock) jobs_list = job_vertical_packager.build_vertical_package(job) packages.append(JobPackageVertical(jobs_list, configuration=self._as_config,wrapper_section=self.current_wrapper_section,wrapper_info=wrapper_info)) @@ -605,7 +603,7 @@ class JobPackager(object): ## READY JOBS ## ## Create the horizontal ## horizontal_packager = JobPackagerHorizontal(jobs_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node) + wrapper_limits["max"], self._platform.processors_per_node) if self.wrapper_type[self.current_wrapper_section] == 'vertical-horizontal': return self._build_vertical_horizontal_package(horizontal_packager, jobs_resources) @@ -654,7 +652,7 @@ class JobPackager(object): horizontal_packager.wrapper_limits["max_by_section"][section] = horizontal_packager.wrapper_limits["max_by_section"][section] - 1 horizontal_packager.wrapper_limits["max"] = horizontal_packager.wrapper_limits["max"] - actual_wrapped_jobs for job in horizontal_package: - job_list = JobPackagerVerticalSimple([job], job.wallclock, self.max_jobs, + job_list = JobPackagerVerticalSimple([job], job.wallclock, horizontal_packager.wrapper_limits["max"], horizontal_packager.wrapper_limits, self._platform.max_wallclock).build_vertical_package(job) @@ -706,7 +704,7 @@ class JobPackagerVertical(object): :rtype: List() of Job Object \n """ # self.jobs_list starts as only 1 member, but wrapped jobs are added in the recursion - if len(self.jobs_list) >= self.max_jobs or len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: + if len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: return self.jobs_list child = self.get_wrappable_child(job) # If not None, it is wrappable @@ -897,7 +895,7 @@ class JobPackagerHorizontal(object): for section in jobs_by_section: current_package_by_section[section] = 0 for job in jobs_by_section[section]: - if self.max_jobs > 0 and len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: + if len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: if int(job.tasks) != 0 and int(job.tasks) != int(self.processors_node) and \ int(job.tasks) < job.total_processors: nodes = int( diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index acba2bcce..1f577426f 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -159,8 +159,10 @@ class ParamikoSubmitter(Submitter): asconf.get_max_processors()) remote_platform.max_waiting_jobs = int(parser.get_option(section, 'MAX_WAITING_JOBS', asconf.get_max_waiting_jobs())) - remote_platform.total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', + totaljobs = int(parser.get_option(section, 'TOTALJOBS', asconf.get_total_jobs())) + total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', asconf.get_total_jobs())) + remote_platform.total_jobs = min(min(totaljobs, total_jobs),asconf.get_total_jobs()) remote_platform.hyperthreading = parser.get_option(section, 'HYPERTHREADING', 'false').lower() remote_platform.project = parser.get_option( -- GitLab From 68b2a800cd2fa822f11eb746d993ff59df459439 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Sep 2022 11:45:53 +0200 Subject: [PATCH 075/121] error message fix --- autosubmit/platforms/paramiko_platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e57512f55..e1b36f116 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -901,7 +901,7 @@ class ParamikoPlatform(Platform): except AutosubmitError as e: raise except IOError as e: - raise AutosubmitError(e.message,6016) + raise AutosubmitError("IO issues, something seems wrong with {0}".format(self.name),6016,e.message) except BaseException as e: raise AutosubmitError('Command {0} in {1} warning: {2}'.format( command, self.host, '\n'.join(stderr_readlines)), 6005, e.message) -- GitLab From 5c84e366d4ce509f37357194ce0a45fe7d199739 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Sep 2022 15:45:24 +0200 Subject: [PATCH 076/121] docs update --- .../html/_sources/usage/new_platform.rst.txt | 4 +- docs/source/troubleshooting/error-codes.rst | 4 +- .../userguide/configure/develop_a_project.rst | 48 +++++++++++++------ 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/docs/build/html/_sources/usage/new_platform.rst.txt b/docs/build/html/_sources/usage/new_platform.rst.txt index 173dafae4..675d4edc6 100644 --- a/docs/build/html/_sources/usage/new_platform.rst.txt +++ b/docs/build/html/_sources/usage/new_platform.rst.txt @@ -53,9 +53,9 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be waiting in this platform. +* MAX_WAITING_JOBS: maximum number of jobs to be queuing or submitted in this platform. -* TOTAL_JOBS: maximum number of jobs to be running at the same time in this platform. +* TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. * CUSTOM_DIRECTIVES: Custom directives for the resource manager of this platform. diff --git a/docs/source/troubleshooting/error-codes.rst b/docs/source/troubleshooting/error-codes.rst index c92ba38ad..ed9154997 100644 --- a/docs/source/troubleshooting/error-codes.rst +++ b/docs/source/troubleshooting/error-codes.rst @@ -155,7 +155,9 @@ Minor errors - Error codes [6000+] +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6013 | Configuration issues | Check log output for more info | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ -| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6015 | Submission failed | Automatically, if there aren't bigger issues | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6016 | Temporal connection issues | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ diff --git a/docs/source/userguide/configure/develop_a_project.rst b/docs/source/userguide/configure/develop_a_project.rst index 39960413a..17caddcf5 100644 --- a/docs/source/userguide/configure/develop_a_project.rst +++ b/docs/source/userguide/configure/develop_a_project.rst @@ -1,6 +1,7 @@ .. _develproject: +==================== Developing a project ==================== @@ -8,14 +9,14 @@ This section contains some examples on how to develop a new project. All files, with the exception of user-defined scripts, are located in the ``/conf`` directory. -Configuration files are written in ``ini`` format. Autosubmit supports user-defined scripts are written in ``bash``, ``python``, and ``R``. +Configuration files are written in ``ini`` format. In the other hand, the user-defined scripts are written in ``bash/python or R`` format. -To configure the experiment, edit ``autosubmit_cxxx.conf``, ``expdef_cxxx.conf``, ``jobs_cxxx.conf`` , ``platforms_cxxx.conf``, and ``proj_cxxx.conf``` in the ``conf`` folder of the experiment. +To configure the experiment, edit ``autosubmit_cxxx.conf``, ``expdef_cxxx.conf``, ``jobs_cxxx.conf`` , ``platforms_cxxx.conf`` and ``proj_cxxx.conf``` in the ``conf`` folder of the experiment. Expdef configuration --------------------- +==================== - ``vi /cxxx/conf/expdef_cxxx.conf`` + vi /cxxx/conf/expdef_cxxx.conf .. code-block:: ini @@ -92,9 +93,9 @@ Expdef configuration FILE_JOBS_CONF = templates/common/jobs.conf Autosubmit configuration ------------------------- +======================== - ``vi /cxxx/conf/autosubmit_cxxx.conf`` + vi /cxxx/conf/autosubmit_cxxx.conf .. code-block:: ini @@ -128,9 +129,9 @@ Autosubmit configuration # [wrappers] Jobs configuration ------------------- +================== - ``vi /cxxx/conf/jobs_cxxx.conf`` + vi /cxxx/conf/jobs_cxxx.conf .. code-block:: ini @@ -234,9 +235,9 @@ Jobs configuration RUNNING = member Platform configuration ----------------------- +====================== - ``vi /cxxx/conf/platforms_cxxx.conf`` + vi /cxxx/conf/platforms_cxxx.conf .. code-block:: ini @@ -291,9 +292,9 @@ Platform configuration TEST_SUITE = True Proj configuration ------------------- +================== -After filling the experiment configuration and executing ``autosubmit create cxxx -np``, a copy of the model is stored in ``proj``. +After filling the experiment configuration and promt ``autosubmit create cxxx -np`` create, user can go into ``proj`` which has a copy of the model. The experiment project contains the scripts specified in ``jobs_cxxx.conf`` and a copy of model source code and data specified in ``expdef_xxxx.conf``. @@ -511,7 +512,7 @@ Example: PISCES_timestep = 3600 Proj configuration:: Full example -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------- This section contains a full example of a valid proj file with a valid user script. @@ -560,7 +561,7 @@ Final script, which is generated by `autosubmit run` or ``autosubmit inspect`` (...) Detailed platform configuration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------- In this section, we describe the platform configuration using `-QOS` and also `PARTITION` @@ -673,7 +674,7 @@ In this section, we describe the platform configuration using `-QOS` and also `P The custom directives can be used for multiple parameters at the same time using the follow syntax. - `vi /conf/platform_cxxx.conf` + vi /conf/platform_cxxx.conf .. code-block:: ini @@ -699,3 +700,20 @@ The custom directives can be used for multiple parameters at the same time using MAX_PROCESSORS = 80 # test [40] / small [40] // large [40] PROCESSORS_PER_NODE = 40 + +Controling the number of active concurrent tasks in an experiment +---------------------------------------------------------------------- + +In some cases, you may want to control the number of concurrent tasks/jobs that can be active in an experiment. + +To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS`` and ``MAX_WAITING_JOBS`` variable in the ``conf/autosubmit_cxxx.conf`` file. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + # Maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 + -- GitLab From ba4ea0319f5338708d1ff51b2a1f084f8f2e6e2c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 16 Sep 2022 15:48:55 +0200 Subject: [PATCH 077/121] Now critical issues messages is always shown --- autosubmit/autosubmit.py | 7 ++++--- autosubmit/config/config_common.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 6fd5932a3..355260a76 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4341,12 +4341,13 @@ class Autosubmit: raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": - submitter = Autosubmit._get_submitter(as_conf) - submitter.load_platforms(as_conf) + try: + submitter = Autosubmit._get_submitter(as_conf) + submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform", 7014, e.message) + raise AutosubmitCritical("Can't set main platform\nCheck the hpcarch platform configuration inside platform.conf", 7014) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 3f5c39a3b..cc8aa3e1c 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -540,6 +540,8 @@ class AutosubmitConfig(object): # In case that there are critical errors in the configuration, Autosubmit won't continue. if running_time is True: raise AutosubmitCritical(e.message, e.code, e.trace) + else: + Log.printlog(e.message+"\n") except Exception as e: raise AutosubmitCritical( "There was an error while showing the config log messages", 7014, str(e)) -- GitLab From 6a9d1f01c731241d11cf1010f37fb7d4ecf07240 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 21 Sep 2022 16:09:27 +0200 Subject: [PATCH 078/121] Patch for db_fix --- autosubmit/autosubmit.py | 20 ++++++++++++-------- requeriments.txt | 1 + 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 355260a76..60b064de9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1556,7 +1556,8 @@ class Autosubmit: exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) except Exception as e: # This error is important - raise AutosubmitCritical("Error while processing historical database.", 7005, str(e)) + Log.printlog("Error while processing historical database.", 7005, str(e)) + try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -4224,13 +4225,16 @@ class Autosubmit: except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) - Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), + as_conf.get_full_config_as_json(), + job_list.get_job_list()) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: if group_by: status = list() diff --git a/requeriments.txt b/requeriments.txt index d57974475..c34451db2 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,3 +1,4 @@ +configparser argparse>=1.2,<2 python-dateutil>2 matplotlib -- GitLab From 46d9cafcdecff1fc614af30ef13aac6fa5021193 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 22 Sep 2022 09:53:04 +0200 Subject: [PATCH 079/121] Patch for db_fix (1) --- autosubmit/autosubmit.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 60b064de9..153c0c8a3 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1776,9 +1776,22 @@ class Autosubmit: job_list.update_list(as_conf, submitter=submitter) job_list.save() # Safe spot to store changes - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - if len(job_changes_tracker) > 0: - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except BaseException as e: + Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", + Log.INFO) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} if Autosubmit.exit: job_list.save() @@ -1949,8 +1962,16 @@ class Autosubmit: raise AutosubmitCritical("There is a bug in the code, please contact via git",7070,e.message) Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + except: + Log.printlog() # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True -- GitLab From fd14bb03d995652453b27427c33e82b3938c8803 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 23 Sep 2022 15:05:21 +0200 Subject: [PATCH 080/121] Does an sql dump everytime a change is detected. Then db_fix load this sql dump --- autosubmit/autosubmit.py | 102 ++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 55 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 153c0c8a3..75baab6de 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -58,6 +58,7 @@ import locale from distutils.util import strtobool from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set +import sqlite3 try: import dialog @@ -71,6 +72,7 @@ import tarfile import time import copy import os +import glob import pwd import sys import shutil @@ -1553,11 +1555,14 @@ class Autosubmit: # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current run does not exist. exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) except Exception as e: - # This error is important - Log.printlog("Error while processing historical database.", 7005, str(e)) - + try: + Autosubmit.database_fix(expid) + # This error is important + except: + pass try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -1781,6 +1786,7 @@ class Autosubmit: historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) if len(job_changes_tracker) > 0: exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) @@ -1790,6 +1796,7 @@ class Autosubmit: historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) if len(job_changes_tracker) > 0: exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except: Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} @@ -1965,13 +1972,12 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) except: try: Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) except: - Log.printlog() + pass # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True @@ -3901,6 +3907,17 @@ class Autosubmit: raise @staticmethod + def database_backup(expid): + try: + database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) + backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) + Log.info("Backing up jobs_data...") + subprocess.call(command, shell=True) + Log.result("Jobs_data database backup completed.") + except BaseException as e: + Log.info("Jobs_data database backup failed.") + @staticmethod def database_fix(expid): """ Database methods. Performs a sql dump of the database and restores it. @@ -3912,52 +3929,31 @@ class Autosubmit: """ os.umask(0) # Overrides user permissions current_time = int(time.time()) + corrupted_db_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_corrupted.db".format(expid)) + database_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) - database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_{1}.db".format(expid, str(current_time))) - dump_file_name = 'job_data_{0}_{1}.sql'.format(expid, current_time) + database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + dump_file_name = 'job_data_{0}.sql'.format(expid, current_time) dump_file_path = os.path.join(BasicConfig.JOBDATA_DIR, dump_file_name) - bash_command = 'sqlite3 {0} .dump > {1}'.format(database_path, dump_file_path) + bash_command = 'cat {1} | sqlite3 {0}'.format(database_path, dump_file_path) try: - if os.path.exists(database_path): + if os.path.exists(database_path): + result = os.popen("mv {0} {1}".format(database_path, corrupted_db_path)).read() + time.sleep(10) + Log.info("Original database moved.") + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + Log.info("Restoring from sql") result = os.popen(bash_command).read() - if result is not None and os.path.exists(dump_file_path): - Log.info("sqldump {0} created".format(dump_file_path)) - Log.info( - "Backing up original database {0}".format(database_path)) - result = os.popen("mv {0} {1}".format(database_path, database_backup_path)).read() - time.sleep(10) - if result is not None and not os.path.exists(database_path): - Log.info("Original database moved.") - Log.info("Restoring from sqldump") - HUtils.create_file_with_full_permissions(database_path) - result = os.popen("cat {0} | sqlite3 {1}".format( - dump_file_path, database_path)).read() - time.sleep(10) - if result is not None and os.path.exists(database_path): - Log.info( - "Database {0} restored.".format(database_path)) - Log.info("Deleting sqldump.") - result = os.popen( - "rm {0}".format(dump_file_path)).read() - sleep(5) - if result is not None and not os.path.exists(dump_file_path): - ExperimentHistory(expid).initialize_database() - Log.info("sqldump file deleted.") - Log.result( - "The database {0} has been fixed.".format(database_path)) - else: - raise Exception( - "The sqldump file could not be removed.") - else: - raise Exception( - "It was not possible to restore the sqldump file.") - else: - raise Exception( - "It was not possible to delete the original database.") - else: - raise Exception("The sqldump file couldn't be created.") - else: - raise Exception("The database file doesn't exist.") + except: + Log.warning("It was not possible to restore the jobs_data.db file... , a new blank db will be created") + result = os.popen("rm {0}".format(database_path)).read() + + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() except Exception as exp: Log.critical(str(exp)) @@ -4243,17 +4239,12 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) try: Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) except: Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: @@ -5018,6 +5009,7 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.process_status_changes(job_list.get_job_list(), chunk_unit=as_conf.get_chunk_size_unit(), chunk_size=as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) else: Log.printlog( "Changes NOT saved to the JobList!!!!: use -s option to save", 3000) -- GitLab From a1a2492b864712a050ac2cf7556ccf4fa0d4a791 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Sep 2022 09:21:49 +0200 Subject: [PATCH 081/121] database changes #870 --- autosubmit/autosubmit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 75baab6de..337247605 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3912,11 +3912,11 @@ class Autosubmit: database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) - Log.info("Backing up jobs_data...") + Log.debug("Backing up jobs_data...") subprocess.call(command, shell=True) - Log.result("Jobs_data database backup completed.") + Log.debug("Jobs_data database backup completed.") except BaseException as e: - Log.info("Jobs_data database backup failed.") + Log.debug("Jobs_data database backup failed.") @staticmethod def database_fix(expid): """ -- GitLab From 61a5b9cba349381c6cd421f219a7f8e4e959b34a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 30 Sep 2022 13:50:03 +0200 Subject: [PATCH 082/121] #877 conda typo --- docs/source/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 64b314886..4f68c3788 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -183,7 +183,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Create conda environment conda env update -f environment.yml -n autosubmit python=2 # Activate env - source activate autosubmit + conda activate autosubmit # Test autosubmit autosubmit -v # Configure autosubmitrc and install database as indicated in this doc -- GitLab From 315b55c2eef0be1a3cebf0203e8746c9a4dd89a9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 30 Sep 2022 13:50:53 +0200 Subject: [PATCH 083/121] #877 changed version to the lastest one (3.14.0b) --- docs/source/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 4f68c3788..7159ac7c0 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -178,7 +178,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. # Download git apt install git -y -q # Download autosubmit - git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 + git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b cd autosubmit # Create conda environment conda env update -f environment.yml -n autosubmit python=2 -- GitLab From 1ebc81cab2c26df63628e956ab4a27a683adc229 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 13:03:40 +0200 Subject: [PATCH 084/121] #inline comments, fixes for slrum --- autosubmit/autosubmit.py | 20 ++++-- autosubmit/platforms/paramiko_platform.py | 74 ++++++++++++++++------- test/regression/tests_runner.py | 1 + 3 files changed, 69 insertions(+), 26 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 337247605..09ce96335 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1584,7 +1584,7 @@ class Autosubmit: if unparsed_two_step_start != "": job_list.parse_jobs_by_filter(unparsed_two_step_start) - main_loop_retrials = 3650 # Hard limit of tries 3650 tries at 15-120seconds sleep each try + main_loop_retrials = 11250*2 # Hard limit of tries ( 48h min 72h max), 2 retrials per stop # establish the connection to all platforms Autosubmit.restore_platforms(platforms_to_test) @@ -1822,7 +1822,7 @@ class Autosubmit: Log.printlog("Error trying to store failed job count",Log.WARNING) Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: - delay = min(15 * consecutive_retrials, 120) + delay = min(15 * consecutive_retrials, 30) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1959,7 +1959,7 @@ class Autosubmit: except BaseException: reconnected = False if main_loop_retrials <= 0: - raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of 4hours reached", 7051, e.message) + raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of {0} retrials reached".format(main_loop_retrials), 7051, e.message) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error raise AutosubmitCritical(e.message, e.code, e.trace) except portalocker.AlreadyLocked: @@ -3322,7 +3322,12 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("Unknown error while reporting the parameters list, likely it is due IO issues",7040,e.message) - + @staticmethod + def removeInlineComments(cfgparser): + for section in cfgparser.sections(): + for item in cfgparser.items(section): + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + return cfgparser @staticmethod def describe(experiment_id): """ @@ -3497,6 +3502,7 @@ class Autosubmit: parser.set("autosubmitapi", "url", autosubmitapi_url) #parser.add_section("hosts") #parser.set("hosts", "whitelist", " localhost # Add your machine names") + parser = Autosubmit.removeInlineComments(parser) parser.write(config_file) config_file.close() Log.result("Configuration file written successfully: \n\t{0}".format(rc_path)) @@ -3591,6 +3597,8 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) + parser = Autosubmit.removeInlineComments(parser) + if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') if parser.has_option('database', 'filename'): @@ -3723,11 +3731,15 @@ class Autosubmit: parser.add_section('mail') parser.set('mail', 'smtp_server', smtp_hostname) parser.set('mail', 'mail_from', mail_from) + parser = Autosubmit.removeInlineComments(parser) + parser.write(config_file) config_file.close() d.msgbox("Configuration file written successfully", width=50, height=5) os.system('clear') + + except (IOError, OSError) as e: raise AutosubmitCritical( "Can not write config file", 7012, e.message) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e1b36f116..fb9059915 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -550,35 +550,63 @@ class ParamikoPlatform(Platform): cmd = self.get_checkAlljobs_cmd(job_list_cmd) sleep_time = 5 sleep(sleep_time) - self.send_command(cmd) - while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + slurm_error = False + e_msg = "" + try: self.send_command(cmd) - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - retries -= 1 - sleep(sleep_time) - sleep_time = sleep_time + 5 + except AutosubmitError as e: + e_msg = e.trace+" "+e.message + slurm_error = True + if not slurm_error: + while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + try: + self.send_command(cmd) + except AutosubmitError as e: + e_msg = e.trace + " " + e.message + slurm_error = True + break + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + retries -= 1 + sleep(sleep_time) + sleep_time = sleep_time + 5 + job_list_status = self.get_ssh_output() if retries >= 0: Log.debug('Successful check job command') in_queue_jobs = [] list_queue_jobid = "" for job in job_list: - job_id = job.id - job_status = self.parse_Alljobs_output(job_list_status, job_id) - while len(job_status) <= 0 and retries >= 0: - retries -= 1 - self.send_command(cmd) - job_list_status = self.get_ssh_output() + if not slurm_error: + job_id = job.id job_status = self.parse_Alljobs_output(job_list_status, job_id) - if len(job_status) <= 0: - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - sleep(sleep_time) - sleep_time = sleep_time + 5 - # URi: define status list in HPC Queue Class + while len(job_status) <= 0 and retries >= 0: + retries -= 1 + self.send_command(cmd) + job_list_status = self.get_ssh_output() + job_status = self.parse_Alljobs_output(job_list_status, job_id) + if len(job_status) <= 0: + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + sleep(sleep_time) + sleep_time = sleep_time + 5 + # URi: define status list in HPC Queue Class + else: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -595,12 +623,12 @@ class ParamikoPlatform(Platform): elif retries == 0: job_status = Status.COMPLETED job.update_status(remote_logs) - else: job_status = Status.UNKNOWN Log.error( 'check_job() The job id ({0}) status is {1}.', job_id, job_status) job.new_status = job_status + reason = str() if self.type == 'slurm' and len(in_queue_jobs) > 0: cmd = self.get_queue_status_cmd(list_queue_jobid) @@ -639,6 +667,8 @@ class ParamikoPlatform(Platform): 'check_job() The job id ({0}) from platform {1} has an status of {2}.', job.id, self.name, job_status) raise AutosubmitError("Some Jobs are in Unknown status", 6008) # job.new_status=job_status + if slurm_error: + raise AutosubmitError(e_msg, 6000) def get_jobid_by_jobname(self,job_name,retries=2): """ diff --git a/test/regression/tests_runner.py b/test/regression/tests_runner.py index ffd490888..ab186e849 100644 --- a/test/regression/tests_runner.py +++ b/test/regression/tests_runner.py @@ -79,6 +79,7 @@ def run(current_experiment_id, only_list=None, exclude_list=None, max_threads=5) tests_parser.optionxform = str tests_parser.read(tests_parser_file) + # Resetting the database clean_database(db_path) create_database() -- GitLab From 8b912ea7b2b65be3aa7128ec8d03d4f730be35ce Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 15:43:21 +0200 Subject: [PATCH 085/121] Remove inline comments working #870 --- autosubmit/autosubmit.py | 13 +++---------- autosubmit/config/config_common.py | 9 +++++++++ autosubmit/config/config_parser.py | 5 ++++- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 09ce96335..82e4b44e9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -3322,12 +3322,7 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("Unknown error while reporting the parameters list, likely it is due IO issues",7040,e.message) - @staticmethod - def removeInlineComments(cfgparser): - for section in cfgparser.sections(): - for item in cfgparser.items(section): - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) - return cfgparser + @staticmethod def describe(experiment_id): """ @@ -3502,7 +3497,6 @@ class Autosubmit: parser.set("autosubmitapi", "url", autosubmitapi_url) #parser.add_section("hosts") #parser.set("hosts", "whitelist", " localhost # Add your machine names") - parser = Autosubmit.removeInlineComments(parser) parser.write(config_file) config_file.close() Log.result("Configuration file written successfully: \n\t{0}".format(rc_path)) @@ -3597,7 +3591,6 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) - parser = Autosubmit.removeInlineComments(parser) if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') @@ -3731,8 +3724,6 @@ class Autosubmit: parser.add_section('mail') parser.set('mail', 'smtp_server', smtp_hostname) parser.set('mail', 'mail_from', mail_from) - parser = Autosubmit.removeInlineComments(parser) - parser.write(config_file) config_file.close() d.msgbox("Configuration file written successfully", @@ -5398,10 +5389,12 @@ class Autosubmit: raise AutosubmitCritical('Can not test a RERUN experiment', 7014) content = open(as_conf.experiment_file).read() + if random_select: if hpc is None: platforms_parser = as_conf.get_parser( ConfigParserFactory(), as_conf.platforms_file) + test_platforms = list() for section in platforms_parser.sections(): if platforms_parser.get_option(section, 'TEST_SUITE', 'false').lower() == 'true': diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index cc8aa3e1c..74dcc3e1e 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1759,6 +1759,13 @@ class AutosubmitConfig(object): commit = self.get_git_project_commit() return origin_exists and (branch is not None or commit is not None) + @staticmethod + def removeInlineComments(cfgparser): + for section in cfgparser.sections(): + for item in cfgparser.items(section): + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + return cfgparser + @staticmethod def get_parser(parser_factory, file_path): """ @@ -1794,5 +1801,7 @@ class AutosubmitConfig(object): raise Exception( "{}\n This file and the correctness of its content are necessary.".format(str(exp))) # parser.read(file_path) + #remove inline comments + parser = AutosubmitConfig.removeInlineComments(parser) return parser diff --git a/autosubmit/config/config_parser.py b/autosubmit/config/config_parser.py index 87b28456a..99d92fd8c 100644 --- a/autosubmit/config/config_parser.py +++ b/autosubmit/config/config_parser.py @@ -14,8 +14,11 @@ class ConfigParserFactory: def __init__(self): pass + + def create_parser(self): - return ConfigParser() + parser = ConfigParser() + return parser class ConfigParser(ConfPar, object): -- GitLab From 2c5a30087b20ec3e0df2e7d7449ba964ea9d0275 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 3 Oct 2022 16:04:02 +0200 Subject: [PATCH 086/121] setstatus doesn't crash anymore if the id does not exists --- autosubmit/autosubmit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 82e4b44e9..37aa84475 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4467,7 +4467,10 @@ class Autosubmit: if job.status in [Status.SUBMITTED, Status.QUEUING, Status.HELD] and final_status not in [Status.QUEUING, Status.HELD, Status.SUSPENDED]: job.hold = False if job.platform_name and job.platform_name.lower() != "local": - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + try: + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + except: + pass elif job.status in [Status.QUEUING, Status.RUNNING, Status.SUBMITTED] and final_status == Status.SUSPENDED: if job.platform_name and job.platform_name.lower() != "local": job.platform.send_command("scontrol hold " + "{0}".format(job.id), ignore_log=True) -- GitLab From 8244e8104c475bfd45b538cb0fe5f52ce2f9f44f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 11:08:39 +0200 Subject: [PATCH 087/121] Fixed e message error --- autosubmit/autosubmit.py | 14 +++++++------- autosubmit/config/config_common.py | 4 ++-- autosubmit/git/autosubmit_git.py | 4 ++-- autosubmit/job/job.py | 8 ++++---- autosubmit/job/job_dict.py | 2 +- autosubmit/job/job_list.py | 6 +++--- autosubmit/monitor/monitor.py | 2 +- autosubmit/platforms/paramiko_platform.py | 16 ++++++++-------- test/regression/tests_utils.py | 2 +- 9 files changed, 29 insertions(+), 29 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 37aa84475..2fca7cb7b 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -590,7 +590,7 @@ class Autosubmit: except Exception as e: if type(e) is SystemExit: # Version keyword force an exception in parse arg due and os_exit(0) but the program is succesfully finished - if e.message == 0: + if str(e) == 0: print(Autosubmit.autosubmit_version) os._exit(0) raise AutosubmitCritical( @@ -836,28 +836,28 @@ class Autosubmit: if ret: Log.result("Experiment {0} deleted".format(expid_delete)) except BaseException as e: - error_message += 'Can not delete experiment entry: {0}\n'.format(e.message) + error_message += 'Can not delete experiment entry: {0}\n'.format(str(e)) Log.info("Removing experiment directory...") try: shutil.rmtree(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)) except BaseException as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) try: Log.info("Removing Structure db...") structures_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, "structure_{0}.db".format(expid_delete)) if os.path.exists(structures_path): os.remove(structures_path) except BaseException as e: - error_message += 'Can not delete structure: {0}\n'.format(e.message) + error_message += 'Can not delete structure: {0}\n'.format(str(e)) try: Log.info("Removing job_data db...") job_data_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid_delete)) if os.path.exists(job_data_path): os.remove(job_data_path) except BaseException as e: - error_message += 'Can not delete job_data: {0}\n'.format(e.message) + error_message += 'Can not delete job_data: {0}\n'.format(str(e)) except OSError as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) else: if not eadmin: raise AutosubmitCritical( @@ -1811,7 +1811,7 @@ class Autosubmit: # No need to wait until the remote platform reconnection recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - consecutive_retrials = 0 + consecutive_retrials = 1 failed_names = {} Log.info("Storing failed job count...") try: diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 74dcc3e1e..ddbb04c78 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -513,11 +513,11 @@ class AutosubmitConfig(object): self.reload() except IOError as e: raise AutosubmitError( - "I/O Issues con config files", 6016, e.message) + "I/O Issues con config files", 6016, str(e)) except (AutosubmitCritical, AutosubmitError) as e: raise except BaseException as e: - raise AutosubmitCritical("Unknown issue while checking the configulation files (check_conf_files)",7040,e.message) + raise AutosubmitCritical("Unknown issue while checking the configulation files (check_conf_files)",7040,str(e)) # Annotates all errors found in the configuration files in dictionaries self.warn_config and self.wrong_config. self.check_expdef_conf() self.check_platforms_conf() diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 817b5e09b..c191c21df 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -60,7 +60,7 @@ class AutosubmitGit: shell=True) except subprocess.CalledProcessError as e: raise AutosubmitCritical( - "Failed to retrieve git info ...", 7064, e.message) + "Failed to retrieve git info ...", 7064, str(e)) if output: Log.info("Changes not committed detected... SKIPPING!") raise AutosubmitCritical("Commit needed!", 7013) @@ -231,7 +231,7 @@ class AutosubmitGit: output_1 = subprocess.check_output(command_1, shell=True) except BaseException as e: submodule_failure = True - Log.printlog("Trace: {0}".format(e.message), 6014) + Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( "Submodule {0} has a wrong configuration".format(submodule), 6014) else: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 9365e516f..6653c51f9 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -600,13 +600,13 @@ class Job(object): self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( - e.message, self.name)) + str(e), self.name)) except AutosubmitError as e: Log.printlog("Trace {0} \nFailed to retrieve log file for job {1}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) return @threaded @@ -656,7 +656,7 @@ class Job(object): except BaseException as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index 0b16d29af..d0aef9f42 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -126,7 +126,7 @@ class DicJobs: except BaseException as e: raise AutosubmitCritical( "Wrong format for {1} parameter in section {0}".format(section,called_from), 7011, - e.message) + str(e)) pass return parsed_list def read_section(self, section, priority, default_job_type, jobs_data=dict()): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index b26f24e74..4c2712267 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -249,7 +249,7 @@ class JobList(object): else: self._ordered_jobs_by_date_member[wrapper_section] = {} except BaseException as e: - raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,e.message) + raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,str(e)) pass @@ -1417,11 +1417,11 @@ class JobList(object): self._persistence_file, self._job_list if self.run_members is None or job_list is None else job_list) pass except BaseException as e: - raise AutosubmitError(e.message,6040,"Failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Failure while saving the job_list") except AutosubmitError as e: raise except BaseException as e: - raise AutosubmitError(e.message,6040,"Unknown failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Unknown failure while saving the job_list") def backup_save(self): diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index 55c60156a..9556e7d3d 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -353,7 +353,7 @@ class Monitor: except: pass - Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only writted in txt".format(e.message),7014) + Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only written in txt".format(e.message),7014) def generate_output_txt(self, expid, joblist, path, classictxt=False, job_list_object=None): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index fb9059915..1c1177510 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -112,7 +112,7 @@ class ParamikoPlatform(Platform): except EOFError as e: self.connected = False raise AutosubmitError("[{0}] not alive. Host: {1}".format( - self.name, self.host), 6002, e.message) + self.name, self.host), 6002, str(e)) except (AutosubmitError,AutosubmitCritical,IOError): self.connected = False raise @@ -136,7 +136,7 @@ class ParamikoPlatform(Platform): self.host.split(',')[0]), 6002) else: raise AutosubmitCritical( - "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,e.message) + "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,str(e)) while self.connected is False and retry < retries: try: self.connect(True) @@ -155,7 +155,7 @@ class ParamikoPlatform(Platform): raise except Exception as e: raise AutosubmitCritical( - 'Cant connect to this platform due an unknown error', 7050, e.message) + 'Cant connect to this platform due an unknown error', 7050, str(e)) def threaded(fn): def wrapper(*args, **kwargs): @@ -219,12 +219,12 @@ class ParamikoPlatform(Platform): elif "name or service not known" in e.strerror.lower(): raise SSHException(" {0} doesn't accept remote connections. Check if there is an typo in the hostname".format(self.host)) else: - raise AutosubmitError("File can't be located due an slow connection", 6016, e.message) + raise AutosubmitError("File can't be located due an slow connection", 6016, str(e)) except BaseException as e: self.connected = False - if "Authentication failed." in e.message: + if "Authentication failed." in str(e): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( - self._host_config['hostname']), 7050, e.message) + self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: self.restore_connection(reconnect=True) else: @@ -284,7 +284,7 @@ class ParamikoPlatform(Platform): return True except IOError as e: raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, e.message) + self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, str(e)) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -358,7 +358,7 @@ class ParamikoPlatform(Platform): except BaseException as e: Log.error('Could not remove file {0} due a wrong configuration'.format( os.path.join(self.get_files_path(), filename))) - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: raise AutosubmitCritical( "Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ", 7051, e.message) diff --git a/test/regression/tests_utils.py b/test/regression/tests_utils.py index 297fb8f75..53ead0dd5 100644 --- a/test/regression/tests_utils.py +++ b/test/regression/tests_utils.py @@ -23,7 +23,7 @@ def check_cmd(command, path=BIN_PATH, verbose='AS_TEST_VERBOSE' in os.environ): except subprocess.CalledProcessError as e: if verbose: - print e.output + print str(e) return False -- GitLab From 5e068242989d2ae25aa4565901a91fcde1e0bf50 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 13:16:06 +0200 Subject: [PATCH 088/121] log error --- autosubmit/config/config_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index ddbb04c78..4b683f1e4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1763,7 +1763,10 @@ class AutosubmitConfig(object): def removeInlineComments(cfgparser): for section in cfgparser.sections(): for item in cfgparser.items(section): - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + try: + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + except: + pass return cfgparser @staticmethod -- GitLab From ad2a39015db8780ae924e0939effa2ad2473790e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 13:33:32 +0200 Subject: [PATCH 089/121] CUSTOM directive has # crashing with the removeinlinecomments --- autosubmit/config/config_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 4b683f1e4..50c4d69e8 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1764,7 +1764,10 @@ class AutosubmitConfig(object): for section in cfgparser.sections(): for item in cfgparser.items(section): try: - cfgparser.set(section, item[0], item[1].split("#")[0].strip()) + if str(item[0]).upper() == "CUSTOM_DIRECTIVES": + pass + else: + cfgparser.set(section, item[0], item[1].split("#")[0].strip()) except: pass return cfgparser -- GitLab From 481dcd9788dc2b1828e3ff8c370cac98ff4737ad Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 15:01:34 +0200 Subject: [PATCH 090/121] Changed delete message, added complete list of directories --- autosubmit/autosubmit.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 2fca7cb7b..0720672e7 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -818,6 +818,16 @@ class Autosubmit: :return: True if succesfully deleted, False otherwise :rtype: boolean """ + message = "The {0} experiment was removed from the local disk and from the database.".format(expid_delete) + message+= " Note that this action does not delete any data written by the experiment.\n" + message+= "Complete list of files/directories deleted:\n" + for root, dirs, files in os.walk(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)): + for dir in dirs: + message += os.path.join(root, dir) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, + "structure_{0}.db".format(expid_delete)) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, + "job_data_{0}.db".format(expid_delete)) + "\n" owner,eadmin,currentOwner = Autosubmit._check_ownership(expid_delete) if expid_delete == '' or expid_delete is None and not os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR,expid_delete)): Log.printlog("Experiment directory does not exist.",Log.WARNING) @@ -865,6 +875,7 @@ class Autosubmit: else: raise AutosubmitCritical( 'Current user is not the owner of the experiment. {0} can not be deleted!'.format(expid_delete), 7012) + Log.printlog(message, Log.RESULT) except Exception as e: # Avoid calling Log at this point since it is possible that tmp folder is already deleted. error_message += "Couldn't delete the experiment".format(e.message) -- GitLab From dc6909c7a8c9d9122a9f1ff3207e322f76cb8aca Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 15:19:47 +0200 Subject: [PATCH 091/121] disable inline delete --- autosubmit/config/config_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 50c4d69e8..63b31483d 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1808,6 +1808,6 @@ class AutosubmitConfig(object): "{}\n This file and the correctness of its content are necessary.".format(str(exp))) # parser.read(file_path) #remove inline comments - parser = AutosubmitConfig.removeInlineComments(parser) + #parser = AutosubmitConfig.removeInlineComments(parser) return parser -- GitLab From a929856cfd3e3cddf888178a6fddc8e0ca2dfb88 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 4 Oct 2022 16:11:47 +0200 Subject: [PATCH 092/121] Fixed node missconfiguration slurm message not being detected correclty --- autosubmit/autosubmit.py | 10 ++++++---- autosubmit/job/job_packages.py | 2 +- autosubmit/platforms/paramiko_submitter.py | 4 +++- autosubmit/platforms/slurmplatform.py | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 0720672e7..48e5b2e28 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2191,11 +2191,11 @@ class Autosubmit: platform.cancel_job(id) jobs_id = None platform.connected = False - if type(e.trace) is not None: - has_trace_bad_parameters = e.trace.lower().find("bad parameters") != -1 + if e.trace is not None: + has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 else: has_trace_bad_parameters = False - if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1: + if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -2206,7 +2206,9 @@ class Autosubmit: else: error_message+="Check that {1} platform has set the correct scheduler. Sections that could be affected: {0}".format( error_msg[:-1], platform.name) - raise AutosubmitCritical(error_message,7014,e.message+"\n"+e.trace) + if e.trace is None: + e.trace = "" + raise AutosubmitCritical(error_message,7014,e.message+"\n"+str(e.trace)) except IOError as e: raise AutosubmitError( "IO issues ", 6016, e.message) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 52afa70cc..a3a6a3b58 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -155,7 +155,7 @@ class JobPackageBase(object): exit=True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - if configuration.get_project_type().lower() != "none": + if str(configuration.get_project_type()).lower() != "none": raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) if not job.check_script(configuration, parameters,show_logs=job.check_warnings): Log.warning("Script {0} check failed",job.name) diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 1f577426f..12e1e70bc 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -203,6 +203,8 @@ class ParamikoSubmitter(Submitter): if parser.has_option(section, 'SERIAL_PLATFORM'): platforms[section.lower()].serial_platform = platforms[parser.get_option(section, 'SERIAL_PLATFORM', - None).lower()] + None)] + if platforms[section.lower()].serial_platform is not None: + platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 5d31690c4..d757256a4 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -466,7 +466,7 @@ class SlurmPlatform(ParamikoPlatform): else: retries = 9999 except BaseException as e: # Unrecoverable error - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: if not wrapper_failed: sleep(sleeptime) sleeptime = sleeptime + 5 -- GitLab From 93f0a58cf4888cd893b56ca996d46d53b534cacf Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 11:16:06 +0200 Subject: [PATCH 093/121] Added include_members and chunks #748 --- autosubmit/job/job_dict.py | 36 ++++++++++++++++++++++++++++++------ test/unit/test_dic_jobs.py | 2 +- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index d0aef9f42..b7e6b4a6d 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -152,11 +152,19 @@ class DicJobs: elif running == 'date': self._create_jobs_startdate(section, priority, frequency, default_job_type, jobs_data,splits) elif running == 'member': - self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits,self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits, \ + self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + elif running == 'chunk': synchronize = self.get_option(section, "SYNCHRONIZE", None) delay = int(self.get_option(section, "DELAY", -1)) - self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data,excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"),excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data, \ + excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"), \ + excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + included_chunks=self.parse_relation(section,False,self.get_option(section, "INCLUDED_CHUNKS", []),"INCLUDED_CHUNKS"), \ + included_members=self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + pass def _create_jobs_once(self, section, priority, default_job_type, jobs_data=dict(),splits=0): @@ -218,7 +226,7 @@ class DicJobs: - def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[]): + def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[],included_members=[]): """ Create jobs to be run once per member @@ -242,11 +250,18 @@ class DicJobs: count = 0 if splits > 0: for member in self._member_list: - if self._member_list.index(member) not in excluded_members: - tmp_dic[section][date][member] = [] + if len(included_members) == 0: + if self._member_list.index(member) not in excluded_members: + tmp_dic[section][date][member] = [] + else: + if self._member_list.index(member) in included_members: + tmp_dic[section][date][member] = [] for member in self._member_list: if self._member_list.index(member) in excluded_members: continue + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue count += 1 if count % frequency == 0 or count == len(self._member_list): if splits <= 0: @@ -259,7 +274,7 @@ class DicJobs: - def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[]): + def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]): """ Create jobs to be run once per chunk @@ -282,6 +297,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): @@ -311,6 +329,9 @@ class DicJobs: for date in self._date_list: self._dic[section][date] = dict() for member in self._member_list: + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue if self._member_list.index(member) in excluded_members: continue self._dic[section][date][member] = dict() @@ -318,6 +339,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 5565c9328..39f7690b2 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -123,7 +123,7 @@ class TestDicJobs(TestCase): self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() self.dictionary._create_jobs_member.assert_not_called() - self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[]) + self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]) def test_dic_creates_right_jobs_by_startdate(self): # arrange -- GitLab From 7d652f65af6dfd733425417129a1962fe36aa5e9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:17:17 +0200 Subject: [PATCH 094/121] Bugfix timeout #812 --- autosubmit/platforms/locplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 3fe62f5cc..e7734b133 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -83,7 +83,7 @@ class LocalPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: -- GitLab From 005443718398f3c7bca61a0eadea85c4339b3987 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:29:32 +0200 Subject: [PATCH 095/121] Erased wrong info about TOTAL_JOBS --- .../html/_sources/usage/new_platform.rst.txt | 2 +- docs/source/userguide/wrappers/index.rst | 73 ++++++------------- 2 files changed, 25 insertions(+), 50 deletions(-) diff --git a/docs/build/html/_sources/usage/new_platform.rst.txt b/docs/build/html/_sources/usage/new_platform.rst.txt index 675d4edc6..971778061 100644 --- a/docs/build/html/_sources/usage/new_platform.rst.txt +++ b/docs/build/html/_sources/usage/new_platform.rst.txt @@ -53,7 +53,7 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be queuing or submitted in this platform. +* MAX_WAITING_JOBS: Maximum number of jobs to be queuing or submitted in this platform. * TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. diff --git a/docs/source/userguide/wrappers/index.rst b/docs/source/userguide/wrappers/index.rst index 2ee2a34e1..388c215ef 100644 --- a/docs/source/userguide/wrappers/index.rst +++ b/docs/source/userguide/wrappers/index.rst @@ -1,5 +1,6 @@ +############ Wrappers -======== +############ In order to understand the goal of this feature, please take a look at: https://earth.bsc.es/wiki/lib/exe/fetch.php?media=library:seminars:techniques_to_improve_the_throughput.pptx @@ -13,37 +14,9 @@ At the moment there are 4 types of wrappers that can be used depending on the ex When using the wrapper, it is useful to be able to visualize which packages are being created. So, when executing *autosubmit monitor cxxx*, a dashed box indicates the jobs that are wrapped together in the same job package. -How to configure ----------------- - -In ``autosubmit_cxxx.conf``, regardless of the wrapper type, you need to make sure that the values of the variables **MAXWAITINGJOBS** and **TOTALJOBS** are increased according to the number of jobs expected to be waiting/running at the same time in your experiment. - -For example: - -.. code-block:: ini - - [config] - EXPID = .... - AUTOSUBMIT_VERSION = 3.13.0 - ... - - MAXWAITINGJOBS = 100 - TOTALJOBS = 100 - ... - -and below the [config] block, add the wrapper directive, indicating the wrapper type: - -.. code-block:: ini - - [wrapper] - TYPE = - -You can also specify which job types should be wrapped. This can be done using the **JOBS_IN_WRAPPER** parameter. -It is only required for the vertical-mixed type (in which the specified job types will be wrapped together), so if nothing is specified, all jobs will be wrapped. -By default, jobs of the same type will be wrapped together, as long as the constraints are satisfied. Number of jobs in a package -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*************************** .. code-block:: ini @@ -56,7 +29,7 @@ Number of jobs in a package - **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` - - If **MAX_WRAPPED** is not defined, then **TOTALJOBS** is used by default + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. - **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain - If not defined, it considers that **MIN_WRAPPED** is 2. - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. @@ -66,14 +39,14 @@ Number of jobs in a package Wrapper check time -~~~~~~~~~~~~~~~~~~ +********************** It is possible to override the **SAFETYSLEEPTIME** for the wrapper, by using **CHECK_TIME_WRAPPER** and defining a time interval (in seconds) in which the wrapper internal jobs should be checked. .. important:: Note that the **numbers** shown in this documentation are examples. The actual values must be set according to the specific workflow, as well as the platform configurations. Vertical wrapper ----------------- +======================= The vertical wrapper is more appropriate when there are many sequential jobs. To use it, set TYPE = vertical: @@ -93,7 +66,7 @@ In order to be able to use the vertical wrapper, in ``platforms_cxxx.conf`` set Remember to add to each job the corresponding WALLCLOCK time. Vertical with multiple sections -------------------------------- +=============================== This is a mode of the vertical wrapper that allows jobs of different types to be wrapped together. Note that the solution considers the order of the sections defined in the ``jobs_cxxx.conf`` file, so the order of the sections given in **JOBS_IN_WRAPPER** is irrelevant. @@ -105,20 +78,23 @@ Additionally, jobs are grouped within the corresponding date, member and chunk h TYPE = vertical JOBS_IN_WRAPPER = SIM&SIM2 # REQUIRED -.. figure:: fig/vertical-mixed.png +.. figure:: ../../workflows/vertical-mixed.png :name: vertical-mixed :width: 100% :align: center :alt: vertical-mixed wrapper Horizontal wrapper ------------------- +========================== The horizontal wrapper is more appropriate when there are multiple ensemble members that can be run in parallel. If the wrapped jobs have an mpirun call, they will need machine files to specify in which nodes each job will run. Different cases may need specific approaches when creating the machine files. For auto-ecearth use COMPONENTS instead of STANDARD. +Horizontal wrapper +********************** + .. code-block:: ini [wrapper] @@ -135,14 +111,14 @@ In order to be able to use the horizontal wrapper, in ``platforms_cxxx.conf`` se ... MAX_PROCESSORS = 2400 -.. figure:: fig/horizontal_remote.png +.. figure:: ../../workflows/horizontal_remote.png :name: horizontal_remote :width: 60% :align: center :alt: horizontally wrapped jobs Shared-memory Experiments -~~~~~~~~~~~~~~~~~~~~~~~~~ +************************* There is also the possibility of setting the option **METHOD** to SRUN in the wrapper directive (**ONLY** for vertical and vertical-horizontal wrappers). @@ -155,13 +131,13 @@ This allows to form a wrapper with shared-memory paradigm instead of rely in mac METHOD = srun # default ASTHREAD Hybrid wrapper --------------- +========================== The hybrid wrapper is a wrapper that works both vertically and horizontally at the same time, meaning that members and chunks can be wrapped in one single job. Mixed approach using a combination of horizontal and vertical wrappers and the list of jobs is a list of lists. Horizontal-vertical -------------------- +=========================== - There is a dependency between lists. Each list runs after the previous one finishes; the jobs within the list run in parallel at the same time - It is particularly suitable if there are jobs of different types in the list with different wall clocks, but dependencies between jobs of different lists; it waits for all the jobs in the list to finish before starting the next list @@ -174,7 +150,7 @@ Horizontal-vertical MACHINEFILES = STANDARD JOBS_IN_WRAPPER = SIM&DA -.. figure:: fig/dasim.png +.. figure:: ../../workflows/dasim.png :name: wrapper_horizontal_vertical :width: 100% :align: center @@ -182,7 +158,7 @@ Horizontal-vertical Vertical-horizontal -------------------- +=========================== - In this approach, each list is independent of each other and run in parallel; jobs within the list run one after the other - It is particularly suitable for running many sequential ensembles @@ -195,15 +171,14 @@ Vertical-horizontal MACHINEFILES = STANDARD JOBS_IN_WRAPPER = SIM -.. figure:: fig/vertical-horizontal.png +.. figure:: ../../workflows/vertical-horizontal.png :name: wrapper_vertical_horizontal :width: 100% :align: center :alt: hybrid wrapper Multiple wrappers at once -------------------------- - +========================= This is an special mode that allows you to use multiple **independent** wrappers on the same experiment. By using an special variable that allows to define subwrapper sections .. code-block:: ini @@ -220,14 +195,14 @@ This is an special mode that allows you to use multiple **independent** wrappers TYPE = vertical JOBS_IN_WRAPPER = DA&REDUCE -.. figure:: fig/multiple_wrappers.png +.. figure:: ../workflows/multiple_wrappers.png :name: :width: 100% :align: center :alt: multi wrapper Summary -------- +========================== In `autosubmit_cxxx.conf`: @@ -238,7 +213,7 @@ In `autosubmit_cxxx.conf`: # JOBS_IN_WRAPPER = Sections that should be wrapped together ex SIM # METHOD : Select between MACHINESFILES or Shared-Memory. # MIN_WRAPPED set the minim number of jobs that should be included in the wrapper. DEFAULT = 2 - # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = TOTALJOBS + # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = 99999999999 # Policy : Select the behaviour of the inner jobs Strict/Flexible/Mixed # EXTEND_WALLCLOCK: Allows to extend the wallclock by the max wallclock of the horizontal package (max inner job). Values are integer units (0,1,2) # RETRIALS : Enables a retrial mechanism for vertical wrappers, or default retrial mechanism for the other wrappers @@ -247,7 +222,7 @@ In `autosubmit_cxxx.conf`: TYPE = Vertical #REQUIRED JOBS_IN_WRAPPER = SIM # Job types (as defined in jobs_cxxx.conf) separated by space. REQUIRED only if vertical-mixed MIN_WRAPPED = 2 - MAX_WRAPPED = 9999 # OPTIONAL. Integer value, overrides TOTALJOBS + MAX_WRAPPED = 999999 # OPTIONAL. Integer value. CHECK_TIME_WRAPPER = # OPTIONAL. Time in seconds, overrides SAFETYSLEEPTIME POLICY = flexible # OPTIONAL, Wrapper policy, mixed, flexible, strict QUEUE = bsc_es # If not specified, queue will be the same of the first SECTION specified on JOBS_IN_WRAPPER -- GitLab From 41d91d50d34557006390d826584d68938087effd Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 13:38:51 +0200 Subject: [PATCH 096/121] Added wrapper info under devel_proj -> Controling the number of active concurrent tasks in an experiment #857 --- .../userguide/configure/develop_a_project.rst | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/docs/source/userguide/configure/develop_a_project.rst b/docs/source/userguide/configure/develop_a_project.rst index 17caddcf5..0dda37b3c 100644 --- a/docs/source/userguide/configure/develop_a_project.rst +++ b/docs/source/userguide/configure/develop_a_project.rst @@ -712,8 +712,34 @@ To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS .. code-block:: ini - # Maximum number of submitted,waiting and running tasks - TOTAL_JOBS = 10 - # Maximum number of submitted and waiting tasks - MAX_WAITING_JOBS = 10 + # Controls the maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Controls the maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 +To control the number of jobs included in a wrapper, you can use the `MAX_WRAPPED_JOBS` and `MIN_WRAPPED_JOBS` variables in the ``conf/autosubmit_cxxx.conf`` file. + +Note that a wrapped job is counted as a single job regardless of the number of tasks it contains. Therefore, `TOTAL_JOBS` and `MAX_WAITING_JOBS` won't have an impact inside a wrapper. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + [wrapper] + TYPE = + MIN_WRAPPED = 2 # Minium amount of jobs that will be wrapped together in any given time. + MIN_WRAPPED_H = 2 # Same as above but only for the horizontal packages. + MIN_WRAPPED_V = 2 # Same as above but only for the vertical packages. + MAX_WRAPPED = 99999 # Maximum amount of jobs that will be wrapped together in any given time. + MAX_WRAPPED_H = 99999 # Same as above but only for the horizontal packages. + MAX_WRAPPED_V = 99999 # Same as above but only for the vertical packages. + +- **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section + - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. +- **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain + - If not defined, it considers that **MIN_WRAPPED** is 2. + - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. + - If **POLICY** is mixed and there are failed jobs inside a wrapper, these jobs will be submitted as individual jobs. + - If **POLICY** is strict and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will not be submitted until there are enough tasks to build a package. + - strict and mixed policies can cause **deadlocks**. -- GitLab From b1bb2535949f0da796fe23215be0cf3cebd912e2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 5 Oct 2022 16:07:50 +0200 Subject: [PATCH 097/121] Deleted argcomplete --- autosubmit/autosubmit.py | 4 +--- setup.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 48e5b2e28..ccb1bbac9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# PYTHON_ARGCOMPLETE_OK # Copyright 2015-2020 Earth Sciences Department, BSC-CNS @@ -65,7 +64,7 @@ try: except Exception: dialog = None from time import sleep -import argparse, argcomplete +import argparse import subprocess import json import tarfile @@ -583,7 +582,6 @@ class Autosubmit: # Changelog subparsers.add_parser('changelog', description='show changelog') - argcomplete.autocomplete(parser) args = parser.parse_args() diff --git a/setup.py b/setup.py index 8e56eb8c5..a5a7801ef 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','six>=1.10.0','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + install_requires=['argparse>=1.2,<2','six>=1.10.0', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests'], extras_require={ -- GitLab From b945cdd83dc16d11a0bcd205ae8785a80e47867c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 6 Oct 2022 14:33:10 +0200 Subject: [PATCH 098/121] Fixed an issue with main_platform = local and no platforms configured --- autosubmit/config/config_common.py | 8 +++++--- autosubmit/history/data_classes/job_data.py | 3 ++- autosubmit/job/job_dict.py | 7 ++++--- autosubmit/platforms/psplatform.py | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 63b31483d..26ce6ec50 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -607,9 +607,7 @@ class AutosubmitConfig(object): """ Checks experiment's queues configuration file. """ - if len(self._platforms_parser.sections()) == 0: - self.wrong_config["Platform"] += [["Global", - "Platform file is not well-configured or found"]] + if len(self._platforms_parser.sections()) != len(set(self._platforms_parser.sections())): self.wrong_config["Platform"] += [["Global", @@ -619,7 +617,11 @@ class AutosubmitConfig(object): main_platform_found = True elif self.ignore_undefined_platforms: main_platform_found = True + if len(self._platforms_parser.sections()) == 0 and not main_platform_found: + self.wrong_config["Platform"] += [["Global", + "Platform file is not well-configured or found"]] for section in self._platforms_parser.sections(): + if section in self.hpcarch: main_platform_found = True if not self._platforms_parser.check_exists(section, 'TYPE'): diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index b5249b797..93a88797a 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -57,7 +57,8 @@ class JobData(object): platform) > 0 else "NA" self.job_id = job_id if job_id else 0 try: - self.extra_data_parsed = loads(extra_data) + if extra_data != "": + self.extra_data_parsed = loads(extra_data) except Exception as exp: self.extra_data_parsed = {} # Fail fast self.extra_data = extra_data diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index b7e6b4a6d..29ca59e28 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -402,9 +402,10 @@ class DicJobs: for d in self._date_list: self._get_date(jobs, dic, d, member, chunk) try: - if type(jobs[0]) is list: - jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] - jobs = jobs_flattened + if len(jobs) > 0: + if type(jobs[0]) is list: + jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] + jobs = jobs_flattened except BaseException as e: pass return jobs diff --git a/autosubmit/platforms/psplatform.py b/autosubmit/platforms/psplatform.py index aee3e4eb7..e2c3ede88 100644 --- a/autosubmit/platforms/psplatform.py +++ b/autosubmit/platforms/psplatform.py @@ -76,7 +76,7 @@ class PsPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: -- GitLab From f61d45fa86eed705989935f50aaefc2d5b32e828 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 6 Oct 2022 14:41:04 +0200 Subject: [PATCH 099/121] fixed tests --- requeriments.txt | 1 + test/unit/test_dic_jobs.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requeriments.txt b/requeriments.txt index c34451db2..b5783046b 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,3 +1,4 @@ +pytest==2.9.2 configparser argparse>=1.2,<2 python-dateutil>2 diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 39f7690b2..f955f96dc 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -81,9 +81,10 @@ class TestDicJobs(TestCase): frequency = 123 splits = 0 excluded_list_m = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='member') - self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() @@ -95,7 +96,7 @@ class TestDicJobs(TestCase): # assert self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() - self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m) + self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m,included_list_m) self.dictionary._create_jobs_chunk.assert_not_called() def test_read_section_running_chunk_create_jobs_chunk(self): @@ -108,9 +109,11 @@ class TestDicJobs(TestCase): splits = 0 excluded_list_c = [] excluded_list_m = [] + included_list_c = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='chunk') - self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m,included_list_c,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() -- GitLab From 9640d066e2f04a1803cbed5bae83ad8840911410 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 7 Oct 2022 13:10:25 +0200 Subject: [PATCH 100/121] updateversion does not need -v, and now stores the change in the db #882 #881 --- autosubmit/autosubmit.py | 25 +++++++++++++------------ autosubmit/config/config_common.py | 2 ++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ccb1bbac9..be15c0bec 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -755,18 +755,19 @@ class Autosubmit: force_update_version = args.update_version else: force_update_version = False - if force_update_version: - if as_conf.get_version() != Autosubmit.autosubmit_version: - Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", - as_conf.get_version(), Autosubmit.autosubmit_version, expid) - as_conf.set_version(Autosubmit.autosubmit_version) - else: - if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: - raise AutosubmitCritical( - "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" - "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), - Autosubmit.autosubmit_version, expid,args.command), - 7067) + if args.command not in ["upgrade","updateversion"]: + if force_update_version: + if as_conf.get_version() != Autosubmit.autosubmit_version: + Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", + as_conf.get_version(), Autosubmit.autosubmit_version, expid) + as_conf.set_version(Autosubmit.autosubmit_version) + else: + if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: + raise AutosubmitCritical( + "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" + "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), + Autosubmit.autosubmit_version, expid,args.command), + 7067) else: if expid == 'None': exp_id = "" diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 26ce6ec50..c0cacf190 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -35,6 +35,7 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from autosubmit.config.basicConfig import BasicConfig from collections import defaultdict +from autosubmit.database.db_common import update_experiment_descrip_version class AutosubmitConfig(object): @@ -1369,6 +1370,7 @@ class AutosubmitConfig(object): content = content.replace(re.search('AUTOSUBMIT_VERSION =.*', content).group(0), "AUTOSUBMIT_VERSION = " + autosubmit_version) open(self._conf_parser_file, 'w').write(content) + update_experiment_descrip_version(self.expid, description=None, version=autosubmit_version) def get_version(self): """ -- GitLab From 8f03d5452f9f2286739597833df98d8a21c6b2c5 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 11 Oct 2022 15:33:59 +0200 Subject: [PATCH 101/121] NEW RUN RULES changes #847 --- autosubmit/autosubmit.py | 16 ++++++++---- autosubmit/history/experiment_history.py | 32 +++++++++++++----------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index be15c0bec..be6c31665 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -59,10 +59,11 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set import sqlite3 -try: - import dialog -except Exception: - dialog = None +#try: +# import dialog +#except Exception: +# dialog = None +dialog = None from time import sleep import argparse import subprocess @@ -4253,7 +4254,12 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + + #exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + exp_history.process_status_changes(job_list.get_job_list(), + chunk_unit=as_conf.get_chunk_size_unit(), + chunk_size=as_conf.get_chunk_size(), + current_config=as_conf.get_full_config_as_json(),create=True) Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index f1e0be68c..ecd06067b 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -190,15 +190,15 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="",create=False): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ try: current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() update_these_changes = self._get_built_list_of_changes(job_list) - should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size) + should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size,create) if len(update_these_changes) > 0 and should_create_new_run == False: self.manager.update_many_job_data_change_status(update_these_changes) - if should_create_new_run: + if should_create_new_run: return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: @@ -217,11 +217,14 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size): - if len(job_list) != current_experiment_run_dc.total: - return True - if changes_count > int(self._get_date_member_completed_count(job_list)): - return True + def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): + if create: + return True + elif not create and self.expid[0].lower() == "t": + if len(job_list) != current_experiment_run_dc.total: + return True + if changes_count > int(self._get_date_member_completed_count(job_list)): + return True return self._chunk_config_has_changed(current_experiment_run_dc, new_chunk_unit, new_chunk_size) def _chunk_config_has_changed(self, current_exp_run_dc, new_chunk_unit, new_chunk_size): @@ -274,15 +277,16 @@ class ExperimentHistory(): def detect_changes_in_job_list(self, job_list): """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" - job_name_to_job = {job.name: job for job in job_list} + job_name_to_job = {str(job.name): job for job in job_list} current_job_data_dcs = self.manager.get_all_last_job_data_dcs() differences = [] for job_dc in current_job_data_dcs: - if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: - if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): - # If the job is not changing from a finalized status to a starting status - job_dc.status = job_name_to_job[job_dc.job_name].status_str - differences.append(job_dc) + if job_dc.job_name in job_name_to_job: + if job_dc.status != job_name_to_job[job_dc.job_name].status_str: + if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): + # If the job is not changing from a finalized status to a starting status + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) return differences def _get_defined_rowtype(self, code): -- GitLab From 99f642cfb19954ced3c9f72c67e15471ac84ed6f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 13 Oct 2022 09:02:40 +0200 Subject: [PATCH 102/121] new run --- autosubmit/history/experiment_history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index ecd06067b..96651df99 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -220,7 +220,7 @@ class ExperimentHistory(): def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): if create: return True - elif not create and self.expid[0].lower() == "t": + elif not create and self.expid[0].lower() != "t": if len(job_list) != current_experiment_run_dc.total: return True if changes_count > int(self._get_date_member_completed_count(job_list)): -- GitLab From 627f65e5ff5713c0921d03588b3d9c92d66a4080 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 13 Oct 2022 13:05:59 +0200 Subject: [PATCH 103/121] Fixed pipeline tests --- test/unit/test_autosubmit_config.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/test/unit/test_autosubmit_config.py b/test/unit/test_autosubmit_config.py index c4c8480df..00e624406 100644 --- a/test/unit/test_autosubmit_config.py +++ b/test/unit/test_autosubmit_config.py @@ -181,13 +181,22 @@ class TestAutosubmitConfig(TestCase): open_mock.assert_any_call(config.experiment_file, 'w') def test_set_version(self): - # arrange + + #ARRAGE + FakeBasicConfig.DB_PATH = 'fake-path' + sys.modules['os'].path.exists = Mock(return_value=True) + connection_mock = Mock() + cursor_mock = Mock() + connection_mock.cursor = Mock(return_value=cursor_mock) + cursor_mock.fetchone = Mock(return_value=[0]) + + sys.modules['sqlite3'].connect = Mock(return_value=connection_mock) config = AutosubmitConfig(self.any_expid, FakeBasicConfig, ConfigParserFactory()) open_mock = mock_open(read_data='AUTOSUBMIT_VERSION = dummy') with patch.object(builtins, "open", open_mock): # act - config.set_version('dummy-vesion') + config.set_version('dummy-version') # assert open_mock.assert_any_call(getattr(config, '_conf_parser_file'), 'w') @@ -461,3 +470,4 @@ class FakeBasicConfig: LOCAL_PROJ_DIR = '/dummy/local/proj/dir' DEFAULT_PLATFORMS_CONF = '' DEFAULT_JOBS_CONF = '' + -- GitLab From 0dbdedd8a79b8cbeaeb33716f6ef21f6d1070dff Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 17 Oct 2022 16:30:31 +0200 Subject: [PATCH 104/121] when creating the experiment, it now also see if there is a folder called as it. If there is a folder, the (old) experiment will be registered --- autosubmit/autosubmit.py | 1 - autosubmit/database/db_common.py | 9 ++++++++- autosubmit/experiment/experiment_common.py | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index be6c31665..1ea5d3a97 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -916,7 +916,6 @@ class Autosubmit: os.mkdir(os.path.join( BasicConfig.LOCAL_ROOT_DIR, exp_id, 'conf')) Log.info("Copying config files...") - # autosubmit config and experiment copied from AS. files = resource_listdir('autosubmit.config', 'files') for filename in files: diff --git a/autosubmit/database/db_common.py b/autosubmit/database/db_common.py index aaaf4875a..47cc770eb 100644 --- a/autosubmit/database/db_common.py +++ b/autosubmit/database/db_common.py @@ -24,7 +24,7 @@ import os import sqlite3 import multiprocessing import Queue - +import autosubmit from log.log import Log, AutosubmitCritical, AutosubmitError Log.get_logger("Autosubmit") from autosubmit.config.basicConfig import BasicConfig @@ -319,6 +319,7 @@ def _check_experiment_exists(name, error_on_inexistence=True): :return: If experiment exists returns true, if not returns false :rtype: bool """ + if not check_db(): return False try: @@ -339,6 +340,12 @@ def _check_experiment_exists(name, error_on_inexistence=True): if error_on_inexistence: raise AutosubmitCritical( 'The experiment name "{0}" does not exist yet!!!'.format(name), 7005) + if os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR, name)): + try: + _save_experiment(name, 'No description', "3.14.0") + except BaseException as e: + pass + return True return False return True diff --git a/autosubmit/experiment/experiment_common.py b/autosubmit/experiment/experiment_common.py index 160f15158..3c31346c2 100644 --- a/autosubmit/experiment/experiment_common.py +++ b/autosubmit/experiment/experiment_common.py @@ -58,7 +58,7 @@ def new_experiment(description, version, test=False, operational=False): else: new_name = 'a000' else: - new_name = next_experiment_id(last_exp_name) + new_name = last_exp_name if new_name == '': return '' while db_common.check_experiment_exists(new_name, False): -- GitLab From daa18b8d664ea37e56935ee5889d51c158c3899c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 18 Oct 2022 11:44:25 +0200 Subject: [PATCH 105/121] Added more reasons to a job for stop #837 --- autosubmit/autosubmit.py | 6 ++++-- autosubmit/job/job.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 1ea5d3a97..27bda288e 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1467,8 +1467,10 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower( - )] + try: + job.platform = submitter.platforms[job.platform_name.lower()] + except: + raise AutosubmitCritical("hpcarch={0} not found in the platforms configuration file".format(job.platform_name), 7014) # noinspection PyTypeChecker if job.status not in (Status.COMPLETED, Status.SUSPENDED): platforms_to_test.add(job.platform) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 6653c51f9..08b39d27c 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1257,7 +1257,8 @@ class Job(object): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: @@ -1639,7 +1640,8 @@ class WrapperJob(Job): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: -- GitLab From b61915c5fad1763bc28cf312fb3da8d8aad1eb6b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 18 Oct 2022 16:25:49 +0200 Subject: [PATCH 106/121] dbfix pipeline --- test/unit/test_expid.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/unit/test_expid.py b/test/unit/test_expid.py index 85e5a012b..7eee22bfc 100644 --- a/test/unit/test_expid.py +++ b/test/unit/test_expid.py @@ -31,21 +31,21 @@ class TestExpid(TestCase): @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "a006" + current_experiment_id = "a007" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version) self.assertEquals("a007", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_test_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "t0ab" + current_experiment_id = "t0ac" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, True) self.assertEquals("t0ac", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_operational_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "o112" + current_experiment_id = "o113" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, False, True) self.assertEquals("o113", experiment_id) -- GitLab From ca7f0ec0e24a4ea2e5420cfabe13838f81ff1a1f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 19 Oct 2022 13:32:13 +0200 Subject: [PATCH 107/121] Recursive submodules --- autosubmit/git/autosubmit_git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index c191c21df..8d194de74 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -223,7 +223,7 @@ class AutosubmitGit: else: command_1 += " git submodule init;".format(project_destination) for submodule in git_project_submodules: - command_1 += " git submodule update {0};".format(submodule) + command_1 += " git submodule update --init --recursive {0};".format(submodule) if git_remote_project_path == '': try: command_1 = "cd {0}; {1} ".format(git_path,command_1) -- GitLab From 622ddee1e527ca57ed5e80627c290f35c338e1dc Mon Sep 17 00:00:00 2001 From: jberlin Date: Fri, 7 Oct 2022 11:24:08 +0200 Subject: [PATCH 108/121] Made small changes to documentation concerning the Conda installation - #864 --- docs/source/installation/index.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 7159ac7c0..157f28ecc 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -160,7 +160,7 @@ Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. autosubmit install # Get expid - autosubmit expid -H TEST -d "Test exp." + autosubmit expid -H local -d "Test exp." # Create with -np # Since it was a new install the expid will be a000 @@ -175,7 +175,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh # Launch it chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Download git + # Download git (if it is not already installed) apt install git -y -q # Download autosubmit git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b @@ -186,4 +186,7 @@ Sequence of instructions to install Autosubmit and its dependencies with conda. conda activate autosubmit # Test autosubmit autosubmit -v - # Configure autosubmitrc and install database as indicated in this doc + # Configure autosubmitrc and install the database as indicated in the installation instructions above this section + +.. hint:: + After installing conda, you may need to close the terminal and re-open it so the installation takes effect. \ No newline at end of file -- GitLab From 986bb585a04780d139eb03dabe34daffccbecbd0 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:17:23 +0200 Subject: [PATCH 109/121] Fixed an issue raised when a platform has no project expid user or scratch dir defined --- autosubmit/autosubmit.py | 7 +++++-- autosubmit/config/config_common.py | 2 +- autosubmit/platforms/paramiko_submitter.py | 19 +++++++++++++------ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 27bda288e..ff24f0967 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -4385,8 +4385,11 @@ class Autosubmit: submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform\nCheck the hpcarch platform configuration inside platform.conf", 7014) - + try: + hpcarch = submitter.platforms[as_conf.get_platform()] + except: + hpcarch = "local" + Log.warning("Remote clone may be disabled due to: "+e.message) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": svn_project_url = as_conf.get_svn_project_url() diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index c0cacf190..ff6f31ea4 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -623,7 +623,7 @@ class AutosubmitConfig(object): "Platform file is not well-configured or found"]] for section in self._platforms_parser.sections(): - if section in self.hpcarch: + if section.lower() in self.hpcarch.lower(): main_platform_found = True if not self._platforms_parser.check_exists(section, 'TYPE'): self.wrong_config["Platform"] += [[section, diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 12e1e70bc..92594abdd 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -20,7 +20,8 @@ import os -from log.log import Log + +from log.log import Log,AutosubmitCritical,AutosubmitError from autosubmit.config.basicConfig import BasicConfig from autosubmit.config.config_common import AutosubmitConfig from submitter import Submitter @@ -72,7 +73,7 @@ class ParamikoSubmitter(Submitter): :return: platforms used by the experiment :rtype: dict """ - + raise_message="" platforms_used = list() hpcarch = asconf.get_platform() platforms_used.append(hpcarch) @@ -191,12 +192,16 @@ class ParamikoSubmitter(Submitter): remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) - remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, - remote_platform.user, remote_platform.expid) + try: + remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, + remote_platform.user, remote_platform.expid) + remote_platform.update_cmds() + platforms[section.lower()] = remote_platform + + except: + raise_message = "Error in platform.conf: SCRATCH_DIR, PROJECT, USER, EXPID must be defined for platform {0}".format(section) # Executes update_cmds() from corresponding Platform Object - remote_platform.update_cmds() # Save platform into result dictionary - platforms[section.lower()] = remote_platform for section in parser.sections(): # if this section is included in platforms @@ -208,3 +213,5 @@ class ParamikoSubmitter(Submitter): platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms + if raise_message != "": + raise AutosubmitError(raise_message) -- GitLab From 7fa2231d2ed3388a44acadece94971abd886dc77 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:39:37 +0200 Subject: [PATCH 110/121] test local git not working proprly --- autosubmit/autosubmit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ff24f0967..5133dc4ca 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1359,6 +1359,7 @@ class Autosubmit: for wrapper_section in as_conf.get_wrapper_multi(): wrapper_jobs[wrapper_section] = as_conf.get_wrapper_jobs(wrapper_section) wrapper_jobs["wrapper"] = as_conf.get_wrapper_jobs("wrapper") + # Log.warning("Aux Job_list was generated successfully") submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) -- GitLab From e5f18727cb3650aaf9af58fe5d6a0d068e6cf9ee Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:30:39 +0200 Subject: [PATCH 111/121] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ad59f742d..cd56dd095 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.14.0b +#3.14.0b -- GitLab From c99d07e59fe148fcd6ea0ea6404fbc51770bef6f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 27 Oct 2022 16:30:49 +0200 Subject: [PATCH 112/121] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index cd56dd095..ad59f742d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -#3.14.0b +3.14.0b -- GitLab From f166cbdce7d6699c03afa403ac74b04cb8784f58 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 3 Nov 2022 09:42:59 +0100 Subject: [PATCH 113/121] Unbound variable fixes in some messages and job_status #893 Fixed an issue with recovery not cancelling all jobs --- autosubmit/autosubmit.py | 17 ++++---- autosubmit/git/autosubmit_git.py | 4 +- autosubmit/job/job.py | 49 ++++++++++++----------- autosubmit/job/job_list.py | 14 +++---- autosubmit/monitor/diagram.py | 12 ++++-- autosubmit/platforms/paramiko_platform.py | 37 ++++++++--------- 6 files changed, 69 insertions(+), 64 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5133dc4ca..cbb4b142d 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2068,7 +2068,6 @@ class Autosubmit: platform.connected = False Log.printlog("[{1}] Connection failed to host {0}".format( platform.host, platform.name),Log.WARNING) if issues != "": - platform.connected = False raise AutosubmitCritical( "Issues while checking the connectivity of platforms.", 7010, issues+"\n"+ssh_config_issues) @@ -2221,7 +2220,7 @@ class Autosubmit: "Submission failed, this can be due a failure on the platform", 6015, e.message) if jobs_id is None or len(jobs_id) <= 0: raise AutosubmitError( - "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(e.message,e.trace), 6015) + "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(str(e),""), 6015) i = 0 if hold: sleep(10) @@ -2677,9 +2676,9 @@ class Autosubmit: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) for platform in platforms_to_test: platform.test_connection() - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) if not force: raise AutosubmitCritical( "Experiment can't be recovered due being {0} active jobs in your experiment, If you want to recover the experiment, please use the flag -f and all active jobs will be cancelled".format( @@ -3235,16 +3234,16 @@ class Autosubmit: # Preparation for section parameters no_load_sections = False no_load_platforms = False - try: - job_list = Autosubmit.load_job_list( - expid, as_conf, notransitive=False) - except Exception as e: - no_load_sections = True + + job_list = Autosubmit.load_job_list( + expid, as_conf, notransitive=False) + try: submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) except Exception as e: no_load_platforms = True + submitter = Autosubmit._get_submitter(as_conf) submitter.load_local_platform(as_conf) try: # Gathering parameters of autosubmit and expdef config files @@ -4049,7 +4048,7 @@ class Autosubmit: Log.warning("Experiment folder renamed to: {0}".format( exp_folder + "_to_delete ")) except Exception as e: - Autosubmit.unarchive(expid, uncompress=False) + Autosubmit.unarchive(expid, uncompressed=False) raise AutosubmitCritical( "Can not remove or rename experiments folder", 7012, str(e)) diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index 8d194de74..493358ed0 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -203,7 +203,7 @@ class AutosubmitGit: command_0 = "cd {0} ; {1}".format(project_path, command_0) output_0 = subprocess.check_output(command_0, shell=True) else: - command_0 = "cd {0} ; {1}".format(git_remote_path, command_0) + command_0 = "cd {0} ; {1}".format(project_path, command_0) hpcarch.send_command(command_0) ##command 1 if os.path.exists(os.path.join(git_path, ".githooks")): @@ -233,7 +233,7 @@ class AutosubmitGit: submodule_failure = True Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( - "Submodule {0} has a wrong configuration".format(submodule), 6014) + "Submodule has a wrong configuration.\n{0}".format(command_1), 6014) else: command_1 = "cd {0}; {1} ".format(git_remote_path, command_1) hpcarch.send_command(command_1) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 08b39d27c..739216c4a 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -624,6 +624,7 @@ class Job(object): as_conf.reload() submitter = self._get_submitter(as_conf) submitter.load_platforms(as_conf) + platform = submitter.platforms[platform_name.lower()] success = True except BaseException as e: error_message = str(e) @@ -632,31 +633,31 @@ class Job(object): count=count+1 if not success: raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) - platform = submitter.platforms[platform_name.lower()] - try: - platform.test_connection() + else: max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count - if self.wrapper_type is not None and self.wrapper_type == "vertical": - found = False - retrials = 0 - while retrials < 3 and not found: - if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): - found = True - retrials = retrials + 1 - for i in range(max_logs-1,-1,-1): - if platform.check_stat_file_by_retrials(stat_file + str(i)): - last_log = i - else: - break - remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) + try: + platform.test_connection() + if self.wrapper_type is not None and self.wrapper_type == "vertical": + found = False + retrials = 0 + while retrials < 3 and not found: + if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): + found = True + retrials = retrials + 1 + for i in range(max_logs-1,-1,-1): + if platform.check_stat_file_by_retrials(stat_file + str(i)): + last_log = i + else: + break + remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) - else: - remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) + else: + remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except BaseException as e: - Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) + except BaseException as e: + Log.printlog( + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 @@ -1730,7 +1731,7 @@ class WrapperJob(Job): self.name, reason), 6009) # while running jobs? self._check_running_jobs() - self.update_failed_jobs(canceled_wrapper=True) + self.update_failed_jobs() self.cancel_failed_wrapper_job() return @@ -1760,8 +1761,8 @@ class WrapperJob(Job): job.hold = self.hold job.status = self.status if self.status == Status.WAITING: - for job in self.job_list: - job.packed = False + for job2 in self.job_list: + job2.packed = False def _check_inner_job_wallclock(self, job): start_time = self.running_jobs_start[job] diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 4c2712267..424332b47 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -437,13 +437,13 @@ class JobList(object): # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - for parent in jobs_by_section: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + for parent2 in jobs_by_section: + if parent2.chunk in dependency.select_chunks_dest[relation_indx] or len( dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - visited_parents.add(parent) + if parent2 not in visited_parents: + job.add_parent(parent2) + JobList._add_edge(graph, job, parent2) + visited_parents.add(parent2) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) @@ -2040,7 +2040,7 @@ class JobList(object): # root exists if root is not None: - result += self._recursion_print(root, 0) + result += self._recursion_print(root, 0,[]) else: result += "\nCannot find root." diff --git a/autosubmit/monitor/diagram.py b/autosubmit/monitor/diagram.py index 8e8753167..b1f0f6744 100644 --- a/autosubmit/monitor/diagram.py +++ b/autosubmit/monitor/diagram.py @@ -66,14 +66,18 @@ def create_bar_diagram(experiment_id, jobs_list, general_stats, output_file, per exp_stats.calculate_statistics() exp_stats.calculate_summary() exp_stats.make_old_format() - failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + # Stats variables definition + normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) + failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) except Exception as exp: + if not isinstance(normal_plots_count,int): + normal_plots_count = 0 + if not isinstance(failed_jobs_plots_count,int): + failed_jobs_plots_count = 0 print(exp) print(traceback.format_exc()) - # Stats variables definition - normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) - failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) total_plots_count = normal_plots_count + failed_jobs_plots_count # num_plots = norma # ind = np.arange(int(MAX_JOBS_PER_PLOT)) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 1c1177510..4b5c2d4b4 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -118,7 +118,7 @@ class ParamikoPlatform(Platform): raise except BaseException as e: self.connected = False - raise AutosubmitCritical(message,7051) + raise AutosubmitCritical(str(e),7051) #raise AutosubmitError("[{0}] connection failed for host: {1}".format(self.name, self.host), 6002, e.message) def restore_connection(self): @@ -226,7 +226,7 @@ class ParamikoPlatform(Platform): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: - self.restore_connection(reconnect=True) + self.restore_connection() else: raise AutosubmitError( "Couldn't establish a connection to the specified host, wrong configuration?", 6003, e.message) @@ -283,8 +283,8 @@ class ParamikoPlatform(Platform): self._ftpChannel.chmod(remote_path, os.stat(local_path).st_mode) return True except IOError as e: - raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, str(e)) + + raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join(self.tmp_path,filename), code=6004, trace=str(e))) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -594,19 +594,20 @@ class ParamikoPlatform(Platform): sleep_time = sleep_time + 5 # URi: define status list in HPC Queue Class else: - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + job_status = job.status + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -989,7 +990,7 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def parse_queue_reason(self, output): + def parse_queue_reason(self, output, job_id): raise NotImplementedError def get_ssh_output(self): -- GitLab From 8a73de91ebf8e3ba3893b573739ffed5d7a54750 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Thu, 3 Nov 2022 11:20:24 +0100 Subject: [PATCH 114/121] Change section title --- docs/source/userguide/manage/index.rst | 2 +- docs/source/userguide/run/index.rst | 2 +- docs/source/userguide/wrappers/index.rst | 75 ++++++++++++++++-------- 3 files changed, 52 insertions(+), 27 deletions(-) diff --git a/docs/source/userguide/manage/index.rst b/docs/source/userguide/manage/index.rst index 3f4edea72..23fe97a5a 100644 --- a/docs/source/userguide/manage/index.rst +++ b/docs/source/userguide/manage/index.rst @@ -1,4 +1,4 @@ -Manage experiments +Manage Experiments =================== How to clean the experiment diff --git a/docs/source/userguide/run/index.rst b/docs/source/userguide/run/index.rst index 34f937ed7..90f0180bf 100644 --- a/docs/source/userguide/run/index.rst +++ b/docs/source/userguide/run/index.rst @@ -1,4 +1,4 @@ -Running experiments +Running Experiments =================== Run an experiment diff --git a/docs/source/userguide/wrappers/index.rst b/docs/source/userguide/wrappers/index.rst index 388c215ef..e19ddaa1b 100644 --- a/docs/source/userguide/wrappers/index.rst +++ b/docs/source/userguide/wrappers/index.rst @@ -1,6 +1,5 @@ -############ -Wrappers -############ +Configure Wrappers +================== In order to understand the goal of this feature, please take a look at: https://earth.bsc.es/wiki/lib/exe/fetch.php?media=library:seminars:techniques_to_improve_the_throughput.pptx @@ -14,9 +13,37 @@ At the moment there are 4 types of wrappers that can be used depending on the ex When using the wrapper, it is useful to be able to visualize which packages are being created. So, when executing *autosubmit monitor cxxx*, a dashed box indicates the jobs that are wrapped together in the same job package. +How to configure +---------------- + +In ``autosubmit_cxxx.conf``, regardless of the wrapper type, you need to make sure that the values of the variables **MAXWAITINGJOBS** and **TOTALJOBS** are increased according to the number of jobs expected to be waiting/running at the same time in your experiment. + +For example: + +.. code-block:: ini + + [config] + EXPID = .... + AUTOSUBMIT_VERSION = 3.13.0 + ... + + MAXWAITINGJOBS = 100 + TOTALJOBS = 100 + ... + +and below the [config] block, add the wrapper directive, indicating the wrapper type: + +.. code-block:: ini + + [wrapper] + TYPE = + +You can also specify which job types should be wrapped. This can be done using the **JOBS_IN_WRAPPER** parameter. +It is only required for the vertical-mixed type (in which the specified job types will be wrapped together), so if nothing is specified, all jobs will be wrapped. +By default, jobs of the same type will be wrapped together, as long as the constraints are satisfied. Number of jobs in a package -*************************** +~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: ini @@ -29,7 +56,7 @@ Number of jobs in a package - **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` - - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. + - If **MAX_WRAPPED** is not defined, then **TOTALJOBS** is used by default - **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain - If not defined, it considers that **MIN_WRAPPED** is 2. - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. @@ -39,14 +66,14 @@ Number of jobs in a package Wrapper check time -********************** +~~~~~~~~~~~~~~~~~~ It is possible to override the **SAFETYSLEEPTIME** for the wrapper, by using **CHECK_TIME_WRAPPER** and defining a time interval (in seconds) in which the wrapper internal jobs should be checked. .. important:: Note that the **numbers** shown in this documentation are examples. The actual values must be set according to the specific workflow, as well as the platform configurations. Vertical wrapper -======================= +---------------- The vertical wrapper is more appropriate when there are many sequential jobs. To use it, set TYPE = vertical: @@ -66,7 +93,7 @@ In order to be able to use the vertical wrapper, in ``platforms_cxxx.conf`` set Remember to add to each job the corresponding WALLCLOCK time. Vertical with multiple sections -=============================== +------------------------------- This is a mode of the vertical wrapper that allows jobs of different types to be wrapped together. Note that the solution considers the order of the sections defined in the ``jobs_cxxx.conf`` file, so the order of the sections given in **JOBS_IN_WRAPPER** is irrelevant. @@ -78,23 +105,20 @@ Additionally, jobs are grouped within the corresponding date, member and chunk h TYPE = vertical JOBS_IN_WRAPPER = SIM&SIM2 # REQUIRED -.. figure:: ../../workflows/vertical-mixed.png +.. figure:: fig/vertical-mixed.png :name: vertical-mixed :width: 100% :align: center :alt: vertical-mixed wrapper Horizontal wrapper -========================== +------------------ The horizontal wrapper is more appropriate when there are multiple ensemble members that can be run in parallel. If the wrapped jobs have an mpirun call, they will need machine files to specify in which nodes each job will run. Different cases may need specific approaches when creating the machine files. For auto-ecearth use COMPONENTS instead of STANDARD. -Horizontal wrapper -********************** - .. code-block:: ini [wrapper] @@ -111,14 +135,14 @@ In order to be able to use the horizontal wrapper, in ``platforms_cxxx.conf`` se ... MAX_PROCESSORS = 2400 -.. figure:: ../../workflows/horizontal_remote.png +.. figure:: fig/horizontal_remote.png :name: horizontal_remote :width: 60% :align: center :alt: horizontally wrapped jobs Shared-memory Experiments -************************* +~~~~~~~~~~~~~~~~~~~~~~~~~ There is also the possibility of setting the option **METHOD** to SRUN in the wrapper directive (**ONLY** for vertical and vertical-horizontal wrappers). @@ -131,13 +155,13 @@ This allows to form a wrapper with shared-memory paradigm instead of rely in mac METHOD = srun # default ASTHREAD Hybrid wrapper -========================== +-------------- The hybrid wrapper is a wrapper that works both vertically and horizontally at the same time, meaning that members and chunks can be wrapped in one single job. Mixed approach using a combination of horizontal and vertical wrappers and the list of jobs is a list of lists. Horizontal-vertical -=========================== +------------------- - There is a dependency between lists. Each list runs after the previous one finishes; the jobs within the list run in parallel at the same time - It is particularly suitable if there are jobs of different types in the list with different wall clocks, but dependencies between jobs of different lists; it waits for all the jobs in the list to finish before starting the next list @@ -150,7 +174,7 @@ Horizontal-vertical MACHINEFILES = STANDARD JOBS_IN_WRAPPER = SIM&DA -.. figure:: ../../workflows/dasim.png +.. figure:: fig/dasim.png :name: wrapper_horizontal_vertical :width: 100% :align: center @@ -158,7 +182,7 @@ Horizontal-vertical Vertical-horizontal -=========================== +------------------- - In this approach, each list is independent of each other and run in parallel; jobs within the list run one after the other - It is particularly suitable for running many sequential ensembles @@ -171,14 +195,15 @@ Vertical-horizontal MACHINEFILES = STANDARD JOBS_IN_WRAPPER = SIM -.. figure:: ../../workflows/vertical-horizontal.png +.. figure:: fig/vertical-horizontal.png :name: wrapper_vertical_horizontal :width: 100% :align: center :alt: hybrid wrapper Multiple wrappers at once -========================= +------------------------- + This is an special mode that allows you to use multiple **independent** wrappers on the same experiment. By using an special variable that allows to define subwrapper sections .. code-block:: ini @@ -195,14 +220,14 @@ This is an special mode that allows you to use multiple **independent** wrappers TYPE = vertical JOBS_IN_WRAPPER = DA&REDUCE -.. figure:: ../workflows/multiple_wrappers.png +.. figure:: fig/multiple_wrappers.png :name: :width: 100% :align: center :alt: multi wrapper Summary -========================== +------- In `autosubmit_cxxx.conf`: @@ -213,7 +238,7 @@ In `autosubmit_cxxx.conf`: # JOBS_IN_WRAPPER = Sections that should be wrapped together ex SIM # METHOD : Select between MACHINESFILES or Shared-Memory. # MIN_WRAPPED set the minim number of jobs that should be included in the wrapper. DEFAULT = 2 - # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = 99999999999 + # MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = TOTALJOBS # Policy : Select the behaviour of the inner jobs Strict/Flexible/Mixed # EXTEND_WALLCLOCK: Allows to extend the wallclock by the max wallclock of the horizontal package (max inner job). Values are integer units (0,1,2) # RETRIALS : Enables a retrial mechanism for vertical wrappers, or default retrial mechanism for the other wrappers @@ -222,7 +247,7 @@ In `autosubmit_cxxx.conf`: TYPE = Vertical #REQUIRED JOBS_IN_WRAPPER = SIM # Job types (as defined in jobs_cxxx.conf) separated by space. REQUIRED only if vertical-mixed MIN_WRAPPED = 2 - MAX_WRAPPED = 999999 # OPTIONAL. Integer value. + MAX_WRAPPED = 9999 # OPTIONAL. Integer value, overrides TOTALJOBS CHECK_TIME_WRAPPER = # OPTIONAL. Time in seconds, overrides SAFETYSLEEPTIME POLICY = flexible # OPTIONAL, Wrapper policy, mixed, flexible, strict QUEUE = bsc_es # If not specified, queue will be the same of the first SECTION specified on JOBS_IN_WRAPPER -- GitLab From 188893bc781eeb23d6030e0423aa3fced2e59ba4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 3 Nov 2022 13:51:58 +0100 Subject: [PATCH 115/121] mail notifier changes --- .gitignore | 3 +++ autosubmit/helpers/utils.py | 1 - autosubmit/notifications/mail_notifier.py | 1 + autosubmit/platforms/pbsplatform.py | 3 ++- autosubmit/statistics/utils.py | 3 +-- 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index ea136b3a0..ae96f6d2d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ autosubmit/simple_test.py .vscode/ .vscode autosubmit.egg-info/ +venv/ +.pytest_cache/ +.cache/ \ No newline at end of file diff --git a/autosubmit/helpers/utils.py b/autosubmit/helpers/utils.py index 1a6d8e763..0ce27ab8a 100644 --- a/autosubmit/helpers/utils.py +++ b/autosubmit/helpers/utils.py @@ -7,7 +7,6 @@ from typing import Tuple def check_experiment_ownership(expid, basic_config, raise_error=False, logger=None): #Logger variable is not needed, LOG is global thus it will be read if avaliable - # type: (str, BasicConfig, bool, Log) -> Tuple[bool, bool, str] my_user_ID = os.getuid() current_owner_ID = 0 current_owner_name = "NA" diff --git a/autosubmit/notifications/mail_notifier.py b/autosubmit/notifications/mail_notifier.py index 53048138f..f774d1ed2 100644 --- a/autosubmit/notifications/mail_notifier.py +++ b/autosubmit/notifications/mail_notifier.py @@ -31,6 +31,7 @@ class MailNotifier: message = MIMEText(message_text) message['From'] = email.utils.formataddr(('Autosubmit', self.config.MAIL_FROM)) message['Subject'] = '[Autosubmit] Warning a remote platform is malfunctioning' + for mail in mail_to: message['To'] = email.utils.formataddr((mail, mail)) try: diff --git a/autosubmit/platforms/pbsplatform.py b/autosubmit/platforms/pbsplatform.py index 961bb4534..089856395 100644 --- a/autosubmit/platforms/pbsplatform.py +++ b/autosubmit/platforms/pbsplatform.py @@ -18,6 +18,7 @@ # along with Autosubmit. If not, see . import os +from log.log import Log, AutosubmitCritical, AutosubmitError from autosubmit.platforms.paramiko_platform import ParamikoPlatform from log.log import Log @@ -49,7 +50,7 @@ class PBSPlatform(ParamikoPlatform): self._header = Pbs12Header() else: Log.error('PBS version {0} not supported'.format(version)) - raise HPCPlatformException('PBS version {0} not supported'.format(version)) + raise AutosubmitError('PBS version {0} not supported'.format(version)) self.job_status = dict() self.job_status['COMPLETED'] = ['F', 'E', 'c', 'C'] diff --git a/autosubmit/statistics/utils.py b/autosubmit/statistics/utils.py index 465740187..765994c9e 100644 --- a/autosubmit/statistics/utils.py +++ b/autosubmit/statistics/utils.py @@ -38,8 +38,7 @@ def timedelta2hours(deltatime): def parse_number_processors(processors_str): """ Defaults to 1 in case of error """ - # type: (str) -> int - if ':' in processors_str: + if ':' in processors_str: components = processors_str.split(":") processors = int(sum( [math.ceil(float(x) / 36.0) * 36.0 for x in components])) -- GitLab From f594cc4dff9812a8f7a75c00b20d15b5c99f7ffc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 2 Nov 2022 15:39:18 +0100 Subject: [PATCH 116/121] mail notifier changes (2) --- autosubmit/notifications/mail_notifier.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/autosubmit/notifications/mail_notifier.py b/autosubmit/notifications/mail_notifier.py index f774d1ed2..ade319601 100644 --- a/autosubmit/notifications/mail_notifier.py +++ b/autosubmit/notifications/mail_notifier.py @@ -31,6 +31,7 @@ class MailNotifier: message = MIMEText(message_text) message['From'] = email.utils.formataddr(('Autosubmit', self.config.MAIL_FROM)) message['Subject'] = '[Autosubmit] Warning a remote platform is malfunctioning' + message['Date'] = email.utils.formatdate(localtime=True) for mail in mail_to: message['To'] = email.utils.formataddr((mail, mail)) @@ -43,6 +44,7 @@ class MailNotifier: message = MIMEText(message_text) message['From'] = email.utils.formataddr(('Autosubmit', self.config.MAIL_FROM)) message['Subject'] = '[Autosubmit] The job {0} status has changed to {1}'.format(job_name, str(status)) + message['Date'] = email.utils.formatdate(localtime=True) for mail in mail_to: message['To'] = email.utils.formataddr((mail, mail)) try: -- GitLab From 8490d2a084a99ff483973067b279679b89e5ac9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Mon, 7 Nov 2022 15:16:45 +0100 Subject: [PATCH 117/121] Correct figure path for wrapper section --- .../userguide/{run => wrappers}/fig/dasim.png | Bin .../{run => wrappers}/fig/horizontal_remote.png | Bin .../{run => wrappers}/fig/multiple_wrappers.png | Bin .../userguide/{run => wrappers}/fig/rerun.png | Bin .../{run => wrappers}/fig/vertical-horizontal.png | Bin .../{run => wrappers}/fig/vertical-mixed.png | Bin 6 files changed, 0 insertions(+), 0 deletions(-) rename docs/source/userguide/{run => wrappers}/fig/dasim.png (100%) rename docs/source/userguide/{run => wrappers}/fig/horizontal_remote.png (100%) rename docs/source/userguide/{run => wrappers}/fig/multiple_wrappers.png (100%) rename docs/source/userguide/{run => wrappers}/fig/rerun.png (100%) rename docs/source/userguide/{run => wrappers}/fig/vertical-horizontal.png (100%) rename docs/source/userguide/{run => wrappers}/fig/vertical-mixed.png (100%) diff --git a/docs/source/userguide/run/fig/dasim.png b/docs/source/userguide/wrappers/fig/dasim.png similarity index 100% rename from docs/source/userguide/run/fig/dasim.png rename to docs/source/userguide/wrappers/fig/dasim.png diff --git a/docs/source/userguide/run/fig/horizontal_remote.png b/docs/source/userguide/wrappers/fig/horizontal_remote.png similarity index 100% rename from docs/source/userguide/run/fig/horizontal_remote.png rename to docs/source/userguide/wrappers/fig/horizontal_remote.png diff --git a/docs/source/userguide/run/fig/multiple_wrappers.png b/docs/source/userguide/wrappers/fig/multiple_wrappers.png similarity index 100% rename from docs/source/userguide/run/fig/multiple_wrappers.png rename to docs/source/userguide/wrappers/fig/multiple_wrappers.png diff --git a/docs/source/userguide/run/fig/rerun.png b/docs/source/userguide/wrappers/fig/rerun.png similarity index 100% rename from docs/source/userguide/run/fig/rerun.png rename to docs/source/userguide/wrappers/fig/rerun.png diff --git a/docs/source/userguide/run/fig/vertical-horizontal.png b/docs/source/userguide/wrappers/fig/vertical-horizontal.png similarity index 100% rename from docs/source/userguide/run/fig/vertical-horizontal.png rename to docs/source/userguide/wrappers/fig/vertical-horizontal.png diff --git a/docs/source/userguide/run/fig/vertical-mixed.png b/docs/source/userguide/wrappers/fig/vertical-mixed.png similarity index 100% rename from docs/source/userguide/run/fig/vertical-mixed.png rename to docs/source/userguide/wrappers/fig/vertical-mixed.png -- GitLab From bc2037db1d10b64b922f3c9f669bd8d24db73afd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Tue, 8 Nov 2022 11:00:14 +0100 Subject: [PATCH 118/121] Correct figure path for defining a workflow section --- .../fig/dashed.png | Bin .../fig/date-synchronize.png | Bin .../fig/dependencies_previous.png | Bin .../fig/dependencies_running.png | Bin .../fig/experiment_delay_doc.png | Bin .../fig/frequency.png | Bin .../fig/member-synchronize.png | Bin .../fig/no-synchronize.png | Bin .../fig/running.png | Bin .../fig/select_chunks.png | Bin .../fig/select_members.png | Bin .../fig/simple.png | Bin .../{configure => defining workflows}/fig/skip.png | Bin .../{configure => defining workflows}/fig/split.png | Bin 14 files changed, 0 insertions(+), 0 deletions(-) rename docs/source/userguide/{configure => defining workflows}/fig/dashed.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/date-synchronize.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/dependencies_previous.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/dependencies_running.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/experiment_delay_doc.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/frequency.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/member-synchronize.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/no-synchronize.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/running.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/select_chunks.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/select_members.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/simple.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/skip.png (100%) rename docs/source/userguide/{configure => defining workflows}/fig/split.png (100%) diff --git a/docs/source/userguide/configure/fig/dashed.png b/docs/source/userguide/defining workflows/fig/dashed.png similarity index 100% rename from docs/source/userguide/configure/fig/dashed.png rename to docs/source/userguide/defining workflows/fig/dashed.png diff --git a/docs/source/userguide/configure/fig/date-synchronize.png b/docs/source/userguide/defining workflows/fig/date-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/date-synchronize.png rename to docs/source/userguide/defining workflows/fig/date-synchronize.png diff --git a/docs/source/userguide/configure/fig/dependencies_previous.png b/docs/source/userguide/defining workflows/fig/dependencies_previous.png similarity index 100% rename from docs/source/userguide/configure/fig/dependencies_previous.png rename to docs/source/userguide/defining workflows/fig/dependencies_previous.png diff --git a/docs/source/userguide/configure/fig/dependencies_running.png b/docs/source/userguide/defining workflows/fig/dependencies_running.png similarity index 100% rename from docs/source/userguide/configure/fig/dependencies_running.png rename to docs/source/userguide/defining workflows/fig/dependencies_running.png diff --git a/docs/source/userguide/configure/fig/experiment_delay_doc.png b/docs/source/userguide/defining workflows/fig/experiment_delay_doc.png similarity index 100% rename from docs/source/userguide/configure/fig/experiment_delay_doc.png rename to docs/source/userguide/defining workflows/fig/experiment_delay_doc.png diff --git a/docs/source/userguide/configure/fig/frequency.png b/docs/source/userguide/defining workflows/fig/frequency.png similarity index 100% rename from docs/source/userguide/configure/fig/frequency.png rename to docs/source/userguide/defining workflows/fig/frequency.png diff --git a/docs/source/userguide/configure/fig/member-synchronize.png b/docs/source/userguide/defining workflows/fig/member-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/member-synchronize.png rename to docs/source/userguide/defining workflows/fig/member-synchronize.png diff --git a/docs/source/userguide/configure/fig/no-synchronize.png b/docs/source/userguide/defining workflows/fig/no-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/no-synchronize.png rename to docs/source/userguide/defining workflows/fig/no-synchronize.png diff --git a/docs/source/userguide/configure/fig/running.png b/docs/source/userguide/defining workflows/fig/running.png similarity index 100% rename from docs/source/userguide/configure/fig/running.png rename to docs/source/userguide/defining workflows/fig/running.png diff --git a/docs/source/userguide/configure/fig/select_chunks.png b/docs/source/userguide/defining workflows/fig/select_chunks.png similarity index 100% rename from docs/source/userguide/configure/fig/select_chunks.png rename to docs/source/userguide/defining workflows/fig/select_chunks.png diff --git a/docs/source/userguide/configure/fig/select_members.png b/docs/source/userguide/defining workflows/fig/select_members.png similarity index 100% rename from docs/source/userguide/configure/fig/select_members.png rename to docs/source/userguide/defining workflows/fig/select_members.png diff --git a/docs/source/userguide/configure/fig/simple.png b/docs/source/userguide/defining workflows/fig/simple.png similarity index 100% rename from docs/source/userguide/configure/fig/simple.png rename to docs/source/userguide/defining workflows/fig/simple.png diff --git a/docs/source/userguide/configure/fig/skip.png b/docs/source/userguide/defining workflows/fig/skip.png similarity index 100% rename from docs/source/userguide/configure/fig/skip.png rename to docs/source/userguide/defining workflows/fig/skip.png diff --git a/docs/source/userguide/configure/fig/split.png b/docs/source/userguide/defining workflows/fig/split.png similarity index 100% rename from docs/source/userguide/configure/fig/split.png rename to docs/source/userguide/defining workflows/fig/split.png -- GitLab From 9b32745ff8eae7f953ada2e27a2a05105a91035b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Fri, 28 Jul 2023 10:15:18 +0200 Subject: [PATCH 119/121] translate to as4 extended header/tailer --- autosubmit/job/job.py | 45 +++++++++++++++++++++++++++++++++++- autosubmit/job/job_common.py | 10 ++++++++ autosubmit/job/job_dict.py | 3 +++ 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 47565d3b9..7b38eb744 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -211,6 +211,8 @@ class Job(object): self._export = "none" self._dependencies = [] self.running = "once" + self.ext_header_path = "" + self.ext_tailer_path = "" self.start_time = None self.edge_info = dict() self.total_jobs = None @@ -837,6 +839,43 @@ class Job(object): retrials_list.insert(0, retrial_dates) return retrials_list + def read_header_tailer_script(self, script_path: str, as_conf: AutosubmitConfig): + """ + Opens and reads a script. If it is not a BASH script it will fail :( + + Will strip away the line with the hash bang (#!) + + :param script_path: relative to the experiment directory path to the script + :param as_conf: Autosubmit configuration file + """ + script_name = script_path.rsplit("/")[-1] # pick the name of the script for a more verbose error + script = '' + if script_path == '': + return script + + try: + script_file = open(os.path.join(as_conf.get_project_dir(), script_path), 'r') + except Exception as e: # log + # We stop Autosubmit if we don't find the script + raise AutosubmitCritical("Extended script: failed to fetch {0} \n".format(str(e)), 7014) + + for line in script_file: + if "#!" not in line: + script += line + else: + # check if the type of the script matches the one in the extended + if "bash" in line: + if self.type != Type.BASH: + raise AutosubmitCritical("Extended script: script {0} seems BASH but job {1} isn't\n".format(script_name, self.script_name), 7011) + elif "Rscript" in line: + if self.type != Type.R: + raise AutosubmitCritical("Extended script: script {0} seems Rscript but job {1} isn't\n".format(script_name, self.script_name), 7011) + elif "python" in line: + if self.type not in (Type.PYTHON, Type.PYTHON2, Type.PYTHON3): + raise AutosubmitCritical("Extended script: script {0} seems Python but job {1} isn't\n".format(script_name, self.script_name), 7011) + + return script + def retrieve_logfiles_unthreaded(self, copy_remote_logs, local_logs): remote_logs = (self.script_name + ".out."+str(self.fail_count), self.script_name + ".err."+str(self.fail_count)) out_exist = False @@ -1251,7 +1290,7 @@ class Job(object): parameters['CURRENT_LOGDIR'] = job_platform.get_files_path() return parameters - def update_platform_associated_parameters(self,as_conf, parameters, job_platform, chunk): + def update_platform_associated_parameters(self, as_conf, parameters, job_platform, chunk): self.executable = str(as_conf.jobs_data[self.section].get("EXECUTABLE", as_conf.platforms_data.get(job_platform.name,{}).get("EXECUTABLE",""))) self.total_jobs = int(as_conf.jobs_data[self.section].get("TOTALJOBS", job_platform.total_jobs)) self.max_waiting_jobs = int(as_conf.jobs_data[self.section].get("MAXWAITINGJOBS", job_platform.max_waiting_jobs)) @@ -1313,6 +1352,10 @@ class Job(object): parameters['SCRATCH_FREE_SPACE'] = self.scratch_free_space parameters['CUSTOM_DIRECTIVES'] = self.custom_directives parameters['HYPERTHREADING'] = self.hyperthreading + # memory issues? We are storing the whole extended script as a string + if as_conf.get_project_type() != "none": + parameters['EXTENDED_HEADER'] = self.read_header_tailer_script(self.ext_header_path, as_conf) + parameters['EXTENDED_TAILER'] = self.read_header_tailer_script(self.ext_tailer_path, as_conf) parameters['CURRENT_QUEUE'] = self.queue return parameters diff --git a/autosubmit/job/job_common.py b/autosubmit/job/job_common.py index 4d05d985c..e5294a1b6 100644 --- a/autosubmit/job/job_common.py +++ b/autosubmit/job/job_common.py @@ -127,6 +127,11 @@ class StatisticsSnippetBash: set -xuve job_name_ptrn='%CURRENT_LOGDIR%/%JOBNAME%' echo $(date +%s) > ${job_name_ptrn}_STAT + + ################## + # Extended header + ################## + %EXTENDED_HEADER% ################### # Autosubmit job @@ -138,6 +143,11 @@ class StatisticsSnippetBash: def as_tailer(): return textwrap.dedent("""\ + ################### + # Extended tailer + ################### + %EXTENDED_TAILER% + ################### # Autosubmit tailer ################### diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index e2f673563..51c34023c 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -425,6 +425,9 @@ class DicJobs: job.running = str(parameters[section].get( 'RUNNING', 'once')) job.x11 = str(parameters[section].get( 'X11', False )).lower() job.skippable = str(parameters[section].get( "SKIPPABLE", False)).lower() + # we make the empty string as default value, case is not present in the config + job.ext_header_path = str(parameters[section].get("EXTENDED_HEADER_PATH", "")) + job.ext_tailer_path = str(parameters[section].get("EXTENDED_TAILER_PATH", "")) self._jobs_list.get_job_list().append(job) return job -- GitLab From e21d6d28caca4be28451ba41a5a46d2bc21174c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Wed, 9 Aug 2023 15:22:31 +0200 Subject: [PATCH 120/121] Test header and tailer on Autosubmit 4 and mocking write --- test/unit/test_job.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/unit/test_job.py b/test/unit/test_job.py index caaf9c60a..a4e3b6cf6 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -191,6 +191,33 @@ class TestJob(TestCase): write_mock.write.assert_called_with(b'some-content: 999, 777, 666 % %') chmod_mock.assert_called_with(os.path.join(self.job._tmp_path, self.job.name + '.cmd'), 0o755) + def test_create_header_tailer_script(self): + # arrange + header_script = '#!/usr/bin/bash\necho "Header test"\n' + tailer_script = '#!/usr/bin/bash\necho "Tailer test"\n' + # the full script in its binary form + expected_script = b'#!/usr/bin/bash\necho "Header test"\n\nsome-content\n#!/usr/bin/bash\necho "Tailer test"\n' + self.job.parameters = dict() + self.job.parameters['EXTENDED_HEADER'] = header_script + self.job.parameters['EXTENDED_TAILER'] = tailer_script + self.job.type = 0 # Type.BASH + self.job.update_content = Mock(return_value=('%EXTENDED_HEADER%\nsome-content\n%EXTENDED_TAILER%', ['%EXTENDED_HEADER%\nsome-content\n%EXTENDED_TAILER%'])) + # self.job.update_parameters = Mock(return_value=self.job.parameters) + # create an autosubmit config + config = Mock(spec=AutosubmitConfig) + + # mock parts to write the file + sys.modules['os'].chmod = Mock() + write_mock = Mock().write = Mock() + open_mock = Mock(return_value=write_mock) + # here we replace (patch) the "open" function with our mocked one >:) + with patch.object(builtins, "open", open_mock): + # act + self.job.create_script(config) + + # assert + write_mock.write.assert_called_with(expected_script) + def test_that_check_script_returns_false_when_there_is_an_unbound_template_variable(self): # arrange update_content_mock = Mock(return_value=('some-content: %UNBOUND%','some-content: %UNBOUND%')) -- GitLab From 985510c98ddd45513b44395a2b037686b9ebe669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Gim=C3=A9nez=20de=20Castro?= Date: Wed, 9 Aug 2023 15:41:19 +0200 Subject: [PATCH 121/121] Test header and tailer on Autosubmit 4 and mocking write --- test/unit/test_job.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/test/unit/test_job.py b/test/unit/test_job.py index a4e3b6cf6..02d906983 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -192,6 +192,12 @@ class TestJob(TestCase): chmod_mock.assert_called_with(os.path.join(self.job._tmp_path, self.job.name + '.cmd'), 0o755) def test_create_header_tailer_script(self): + """ + Here we are testing if the parameters from the dictionary are properly read and then regex acts correctly. + + The internal logic to avoid header/tailer of a different type on a R or Python script is not tested. We skip + all of that when we do the content_update mock. + """ # arrange header_script = '#!/usr/bin/bash\necho "Header test"\n' tailer_script = '#!/usr/bin/bash\necho "Tailer test"\n' @@ -200,16 +206,17 @@ class TestJob(TestCase): self.job.parameters = dict() self.job.parameters['EXTENDED_HEADER'] = header_script self.job.parameters['EXTENDED_TAILER'] = tailer_script - self.job.type = 0 # Type.BASH - self.job.update_content = Mock(return_value=('%EXTENDED_HEADER%\nsome-content\n%EXTENDED_TAILER%', ['%EXTENDED_HEADER%\nsome-content\n%EXTENDED_TAILER%'])) + # We mock the function that returns the template script it is tuple because that is what the + # function outputs. We don't care about additional templates, hence the empty list :D + self.job.update_content = Mock(return_value=('%EXTENDED_HEADER%\nsome-content\n%EXTENDED_TAILER%', [])) # self.job.update_parameters = Mock(return_value=self.job.parameters) # create an autosubmit config config = Mock(spec=AutosubmitConfig) # mock parts to write the file - sys.modules['os'].chmod = Mock() + sys.modules['os'].chmod = Mock() # needed cuz AS changes the permission on file write_mock = Mock().write = Mock() - open_mock = Mock(return_value=write_mock) + open_mock = Mock(return_value=write_mock) # so that we don't try to open a file that does not exist # here we replace (patch) the "open" function with our mocked one >:) with patch.object(builtins, "open", open_mock): # act -- GitLab