From 7a1b443915926cce22440997a2c14c74be38d66d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 9 Oct 2020 16:14:00 +0200 Subject: [PATCH 01/26] More changes --- autosubmit/job/job.py | 61 +++++++++++++++--------------- autosubmit/job/job_packager.py | 3 +- autosubmit/platforms/psplatform.py | 24 ------------ docs/source/usage/wrappers.rst | 10 ++--- 4 files changed, 37 insertions(+), 61 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index b768766a4..746c6c56d 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -525,55 +525,49 @@ class Job(object): Log.printlog("{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message,self.name), 6001) out_exist = False err_exist = False - retries = 3 + retries = 20 sleeptime = 0 i = 0 sleep(20) + no_continue = False try: while (not out_exist and not err_exist) and i < retries: try: - try: - out_exist = self.platform.check_file_exists(remote_logs[0]) # will do 5 retries - except IOError as e: - out_exist = False - try: - err_exist = self.platform.check_file_exists(remote_logs[1]) # will do 5 retries - except IOError as e: - err_exists = False - except Exception as e: + out_exist = self.platform.check_file_exists(remote_logs[0]) # will do 5 retries + except IOError as e: out_exist = False - err_exist = False - pass + try: + err_exist = self.platform.check_file_exists(remote_logs[1]) # will do 5 retries + except IOError as e: + err_exists = False if not out_exist or not err_exist: sleeptime = sleeptime + 5 i = i + 1 sleep(sleeptime) if i >= retries: if not out_exist or not err_exist: - Log.printlog("Retries = {0}, Failed to retrieve log files {1} and {2}".format(retries,remote_logs[0],remote_logs[1]), 6001) - - + raise AutosubmitError("Failed to retrieve log files {1} and {2}".format(retries,remote_logs[0],remote_logs[1]), 6001) if copy_remote_logs: if local_logs != remote_logs: # unifying names for log files self.synchronize_logs(self.platform, remote_logs, local_logs) remote_logs = local_logs self.platform.get_logs_files(self.expid, remote_logs) - # Update the logs with Autosubmit Job Id Brand - try: - for local_log in local_logs: - self.platform.write_jobid(self.id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) - except BaseException as e: - Log.printlog("Trace {0} \n Failed to write the {1}".format(e.message,self.name), 6001) + # Update the logs with Autosubmit Job Id Brand + try: + for local_log in local_logs: + self.platform.write_jobid(self.id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) + except BaseException as e: + raise AutosubmitError("Trace {0} \n Failed to write the {1}".format(e.message,self.name), 6001) except AutosubmitError as e: Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format(e.message,self.name), 6001) + sleep(5) # safe wait before end a thread + return except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format(e.message,self.name), 6001) - try: - self.platform.closeConnection() - except: - pass + sleep(5) # safe wait before end a thread + return sleep(5) # safe wait before end a thread return @@ -611,7 +605,6 @@ class Job(object): self.platform.get_completed_files(self.name) self.check_completion() if self.status == Status.COMPLETED: - Log.printlog(" there is a COMPLETED file.",3000) Log.result("Job {0} is COMPLETED", self.name) else: self.update_children_status() @@ -836,7 +829,7 @@ class Job(object): template = template_file.read() else: if self.type == Type.BASH: - template = 'sleep 60' + template = 'sleep 70' elif self.type == Type.PYTHON: template = 'time.sleep(5)' elif self.type == Type.R: @@ -1151,6 +1144,7 @@ class WrapperJob(Job): # save start time, wallclock and processors?! self.checked_time = datetime.datetime.now() self.hold = hold + self.inner_jobs_running = dict() def _queuing_reason_cancel(self, reason): try: @@ -1189,8 +1183,14 @@ class WrapperJob(Job): # Fail can come from check function or running/completed checkers. if self.status in [Status.FAILED, Status.UNKNOWN]: self.status = Status.FAILED - self.cancel_failed_wrapper_job() - self.update_failed_jobs() + self._check_running_jobs() + still_running = False + for job in self.inner_jobs_running.keys(): + if job.status == Status.RUNNING: + still_running = True + if not still_running: + self.cancel_failed_wrapper_job() + self.update_failed_jobs() def check_inner_jobs_completed(self, jobs): not_completed_jobs = [ @@ -1344,7 +1344,7 @@ done if retries == 0 or over_wallclock: self.status = Status.FAILED - + self.inner_jobs_running = not_finished_jobs_dict def _check_finished_job(self, job): wait = 2 retries = 5 @@ -1374,6 +1374,7 @@ done Log.printlog("Cancelling job with id {0}".format(self.id),6009) self.platform.send_command( self.platform.cancel_cmd + " " + str(self.id)) + # If there are jobs running, let them finish TODO def _update_completed_jobs(self): for job in self.job_list: diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 88075d990..005e17e78 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -343,8 +343,7 @@ class JobPackager(object): jobs_section = dict() for job in jobs_list: # This iterator will always return None if there is no '&' defined in the section name - section = next( - (s for s in sections_split if job.section in s and '&' in s), None) + section = next((s for s in sections_split if job.section in s and '&' in s), None) if section is None: section = job.section if section not in jobs_section: diff --git a/autosubmit/platforms/psplatform.py b/autosubmit/platforms/psplatform.py index e8981eec8..9861db07e 100644 --- a/autosubmit/platforms/psplatform.py +++ b/autosubmit/platforms/psplatform.py @@ -79,27 +79,3 @@ class PsPlatform(ParamikoPlatform): def get_checkjob_cmd(self, job_id): return self.get_pscall(job_id) - # def connect(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True - # def restore_connection(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True - # def test_connection(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True \ No newline at end of file diff --git a/docs/source/usage/wrappers.rst b/docs/source/usage/wrappers.rst index c961f1a19..6f52f0b2c 100644 --- a/docs/source/usage/wrappers.rst +++ b/docs/source/usage/wrappers.rst @@ -101,7 +101,7 @@ Additionally, jobs are grouped within the corresponding date, member and chunk h [wrapper] TYPE = vertical-mixed - JOBS_IN_WRAPPER = # REQUIRED + JOBS_IN_WRAPPER = # REQUIRED Horizontal wrapper @@ -157,7 +157,7 @@ Horizontal-vertical [wrapper] TYPE = horizontal-vertical MACHINEFILES = STANDARD - JOBS_IN_WRAPPER = SIM POST + JOBS_IN_WRAPPER = SIM&POST .. figure:: ../workflows/horizontal-vertical.png :name: wrapper_horizontal_vertical @@ -178,7 +178,7 @@ Vertical-horizontal [wrapper] TYPE = vertical-horizontal MACHINEFILES = STANDARD - JOBS_IN_WRAPPER = SIM POST + JOBS_IN_WRAPPER = SIM&POST .. figure:: ../workflows/vertical-horizontal.png :name: wrapper_vertical_horizontal @@ -234,7 +234,7 @@ Considering a very simple workflow with the configurations as follows: [wrapper] TYPE = vertical-mixed - JOBS_IN_WRAPPER = SIM POST + JOBS_IN_WRAPPER = SIM&POST .. figure:: ../workflows/wrapper.png @@ -250,7 +250,7 @@ Horizontal wrapper with remote dependencies [wrapper] TYPE = horizontal - JOBS_IN_WRAPPER = SIM POST + JOBS_IN_WRAPPER = SIM&POST .. figure:: ../workflows/horizontal_remote.png :name: horizontal_remote -- GitLab From 8d2e2634dca0f398b9db0e705d14e6cf38abaa00 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 13 Oct 2020 09:45:08 +0200 Subject: [PATCH 02/26] fixed checker while working with local templates --- autosubmit/autosubmit.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 94b0da491..d86a9ffc9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1129,6 +1129,7 @@ class Autosubmit: :return: Nothing\n :rtype: \n """ + job_list._job_list = jobs_filtered # Current choice is Paramiko Submitter submitter = Autosubmit._get_submitter(as_conf) @@ -3149,7 +3150,9 @@ class Autosubmit: expand_list=expand, expanded_status=status) groups_dict = job_grouping.group_jobs() # WRAPPERS + if as_conf.get_wrapper_type() != 'none' and check_wrappers: + as_conf.check_conf_files(True) packages_persistence = JobPackagePersistence( os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid, "pkl"), "job_packages_" + expid) packages_persistence.reset_table(True) -- GitLab From 0c2b860d562db4183b9eaafe5ffde07c76236435 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 14 Oct 2020 14:55:03 +0200 Subject: [PATCH 03/26] Changed the way of handle failed jobs --- autosubmit/job/job.py | 2 +- autosubmit/platforms/wrappers/wrapper_builder.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 746c6c56d..44719402e 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1185,7 +1185,7 @@ class WrapperJob(Job): self.status = Status.FAILED self._check_running_jobs() still_running = False - for job in self.inner_jobs_running.keys(): + for job in self.inner_jobs_running.values(): if job.status == Status.RUNNING: still_running = True if not still_running: diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 4d295bb80..b19a1d3d6 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -130,6 +130,19 @@ class PythonWrapperBuilder(WrapperBuilder): from math import ceil from collections import OrderedDict import copy + class Unbuffered(object): + def __init__(self, stream): + self.stream = stream + def write(self, data): + self.stream.write(data) + self.stream.flush() + def writelines(self, datas): + self.stream.writelines(datas) + self.stream.flush() + def __getattr__(self, attr): + return getattr(self.stream, attr) + + sys.stdout = Unbuffered(sys.stdout) # Defining scripts to be run scripts= {0} @@ -314,7 +327,8 @@ class PythonWrapperBuilder(WrapperBuilder): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: print datetime.now(), "The job ", pid.template," has FAILED" - {1} + + #{1} """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher -- GitLab From 92ca984865e85bf43dd6a31a8f29f09f2df8675d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 15 Oct 2020 15:03:10 +0200 Subject: [PATCH 04/26] Job retrials fix --- autosubmit/autosubmit.py | 63 +++++++++---------- autosubmit/job/job.py | 56 +++++++++-------- autosubmit/job/job_list.py | 49 +++++++++------ autosubmit/job/job_packages.py | 2 +- autosubmit/platforms/headers/lsf_header.py | 2 +- autosubmit/platforms/locplatform.py | 2 +- autosubmit/platforms/paramiko_platform.py | 2 +- autosubmit/platforms/slurmplatform.py | 2 + .../platforms/wrappers/wrapper_builder.py | 8 +-- simple_test.py | 2 +- test/unit/test_job.py | 12 ++-- test/unit/test_wrappers.py | 2 +- 12 files changed, 109 insertions(+), 93 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index d86a9ffc9..30640d305 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1144,10 +1144,10 @@ class Autosubmit: job.platform_name = hpcarch # Assign platform objects to each job # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower()] + job._platform = submitter.platforms[job._platform_name.lower()] # Add object to set # noinspection PyTypeChecker - platforms_to_test.add(job.platform) + platforms_to_test.add(job._platform) # case setstatus job_list.check_scripts(as_conf) job_list.update_list(as_conf, False) @@ -1268,10 +1268,10 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower( + job._platform = submitter.platforms[job.platform_name.lower( )] # noinspection PyTypeChecker - platforms_to_test.add(job.platform) + platforms_to_test.add(job._platform) try: job_list.check_scripts(as_conf) except Exception as e: @@ -1450,8 +1450,7 @@ class Autosubmit: save = True if platform.type == "slurm" and list_jobid != "": - slurm.append( - [platform, list_jobid, list_prevStatus, completed_joblist]) + slurm.append([platform, list_jobid, list_prevStatus, completed_joblist]) # END Normal jobs + wrappers # CHECK ALL JOBS at once if they're from slurm ( wrappers non contempled) for platform_jobs in slurm: @@ -1474,7 +1473,7 @@ class Autosubmit: as_conf.get_mails_to()) save = True # End Check Current jobs - save2 = job_list.update_list(as_conf) + save2 = job_list.update_list(as_conf,submitter=submitter) if save or save2: job_list.save() if len(job_list.get_ready()) > 0: @@ -1483,7 +1482,7 @@ class Autosubmit: if as_conf.get_remote_dependencies() and len(job_list.get_prepared()) > 0: Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=True) - save = job_list.update_list(as_conf) + save = job_list.update_list(as_conf,submitter=submitter) if save: job_list.save() # Safe spot to store changes @@ -1509,7 +1508,7 @@ class Autosubmit: for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] + job._platform = submitter.platforms[job.platform_name.lower()] packages_persistence = JobPackagePersistence(os.path.join( BasicConfig.LOCAL_ROOT_DIR, expid, "pkl"), "job_packages_" + expid) @@ -1538,14 +1537,16 @@ class Autosubmit: try: Autosubmit.restore_platforms(platforms_to_test) platforms_to_test = set() + Autosubmit.restore_platforms(platforms_to_test) + for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower( + job._platform = submitter.platforms[job.platform_name.lower( )] # noinspection PyTypeChecker - platforms_to_test.add(job.platform) + platforms_to_test.add(job._platform) except BaseException: raise AutosubmitCritical( "Autosubmit couldn't recover the platforms", 7050, e.message) @@ -1626,14 +1627,13 @@ class Autosubmit: save = False for platform in platforms_to_test: if not hold: - Log.debug("\nJobs ready for {1}: {0}", len( - job_list.get_ready(platform, hold=hold)), platform.name) + Log.debug("\nJobs ready for {1}: {0}", len(job_list.get_ready(platform, hold=hold)), platform.name) + ready_jobs = job_list.get_ready(platform, hold=hold) else: Log.debug("\nJobs prepared for {1}: {0}", len( job_list.get_prepared(platform)), platform.name) - packages_to_submit = JobPackager( - as_conf, platform, job_list, hold=hold).build_packages() + packages_to_submit = JobPackager(as_conf, platform, job_list, hold=hold).build_packages() if not inspect: platform.open_submit_script() @@ -1659,8 +1659,7 @@ class Autosubmit: # If called from RUN or inspect command if not only_wrappers: try: - package.submit( - as_conf, job_list.parameters, inspect, hold=hold) + package.submit(as_conf, job_list.parameters, inspect, hold=hold) valid_packages_to_submit.append(package) except (IOError, OSError): continue @@ -2099,7 +2098,7 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = platforms[job.platform_name.lower()] + job._platform = platforms[job.platform_name.lower()] # noinspection PyTypeChecker platforms_to_test.add(platforms[job.platform_name.lower()]) # establish the connection to all platforms @@ -2116,9 +2115,9 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = platforms[job.platform_name.lower()] + job._platform = platforms[job.platform_name.lower()] - if job.platform.get_completed_files(job.name, 0, True): + if job._platform.get_completed_files(job.name, 0, True): job.status = Status.COMPLETED Log.info( "CHANGED job '{0}' status to COMPLETED".format(job.name)) @@ -2126,7 +2125,7 @@ class Autosubmit: if not no_recover_logs: try: - job.platform.get_logs_files(expid, job.remote_logs) + job._platform.get_logs_files(expid, job.remote_logs) except: pass elif job.status != Status.SUSPENDED: @@ -2444,7 +2443,7 @@ class Autosubmit: for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpc_architecture - job.platform = submitter.platforms[job.platform_name.lower()] + job._platform = submitter.platforms[job.platform_name.lower()] job.update_parameters(as_conf, job_list.parameters) return job_list.check_scripts(as_conf) @@ -3324,15 +3323,15 @@ class Autosubmit: if (job.status == Status.QUEUING or job.status == Status.HELD) and save and (final_status != Status.QUEUING and final_status != Status.HELD and final_status != Status.SUSPENDED): job.hold = False if job.platform_name is not None and job.platform_name.lower() != "local": - job.platform.send_command( - job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + job._platform.send_command( + job._platform.cancel_cmd + " " + str(job.id), ignore_log=True) elif (job.status == Status.QUEUING or job.status == Status.RUNNING or job.status == Status.SUBMITTED) and save and (final_status == Status.SUSPENDED): if job.platform_name is not None and job.platform_name.lower() != "local": - job.platform.send_command( + job._platform.send_command( "scontrol hold " + "{0}".format(job.id), ignore_log=True) elif (final_status == Status.QUEUING or final_status == Status.RUNNING) and save and (job.status == Status.SUSPENDED): if job.platform_name is not None and job.platform_name.lower() != "local": - job.platform.send_command( + job._platform.send_command( "scontrol release " + "{0}".format(job.id), ignore_log=True) job.status = final_status Log.info("CHANGED: job: " + job.name + " status to: " + final) @@ -3434,7 +3433,7 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower( + job._platform = submitter.platforms[job.platform_name.lower( )] platforms_to_test = set() platforms = submitter.platforms @@ -3443,7 +3442,7 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = platforms[job.platform_name.lower()] + job._platform = platforms[job.platform_name.lower()] # noinspection PyTypeChecker platforms_to_test.add(platforms[job.platform_name.lower()]) # establish the connection to all platforms @@ -4334,7 +4333,7 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = platforms[job.platform_name.lower()] + job._platform = platforms[job.platform_name.lower()] # noinspection PyTypeChecker platforms_to_test.add(platforms[job.platform_name.lower()]) rerun_names = [] @@ -4349,12 +4348,12 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = platforms[job.platform_name.lower()] + job._platform = platforms[job.platform_name.lower()] - if job.platform.get_completed_files(job.name, 0): + if job._platform.get_completed_files(job.name, 0): job.status = Status.COMPLETED Log.info( "CHANGED job '{0}' status to COMPLETED".format(job.name)) - job.platform.get_logs_files(expid, job.remote_logs) + job._platform.get_logs_files(expid, job.remote_logs) return job_list diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 44719402e..d13c05a84 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -519,8 +519,10 @@ class Job(object): if self.platform_name is None: self.platform_name = hpcarch self.platform = submitter.platforms[self.platform_name.lower()] + + self._platform = submitter.platforms[self.platform_name.lower()] try: - self.platform.restore_connection() + self._platform.restore_connection() except Exception as e: Log.printlog("{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message,self.name), 6001) out_exist = False @@ -552,11 +554,11 @@ class Job(object): # unifying names for log files self.synchronize_logs(self.platform, remote_logs, local_logs) remote_logs = local_logs - self.platform.get_logs_files(self.expid, remote_logs) + self._platform.get_logs_files(self.expid, remote_logs) # Update the logs with Autosubmit Job Id Brand try: for local_log in local_logs: - self.platform.write_jobid(self.id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) + self._platform.write_jobid(self.id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: raise AutosubmitError("Trace {0} \n Failed to write the {1}".format(e.message,self.name), 6001) @@ -585,7 +587,7 @@ class Job(object): if new_status == Status.COMPLETED: Log.debug("{0} job seems to have completed: checking...".format(self.name)) - if not self.platform.get_completed_files(self.name): + if not self._platform.get_completed_files(self.name): log_name = os.path.join(self._tmp_path, self.name + '_COMPLETED') self.check_completion() @@ -602,7 +604,7 @@ class Job(object): Log.result("Job {0} is COMPLETED", self.name) elif self.status == Status.FAILED: Log.printlog("Job {0} is FAILED. Checking completed files to confirm the failure...".format(self.name),3000) - self.platform.get_completed_files(self.name) + self._platform.get_completed_files(self.name) self.check_completion() if self.status == Status.COMPLETED: Log.result("Job {0} is COMPLETED", self.name) @@ -610,7 +612,7 @@ class Job(object): self.update_children_status() elif self.status == Status.UNKNOWN: Log.printlog("Job {0} is UNKNOWN. Checking completed files to confirm the failure...".format(self.name),3000) - self.platform.get_completed_files(self.name) + self._platform.get_completed_files(self.name) self.check_completion(Status.UNKNOWN) if self.status == Status.UNKNOWN: Log.printlog("Job {0} is UNKNOWN. Checking completed files to confirm the failure...".format(self.name),6009) @@ -753,7 +755,7 @@ class Job(object): else: parameters['Chunk_LAST'] = 'FALSE' - job_platform = self.platform + job_platform = self._platform self.processors = as_conf.get_processors(self.section) self.threads = as_conf.get_threads(self.section) self.tasks = as_conf.get_tasks(self.section) @@ -867,7 +869,7 @@ class Job(object): raise AutosubmitCritical("Job {0} does not have an correct template// template not found".format(self.name),7014) def _get_paramiko_template(self, snippet, template): - current_platform = self.platform + current_platform = self._platform return ''.join([snippet.as_header(current_platform.get_header(self)), template, snippet.as_tailer()]) @@ -992,7 +994,7 @@ class Job(object): :return: True if succesful, False otherwise :rtype: bool """ - if self.platform.get_stat_file(self.name, retries=5): + if self._platform.get_stat_file(self.name, retries=5): start_time = self.check_start_time() else: Log.printlog('Could not get start time for {0}. Using current time as an approximation'.format(self.name),3000) @@ -1014,7 +1016,7 @@ class Job(object): :param completed: True if job was completed successfully, False otherwise :type completed: bool """ - self.platform.get_stat_file(self.name, retries=0) + self._platform.get_stat_file(self.name, retries=0) end_time = self.check_end_time() path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') f = open(path, 'a') @@ -1139,7 +1141,7 @@ class WrapperJob(Job): self.num_processors = num_processors self.running_jobs_start = OrderedDict() - self.platform = platform + self._platform = platform self.as_config = as_config # save start time, wallclock and processors?! self.checked_time = datetime.datetime.now() @@ -1183,14 +1185,14 @@ class WrapperJob(Job): # Fail can come from check function or running/completed checkers. if self.status in [Status.FAILED, Status.UNKNOWN]: self.status = Status.FAILED - self._check_running_jobs() + #self._check_running_jobs() still_running = False for job in self.inner_jobs_running.values(): if job.status == Status.RUNNING: still_running = True if not still_running: self.cancel_failed_wrapper_job() - self.update_failed_jobs() + self.update_failed_jobs() def check_inner_jobs_completed(self, jobs): not_completed_jobs = [ @@ -1198,7 +1200,7 @@ class WrapperJob(Job): not_completed_job_names = [job.name for job in not_completed_jobs] job_names = ' '.join(not_completed_job_names) if job_names: - completed_files = self.platform.check_completed_files(job_names) + completed_files = self._platform.check_completed_files(job_names) completed_jobs = [] for job in not_completed_jobs: if completed_files and len(completed_files) > 0: @@ -1217,11 +1219,11 @@ class WrapperJob(Job): def _check_inner_jobs_queue(self, prev_status): reason = str() - if self.platform.type == 'slurm': - self.platform.send_command( - self.platform.get_queue_status_cmd(self.id)) - reason = self.platform.parse_queue_reason( - self.platform._ssh_output, self.id) + if self._platform.type == 'slurm': + self._platform.send_command( + self._platform.get_queue_status_cmd(self.id)) + reason = self._platform.parse_queue_reason( + self._platform._ssh_output, self.id) if self._queuing_reason_cancel(reason): Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}".format(self.name,reason),6009) self.cancel_failed_wrapper_job() @@ -1230,7 +1232,7 @@ class WrapperJob(Job): if reason == '(JobHeldUser)': if self.hold is False: # SHOULD BE MORE CLASS (GET_scontrol realease but not sure if this can be implemented on others PLATFORMS - self.platform.send_command( + self._platform.send_command( "scontrol release " + "{0}".format(self.id)) self.status = Status.QUEUING for job in self.job_list: @@ -1243,8 +1245,8 @@ class WrapperJob(Job): elif reason == '(JobHeldAdmin)': Log.debug( "Job {0} Failed to be HELD, canceling... ", self.name) - self.platform.send_command( - self.platform.cancel_cmd + " {0}".format(self.id)) + self._platform.send_command( + self._platform.cancel_cmd + " {0}".format(self.id)) self.status = Status.WAITING else: Log.info("Job {0} is QUEUING {1}", self.name, reason) @@ -1277,7 +1279,7 @@ class WrapperJob(Job): not_finished_jobs_dict[job.name] = job if len(not_finished_jobs_dict.keys()) > 0: # Only running jobs will enter there not_finished_jobs_names = ' '.join(not_finished_jobs_dict.keys()) - remote_log_dir = self.platform.get_remote_log_dir() + remote_log_dir = self._platform.get_remote_log_dir() # PREPARE SCRIPT TO SEND command = textwrap.dedent(""" cd {1} @@ -1308,7 +1310,7 @@ done content = '' while content == '' and retries > 0: self._platform.send_command(command, False) - content = self.platform._ssh_output.split('\n') + content = self._platform._ssh_output.split('\n') # content.reverse() for line in content[:-1]: out = line.split() @@ -1350,7 +1352,7 @@ done retries = 5 output = '' while output == '' and retries > 0: - output = self.platform.check_completed_files(job.name) + output = self._platform.check_completed_files(job.name) if output is None or output == '': sleep(wait) retries = retries - 1 @@ -1372,8 +1374,8 @@ done def cancel_failed_wrapper_job(self): Log.printlog("Cancelling job with id {0}".format(self.id),6009) - self.platform.send_command( - self.platform.cancel_cmd + " " + str(self.id)) + self._platform.send_command( + self._platform.cancel_cmd + " " + str(self.id)) # If there are jobs running, let them finish TODO def _update_completed_jobs(self): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 559370558..4c053f91b 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -647,7 +647,7 @@ class JobList: :rtype: list """ - completed_jobs = [job for job in self._job_list if (platform is None or job.platform == platform) and + completed_jobs = [job for job in self._job_list if (platform is None or job._platform.name == platform.name) and job.status == Status.COMPLETED] if wrapper: return [job for job in completed_jobs if job.packed is False] @@ -664,7 +664,7 @@ class JobList: :return: completed jobs :rtype: list """ - uncompleted_jobs = [job for job in self._job_list if (platform is None or job.platform == platform) and + uncompleted_jobs = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status != Status.COMPLETED] if wrapper: @@ -683,10 +683,10 @@ class JobList: """ submitted = list() if hold: - submitted = [job for job in self._job_list if (platform is None or job.platform == platform) and + submitted = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status == Status.SUBMITTED and job.hold == hold] else: - submitted = [job for job in self._job_list if (platform is None or job.platform == platform) and + submitted = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status == Status.SUBMITTED] if wrapper: return [job for job in submitted if job.packed is False] @@ -702,7 +702,7 @@ class JobList: :return: running jobs :rtype: list """ - running = [job for job in self._job_list if (platform is None or job.platform == platform) and + running = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status == Status.RUNNING] if wrapper: return [job for job in running if job.packed is False] @@ -718,7 +718,7 @@ class JobList: :return: queuedjobs :rtype: list """ - queuing = [job for job in self._job_list if (platform is None or job.platform == platform) and + queuing = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status == Status.QUEUING] if wrapper: return [job for job in queuing if job.packed is False] @@ -734,7 +734,7 @@ class JobList: :return: failed jobs :rtype: list """ - failed = [job for job in self._job_list if (platform is None or job.platform == platform) and + failed = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status == Status.FAILED] if wrapper: return [job for job in failed if job.packed is False] @@ -750,7 +750,7 @@ class JobList: :return: all jobs :rtype: list """ - unsubmitted = [job for job in self._job_list if (platform is None or job.platform == platform) and + unsubmitted = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and (job.status != Status.SUBMITTED and job.status != Status.QUEUING and job.status == Status.RUNNING and job.status == Status.COMPLETED)] if wrapper: @@ -783,7 +783,7 @@ class JobList: :return: ready jobs :rtype: list """ - ready = [job for job in self._job_list if (platform is None or job.platform == platform) and + ready = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status == Status.READY and job.hold is hold] if wrapper: @@ -800,7 +800,7 @@ class JobList: :return: prepared jobs :rtype: list """ - prepared = [job for job in self._job_list if (platform is None or job.platform == platform) and + prepared = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status == Status.PREPARED] return prepared @@ -813,7 +813,7 @@ class JobList: :return: waiting jobs :rtype: list """ - waiting_jobs = [job for job in self._job_list if (platform is None or job.platform == platform) and + waiting_jobs = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and job.status == Status.WAITING] if wrapper: return [job for job in waiting_jobs if job.packed is False] @@ -829,7 +829,7 @@ class JobList: :return: waiting jobs :rtype: list """ - waiting_jobs = [job for job in self._job_list if (job.platform.type == platform_type and job.status == Status.WAITING)] + waiting_jobs = [job for job in self._job_list if (job._platform.type == platform_type and job.status == Status.WAITING)] return waiting_jobs def get_held_jobs(self, platform=None): @@ -841,7 +841,7 @@ class JobList: :return: jobs in platforms :rtype: list """ - return [job for job in self._job_list if (platform is None or job.platform == platform) and + return [job for job in self._job_list if (platform is None or job._platform.name == platform.name) and job.status == Status.HELD] def get_unknown(self, platform=None, wrapper=False): @@ -853,7 +853,7 @@ class JobList: :return: unknown state jobs :rtype: list """ - submitted = [job for job in self._job_list if (platform is None or job.platform == platform) and + submitted = [job for job in self._job_list if (platform is None or job._platform.name == platform.name) and job.status == Status.UNKNOWN] if wrapper: return [job for job in submitted if job.packed is False] @@ -869,7 +869,7 @@ class JobList: :return: unknown state jobs :rtype: list """ - suspended = [job for job in self._job_list if (platform is None or job.platform == platform) and + suspended = [job for job in self._job_list if (platform is None or job._platform.name == platform.name) and job.status == Status.SUSPENDED] if wrapper: return [job for job in suspended if job.packed is False] @@ -1073,7 +1073,7 @@ class JobList: queue = "no-scheduler" else: queue = job.queue - Log.status("{0:<35}{1:<15}{2:<15}{3:<20}{4:<15}", job.name, job.id, Status().VALUE_TO_KEY[job.status],job.platform.name,queue) + Log.status("{0:<35}{1:<15}{2:<15}{3:<20}{4:<15}", job.name, job.id, Status().VALUE_TO_KEY[job.status],job._platform.name,queue) def update_from_file(self, store_change=True): @@ -1111,7 +1111,7 @@ class JobList: def parameters(self, value): self._parameters = value - def update_list(self, as_conf, store_change=True, fromSetStatus=False): + def update_list(self, as_conf, store_change=True, fromSetStatus=False,submitter=None): """ Updates job list, resetting failed jobs and changing to READY all WAITING jobs with all parents COMPLETED @@ -1140,6 +1140,14 @@ class JobList: parent for parent in job.parents if parent.status == Status.COMPLETED] if len(tmp) == len(job.parents): job.status = Status.READY + if submitter is not None: + job.platform = submitter.platforms[job.platform_name.lower()] + job.platform.test_connection() + job._platform = submitter.platforms[job.platform_name.lower()] + job._platform.test_connection() + + job.id = None + job.packed = False save = True Log.debug( @@ -1150,7 +1158,12 @@ class JobList: job.packed = False Log.debug( "Resetting job: {0} status to: WAITING for parents completion...".format(job.name)) - + else: + job.status = Status.FAILED + job.packed = False + save = True + Log.debug( + "Job is failed".format(job.name)) # if waiting jobs has all parents completed change its State to READY for job in self.get_completed(): if job.synchronize is not None: diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 9f33f892c..61e47c580 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -49,7 +49,7 @@ class JobPackageBase(object): self._platform = jobs[0].platform self._custom_directives = set() for job in jobs: - if job.platform.name != self._platform.name or job.platform is None: + if job._platform.name != self._platform.name or job._platform is None: raise Exception('Only one valid platform per package') except IndexError: raise Exception('No jobs given') diff --git a/autosubmit/platforms/headers/lsf_header.py b/autosubmit/platforms/headers/lsf_header.py index af22da2ca..0ea44b77f 100644 --- a/autosubmit/platforms/headers/lsf_header.py +++ b/autosubmit/platforms/headers/lsf_header.py @@ -54,7 +54,7 @@ class LsfHeader(object): # noinspection PyMethodMayBeStatic def get_exclusivity(self, job): - if job.platform.exclusivity == 'true': + if job._platform.exclusivity == 'true': return "#BSUB -x" else: return "" diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 81c969f82..aa9968f39 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -164,7 +164,7 @@ class LocalPlatform(ParamikoPlatform): sleeptime = sleeptime + 5 retries = retries + 1 except BaseException as e: # Unrecoverable error - Log.printlog("Crashed while retrieving logs",6001) + Log.printlog("File does not exist, logs {0} {1}".format(self.get_files_path(),src),6001) file_exist = False # won't exist retries = 999 # no more retries diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index e6b4891b9..39bf0b9a7 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -475,7 +475,7 @@ class ParamikoPlatform(Platform): elif reason == '(JobHeldAdmin)': #This shouldn't happen anymore TODO delete Log.debug("Job {0} Failed to be HELD, canceling... ", job.name) job.new_status = Status.WAITING - job.platform.send_command(job.platform.cancel_cmd + " {0}".format(job.id)) + job._platform.send_command(job._platform.cancel_cmd + " {0}".format(job.id)) else: for job in job_list: diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 051820af1..f806578ce 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -323,9 +323,11 @@ class SlurmPlatform(ParamikoPlatform): if not hold: self._submit_script_file.write( self._submit_cmd + job_script + "\n") + self._submit_script_file.close() else: self._submit_script_file.write( self._submit_hold_cmd + job_script + "\n") + self._submit_script_file.close() def get_checkjob_cmd(self, job_id): return 'sacct -n -X -j {1} -o "State"'.format(self.host, job_id) diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index b19a1d3d6..daabf854b 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -293,7 +293,7 @@ class PythonWrapperBuilder(WrapperBuilder): print datetime.now(), "The job ", current.template," has been COMPLETED" else: print datetime.now(), "The job ", current.template," has FAILED" - {1} + #{1} """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) return sequential_threads_launcher @@ -361,7 +361,7 @@ class PythonWrapperBuilder(WrapperBuilder): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: print datetime.now(), "The job ", pid.template," has FAILED" - {1} + #{1} """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher @@ -394,7 +394,7 @@ class PythonWrapperBuilder(WrapperBuilder): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: print datetime.now(), "The job ", pid.template," has FAILED" - {1} + #{1} """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher @@ -484,7 +484,7 @@ class PythonHorizontalVerticalWrapperBuilder(PythonWrapperBuilder): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: print datetime.now(), "The job ", pid.template," has FAILED" - {1} + #{1} """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher diff --git a/simple_test.py b/simple_test.py index 5fbbc0d55..df1bee2d6 100644 --- a/simple_test.py +++ b/simple_test.py @@ -33,7 +33,7 @@ # if job.platform_name is None: # job.platform_name = "marenostrum4" # # noinspection PyTypeChecker -# job.platform = submitter.platforms[job.platform_name.lower( +# job._platform = submitter.platforms[job.platform_name.lower( # )] # list_jobs = job_list.get_job_list() diff --git a/test/unit/test_job.py b/test/unit/test_job.py index 61967ae8b..7c3a43f41 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -34,7 +34,7 @@ class TestJob(TestCase): self.job._platform = platform self.job.processors = 999 - returned_platform = self.job.platform + returned_platform = self.job._platform self.assertEquals(platform, returned_platform) @@ -45,7 +45,7 @@ class TestJob(TestCase): self.job._platform = platform self.job.processors = '1' - returned_platform = self.job.platform + returned_platform = self.job._platform self.assertEquals('serial-platform', returned_platform) @@ -53,9 +53,9 @@ class TestJob(TestCase): dummy_platform = Platform('whatever', 'rand-name', FakeBasicConfig) self.assertNotEquals(dummy_platform, self.job._platform) - self.job.platform = dummy_platform + self.job._platform = dummy_platform - self.assertEquals(dummy_platform, self.job.platform) + self.assertEquals(dummy_platform, self.job._platform) def test_when_the_job_has_a_queue_returns_that_queue(self): dummy_queue = 'whatever' @@ -69,7 +69,7 @@ class TestJob(TestCase): dummy_queue = 'whatever-parallel' dummy_platform = Platform('whatever', 'rand-name', FakeBasicConfig) dummy_platform.queue = dummy_queue - self.job.platform = dummy_platform + self.job._platform = dummy_platform self.assertIsNone(self.job._queue) @@ -89,7 +89,7 @@ class TestJob(TestCase): dummy_platform.serial_platform = dummy_serial_platform dummy_platform.queue = parallel_queue - self.job.platform = dummy_platform + self.job._platform = dummy_platform self.job.processors = '1' self.assertIsNone(self.job._queue) diff --git a/test/unit/test_wrappers.py b/test/unit/test_wrappers.py index ebb9c1960..54e7027d4 100644 --- a/test/unit/test_wrappers.py +++ b/test/unit/test_wrappers.py @@ -1311,7 +1311,7 @@ class TestWrappers(TestCase): job.packed = False job.hold = False job.wallclock = total_wallclock - job.platform = self.platform + job._platform = self.platform job.date = date job.member = member -- GitLab From 6a5fe5d233921a83063bba7eb793310f4e227b93 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Sun, 18 Oct 2020 23:43:32 +0200 Subject: [PATCH 05/26] Mayor changes, to try unify the failed wrappers --- autosubmit/autosubmit.py | 2 +- autosubmit/job/job.py | 41 +++--- autosubmit/platforms/ecplatform.py | 2 +- autosubmit/platforms/locplatform.py | 15 ++- autosubmit/platforms/paramiko_platform.py | 2 +- autosubmit/platforms/platform.py | 10 +- autosubmit/platforms/slurmplatform.py | 15 ++- .../platforms/wrappers/wrapper_builder.py | 123 +++++++++++------- 8 files changed, 126 insertions(+), 84 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 30640d305..81eec00b8 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1144,7 +1144,7 @@ class Autosubmit: job.platform_name = hpcarch # Assign platform objects to each job # noinspection PyTypeChecker - job._platform = submitter.platforms[job._platform_name.lower()] + job._platform = submitter.platforms[job.platform_name.lower()] # Add object to set # noinspection PyTypeChecker platforms_to_test.add(job._platform) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index d13c05a84..3eb4f867d 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -587,7 +587,7 @@ class Job(object): if new_status == Status.COMPLETED: Log.debug("{0} job seems to have completed: checking...".format(self.name)) - if not self._platform.get_completed_files(self.name): + if not self._platform.get_completed_files(self.name,wrapper_failed=self.packed): log_name = os.path.join(self._tmp_path, self.name + '_COMPLETED') self.check_completion() @@ -604,7 +604,7 @@ class Job(object): Log.result("Job {0} is COMPLETED", self.name) elif self.status == Status.FAILED: Log.printlog("Job {0} is FAILED. Checking completed files to confirm the failure...".format(self.name),3000) - self._platform.get_completed_files(self.name) + self._platform.get_completed_files(self.name,wrapper_failed=self.packed) self.check_completion() if self.status == Status.COMPLETED: Log.result("Job {0} is COMPLETED", self.name) @@ -612,7 +612,7 @@ class Job(object): self.update_children_status() elif self.status == Status.UNKNOWN: Log.printlog("Job {0} is UNKNOWN. Checking completed files to confirm the failure...".format(self.name),3000) - self._platform.get_completed_files(self.name) + self._platform.get_completed_files(self.name,wrapper_failed=self.packed) self.check_completion(Status.UNKNOWN) if self.status == Status.UNKNOWN: Log.printlog("Job {0} is UNKNOWN. Checking completed files to confirm the failure...".format(self.name),6009) @@ -1146,7 +1146,7 @@ class WrapperJob(Job): # save start time, wallclock and processors?! self.checked_time = datetime.datetime.now() self.hold = hold - self.inner_jobs_running = dict() + self.inner_jobs_running = list() def _queuing_reason_cancel(self, reason): try: @@ -1185,11 +1185,13 @@ class WrapperJob(Job): # Fail can come from check function or running/completed checkers. if self.status in [Status.FAILED, Status.UNKNOWN]: self.status = Status.FAILED - #self._check_running_jobs() - still_running = False - for job in self.inner_jobs_running.values(): - if job.status == Status.RUNNING: + if self.prev_status not in [Status.FAILED, Status.UNKNOWN]: + sleep(1) + self._check_running_jobs() + if len(self.inner_jobs_running) > 0: still_running = True + else: + still_running = False if not still_running: self.cancel_failed_wrapper_job() self.update_failed_jobs() @@ -1220,10 +1222,8 @@ class WrapperJob(Job): def _check_inner_jobs_queue(self, prev_status): reason = str() if self._platform.type == 'slurm': - self._platform.send_command( - self._platform.get_queue_status_cmd(self.id)) - reason = self._platform.parse_queue_reason( - self._platform._ssh_output, self.id) + self._platform.send_command(self._platform.get_queue_status_cmd(self.id)) + reason = self._platform.parse_queue_reason(self._platform._ssh_output, self.id) if self._queuing_reason_cancel(reason): Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}".format(self.name,reason),6009) self.cancel_failed_wrapper_job() @@ -1270,6 +1270,7 @@ class WrapperJob(Job): def _check_running_jobs(self): not_finished_jobs_dict = OrderedDict() + self.inner_jobs_running = list() not_finished_jobs = [job for job in self.job_list if job.status not in [ Status.COMPLETED, Status.FAILED]] for job in not_finished_jobs: @@ -1277,6 +1278,7 @@ class WrapperJob(Job): Status.COMPLETED or self.status == Status.COMPLETED] if job.parents is None or len(tmp) == len(job.parents): not_finished_jobs_dict[job.name] = job + self.inner_jobs_running.append(job) if len(not_finished_jobs_dict.keys()) > 0: # Only running jobs will enter there not_finished_jobs_names = ' '.join(not_finished_jobs_dict.keys()) remote_log_dir = self._platform.get_remote_log_dir() @@ -1346,10 +1348,9 @@ done if retries == 0 or over_wallclock: self.status = Status.FAILED - self.inner_jobs_running = not_finished_jobs_dict def _check_finished_job(self, job): wait = 2 - retries = 5 + retries = 2 output = '' while output == '' and retries > 0: output = self._platform.check_completed_files(job.name) @@ -1367,9 +1368,17 @@ done self.running_jobs_start.pop(job, None) def update_failed_jobs(self): - not_finished_jobs = [job for job in self.job_list if job.status not in [ - Status.FAILED, Status.COMPLETED]] + + not_finished_jobs = [job for job in self.job_list if job.status not in [Status.FAILED, Status.COMPLETED]] + running_jobs = list() for job in not_finished_jobs: + tmp = [parent for parent in job.parents if parent.status == + Status.COMPLETED or self.status == Status.COMPLETED] + if job.parents is None or len(tmp) == len(job.parents): + running_jobs.append(job) + else: + job.status = Status.WAITING + for job in running_jobs: self._check_finished_job(job) def cancel_failed_wrapper_job(self): diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index e1ec6d7da..e27923c74 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -157,7 +157,7 @@ class EcPlatform(ParamikoPlatform): raise AutosubmitError('Could not send file {0} to {1}'.format(os.path.join(self.tmp_path, filename),os.path.join(self.get_files_path(), filename)),6005,e.message) return True - def get_file(self, filename, must_exist=True, relative_path='',ignore_log = False): + def get_file(self, filename, must_exist=True, relative_path='',ignore_log = False,wrapper_failed=False): local_path = os.path.join(self.tmp_path, relative_path) if not os.path.exists(local_path): os.makedirs(local_path) diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index aa9968f39..56936ca4e 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -118,10 +118,10 @@ class LocalPlatform(ParamikoPlatform): raise return True - def check_file_exists(self,filename): + def check_file_exists(self,filename,wrapper_failed=False): return True - def get_file(self, filename, must_exist=True, relative_path='',ignore_log = False): + def get_file(self, filename, must_exist=True, relative_path='',ignore_log = False,wrapper_failed=False): local_path = os.path.join(self.tmp_path, relative_path) if not os.path.exists(local_path): os.makedirs(local_path) @@ -140,7 +140,7 @@ class LocalPlatform(ParamikoPlatform): return True # Moves .err .out - def check_file_exists(self, src): + def check_file_exists(self, src,wrapper_failed=False): """ Moves a file on the platform :param src: source name @@ -160,9 +160,12 @@ class LocalPlatform(ParamikoPlatform): if not file_exist: # File doesn't exist, retry in sleeptime Log.debug("{2} File still no exists.. waiting {0}s for a new retry ( retries left: {1})", sleeptime, max_retries - retries, remote_path) - sleep(sleeptime) - sleeptime = sleeptime + 5 - retries = retries + 1 + if not wrapper_failed: + sleep(sleeptime) + sleeptime = sleeptime + 5 + retries = retries + 1 + else: + retries = 9999 except BaseException as e: # Unrecoverable error Log.printlog("File does not exist, logs {0} {1}".format(self.get_files_path(),src),6001) file_exist = False # won't exist diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 39bf0b9a7..bb8cb0cf8 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -207,7 +207,7 @@ class ParamikoPlatform(Platform): raise AutosubmitError('Send file failed. Connection seems to no be active',6004) # Gets .err and .out - def get_file(self, filename, must_exist=True, relative_path='',ignore_log = False): + def get_file(self, filename, must_exist=True, relative_path='',ignore_log = False,wrapper_failed=False): """ Copies a file from the current platform to experiment's tmp folder diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index e42884a15..10c2a7b3f 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -151,7 +151,7 @@ class Platform(object): """ raise NotImplementedError - def get_file(self, filename, must_exist=True, relative_path='', ignore_log=False): + def get_file(self, filename, must_exist=True, relative_path='', ignore_log=False,wrapper_failed=False): """ Copies a file from the current platform to experiment's tmp folder @@ -206,7 +206,7 @@ class Platform(object): (job_out_filename, job_err_filename) = remote_logs self.get_files([job_out_filename, job_err_filename], False, 'LOG_{0}'.format(exp_id)) - def get_completed_files(self, job_name, retries=0, recovery=False): + def get_completed_files(self, job_name, retries=0, recovery=False,wrapper_failed=False): """ Get the COMPLETED file of the given job @@ -223,8 +223,8 @@ class Platform(object): return True else: return False - if self.check_file_exists('{0}_COMPLETED'.format(job_name)): - if self.get_file('{0}_COMPLETED'.format(job_name), False): + if self.check_file_exists('{0}_COMPLETED'.format(job_name),wrapper_failed=wrapper_failed): + if self.get_file('{0}_COMPLETED'.format(job_name), False,wrapper_failed=wrapper_failed): return True else: return False @@ -261,7 +261,7 @@ class Platform(object): return True return False - def check_file_exists(self, src): + def check_file_exists(self, src,wrapper_failed=False): return True def get_stat_file(self, job_name, retries=0): diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index f806578ce..a8b9fa469 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -323,11 +323,11 @@ class SlurmPlatform(ParamikoPlatform): if not hold: self._submit_script_file.write( self._submit_cmd + job_script + "\n") - self._submit_script_file.close() + #self._submit_script_file.close() else: self._submit_script_file.write( self._submit_hold_cmd + job_script + "\n") - self._submit_script_file.close() + #self._submit_script_file.close() def get_checkjob_cmd(self, job_id): return 'sacct -n -X -j {1} -o "State"'.format(self.host, job_id) @@ -399,7 +399,7 @@ class SlurmPlatform(ParamikoPlatform): def allocated_nodes(): return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list")""" - def check_file_exists(self, filename): + def check_file_exists(self, filename,wrapper_failed=False): file_exist = False sleeptime = 5 retries = 0 @@ -413,9 +413,12 @@ class SlurmPlatform(ParamikoPlatform): except IOError: # File doesn't exist, retry in sleeptime Log.debug("{2} File still no exists.. waiting {0}s for a new retry ( retries left: {1})", sleeptime, max_retries - retries, os.path.join(self.get_files_path(), filename)) - sleep(sleeptime) - sleeptime = sleeptime + 5 - retries = retries + 1 + if not wrapper_failed: + sleep(sleeptime) + sleeptime = sleeptime + 5 + retries = retries + 1 + else: + retries = 9999 except BaseException as e: # Unrecoverable error Log.critical( "Crashed while retrieving remote logs", 6001, e.message) diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index daabf854b..f428dd8a9 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -289,9 +289,12 @@ class PythonWrapperBuilder(WrapperBuilder): sequential_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) if os.path.exists(completed_path): print datetime.now(), "The job ", current.template," has been COMPLETED" else: + open(failed_path, 'w').close() print datetime.now(), "The job ", current.template," has FAILED" #{1} """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) @@ -317,20 +320,25 @@ class PythonWrapperBuilder(WrapperBuilder): for i in range(len(pid_list)): pid = pid_list[i] pid.join() - """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) - + """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 0), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" - completed_filename = {0}[i].replace('.cmd', '_COMPLETED') - completed_path = os.path.join(os.getcwd(), completed_filename) - if os.path.exists(completed_path): - print datetime.now(), "The job ", pid.template," has been COMPLETED" - else: - print datetime.now(), "The job ", pid.template," has FAILED" - - #{1} - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) - + completed_filename = {0}[i].replace('.cmd', '_COMPLETED') + completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) + Failed = False + if os.path.exists(completed_path): + print datetime.now(), "The job ", pid.template," has been COMPLETED" + else: + Failed = True + open(failed_path, 'w').close() + print datetime.now(), "The job ", pid.template," has FAILED" + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + parallel_threads_launcher += self._indent(textwrap.dedent(""" + if Failed: + {0} + """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher def build_parallel_threads_launcher_horizontal(self, jobs_list, thread, footer=True): parallel_threads_launcher = textwrap.dedent(""" @@ -351,19 +359,25 @@ class PythonWrapperBuilder(WrapperBuilder): for i in range(len(pid_list)): pid = pid_list[i] pid.join() - """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) - + """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 0), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" - completed_filename = {0}[i].replace('.cmd', '_COMPLETED') - completed_path = os.path.join(os.getcwd(), completed_filename) - if os.path.exists(completed_path): - print datetime.now(), "The job ", pid.template," has been COMPLETED" - else: - print datetime.now(), "The job ", pid.template," has FAILED" - #{1} - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) - + completed_filename = {0}[i].replace('.cmd', '_COMPLETED') + completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) + Failed = False + if os.path.exists(completed_path): + print datetime.now(), "The job ", pid.template," has been COMPLETED" + else: + Failed = True + open(failed_path, 'w').close() + print datetime.now(), "The job ", pid.template," has FAILED" + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + parallel_threads_launcher += self._indent(textwrap.dedent(""" + if Failed: + {0} + """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher def build_parallel_threads_launcher_vertical_horizontal(self, jobs_list, thread, footer=True): parallel_threads_launcher = textwrap.dedent(""" @@ -384,19 +398,25 @@ class PythonWrapperBuilder(WrapperBuilder): for i in range(len(pid_list)): pid = pid_list[i] pid.join() - """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) - + """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 0), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" - completed_filename = {0}[i].replace('.cmd', '_COMPLETED') - completed_path = os.path.join(os.getcwd(), completed_filename) - if os.path.exists(completed_path): - print datetime.now(), "The job ", pid.template," has been COMPLETED" - else: - print datetime.now(), "The job ", pid.template," has FAILED" - #{1} - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) - + completed_filename = {0}[i].replace('.cmd', '_COMPLETED') + completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) + Failed = False + if os.path.exists(completed_path): + print datetime.now(), "The job ", pid.template," has been COMPLETED" + else: + Failed = True + open(failed_path, 'w').close() + print datetime.now(), "The job ", pid.template," has FAILED" + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + parallel_threads_launcher += self._indent(textwrap.dedent(""" + if Failed: + {0} + """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher # all should override -> abstract! def build_main(self): @@ -471,22 +491,29 @@ class PythonHorizontalVerticalWrapperBuilder(PythonWrapperBuilder): pid_list.append(current) current.start() - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() - """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) + # Waiting until all scripts finish + for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() + """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 0), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" - completed_filename = {0}[i].replace('.cmd', '_COMPLETED') - completed_path = os.path.join(os.getcwd(), completed_filename) - if os.path.exists(completed_path): - print datetime.now(), "The job ", pid.template," has been COMPLETED" - else: - print datetime.now(), "The job ", pid.template," has FAILED" - #{1} - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) - + completed_filename = {0}[i].replace('.cmd', '_COMPLETED') + completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) + Failed = False + if os.path.exists(completed_path): + print datetime.now(), "The job ", pid.template," has been COMPLETED" + else: + Failed = True + open(failed_path, 'w').close() + print datetime.now(), "The job ", pid.template," has FAILED" + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + parallel_threads_launcher += self._indent(textwrap.dedent(""" + if Failed: + {0} + """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher def build_joblist_thread(self): return textwrap.dedent(""" -- GitLab From 715342475382fcb66d5c0662440043b17753f945 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 20 Oct 2020 08:33:59 +0200 Subject: [PATCH 06/26] More changes, stopped until fix 3.12 --- autosubmit/job/job.py | 39 ++++---- autosubmit/job/job_packager.py | 21 +++-- .../platforms/wrappers/wrapper_builder.py | 90 +++++++++---------- 3 files changed, 76 insertions(+), 74 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 3eb4f867d..09a1ffa9f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -127,6 +127,7 @@ class Job(object): self.packed = False self.hold = False self.distance_weight = 0 + self.level = 0 def __getstate__(self): odict = self.__dict__ if '_platform' in odict: @@ -668,7 +669,7 @@ class Job(object): """ log_name = os.path.join(self._tmp_path,self.name + '_COMPLETED') - if os.path.exists(log_name): + if os.path.exists(log_name): #TODO self.status = Status.COMPLETED else: Log.printlog("Job {0} completion check failed. There is no COMPLETED file".format(self.name),6009) @@ -1186,15 +1187,20 @@ class WrapperJob(Job): if self.status in [Status.FAILED, Status.UNKNOWN]: self.status = Status.FAILED if self.prev_status not in [Status.FAILED, Status.UNKNOWN]: - sleep(1) + sleep(10) + + #if self.prev_status in [Status.SUBMITTED,Status.QUEUING]: + # for job in self.job_list: + # if job.level == 0: + # job.status = Status.RUNNING self._check_running_jobs() if len(self.inner_jobs_running) > 0: - still_running = True + still_running = True else: still_running = False if not still_running: self.cancel_failed_wrapper_job() - self.update_failed_jobs() + #self.update_failed_jobs() def check_inner_jobs_completed(self, jobs): not_completed_jobs = [ @@ -1322,26 +1328,19 @@ done if len(out) > 1: if job not in self.running_jobs_start: start_time = self._check_time(out, 1) - Log.debug("Job {0} started at {1}".format(jobname, str(parse_date(start_time)))) - + Log.info("Job {0} started at {1}".format(jobname, str(parse_date(start_time)))) self.running_jobs_start[job] = start_time - job.new_status = Status.RUNNING - job.update_status( - self.as_config.get_copy_remote_logs() == 'true') - + job.status = Status.RUNNING + #job.update_status(self.as_config.get_copy_remote_logs() == 'true') if len(out) == 2: Log.info("Job {0} is RUNNING".format(jobname)) - over_wallclock = self._check_inner_job_wallclock( - job) + over_wallclock = self._check_inner_job_wallclock(job) if over_wallclock: - Log.printlog( - "Job {0} is FAILED".format(jobname),6009) - + Log.printlog("Job {0} is FAILED".format(jobname),6009) elif len(out) == 3: end_time = self._check_time(out, 2) self._check_finished_job(job) - Log.info("Job {0} finished at {1}".format( - jobname, str(parse_date(end_time)))) + Log.info("Job {0} finished at {1}".format(jobname, str(parse_date(end_time)))) if content == '': sleep(wait) retries = retries - 1 @@ -1368,7 +1367,6 @@ done self.running_jobs_start.pop(job, None) def update_failed_jobs(self): - not_finished_jobs = [job for job in self.job_list if job.status not in [Status.FAILED, Status.COMPLETED]] running_jobs = list() for job in not_finished_jobs: @@ -1376,8 +1374,9 @@ done Status.COMPLETED or self.status == Status.COMPLETED] if job.parents is None or len(tmp) == len(job.parents): running_jobs.append(job) - else: - job.status = Status.WAITING + #else: + # job.status = Status.WAITING + # job.packed = False for job in running_jobs: self._check_finished_job(job) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 005e17e78..6b050653a 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -235,10 +235,8 @@ class JobPackager(object): elif self.wrapper_type in ['vertical-horizontal', 'horizontal-vertical']: wrapped = True built_packages_tmp = list() - built_packages_tmp.append(self._build_hybrid_package( - jobs_to_submit_by_section[section], max_wrapped_jobs, section)) + built_packages_tmp.append(self._build_hybrid_package(jobs_to_submit_by_section[section], max_wrapped_jobs, section)) if wrapped: - for p in built_packages_tmp: failed_innerjobs = False #Check failed jobs first @@ -443,12 +441,16 @@ class JobPackager(object): max_procs = horizontal_packager.total_processors new_package = horizontal_packager.get_next_packages( section, max_wallclock=self._platform.max_wallclock, horizontal_vertical=True, max_procs=max_procs) + if new_package is not None: current_package += new_package for i in range(len(current_package)): total_wallclock = sum_str_hours(total_wallclock, wallclock) - + if len(current_package) > 1: + for level in range(1,len(current_package)): + for job in current_package[level]: + job.level=level return JobPackageHorizontalVertical(current_package, max_procs, total_wallclock, jobs_resources=jobs_resources, configuration=self._as_config) @@ -464,11 +466,15 @@ class JobPackager(object): job_list = JobPackagerVerticalSimple([job], job.wallclock, self.max_jobs, max_wrapped_jobs, self._platform.max_wallclock).build_vertical_package(job) + current_package.append(job_list) for job in current_package[-1]: total_wallclock = sum_str_hours(total_wallclock, job.wallclock) - + if len(current_package) > 1: + for level in range(1,len(current_package)): + for job in current_package[level]: + job.level=level return JobPackageVerticalHorizontal(current_package, total_processors, total_wallclock, jobs_resources=jobs_resources, method=self.wrapper_method, configuration=self._as_config) @@ -497,7 +503,7 @@ class JobPackagerVertical(object): self.max_wrapped_jobs = max_wrapped_jobs self.max_wallclock = max_wallclock - def build_vertical_package(self, job): + def build_vertical_package(self, job,level=0): """ Goes trough the job and all the related jobs (children, or part of the same date member ordered group), finds those suitable and groups them together into a wrapper. @@ -521,9 +527,10 @@ class JobPackagerVertical(object): if self.total_wallclock <= self.max_wallclock: # Marking, this is later tested in the main loop child.packed = True + child.level = level self.jobs_list.append(child) # Recursive call - return self.build_vertical_package(child) + return self.build_vertical_package(child,level=level+1) # Wrapped jobs are accumulated and returned in this list return self.jobs_list diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index f428dd8a9..66aa4e91b 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -279,10 +279,16 @@ class PythonWrapperBuilder(WrapperBuilder): def build_sequential_threads_launcher(self, jobs_list, thread, footer=True): sequential_threads_launcher = textwrap.dedent(""" + failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") for i in range(len({0})): current = {1} current.start() current.join() + if os.path.exists(failed_path): + level_failed = os.path.join(os.getcwd(),"Level_"+str(i)+"_FAILED") + open(level_failed, 'w').close() + os._exit(1) + """).format(jobs_list, thread, '\n'.ljust(13)) if footer: @@ -297,7 +303,7 @@ class PythonWrapperBuilder(WrapperBuilder): open(failed_path, 'w').close() print datetime.now(), "The job ", current.template," has FAILED" #{1} - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) return sequential_threads_launcher @@ -316,28 +322,25 @@ class PythonWrapperBuilder(WrapperBuilder): pid_list.append(current) current.start() - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() - """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 0), '\n'.ljust(13)) + # Waiting until all scripts finish + for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() + """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) - failed_filename = {0}[i].replace('.cmd', '_FAILED') - failed_path = os.path.join(os.getcwd(), failed_filename) - Failed = False + failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") if os.path.exists(completed_path): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: - Failed = True open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) parallel_threads_launcher += self._indent(textwrap.dedent(""" - if Failed: - {0} + #if Failed: + #{0} """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher def build_parallel_threads_launcher_horizontal(self, jobs_list, thread, footer=True): @@ -355,28 +358,26 @@ class PythonWrapperBuilder(WrapperBuilder): pid_list.append(current) current.start() - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() - """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 0), '\n'.ljust(13)) + # Waiting until all scripts finish + for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() + """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) - failed_filename = {0}[i].replace('.cmd', '_FAILED') - failed_path = os.path.join(os.getcwd(), failed_filename) + failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") Failed = False if os.path.exists(completed_path): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: - Failed = True open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) parallel_threads_launcher += self._indent(textwrap.dedent(""" - if Failed: - {0} + #if Failed: + #{0} """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher def build_parallel_threads_launcher_vertical_horizontal(self, jobs_list, thread, footer=True): @@ -394,28 +395,26 @@ class PythonWrapperBuilder(WrapperBuilder): pid_list.append(current) current.start() - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() - """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 0), '\n'.ljust(13)) + # Waiting until all scripts finish + for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() + """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) - failed_filename = {0}[i].replace('.cmd', '_FAILED') - failed_path = os.path.join(os.getcwd(), failed_filename) + failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") Failed = False if os.path.exists(completed_path): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: - Failed = True open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) parallel_threads_launcher += self._indent(textwrap.dedent(""" - if Failed: - {0} + #if Failed: + #{0} """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher # all should override -> abstract! @@ -491,28 +490,25 @@ class PythonHorizontalVerticalWrapperBuilder(PythonWrapperBuilder): pid_list.append(current) current.start() - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() - """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 0), '\n'.ljust(13)) + # Waiting until all scripts finish + for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() + """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) - failed_filename = {0}[i].replace('.cmd', '_FAILED') - failed_path = os.path.join(os.getcwd(), failed_filename) - Failed = False + failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") if os.path.exists(completed_path): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: - Failed = True open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) parallel_threads_launcher += self._indent(textwrap.dedent(""" - if Failed: - {0} + #if Failed: + #{0} """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher def build_joblist_thread(self): -- GitLab From fd862000ef5f51e09dd8ff04b9b381c888a8188e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 20 Oct 2020 11:24:49 +0200 Subject: [PATCH 07/26] fix delete on expid failure --- autosubmit/autosubmit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 81eec00b8..0ce6d3947 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -622,7 +622,7 @@ class Autosubmit: "User or owner does not exists", 7012, e.message) @staticmethod - def _delete_expid(expid_delete, force): + def _delete_expid(expid_delete, force=False): """ Removes an experiment from path and database If current user is eadmin and -f has been sent, it deletes regardless -- GitLab From 93101645cb446820f8dc54c78f0e9a2eb2c183d5 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 23 Oct 2020 16:01:41 +0200 Subject: [PATCH 08/26] Inner_jobs for horizontal-vertical completed. --- autosubmit/job/job.py | 90 ++++++++++--------- .../platforms/wrappers/wrapper_builder.py | 48 +++++++--- 2 files changed, 87 insertions(+), 51 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 09a1ffa9f..c05dcd4ad 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -656,7 +656,7 @@ class Job(object): def update_children_status(self): children = list(self.children) for child in children: - if child.status in [Status.SUBMITTED, Status.RUNNING, Status.QUEUING, Status.UNKNOWN]: + if child.level == 0 and child.status in [Status.SUBMITTED, Status.RUNNING, Status.QUEUING, Status.UNKNOWN]: child.status = Status.FAILED children += list(child.children) @@ -1136,6 +1136,7 @@ class WrapperJob(Job): def __init__(self, name, job_id, status, priority, job_list, total_wallclock, num_processors, platform, as_config, hold): super(WrapperJob, self).__init__(name, job_id, status, priority) + self.failed = False self.job_list = job_list # divide jobs in dictionary by state? self.wallclock = total_wallclock @@ -1187,20 +1188,30 @@ class WrapperJob(Job): if self.status in [Status.FAILED, Status.UNKNOWN]: self.status = Status.FAILED if self.prev_status not in [Status.FAILED, Status.UNKNOWN]: - sleep(10) - - #if self.prev_status in [Status.SUBMITTED,Status.QUEUING]: - # for job in self.job_list: - # if job.level == 0: - # job.status = Status.RUNNING - self._check_running_jobs() + sleep(1) + else: + self.failed = True + self._check_running_jobs() if len(self.inner_jobs_running) > 0: still_running = True + if not self.failed: + if self.platform.check_file_exists('WRAPPER_FAILED', wrapper_failed=True): + for job in self.inner_jobs_running: + if job.platform.check_file_exists('{0}_FAILED'.format(job.name), wrapper_failed=True): + Log.info("Wrapper {0} Failed, checking inner_jobs...".format(self.name)) + self.failed = True + self.platform.delete_file('WRAPPER_FAILED') + break + + if self.failed: + self.update_failed_jobs() + if len(self.inner_jobs_running) <= 0: + still_running = False else: still_running = False if not still_running: self.cancel_failed_wrapper_job() - #self.update_failed_jobs() + def check_inner_jobs_completed(self, jobs): not_completed_jobs = [ @@ -1232,8 +1243,11 @@ class WrapperJob(Job): reason = self._platform.parse_queue_reason(self._platform._ssh_output, self.id) if self._queuing_reason_cancel(reason): Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}".format(self.name,reason),6009) + #while running jobs? + self._check_running_jobs() #todo + self.update_failed_jobs(canceled_wrapper=True) self.cancel_failed_wrapper_job() - self.update_failed_jobs() + return if reason == '(JobHeldUser)': if self.hold is False: @@ -1273,7 +1287,6 @@ class WrapperJob(Job): job.update_status(self.as_config.get_copy_remote_logs() == 'true') return True return False - def _check_running_jobs(self): not_finished_jobs_dict = OrderedDict() self.inner_jobs_running = list() @@ -1344,48 +1357,45 @@ done if content == '': sleep(wait) retries = retries - 1 - + temp_list = self.inner_jobs_running + self.inner_jobs_running = [job for job in temp_list if job.status == Status.RUNNING] if retries == 0 or over_wallclock: self.status = Status.FAILED - def _check_finished_job(self, job): - wait = 2 - retries = 2 - output = '' - while output == '' and retries > 0: - output = self._platform.check_completed_files(job.name) - if output is None or output == '': - sleep(wait) - retries = retries - 1 - if output is not None and output != '' and 'COMPLETED' in output: + def _check_finished_job(self, job , failed_file=False): + if not failed_file: + wait = 2 + retries = 2 + output = '' + while output == '' and retries > 0: + output = self._platform.check_completed_files(job.name) + if output is None or output == '': + sleep(wait) + retries = retries - 1 + if failed_file or (output is not None and output != '' and 'COMPLETED' in output): job.new_status = Status.COMPLETED job.update_status(self.as_config.get_copy_remote_logs() == 'true') else: - Log.info( - "No completed filed found, setting {0} to FAILED...".format(job.name)) + #Log.info("No completed filed found, setting {0} to FAILED...".format(job.name)) job.new_status = Status.FAILED job.update_status(self.as_config.get_copy_remote_logs() == 'true') self.running_jobs_start.pop(job, None) - def update_failed_jobs(self): - not_finished_jobs = [job for job in self.job_list if job.status not in [Status.FAILED, Status.COMPLETED]] - running_jobs = list() - for job in not_finished_jobs: - tmp = [parent for parent in job.parents if parent.status == - Status.COMPLETED or self.status == Status.COMPLETED] - if job.parents is None or len(tmp) == len(job.parents): - running_jobs.append(job) - #else: - # job.status = Status.WAITING - # job.packed = False + def update_failed_jobs(self,canceled_wrapper=False): + running_jobs = self.inner_jobs_running + self.inner_jobs_running = list() for job in running_jobs: - self._check_finished_job(job) + if job.platform.check_file_exists('{0}_FAILED'.format(job.name), wrapper_failed=True): + if job.platform.get_file('{0}_FAILED'.format(job.name), False, wrapper_failed=True): + self._check_finished_job(job) + else: + self.inner_jobs_running.append(job) def cancel_failed_wrapper_job(self): Log.printlog("Cancelling job with id {0}".format(self.id),6009) - self._platform.send_command( - self._platform.cancel_cmd + " " + str(self.id)) - # If there are jobs running, let them finish TODO - + self._platform.send_command(self._platform.cancel_cmd + " " + str(self.id)) + for job in self.job_list: + if job.status not in [Status.COMPLETED, Status.FAILED]: + job.status = Status.WAITING def _update_completed_jobs(self): for job in self.job_list: if job.status == Status.RUNNING: diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 66aa4e91b..0308f09f4 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -19,6 +19,8 @@ import textwrap import math +import random +import string class WrapperDirector: """ @@ -119,8 +121,17 @@ class WrapperBuilder(object): class PythonWrapperBuilder(WrapperBuilder): - + def get_random_alphanumeric_string(self,letters_count, digits_count): + sample_str = ''.join((random.choice(string.ascii_letters) for i in range(letters_count))) + sample_str += ''.join((random.choice(string.digits) for i in range(digits_count))) + + # Convert string to list and shuffle it to mix letters and digits + sample_list = list(sample_str) + random.shuffle(sample_list) + final_string = ''.join(sample_list) + return final_string+"_FAILED" def build_imports(self): + return textwrap.dedent(""" import os import sys @@ -143,10 +154,10 @@ class PythonWrapperBuilder(WrapperBuilder): return getattr(self.stream, attr) sys.stdout = Unbuffered(sys.stdout) - + wrapper_id = "{1}" # Defining scripts to be run scripts= {0} - """).format(str(self.job_scripts), '\n'.ljust(13)) + """).format(str(self.job_scripts), self.get_random_alphanumeric_string(5,5),'\n'.ljust(13)) def build_job_thread(self): return textwrap.dedent(""" @@ -279,14 +290,15 @@ class PythonWrapperBuilder(WrapperBuilder): def build_sequential_threads_launcher(self, jobs_list, thread, footer=True): sequential_threads_launcher = textwrap.dedent(""" - failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") + failed_wrapper = os.path.join(os.getcwd(),wrapper_id) for i in range(len({0})): current = {1} current.start() current.join() - if os.path.exists(failed_path): - level_failed = os.path.join(os.getcwd(),"Level_"+str(i)+"_FAILED") - open(level_failed, 'w').close() + if os.path.exists(failed_wrapper): + os.remove(os.path.join(os.getcwd(),wrapper_id)) + wrapper_failed = os.path.join(os.getcwd(),"WRAPPER_FAILED") + open(wrapper_failed, 'w').close() os._exit(1) """).format(jobs_list, thread, '\n'.ljust(13)) @@ -297,9 +309,11 @@ class PythonWrapperBuilder(WrapperBuilder): completed_path = os.path.join(os.getcwd(), completed_filename) failed_filename = {0}[i].replace('.cmd', '_FAILED') failed_path = os.path.join(os.getcwd(), failed_filename) + failed_wrapper = os.path.join(os.getcwd(), wrapper_id) if os.path.exists(completed_path): print datetime.now(), "The job ", current.template," has been COMPLETED" else: + open(failed_wrapper,'w').close() open(failed_path, 'w').close() print datetime.now(), "The job ", current.template," has FAILED" #{1} @@ -331,10 +345,13 @@ class PythonWrapperBuilder(WrapperBuilder): parallel_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) - failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(),failed_filename) + failed_wrapper = os.path.join(os.getcwd(),wrapper_id) if os.path.exists(completed_path): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: + open(failed_wrapper, 'w').close() open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) @@ -367,11 +384,14 @@ class PythonWrapperBuilder(WrapperBuilder): parallel_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) - failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(),failed_filename) + failed_wrapper = os.path.join(os.getcwd(),wrapper_id) Failed = False if os.path.exists(completed_path): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: + open(failed_wrapper, 'w').close() open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) @@ -404,11 +424,14 @@ class PythonWrapperBuilder(WrapperBuilder): parallel_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) - failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(),failed_filename) + failed_wrapper = os.path.join(os.getcwd(),wrapper_id) Failed = False if os.path.exists(completed_path): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: + open(failed_wrapper, 'w').close() open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) @@ -499,10 +522,13 @@ class PythonHorizontalVerticalWrapperBuilder(PythonWrapperBuilder): parallel_threads_launcher += self._indent(textwrap.dedent(""" completed_filename = {0}[i].replace('.cmd', '_COMPLETED') completed_path = os.path.join(os.getcwd(), completed_filename) - failed_path = os.path.join(os.getcwd(),"HORIZONTAL_FAILED") + failed_filename = {0}[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(),failed_filename) + failed_wrapper = os.path.join(os.getcwd(),wrapper_id) if os.path.exists(completed_path): print datetime.now(), "The job ", pid.template," has been COMPLETED" else: + open(failed_wrapper, 'w').close() open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) -- GitLab From 2508aa00eb1f8534eefebc3be31ffaaa4445237a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 26 Oct 2020 10:44:12 +0100 Subject: [PATCH 09/26] Inner_jobs for horizontal --- .../platforms/wrappers/wrapper_builder.py | 22 ++++--------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 0308f09f4..237a90b72 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -215,7 +215,7 @@ class PythonWrapperBuilder(WrapperBuilder): idx += 1 processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) - processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) """).format(self.num_procs, str(self.jobs_resources), '\n'.ljust(13)) def build_machinefiles(self): @@ -355,10 +355,6 @@ class PythonWrapperBuilder(WrapperBuilder): open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) - parallel_threads_launcher += self._indent(textwrap.dedent(""" - #if Failed: - #{0} - """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher def build_parallel_threads_launcher_horizontal(self, jobs_list, thread, footer=True): parallel_threads_launcher = textwrap.dedent(""" @@ -395,10 +391,7 @@ class PythonWrapperBuilder(WrapperBuilder): open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) - parallel_threads_launcher += self._indent(textwrap.dedent(""" - #if Failed: - #{0} - """).format(self.exit_thread, '\n'.ljust(13)), 4) + return parallel_threads_launcher def build_parallel_threads_launcher_vertical_horizontal(self, jobs_list, thread, footer=True): parallel_threads_launcher = textwrap.dedent(""" @@ -435,10 +428,7 @@ class PythonWrapperBuilder(WrapperBuilder): open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) - parallel_threads_launcher += self._indent(textwrap.dedent(""" - #if Failed: - #{0} - """).format(self.exit_thread, '\n'.ljust(13)), 4) + return parallel_threads_launcher # all should override -> abstract! def build_main(self): @@ -532,10 +522,6 @@ class PythonHorizontalVerticalWrapperBuilder(PythonWrapperBuilder): open(failed_path, 'w').close() print datetime.now(), "The job ", pid.template," has FAILED" """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) - parallel_threads_launcher += self._indent(textwrap.dedent(""" - #if Failed: - #{0} - """).format(self.exit_thread, '\n'.ljust(13)), 4) return parallel_threads_launcher def build_joblist_thread(self): return textwrap.dedent(""" @@ -681,7 +667,7 @@ class SrunWrapperBuilder(WrapperBuilder): idx += 1 processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) - processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) """).format(self.num_procs, str(self.jobs_resources), '\n'.ljust(13)) def build_machinefiles(self): -- GitLab From b014803d719190c980aabb5a525bad51f7d78f5e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 26 Oct 2020 14:03:32 +0100 Subject: [PATCH 10/26] Working --- .../platforms/wrappers/wrapper_builder.py | 268 ++++++++---------- 1 file changed, 126 insertions(+), 142 deletions(-) diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 237a90b72..06b86563b 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -199,23 +199,20 @@ class PythonWrapperBuilder(WrapperBuilder): def build_cores_list(self): return textwrap.dedent(""" - total_cores = {0} - jobs_resources = {1} - +total_cores = {0} +jobs_resources = {1} +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +idx = 0 +all_cores = [] +while total_cores > 0: + if processors_per_node > 0: + processors_per_node -= 1 + total_cores -= 1 + all_cores.append(all_nodes[idx]) + else: + idx += 1 processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) - - idx = 0 - all_cores = [] - while total_cores > 0: - if processors_per_node > 0: - processors_per_node -= 1 - total_cores -= 1 - all_cores.append(all_nodes[idx]) - else: - idx += 1 - processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) - - processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) """).format(self.num_procs, str(self.jobs_resources), '\n'.ljust(13)) def build_machinefiles(self): @@ -226,28 +223,26 @@ class PythonWrapperBuilder(WrapperBuilder): def build_machinefiles_standard(self): return textwrap.dedent(""" - machines = str() - - cores = int(jobs_resources[section]['PROCESSORS']) - tasks = int(jobs_resources[section]['TASKS']) - nodes = int(ceil(int(cores)/float(tasks))) - if tasks < processors_per_node: - cores = tasks - - job_cores = cores - while nodes > 0: - while cores > 0: - if len(all_cores) > 0: - node = all_cores.pop(0) - if node: - machines += node +"_NEWLINE_" - cores -= 1 - for rest in range(processors_per_node-tasks): - if len(all_cores) > 0: - all_cores.pop(0) - nodes -= 1 + machines = str() + cores = int(jobs_resources[section]['PROCESSORS']) + tasks = int(jobs_resources[section]['TASKS']) + nodes = int(ceil(int(cores)/float(tasks))) if tasks < processors_per_node: - cores = job_cores + cores = tasks + job_cores = cores + while nodes > 0: + while cores > 0: + if len(all_cores) > 0: + node = all_cores.pop(0) + if node: + machines += node +"_NEWLINE_" + cores -= 1 + for rest in range(processors_per_node-tasks): + if len(all_cores) > 0: + all_cores.pop(0) + nodes -= 1 + if tasks < processors_per_node: + cores = job_cores """).format('\n'.ljust(13)) def _create_components_dict(self): @@ -323,23 +318,20 @@ class PythonWrapperBuilder(WrapperBuilder): def build_parallel_threads_launcher(self, jobs_list, thread, footer=True): parallel_threads_launcher = textwrap.dedent(""" - pid_list = [] - - for i in range(len({0})): - if type({0}[i]) != list: - job = {0}[i] - jobname = job.replace(".cmd", '') - section = jobname.split('_')[-1] - - {2} - current = {1}({0}[i], i+self.id_run) - pid_list.append(current) - current.start() - - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() +pid_list = [] +for i in range(len({0})): + if type({0}[i]) != list: + job = {0}[i] + jobname = job.replace(".cmd", '') + section = jobname.split('_')[-1] + {2} + current = {1}({0}[i], i+self.id_run) + pid_list.append(current) + current.start() +# Waiting until all scripts finish +for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" @@ -358,23 +350,22 @@ class PythonWrapperBuilder(WrapperBuilder): return parallel_threads_launcher def build_parallel_threads_launcher_horizontal(self, jobs_list, thread, footer=True): parallel_threads_launcher = textwrap.dedent(""" - pid_list = [] - - for i in range(len({0})): - if type({0}[i]) != list: - job = {0}[i] - jobname = job.replace(".cmd", '') - section = jobname.split('_')[-1] - - {2} - current = {1}({0}[i], i) - pid_list.append(current) - current.start() - - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() +pid_list = [] +for i in range(len({0})): + if type({0}[i]) != list: + job = {0}[i] + jobname = job.replace(".cmd", '') + section = jobname.split('_')[-1] + + {2} + current = {1}({0}[i], i) + pid_list.append(current) + current.start() + +# Waiting until all scripts finish +for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" @@ -395,23 +386,22 @@ class PythonWrapperBuilder(WrapperBuilder): return parallel_threads_launcher def build_parallel_threads_launcher_vertical_horizontal(self, jobs_list, thread, footer=True): parallel_threads_launcher = textwrap.dedent(""" - pid_list = [] - - for i in range(len({0})): - if type({0}[i]) != list: - job = {0}[i] - jobname = job.replace(".cmd", '') - section = jobname.split('_')[-1] - - {2} - current = {1}({0}[i], i) - pid_list.append(current) - current.start() - - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() +pid_list = [] +for i in range(len({0})): + if type({0}[i]) != list: + job = {0}[i] + jobname = job.replace(".cmd", '') + section = jobname.split('_')[-1] + + {2} + current = {1}({0}[i], i) + pid_list.append(current) + current.start() + +# Waiting until all scripts finish +for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" @@ -490,23 +480,22 @@ class PythonVerticalHorizontalWrapperBuilder(PythonWrapperBuilder): class PythonHorizontalVerticalWrapperBuilder(PythonWrapperBuilder): def build_parallel_threads_launcher_horizontal_vertical(self, jobs_list, thread, footer=True): parallel_threads_launcher = textwrap.dedent(""" - pid_list = [] - - for i in range(len({0})): - if type({0}[i]) != list: - job = {0}[i] - jobname = job.replace(".cmd", '') - section = jobname.split('_')[-1] - - {2} - current = {1}({0}[i], i+self.id_run) - pid_list.append(current) - current.start() - - # Waiting until all scripts finish - for i in range(len(pid_list)): - pid = pid_list[i] - pid.join() +pid_list = [] +for i in range(len({0})): + if type({0}[i]) != list: + job = {0}[i] + jobname = job.replace(".cmd", '') + section = jobname.split('_')[-1] + + {2} + current = {1}({0}[i], i+self.id_run) + pid_list.append(current) + current.start() + +# Waiting until all scripts finish +for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() """).format(jobs_list, thread, self._indent(self.build_machinefiles(), 8), '\n'.ljust(13)) if footer: parallel_threads_launcher += self._indent(textwrap.dedent(""" @@ -651,23 +640,20 @@ class SrunWrapperBuilder(WrapperBuilder): def build_cores_list(self): return textwrap.dedent(""" - total_cores = {0} - jobs_resources = {1} - +total_cores = {0} +jobs_resources = {1} +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +idx = 0 +all_cores = [] +while total_cores > 0: + if processors_per_node > 0: + processors_per_node -= 1 + total_cores -= 1 + all_cores.append(all_nodes[idx]) + else: + idx += 1 processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) - - idx = 0 - all_cores = [] - while total_cores > 0: - if processors_per_node > 0: - processors_per_node -= 1 - total_cores -= 1 - all_cores.append(all_nodes[idx]) - else: - idx += 1 - processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) - - processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) """).format(self.num_procs, str(self.jobs_resources), '\n'.ljust(13)) def build_machinefiles(self): @@ -678,28 +664,26 @@ class SrunWrapperBuilder(WrapperBuilder): def build_machinefiles_standard(self): return textwrap.dedent(""" - machines = str() - - cores = int(jobs_resources[section]['PROCESSORS']) - tasks = int(jobs_resources[section]['TASKS']) - nodes = int(ceil(int(cores)/float(tasks))) - if tasks < processors_per_node: - cores = tasks - - job_cores = cores - while nodes > 0: - while cores > 0: - if len(all_cores) > 0: - node = all_cores.pop(0) - if node: - machines += node +"_NEWLINE_" - cores -= 1 - for rest in range(processors_per_node-tasks): - if len(all_cores) > 0: - all_cores.pop(0) - nodes -= 1 + machines = str() + cores = int(jobs_resources[section]['PROCESSORS']) + tasks = int(jobs_resources[section]['TASKS']) + nodes = int(ceil(int(cores)/float(tasks))) if tasks < processors_per_node: - cores = job_cores + cores = tasks + job_cores = cores + while nodes > 0: + while cores > 0: + if len(all_cores) > 0: + node = all_cores.pop(0) + if node: + machines += node +"_NEWLINE_" + cores -= 1 + for rest in range(processors_per_node-tasks): + if len(all_cores) > 0: + all_cores.pop(0) + nodes -= 1 + if tasks < processors_per_node: + cores = job_cores """).format('\n'.ljust(13)) def _create_components_dict(self): -- GitLab From 654cfdaa7df96d41dbaa07a1f995780848453d4a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 26 Oct 2020 15:47:59 +0100 Subject: [PATCH 11/26] PipeLine Fix --- test/unit/test_job_package.py | 14 +++++++------- test/unit/test_wrappers.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index 42be51493..8e9aaeb07 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -12,10 +12,10 @@ from autosubmit.job.job_common import Status class TestJobPackage(TestCase): def setUp(self): - self.platform = Mock() + self._platform = Mock() self.jobs = [Job('dummy1', 0, Status.READY, 0), Job('dummy2', 0, Status.READY, 0)] - self.jobs[0].platform = self.jobs[1].platform = self.platform + self.jobs[0]._platform = self.jobs[1]._platform = self._platform self.job_package = JobPackageSimple(self.jobs) def test_job_package_default_init(self): @@ -23,14 +23,14 @@ class TestJobPackage(TestCase): JobPackageSimple([]) def test_job_package_different_platforms_init(self): - self.jobs[0].platform = Mock() - self.jobs[1].platform = Mock() + self.jobs[0]._platform = Mock() + self.jobs[1]._platform = Mock() with self.assertRaises(Exception): JobPackageSimple(this.jobs) def test_job_package_none_platforms_init(self): - self.jobs[0].platform = None - self.jobs[1].platform = None + self.jobs[0]._platform = None + self.jobs[1]._platform = None with self.assertRaises(Exception): JobPackageSimple(this.jobs) @@ -41,7 +41,7 @@ class TestJobPackage(TestCase): self.assertEquals(self.jobs, self.job_package.jobs) def test_job_package_platform_getter(self): - self.assertEquals(self.platform.serial_platform, self.job_package.platform) + self.assertEquals(self._platform.serial_platform, self.job_package.platform) def test_job_package_submission(self): # arrange diff --git a/test/unit/test_wrappers.py b/test/unit/test_wrappers.py index 54e7027d4..8593bb02c 100644 --- a/test/unit/test_wrappers.py +++ b/test/unit/test_wrappers.py @@ -151,13 +151,13 @@ class TestWrappers(TestCase): def setUp(self): self.experiment_id = 'random-id' self.config = FakeBasicConfig - self.platform = Mock() + self._platform = Mock() self.job_list = JobList(self.experiment_id, self.config, ConfigParserFactory(), JobListPersistenceDb('.', '.')) self.parser_mock = Mock(spec='SafeConfigParser') - self.platform.max_waiting_jobs = 100 - self.platform.total_jobs = 100 + self._platform.max_waiting_jobs = 100 + self._platform.total_jobs = 100 self.config.get_wrapper_type = Mock(return_value='vertical') self.config.get_wrapper_crossdate = Mock(return_value=False) self.config.get_remote_dependencies = Mock(return_value=False) @@ -167,7 +167,7 @@ class TestWrappers(TestCase): self.config.get_wrapper_policy = Mock(return_value='flexible') self.job_packager = JobPackager( - self.config, self.platform, self.job_list) + self.config, self._platform, self.job_list) ### ONE SECTION WRAPPER ### def test_returned_packages(self): @@ -1311,7 +1311,7 @@ class TestWrappers(TestCase): job.packed = False job.hold = False job.wallclock = total_wallclock - job._platform = self.platform + job._platform = self._platform job.date = date job.member = member -- GitLab From 7b4db35dcf93f229a3fd7f9fb1f1ad120cd005f1 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 26 Oct 2020 16:22:03 +0100 Subject: [PATCH 12/26] PipeLine Fix --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index c05dcd4ad..e89a356c0 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -519,7 +519,7 @@ class Job(object): platforms_to_test = set() if self.platform_name is None: self.platform_name = hpcarch - self.platform = submitter.platforms[self.platform_name.lower()] + #self.platform = submitter.platforms[self.platform_name.lower()] self._platform = submitter.platforms[self.platform_name.lower()] try: -- GitLab From 886c47517ae4f0e9d7a4db317a1a50bb54e20d7e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 26 Oct 2020 16:41:06 +0100 Subject: [PATCH 13/26] PipeLine Fix --- autosubmit/job/job_packages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 61e47c580..442c040e9 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -46,7 +46,7 @@ class JobPackageBase(object): self.hold = False try: self._tmp_path = jobs[0]._tmp_path - self._platform = jobs[0].platform + self._platform = jobs[0]._platform self._custom_directives = set() for job in jobs: if job._platform.name != self._platform.name or job._platform is None: -- GitLab From 4904610cf35a85cb8807f93f4d8c005e3c8dbef1 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 26 Oct 2020 17:21:56 +0100 Subject: [PATCH 14/26] PipeLine Fix --- autosubmit/job/job_packages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 442c040e9..33956f603 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -75,7 +75,7 @@ class JobPackageBase(object): :return: platform :rtype: Platform """ - return self._platform + return self.platform def submit(self, configuration, parameters,only_generate=False,hold=False): """ -- GitLab From d8cf1072f27307f60e337828cb115a95e3b08275 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 11:26:21 +0100 Subject: [PATCH 15/26] PipeLine Fix (had to recover a corrupt file) --- autosubmit/job/job_packages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 33956f603..442c040e9 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -75,7 +75,7 @@ class JobPackageBase(object): :return: platform :rtype: Platform """ - return self.platform + return self._platform def submit(self, configuration, parameters,only_generate=False,hold=False): """ -- GitLab From e526cfdf3ecf61806f535750fcf0b124402a8d1a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 14:00:39 +0100 Subject: [PATCH 16/26] PipeLine Fix (had to recover a corrupt file) --- autosubmit/job/job.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index e89a356c0..e47c10cb0 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -536,11 +536,11 @@ class Job(object): try: while (not out_exist and not err_exist) and i < retries: try: - out_exist = self.platform.check_file_exists(remote_logs[0]) # will do 5 retries + out_exist = self._platform.check_file_exists(remote_logs[0]) # will do 5 retries except IOError as e: out_exist = False try: - err_exist = self.platform.check_file_exists(remote_logs[1]) # will do 5 retries + err_exist = self._platform.check_file_exists(remote_logs[1]) # will do 5 retries except IOError as e: err_exists = False if not out_exist or not err_exist: @@ -553,7 +553,7 @@ class Job(object): if copy_remote_logs: if local_logs != remote_logs: # unifying names for log files - self.synchronize_logs(self.platform, remote_logs, local_logs) + self.synchronize_logs(self._platform, remote_logs, local_logs) remote_logs = local_logs self._platform.get_logs_files(self.expid, remote_logs) # Update the logs with Autosubmit Job Id Brand @@ -1195,12 +1195,12 @@ class WrapperJob(Job): if len(self.inner_jobs_running) > 0: still_running = True if not self.failed: - if self.platform.check_file_exists('WRAPPER_FAILED', wrapper_failed=True): + if self._platform.check_file_exists('WRAPPER_FAILED', wrapper_failed=True): for job in self.inner_jobs_running: if job.platform.check_file_exists('{0}_FAILED'.format(job.name), wrapper_failed=True): Log.info("Wrapper {0} Failed, checking inner_jobs...".format(self.name)) self.failed = True - self.platform.delete_file('WRAPPER_FAILED') + self._platform.delete_file('WRAPPER_FAILED') break if self.failed: -- GitLab From c828e746e195279ebae01ae7dc42f4736334ffc7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 14:16:22 +0100 Subject: [PATCH 17/26] PipeLine Fix --- test/unit/test_job_package.py | 4 ++-- test/unit/test_wrappers.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index 8e9aaeb07..f57698084 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -12,7 +12,7 @@ from autosubmit.job.job_common import Status class TestJobPackage(TestCase): def setUp(self): - self._platform = Mock() + self.platform = Mock() self.jobs = [Job('dummy1', 0, Status.READY, 0), Job('dummy2', 0, Status.READY, 0)] self.jobs[0]._platform = self.jobs[1]._platform = self._platform @@ -41,7 +41,7 @@ class TestJobPackage(TestCase): self.assertEquals(self.jobs, self.job_package.jobs) def test_job_package_platform_getter(self): - self.assertEquals(self._platform.serial_platform, self.job_package.platform) + self.assertEquals(self.platform.serial_platform, self.job_package.platform) def test_job_package_submission(self): # arrange diff --git a/test/unit/test_wrappers.py b/test/unit/test_wrappers.py index 8593bb02c..54e7027d4 100644 --- a/test/unit/test_wrappers.py +++ b/test/unit/test_wrappers.py @@ -151,13 +151,13 @@ class TestWrappers(TestCase): def setUp(self): self.experiment_id = 'random-id' self.config = FakeBasicConfig - self._platform = Mock() + self.platform = Mock() self.job_list = JobList(self.experiment_id, self.config, ConfigParserFactory(), JobListPersistenceDb('.', '.')) self.parser_mock = Mock(spec='SafeConfigParser') - self._platform.max_waiting_jobs = 100 - self._platform.total_jobs = 100 + self.platform.max_waiting_jobs = 100 + self.platform.total_jobs = 100 self.config.get_wrapper_type = Mock(return_value='vertical') self.config.get_wrapper_crossdate = Mock(return_value=False) self.config.get_remote_dependencies = Mock(return_value=False) @@ -167,7 +167,7 @@ class TestWrappers(TestCase): self.config.get_wrapper_policy = Mock(return_value='flexible') self.job_packager = JobPackager( - self.config, self._platform, self.job_list) + self.config, self.platform, self.job_list) ### ONE SECTION WRAPPER ### def test_returned_packages(self): @@ -1311,7 +1311,7 @@ class TestWrappers(TestCase): job.packed = False job.hold = False job.wallclock = total_wallclock - job._platform = self._platform + job._platform = self.platform job.date = date job.member = member -- GitLab From a8b401e405582754753faa1885469ec6d2437aa9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 15:35:59 +0100 Subject: [PATCH 18/26] PipeLine Fix --- autosubmit/autosubmit.py | 44 +++++++++++------------ autosubmit/job/job_list.py | 18 +++++----- autosubmit/job/job_packages.py | 2 +- autosubmit/platforms/paramiko_platform.py | 2 +- test/unit/test_job.py | 20 +++++------ test/unit/test_wrappers.py | 10 +++--- 6 files changed, 48 insertions(+), 48 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 0ce6d3947..fcb060d50 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1144,10 +1144,10 @@ class Autosubmit: job.platform_name = hpcarch # Assign platform objects to each job # noinspection PyTypeChecker - job._platform = submitter.platforms[job.platform_name.lower()] + job.platform = submitter.platforms[job.platform_name.lower()] # Add object to set # noinspection PyTypeChecker - platforms_to_test.add(job._platform) + platforms_to_test.add(job.platform) # case setstatus job_list.check_scripts(as_conf) job_list.update_list(as_conf, False) @@ -1268,10 +1268,10 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job._platform = submitter.platforms[job.platform_name.lower( + job.platform = submitter.platforms[job.platform_name.lower( )] # noinspection PyTypeChecker - platforms_to_test.add(job._platform) + platforms_to_test.add(job.platform) try: job_list.check_scripts(as_conf) except Exception as e: @@ -1508,7 +1508,7 @@ class Autosubmit: for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpcarch - job._platform = submitter.platforms[job.platform_name.lower()] + job.platform = submitter.platforms[job.platform_name.lower()] packages_persistence = JobPackagePersistence(os.path.join( BasicConfig.LOCAL_ROOT_DIR, expid, "pkl"), "job_packages_" + expid) @@ -1543,10 +1543,10 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job._platform = submitter.platforms[job.platform_name.lower( + job.platform = submitter.platforms[job.platform_name.lower( )] # noinspection PyTypeChecker - platforms_to_test.add(job._platform) + platforms_to_test.add(job.platform) except BaseException: raise AutosubmitCritical( "Autosubmit couldn't recover the platforms", 7050, e.message) @@ -2098,7 +2098,7 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job._platform = platforms[job.platform_name.lower()] + job.platform = platforms[job.platform_name.lower()] # noinspection PyTypeChecker platforms_to_test.add(platforms[job.platform_name.lower()]) # establish the connection to all platforms @@ -2115,9 +2115,9 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job._platform = platforms[job.platform_name.lower()] + job.platform = platforms[job.platform_name.lower()] - if job._platform.get_completed_files(job.name, 0, True): + if job.platform.get_completed_files(job.name, 0, True): job.status = Status.COMPLETED Log.info( "CHANGED job '{0}' status to COMPLETED".format(job.name)) @@ -2125,7 +2125,7 @@ class Autosubmit: if not no_recover_logs: try: - job._platform.get_logs_files(expid, job.remote_logs) + job.platform.get_logs_files(expid, job.remote_logs) except: pass elif job.status != Status.SUSPENDED: @@ -2443,7 +2443,7 @@ class Autosubmit: for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpc_architecture - job._platform = submitter.platforms[job.platform_name.lower()] + job.platform = submitter.platforms[job.platform_name.lower()] job.update_parameters(as_conf, job_list.parameters) return job_list.check_scripts(as_conf) @@ -3323,15 +3323,15 @@ class Autosubmit: if (job.status == Status.QUEUING or job.status == Status.HELD) and save and (final_status != Status.QUEUING and final_status != Status.HELD and final_status != Status.SUSPENDED): job.hold = False if job.platform_name is not None and job.platform_name.lower() != "local": - job._platform.send_command( - job._platform.cancel_cmd + " " + str(job.id), ignore_log=True) + job.platform.send_command( + job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) elif (job.status == Status.QUEUING or job.status == Status.RUNNING or job.status == Status.SUBMITTED) and save and (final_status == Status.SUSPENDED): if job.platform_name is not None and job.platform_name.lower() != "local": - job._platform.send_command( + job.platform.send_command( "scontrol hold " + "{0}".format(job.id), ignore_log=True) elif (final_status == Status.QUEUING or final_status == Status.RUNNING) and save and (job.status == Status.SUSPENDED): if job.platform_name is not None and job.platform_name.lower() != "local": - job._platform.send_command( + job.platform.send_command( "scontrol release " + "{0}".format(job.id), ignore_log=True) job.status = final_status Log.info("CHANGED: job: " + job.name + " status to: " + final) @@ -3433,7 +3433,7 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job._platform = submitter.platforms[job.platform_name.lower( + job.platform = submitter.platforms[job.platform_name.lower( )] platforms_to_test = set() platforms = submitter.platforms @@ -3442,7 +3442,7 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job._platform = platforms[job.platform_name.lower()] + job.platform = platforms[job.platform_name.lower()] # noinspection PyTypeChecker platforms_to_test.add(platforms[job.platform_name.lower()]) # establish the connection to all platforms @@ -4333,7 +4333,7 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job._platform = platforms[job.platform_name.lower()] + job.platform = platforms[job.platform_name.lower()] # noinspection PyTypeChecker platforms_to_test.add(platforms[job.platform_name.lower()]) rerun_names = [] @@ -4348,12 +4348,12 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job._platform = platforms[job.platform_name.lower()] + job.platform = platforms[job.platform_name.lower()] - if job._platform.get_completed_files(job.name, 0): + if job.platform.get_completed_files(job.name, 0): job.status = Status.COMPLETED Log.info( "CHANGED job '{0}' status to COMPLETED".format(job.name)) - job._platform.get_logs_files(expid, job.remote_logs) + job.platform.get_logs_files(expid, job.remote_logs) return job_list diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 4c053f91b..695341a56 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -647,7 +647,7 @@ class JobList: :rtype: list """ - completed_jobs = [job for job in self._job_list if (platform is None or job._platform.name == platform.name) and + completed_jobs = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and job.status == Status.COMPLETED] if wrapper: return [job for job in completed_jobs if job.packed is False] @@ -829,7 +829,7 @@ class JobList: :return: waiting jobs :rtype: list """ - waiting_jobs = [job for job in self._job_list if (job._platform.type == platform_type and job.status == Status.WAITING)] + waiting_jobs = [job for job in self._job_list if (job.platform.type == platform_type and job.status == Status.WAITING)] return waiting_jobs def get_held_jobs(self, platform=None): @@ -841,7 +841,7 @@ class JobList: :return: jobs in platforms :rtype: list """ - return [job for job in self._job_list if (platform is None or job._platform.name == platform.name) and + return [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and job.status == Status.HELD] def get_unknown(self, platform=None, wrapper=False): @@ -853,7 +853,7 @@ class JobList: :return: unknown state jobs :rtype: list """ - submitted = [job for job in self._job_list if (platform is None or job._platform.name == platform.name) and + submitted = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and job.status == Status.UNKNOWN] if wrapper: return [job for job in submitted if job.packed is False] @@ -869,7 +869,7 @@ class JobList: :return: unknown state jobs :rtype: list """ - suspended = [job for job in self._job_list if (platform is None or job._platform.name == platform.name) and + suspended = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and job.status == Status.SUSPENDED] if wrapper: return [job for job in suspended if job.packed is False] @@ -968,7 +968,7 @@ class JobList: def get_in_ready_grouped_id(self, platform): jobs = [] [jobs.append(job) for job in jobs if ( - platform is None or job._platform.name is platform.name)] + platform is None or job.platform.name is platform.name)] jobs_by_id = dict() for job in jobs: @@ -1073,7 +1073,7 @@ class JobList: queue = "no-scheduler" else: queue = job.queue - Log.status("{0:<35}{1:<15}{2:<15}{3:<20}{4:<15}", job.name, job.id, Status().VALUE_TO_KEY[job.status],job._platform.name,queue) + Log.status("{0:<35}{1:<15}{2:<15}{3:<20}{4:<15}", job.name, job.id, Status().VALUE_TO_KEY[job.status],job.platform.name,queue) def update_from_file(self, store_change=True): @@ -1143,8 +1143,8 @@ class JobList: if submitter is not None: job.platform = submitter.platforms[job.platform_name.lower()] job.platform.test_connection() - job._platform = submitter.platforms[job.platform_name.lower()] - job._platform.test_connection() + job.platform = submitter.platforms[job.platform_name.lower()] + job.platform.test_connection() job.id = None diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 442c040e9..06789a359 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -49,7 +49,7 @@ class JobPackageBase(object): self._platform = jobs[0]._platform self._custom_directives = set() for job in jobs: - if job._platform.name != self._platform.name or job._platform is None: + if job._platform.name != self._platform.name or job.platform is None: raise Exception('Only one valid platform per package') except IndexError: raise Exception('No jobs given') diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index bb8cb0cf8..13799e6d8 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -475,7 +475,7 @@ class ParamikoPlatform(Platform): elif reason == '(JobHeldAdmin)': #This shouldn't happen anymore TODO delete Log.debug("Job {0} Failed to be HELD, canceling... ", job.name) job.new_status = Status.WAITING - job._platform.send_command(job._platform.cancel_cmd + " {0}".format(job.id)) + job.platform.send_command(job.platform.cancel_cmd + " {0}".format(job.id)) else: for job in job_list: diff --git a/test/unit/test_job.py b/test/unit/test_job.py index 7c3a43f41..b733b318e 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -31,10 +31,10 @@ class TestJob(TestCase): platform = Platform(self.experiment_id, 'parallel-platform', FakeBasicConfig) platform.serial_platform = 'serial-platform' - self.job._platform = platform + self.job.platform = platform self.job.processors = 999 - returned_platform = self.job._platform + returned_platform = self.job.platform self.assertEquals(platform, returned_platform) @@ -42,20 +42,20 @@ class TestJob(TestCase): platform = Platform(self.experiment_id, 'parallel-platform', FakeBasicConfig) platform.serial_platform = 'serial-platform' - self.job._platform = platform + self.job.platform = platform self.job.processors = '1' - returned_platform = self.job._platform + returned_platform = self.job.platform self.assertEquals('serial-platform', returned_platform) def test_set_platform(self): dummy_platform = Platform('whatever', 'rand-name', FakeBasicConfig) - self.assertNotEquals(dummy_platform, self.job._platform) + self.assertNotEquals(dummy_platform, self.job.platform) - self.job._platform = dummy_platform + self.job.platform = dummy_platform - self.assertEquals(dummy_platform, self.job._platform) + self.assertEquals(dummy_platform, self.job.platform) def test_when_the_job_has_a_queue_returns_that_queue(self): dummy_queue = 'whatever' @@ -69,7 +69,7 @@ class TestJob(TestCase): dummy_queue = 'whatever-parallel' dummy_platform = Platform('whatever', 'rand-name', FakeBasicConfig) dummy_platform.queue = dummy_queue - self.job._platform = dummy_platform + self.job.platform = dummy_platform self.assertIsNone(self.job._queue) @@ -89,7 +89,7 @@ class TestJob(TestCase): dummy_platform.serial_platform = dummy_serial_platform dummy_platform.queue = parallel_queue - self.job._platform = dummy_platform + self.job.platform = dummy_platform self.job.processors = '1' self.assertIsNone(self.job._queue) @@ -265,7 +265,7 @@ class TestJob(TestCase): dummy_platform = Mock() dummy_platform.serial_platform = dummy_serial_platform dummy_platform.custom_directives = '["whatever"]' - self.job._platform = dummy_platform + self.job.platform = dummy_platform # Act parameters = self.job.update_parameters(as_conf, dict()) # Assert diff --git a/test/unit/test_wrappers.py b/test/unit/test_wrappers.py index 54e7027d4..aeb192f9e 100644 --- a/test/unit/test_wrappers.py +++ b/test/unit/test_wrappers.py @@ -151,13 +151,13 @@ class TestWrappers(TestCase): def setUp(self): self.experiment_id = 'random-id' self.config = FakeBasicConfig - self.platform = Mock() + self._platform = Mock() self.job_list = JobList(self.experiment_id, self.config, ConfigParserFactory(), JobListPersistenceDb('.', '.')) self.parser_mock = Mock(spec='SafeConfigParser') - self.platform.max_waiting_jobs = 100 - self.platform.total_jobs = 100 + self._platform.max_waiting_jobs = 100 + self._platform.total_jobs = 100 self.config.get_wrapper_type = Mock(return_value='vertical') self.config.get_wrapper_crossdate = Mock(return_value=False) self.config.get_remote_dependencies = Mock(return_value=False) @@ -167,7 +167,7 @@ class TestWrappers(TestCase): self.config.get_wrapper_policy = Mock(return_value='flexible') self.job_packager = JobPackager( - self.config, self.platform, self.job_list) + self.config, self._platform, self.job_list) ### ONE SECTION WRAPPER ### def test_returned_packages(self): @@ -1311,7 +1311,7 @@ class TestWrappers(TestCase): job.packed = False job.hold = False job.wallclock = total_wallclock - job._platform = self.platform + job.platform = self._platform job.date = date job.member = member -- GitLab From f12bfc4a9ad6aa8e42626d1e9193c7517f555f79 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 16:10:03 +0100 Subject: [PATCH 19/26] PipeLine Fix --- autosubmit/job/job.py | 6 ++---- autosubmit/platforms/slurmplatform.py | 2 +- test/unit/test_job_package.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index e47c10cb0..a62ade881 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -519,16 +519,14 @@ class Job(object): platforms_to_test = set() if self.platform_name is None: self.platform_name = hpcarch - #self.platform = submitter.platforms[self.platform_name.lower()] - - self._platform = submitter.platforms[self.platform_name.lower()] + self._platform = submitter.platforms[self.platform_name.lower()] # serial try: self._platform.restore_connection() except Exception as e: Log.printlog("{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message,self.name), 6001) out_exist = False err_exist = False - retries = 20 + retries = 5 sleeptime = 0 i = 0 sleep(20) diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index a8b9fa469..c607a619f 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -421,7 +421,7 @@ class SlurmPlatform(ParamikoPlatform): retries = 9999 except BaseException as e: # Unrecoverable error Log.critical( - "Crashed while retrieving remote logs", 6001, e.message) + "remote logs {0} couldn't be recovered".format(filename), 6001, e.message) file_exist = False # won't exist retries = 999 # no more retries diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index f57698084..9f1420c9c 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -12,7 +12,7 @@ from autosubmit.job.job_common import Status class TestJobPackage(TestCase): def setUp(self): - self.platform = Mock() + self._platform = Mock() self.jobs = [Job('dummy1', 0, Status.READY, 0), Job('dummy2', 0, Status.READY, 0)] self.jobs[0]._platform = self.jobs[1]._platform = self._platform -- GitLab From ae26c9c5842fb044bdec5a1fc54a1deb04217b29 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 16:22:09 +0100 Subject: [PATCH 20/26] PipeLine Fix --- test/unit/test_job_package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index 9f1420c9c..8e9aaeb07 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -41,7 +41,7 @@ class TestJobPackage(TestCase): self.assertEquals(self.jobs, self.job_package.jobs) def test_job_package_platform_getter(self): - self.assertEquals(self.platform.serial_platform, self.job_package.platform) + self.assertEquals(self._platform.serial_platform, self.job_package.platform) def test_job_package_submission(self): # arrange -- GitLab From 00bce55218cdd0ba731cae9fac6ba4ac2f70ffcf Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 16:45:43 +0100 Subject: [PATCH 21/26] PipeLine Fix --- test/unit/test_job_package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index 8e9aaeb07..9f1420c9c 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -41,7 +41,7 @@ class TestJobPackage(TestCase): self.assertEquals(self.jobs, self.job_package.jobs) def test_job_package_platform_getter(self): - self.assertEquals(self._platform.serial_platform, self.job_package.platform) + self.assertEquals(self.platform.serial_platform, self.job_package.platform) def test_job_package_submission(self): # arrange -- GitLab From 46967a143bfaf1d352f8e0a2799c36c8747cd8a2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 17:08:52 +0100 Subject: [PATCH 22/26] PipeLine Fix --- test/unit/test_job.py | 10 +++++----- test/unit/test_job_package.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/unit/test_job.py b/test/unit/test_job.py index b733b318e..5248377d4 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -31,7 +31,7 @@ class TestJob(TestCase): platform = Platform(self.experiment_id, 'parallel-platform', FakeBasicConfig) platform.serial_platform = 'serial-platform' - self.job.platform = platform + self.job._platform = platform self.job.processors = 999 returned_platform = self.job.platform @@ -42,8 +42,8 @@ class TestJob(TestCase): platform = Platform(self.experiment_id, 'parallel-platform', FakeBasicConfig) platform.serial_platform = 'serial-platform' - self.job.platform = platform - self.job.processors = '1' + self.job._platform = platform + self.job.processors = 1 returned_platform = self.job.platform @@ -89,7 +89,7 @@ class TestJob(TestCase): dummy_platform.serial_platform = dummy_serial_platform dummy_platform.queue = parallel_queue - self.job.platform = dummy_platform + self.job._platform = dummy_platform self.job.processors = '1' self.assertIsNone(self.job._queue) @@ -265,7 +265,7 @@ class TestJob(TestCase): dummy_platform = Mock() dummy_platform.serial_platform = dummy_serial_platform dummy_platform.custom_directives = '["whatever"]' - self.job.platform = dummy_platform + self.job._platform = dummy_platform # Act parameters = self.job.update_parameters(as_conf, dict()) # Assert diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index 9f1420c9c..3d23f5059 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -12,10 +12,10 @@ from autosubmit.job.job_common import Status class TestJobPackage(TestCase): def setUp(self): - self._platform = Mock() + self.platform = Mock() self.jobs = [Job('dummy1', 0, Status.READY, 0), Job('dummy2', 0, Status.READY, 0)] - self.jobs[0]._platform = self.jobs[1]._platform = self._platform + self.jobs[0]._platform = self.jobs[1]._platform = self.platform self.job_package = JobPackageSimple(self.jobs) def test_job_package_default_init(self): -- GitLab From e283c39327dfd1f7ca8626329232c60637a592ac Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 27 Oct 2020 17:18:01 +0100 Subject: [PATCH 23/26] PipeLine Fix --- test/unit/test_job_package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index 3d23f5059..a695c95bb 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -41,7 +41,7 @@ class TestJobPackage(TestCase): self.assertEquals(self.jobs, self.job_package.jobs) def test_job_package_platform_getter(self): - self.assertEquals(self.platform.serial_platform, self.job_package.platform) + self.assertEquals(self.platform, self.job_package.platform) def test_job_package_submission(self): # arrange -- GitLab From 7336bb4a39fd47130b9480d926c601e743da7e34 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 28 Oct 2020 15:01:34 +0100 Subject: [PATCH 24/26] Fix ready_jobs on local platform, fix a bug with platform_name , fix a bug with on_submission --- autosubmit/autosubmit.py | 3 ++- autosubmit/job/job.py | 2 +- autosubmit/job/job_list.py | 28 ++++++++++++++-------------- autosubmit/job/job_packages.py | 3 ++- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index fcb060d50..aba44ca98 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1627,8 +1627,9 @@ class Autosubmit: save = False for platform in platforms_to_test: if not hold: - Log.debug("\nJobs ready for {1}: {0}", len(job_list.get_ready(platform, hold=hold)), platform.name) + Log.info("\nJobs ready for {1}: {0}", len(job_list.get_ready(platform, hold=hold)), platform.name) ready_jobs = job_list.get_ready(platform, hold=hold) + pass else: Log.debug("\nJobs prepared for {1}: {0}", len( job_list.get_prepared(platform)), platform.name) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index a62ade881..01e0465e4 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -830,7 +830,7 @@ class Job(object): template = template_file.read() else: if self.type == Type.BASH: - template = 'sleep 70' + template = 'sleep 5' elif self.type == Type.PYTHON: template = 'time.sleep(5)' elif self.type == Type.R: diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 695341a56..0d05f1d41 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -647,7 +647,7 @@ class JobList: :rtype: list """ - completed_jobs = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and + completed_jobs = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.COMPLETED] if wrapper: return [job for job in completed_jobs if job.packed is False] @@ -664,7 +664,7 @@ class JobList: :return: completed jobs :rtype: list """ - uncompleted_jobs = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + uncompleted_jobs = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status != Status.COMPLETED] if wrapper: @@ -683,10 +683,10 @@ class JobList: """ submitted = list() if hold: - submitted = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + submitted = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.SUBMITTED and job.hold == hold] else: - submitted = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + submitted = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.SUBMITTED] if wrapper: return [job for job in submitted if job.packed is False] @@ -702,7 +702,7 @@ class JobList: :return: running jobs :rtype: list """ - running = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + running = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.RUNNING] if wrapper: return [job for job in running if job.packed is False] @@ -718,7 +718,7 @@ class JobList: :return: queuedjobs :rtype: list """ - queuing = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + queuing = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.QUEUING] if wrapper: return [job for job in queuing if job.packed is False] @@ -734,7 +734,7 @@ class JobList: :return: failed jobs :rtype: list """ - failed = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + failed = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.FAILED] if wrapper: return [job for job in failed if job.packed is False] @@ -750,7 +750,7 @@ class JobList: :return: all jobs :rtype: list """ - unsubmitted = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + unsubmitted = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and (job.status != Status.SUBMITTED and job.status != Status.QUEUING and job.status == Status.RUNNING and job.status == Status.COMPLETED)] if wrapper: @@ -783,7 +783,7 @@ class JobList: :return: ready jobs :rtype: list """ - ready = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + ready = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.READY and job.hold is hold] if wrapper: @@ -800,7 +800,7 @@ class JobList: :return: prepared jobs :rtype: list """ - prepared = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + prepared = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.PREPARED] return prepared @@ -813,7 +813,7 @@ class JobList: :return: waiting jobs :rtype: list """ - waiting_jobs = [job for job in self._job_list if (platform is None or job.platform_name == platform.name) and + waiting_jobs = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.WAITING] if wrapper: return [job for job in waiting_jobs if job.packed is False] @@ -841,7 +841,7 @@ class JobList: :return: jobs in platforms :rtype: list """ - return [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and + return [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.HELD] def get_unknown(self, platform=None, wrapper=False): @@ -853,7 +853,7 @@ class JobList: :return: unknown state jobs :rtype: list """ - submitted = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and + submitted = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.UNKNOWN] if wrapper: return [job for job in submitted if job.packed is False] @@ -869,7 +869,7 @@ class JobList: :return: unknown state jobs :rtype: list """ - suspended = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and + suspended = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.SUSPENDED] if wrapper: return [job for job in suspended if job.packed is False] diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 06789a359..21bd93e42 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -94,7 +94,8 @@ class JobPackageBase(object): exit=True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) + if configuration.get_project_type().lower() != "none": + raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) if not job.check_script(configuration, parameters,show_logs=job.check_warnings): Log.warning("Script {0} check failed",job.name) Log.warning("On submission script has some empty variables") -- GitLab From 1f6e4f2426c620868b070ca5d2c2b4d6916334d7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 28 Oct 2020 17:41:52 +0100 Subject: [PATCH 25/26] Fix total_stats --- autosubmit/autosubmit.py | 4 ++-- autosubmit/job/job.py | 13 ++++++++----- autosubmit/job/job_packager.py | 4 ++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index aba44ca98..f736d6e55 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1627,9 +1627,8 @@ class Autosubmit: save = False for platform in platforms_to_test: if not hold: - Log.info("\nJobs ready for {1}: {0}", len(job_list.get_ready(platform, hold=hold)), platform.name) + Log.debug("\nJobs ready for {1}: {0}", len(job_list.get_ready(platform, hold=hold)), platform.name) ready_jobs = job_list.get_ready(platform, hold=hold) - pass else: Log.debug("\nJobs prepared for {1}: {0}", len( job_list.get_prepared(platform)), platform.name) @@ -1734,6 +1733,7 @@ class Autosubmit: job.hold = hold job.id = str(jobs_id[i]) job.status = Status.SUBMITTED + job.write_submit_time() if hasattr(package, "name"): job_list.packages_dict[package.name] = package.jobs from job.job import WrapperJob diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 01e0465e4..501ccf8d0 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -529,7 +529,7 @@ class Job(object): retries = 5 sleeptime = 0 i = 0 - sleep(20) + sleep(10) no_continue = False try: while (not out_exist and not err_exist) and i < retries: @@ -547,7 +547,9 @@ class Job(object): sleep(sleeptime) if i >= retries: if not out_exist or not err_exist: - raise AutosubmitError("Failed to retrieve log files {1} and {2}".format(retries,remote_logs[0],remote_logs[1]), 6001) + Log.printlog("Failed to retrieve log files {1} and {2} e=6001".format(retries,remote_logs[0],remote_logs[1])) + sleep(5) # safe wait before end a thread + return if copy_remote_logs: if local_logs != remote_logs: # unifying names for log files @@ -559,8 +561,9 @@ class Job(object): for local_log in local_logs: self._platform.write_jobid(self.id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: - raise AutosubmitError("Trace {0} \n Failed to write the {1}".format(e.message,self.name), 6001) - + Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(e.message,self.name)) + sleep(5) # safe wait before end a thread + return except AutosubmitError as e: Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format(e.message,self.name), 6001) sleep(5) # safe wait before end a thread @@ -1015,7 +1018,7 @@ class Job(object): :param completed: True if job was completed successfully, False otherwise :type completed: bool """ - self._platform.get_stat_file(self.name, retries=0) + self._platform.get_stat_file(self.name, retries=5) end_time = self.check_end_time() path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') f = open(path, 'a') diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 6b050653a..76a4f2c22 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -77,13 +77,13 @@ class JobPackager(object): Log.debug("Number of jobs prepared: {0}", len( jobs_list.get_prepared(platform))) if len(jobs_list.get_prepared(platform)) > 0: - Log.info("Jobs ready for {0}: {1}", self._platform.name, len( + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( jobs_list.get_prepared(platform))) else: Log.debug("Number of jobs ready: {0}", len( jobs_list.get_ready(platform, hold=False))) if len(jobs_list.get_ready(platform)) > 0: - Log.info("Jobs ready for {0}: {1}", self._platform.name, len( + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( jobs_list.get_ready(platform))) self._maxTotalProcessors = 0 def compute_weight(self,job_list): -- GitLab From 62416e6097911561772cbc0fa7ace730cba4ebda Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 28 Oct 2020 17:50:58 +0100 Subject: [PATCH 26/26] Fix possible job hang ( same than in 3.12 fixed) --- autosubmit/platforms/paramiko_platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 13799e6d8..b6d23e19c 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -363,7 +363,7 @@ class ParamikoPlatform(Platform): Log.error('check_job() The job id ({0}) is not an integer neither a string.', job_id) job.new_status = job_status sleep_time=5 - while not (self.send_command(self.get_checkjob_cmd(job_id)) and retries >= 0) or (self.get_ssh_output() == "" and retries >= 0): + while not ( self.send_command(self.get_checkjob_cmd(job_id)) or (self.get_ssh_output() == "") ) and retries > 0: retries = retries - 1 Log.debug('Retrying check job command: {0}', self.get_checkjob_cmd(job_id)) Log.debug('retries left {0}', retries) -- GitLab