From 6a704c97fcb5ec188e2fce5737a2884fea444537 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 18 Nov 2020 18:14:10 +0100 Subject: [PATCH 1/2] vertical fixed --- autosubmit/job/job.py | 12 ++++------ .../platforms/wrappers/wrapper_builder.py | 24 +++++++++++++------ log/log.py | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index e183abbf2..458452814 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1238,7 +1238,6 @@ class WrapperJob(Job): self.failed = True self._platform.delete_file('WRAPPER_FAILED') break - if self.failed: self.update_failed_jobs() if len(self.inner_jobs_running) <= 0: @@ -1318,10 +1317,8 @@ class WrapperJob(Job): start_time = self.running_jobs_start[job] if self._is_over_wallclock(start_time, job.wallclock): # if self.as_config.get_wrapper_type() in ['vertical', 'horizontal']: - Log.printlog("Job {0} inside wrapper {1} is running for longer than it's wallclock! Cancelling...".format( + Log.printlog("Job {0} inside wrapper {1} is running for longer than it's wallclock!".format( job.name, self.name), 6009) - job.new_status = Status.FAILED - job.update_status(self.as_config.get_copy_remote_logs() == 'true') return True return False def _check_running_jobs(self): @@ -1385,8 +1382,9 @@ done job.update_status(self.as_config.get_copy_remote_logs() == 'true') if len(out) == 2: Log.info("Job {0} is RUNNING".format(jobname)) - over_wallclock = self._check_inner_job_wallclock(job) + over_wallclock = self._check_inner_job_wallclock(job) # messaged included if over_wallclock: + job.status = Status.FAILED Log.printlog("Job {0} is FAILED".format(jobname),6009) elif len(out) == 3: end_time = self._check_time(out, 2) @@ -1397,7 +1395,7 @@ done retries = retries - 1 temp_list = self.inner_jobs_running self.inner_jobs_running = [job for job in temp_list if job.status == Status.RUNNING] - if retries == 0 or over_wallclock: + if retries == 0: # or over_wallclock: self.status = Status.FAILED def _check_finished_job(self, job , failed_file=False): if not failed_file: @@ -1414,7 +1412,7 @@ done job.update_status(self.as_config.get_copy_remote_logs() == 'true') else: #Log.info("No completed filed found, setting {0} to FAILED...".format(job.name)) - job.new_status = Status.FAILED + job.status = Status.FAILED job.update_status(self.as_config.get_copy_remote_logs() == 'true') self.running_jobs_start.pop(job, None) diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 15096b39e..1b0322b2c 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -290,12 +290,6 @@ processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) current = {1} current.start() current.join() - if os.path.exists(failed_wrapper): - os.remove(os.path.join(os.getcwd(),wrapper_id)) - wrapper_failed = os.path.join(os.getcwd(),"WRAPPER_FAILED") - open(wrapper_failed, 'w').close() - os._exit(1) - """).format(jobs_list, thread, '\n'.ljust(13)) if footer: @@ -312,8 +306,24 @@ processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) open(failed_path, 'w').close() print datetime.now(), "The job ", current.template," has FAILED" #{1} - """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) + sequential_threads_launcher += self._indent(textwrap.dedent(""" + if os.path.exists(failed_wrapper): + os.remove(os.path.join(os.getcwd(),wrapper_id)) + wrapper_failed = os.path.join(os.getcwd(),"WRAPPER_FAILED") + open(wrapper_failed, 'w').close() + os._exit(1) + + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) + else: + sequential_threads_launcher += self._indent(textwrap.dedent(""" + if os.path.exists(failed_wrapper): + os.remove(os.path.join(os.getcwd(),wrapper_id)) + wrapper_failed = os.path.join(os.getcwd(),"WRAPPER_FAILED") + open(wrapper_failed, 'w').close() + os._exit(1) + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 4) return sequential_threads_launcher def build_parallel_threads_launcher(self, jobs_list, thread, footer=True): diff --git a/log/log.py b/log/log.py index 222060b1f..956d4015f 100644 --- a/log/log.py +++ b/log/log.py @@ -163,7 +163,7 @@ class Log: os.mkdir(directory) files = [f for f in os.listdir(directory) if os.path.isfile( os.path.join(directory, f)) and f.endswith(filename)] - if len(files) >= 5: + if len(files) >= 10: files.sort() os.remove(os.path.join(directory, files[0])) file_path = os.path.join( -- GitLab From 7cdfa4628c4e5f33439a88190c2f18de18ffd51f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 18 Nov 2020 18:16:03 +0100 Subject: [PATCH 2/2] vertical fixed --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 458452814..1a6e141d6 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1413,7 +1413,7 @@ done else: #Log.info("No completed filed found, setting {0} to FAILED...".format(job.name)) job.status = Status.FAILED - job.update_status(self.as_config.get_copy_remote_logs() == 'true') + #job.update_status(self.as_config.get_copy_remote_logs() == 'true') self.running_jobs_start.pop(job, None) def update_failed_jobs(self,canceled_wrapper=False): -- GitLab