diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 1d75ec5349debcf75c66323280774adae0f3794f..703d66e3c6cda2f2c2cfbc50379feda50ca2b5ba 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -1424,6 +1424,15 @@ class AutosubmitConfig(object): """ return int(self._conf_parser.get('config', 'RETRIALS')) + def get_delay_retry_time(self): + """ + Returns delay time from autosubmit's config file + + :return: safety sleep time + :rtype: int + """ + return int(self._conf_parser.get_option('config', 'DELAY_RETRY_TIME', "-1")) + def get_notifications(self): """ Returns if the user has enabled the notifications from autosubmit's config file diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 94e1a58380e5e037813621e9545e964b30b32f86..15d997f55d56dfaff0c6f3f1937724af04486d98 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -80,10 +80,13 @@ class Job(object): return "{0} STATUS: {1}".format(self.name, self.status) def __init__(self, name, job_id, status, priority): + self.delay_end = datetime.datetime.now() + self.delay_retrials = 0 self.wrapper_type = "none" self._wrapper_queue = None self._platform = None self._queue = None + self.retry_delay = 0 self.platform_name = None self.section = None self.wallclock = None @@ -883,7 +886,8 @@ class Job(object): parameters['MEMBER'] = self.member if hasattr(self, 'retrials'): parameters['RETRIALS'] = self.retrials - + if hasattr(self, 'delay_retrials'): + parameters['delay_retrials'] = self.delay_retrials if self.date is not None: if self.chunk is None: chunk = 1 diff --git a/autosubmit/job/job_common.py b/autosubmit/job/job_common.py index f97dcb355e3445f960c20704f2f7c6aa330e92ef..0ec44bbf130300819678fa1cc788f9bcce4bd17c 100644 --- a/autosubmit/job/job_common.py +++ b/autosubmit/job/job_common.py @@ -34,12 +34,13 @@ class Status: PREPARED = 7 SKIPPED = 8 FAILED = -1 + DELAYED = 9 UNKNOWN = -2 SUSPENDED = -3 ####### # Note: any change on constants must be applied on the dict below!!! VALUE_TO_KEY = {-3: 'SUSPENDED', -2: 'UNKNOWN', -1: 'FAILED', 0: 'WAITING', 1: 'READY', - 2: 'SUBMITTED', 3: 'QUEUING', 4: 'RUNNING', 5: 'COMPLETED', 6: 'HELD', 7: 'PREPARED', 8: 'SKIPPED'} + 2: 'SUBMITTED', 3: 'QUEUING', 4: 'RUNNING', 5: 'COMPLETED', 6: 'HELD', 7: 'PREPARED', 8: 'SKIPPED', 9: 'DELAYED'} def retval(self, value): return getattr(self, value) @@ -67,9 +68,10 @@ class bcolors: PREPARED = '\033[34;2m' HELD = '\033[34;1m' FAILED = '\033[31m' + DELAYED = '\033[36;1m' SUSPENDED = '\033[31;1m' CODE_TO_COLOR = {-3: SUSPENDED, -2: UNKNOWN, -1: FAILED, 0: WAITING, 1: READY, - 2: SUBMITTED, 3: QUEUING, 4: RUNNING, 5: COMPLETED, 6: HELD, 7: PREPARED, 8: SKIPPED} + 2: SUBMITTED, 3: QUEUING, 4: RUNNING, 5: COMPLETED, 6: HELD, 7: PREPARED, 8: SKIPPED, 9: DELAYED} class Type: diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index 69c56a535f86cb04e59cfc431cecc821adca2632..1cf8dc61f35b87cb3dd123db869bc933fa3591a2 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -343,6 +343,7 @@ class DicJobs: job.memory_per_task = self.get_option(section, "MEMORY_PER_TASK", '') job.wallclock = self.get_option(section, "WALLCLOCK", '') job.retrials = int(self.get_option(section, 'RETRIALS', -1)) + job.delay_retrials = str(self.get_option(section, 'DELAY_RETRY_TIME', "-1")) if job.retrials == -1: job.retrials = None job.notify_on = [x.upper() for x in self.get_option(section, "NOTIFY_ON", '').split(' ')] diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index ea27ac6a22a0592ad2fab20d48d26b1ca6754588..9d4c55023d4d5b601f1fd0ed897b50357879ac61 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -34,7 +34,7 @@ from autosubmit.job.job_utils import Dependency from autosubmit.job.job_common import Status, bcolors from bscearth.utils.date import date2str, parse_date import autosubmit.database.db_structure as DbStructure - +import datetime from networkx import DiGraph from autosubmit.job.job_utils import transitive_reduction from log.log import AutosubmitCritical, AutosubmitError, Log @@ -1041,14 +1041,25 @@ class JobList(object): prepared = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and job.status == Status.PREPARED] return prepared + def get_delayed(self, platform=None): + """ + Returns a list of delayed jobs + :param platform: job platform + :type platform: HPCPlatform + :return: delayed jobs + :rtype: list + """ + delayed = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and + job.status == Status.DELAY] + return delayed def get_skipped(self, platform=None): """ - Returns a list of prepared jobs + Returns a list of skipped jobs :param platform: job platform :type platform: HPCPlatform - :return: prepared jobs + :return: skipped jobs :rtype: list """ skipped = [job for job in self._job_list if (platform is None or job.platform.name.lower() == platform.name.lower()) and @@ -1186,9 +1197,9 @@ class JobList(object): :rtype: list """ active = self.get_in_queue(platform) + self.get_ready( - platform=platform, hold=True) + self.get_ready(platform=platform, hold=False) - tmp = [job for job in active if job.hold and not job.status == - Status.SUBMITTED and not job.status == Status.READY] + platform=platform, hold=True) + self.get_ready(platform=platform, hold=False) + self.get_active(platform=platform) + tmp = [job for job in active if job.hold and not (job.status == + Status.SUBMITTED or job.status == Status.READY or job.status == Status.DELAYED) ] if len(tmp) == len(active): # IF only held jobs left without dependencies satisfied if len(tmp) != 0 and len(active) != 0: raise AutosubmitCritical( @@ -1429,12 +1440,31 @@ class JobList(object): tmp = [ parent for parent in job.parents if parent.status == Status.COMPLETED] if len(tmp) == len(job.parents): - job.status = Status.READY + if not hasattr(job, 'DELAY_RETRY_TIME') or job.retrials is None: + delay_retry_time = as_conf.get_delay_retry_time() + else: + delay_retry_time = job.retry_delay + if "+" in job.retry_delay: + retry_delay = job.fail_count * int(delay_retry_time[:-1]) + int(delay_retry_time[:-1]) + elif "*" in job.delay_retrials: + retry_delay = int(delay_retry_time[1:]) + for retrial_amount in range(0,job.fail_count): + retry_delay += retry_delay * 10 + else: + retry_delay = int(delay_retry_time) + if retry_delay > 0: + job.status = Status.DELAYED + job.delay_end = datetime.datetime.now() + datetime.timedelta(seconds=retry_delay) + Log.debug( + "Resetting job: {0} status to: DELAYED for retrial...".format(job.name)) + else: + job.status = Status.READY + Log.debug( + "Resetting job: {0} status to: READY for retrial...".format(job.name)) job.id = None job.packed = False save = True - Log.debug( - "Resetting job: {0} status to: READY for retrial...".format(job.name)) + else: job.status = Status.WAITING save = True @@ -1459,6 +1489,9 @@ class JobList(object): Log.debug('Updating WAITING jobs') if not fromSetStatus: all_parents_completed = [] + for job in self.get_delayed(): + if datetime.datetime.now() >= job.delay_end: + job.status = Status.READY for job in self.get_waiting(): tmp = [parent for parent in job.parents if parent.status == Status.COMPLETED or parent.status == Status.SKIPPED] @@ -1478,7 +1511,7 @@ class JobList(object): job.packed = False save = True Log.debug( - "Resetting job: {0} status to: READY for retrial...".format(job.name)) + "Resetting job: {0} status to: READY".format(job.name)) if len(tmp) == len(job.parents): job.status = Status.READY job.packed = False diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index 14e72c4f96a66eaabe71864cbd4821b5d5835c9d..7ff33b0a667bf8ab6d6261d1d6bfa1804c986703 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -44,7 +44,7 @@ class Monitor: _table = dict([(Status.UNKNOWN, 'white'), (Status.WAITING, 'gray'), (Status.READY, 'lightblue'), (Status.PREPARED, 'skyblue'), (Status.SUBMITTED, 'cyan'), (Status.HELD, 'salmon'), (Status.QUEUING, 'pink'), (Status.RUNNING, 'green'), - (Status.COMPLETED, 'yellow'), (Status.FAILED, 'red'), (Status.SUSPENDED, 'orange'), (Status.SKIPPED, 'lightyellow')]) + (Status.COMPLETED, 'yellow'), (Status.FAILED, 'red'), (Status.DELAYED,'lightcyan') ,(Status.SUSPENDED, 'orange'), (Status.SKIPPED, 'lightyellow')]) @staticmethod def color_status(status): @@ -78,6 +78,8 @@ class Monitor: return Monitor._table[Status.FAILED] elif status == Status.SUSPENDED: return Monitor._table[Status.SUSPENDED] + elif status == Status.DELAYED: + return Monitor._table[Status.SUSPENDED] else: return Monitor._table[Status.UNKNOWN] @@ -100,8 +102,11 @@ class Monitor: graph_name='Legend', label='Legend', rank="source") legend.add_node(pydotplus.Node(name='UNKNOWN', shape='box', style="", fillcolor=self._table[Status.UNKNOWN])) + legend.add_node(pydotplus.Node(name='WAITING', shape='box', style="filled", fillcolor=self._table[Status.WAITING])) + legend.add_node(pydotplus.Node(name='DELAYED', shape='box', style="filled", + fillcolor=self._table[Status.DELAYED])) legend.add_node(pydotplus.Node(name='READY', shape='box', style="filled", fillcolor=self._table[Status.READY])) legend.add_node(pydotplus.Node(name='PREPARED', shape='box', style="filled", @@ -121,6 +126,7 @@ class Monitor: fillcolor=self._table[Status.COMPLETED])) legend.add_node(pydotplus.Node(name='FAILED', shape='box', style="filled", fillcolor=self._table[Status.FAILED])) + legend.add_node(pydotplus.Node(name='SUSPENDED', shape='box', style="filled", fillcolor=self._table[Status.SUSPENDED])) diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 39284bb3823fbb35989eaf7cdcf6a12b762a3b85..73ec3f1fc1207f09bccd87d418ef63707bdfdcc6 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -199,6 +199,10 @@ Examples: # MEMORY = 4096 ## Number of retrials if a job fails. If not specified, defaults to the value given on experiment's autosubmit.conf # RETRIALS = 4 + ## Allows to put a delay between retries, of retrials if a job fails. If not specified, it will be static + # DELAY_RETRY_TIME = 11 + # DELAY_RETRY_TIME = +11 # will wait 11,22,33,44... + # DELAY_RETRY_TIME = *11 # will wait 11,110,1110,11110... ## Some jobs can not be checked before running previous jobs. Set this option to false if that is the case # CHECK = False ## Select the interpreter that will run the job. Options: bash, python, r Default: bash @@ -331,6 +335,10 @@ Examples: # Number of retrials if a job fails. Can ve override at job level # Default = 0 RETRIALS = 0 + ## Allows to put a delay between retries, of retrials if a job fails. If not specified, it will be static + # DELAY_RETRY_TIME = 11 + # DELAY_RETRY_TIME = +11 # will wait 11,22,33,44... + # DELAY_RETRY_TIME = *11 # will wait 11,110,1110,11110... # Default output type for CREATE, MONITOR, SET STATUS, RECOVERY. Available options: pdf, svg, png, ps, txt # Default = pdf OUTPUT = pdf diff --git a/docs/source/usage/advanced_features/custom_header.rst b/docs/source/usage/advanced_features/custom_header.rst index c5b4d100ff8f0937b599aebb2214010aeea0a0d5..ad6eff0e061ee177c00e0da62c4a8e638a471314 100644 --- a/docs/source/usage/advanced_features/custom_header.rst +++ b/docs/source/usage/advanced_features/custom_header.rst @@ -56,6 +56,10 @@ In the file: # MEMORY = 4096 ## Number of retrials if a job fails. If not specified, defaults to the value given on experiment's autosubmit.conf # RETRIALS = 4 + ## Allows to put a delay between retries, of retrials if a job fails. If not specified, it will be static + # DELAY_RETRY_TIME = 11 + # DELAY_RETRY_TIME = +11 # will wait 11,22,33,44... + # DELAY_RETRY_TIME = *11 # will wait 11,110,1110,11110... ## Some jobs can not be checked before running previous jobs. Set this option to false if that is the case # CHECK = False ## Select the interpreter that will run the job. Options: bash, python, r Default: bash diff --git a/docs/source/usage/configuration/new_job.rst b/docs/source/usage/configuration/new_job.rst index 17bc96e005b74daf96f245b25fe53aa1e93d262b..d94a0aaa33b5a88a74395848790c8f58e6736f2f 100644 --- a/docs/source/usage/configuration/new_job.rst +++ b/docs/source/usage/configuration/new_job.rst @@ -40,6 +40,9 @@ To do this use: * QUEUE: queue to add the job to. If not specificied, uses PLATFORM default. +* RETRIALS: Number of retrials if job fails + +* DELAY_RETRY_TIME: Allows to add a timer between retries There are also other, less used features that you can use: