From a0c5bea6843a9897fe56c17482ce3e6fb05a47f3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 16 May 2023 12:21:02 +0200 Subject: [PATCH 01/20] logs rework logs Working some changes and trying to use process instead of thread Fixed a perfomance issue Fixed an issue with running: once adding more edges logs fix run Rework of Retrieval of logs parcially working Now remote works but local not Rework of Retrieval of logs parcially working log_recovery rework wip logs change Clean the code a bit working now needs some cleaning test fix bug fix bugs, reworked a bit fix bug updated test updated test fixed some bugs, added some docs more fixes test fix pipeline fix pipeline math readded datetime readded fix issue with delay retrial fix issue with -1 Fixes !https://earth.bsc.es/gitlab/es/autosubmit/-/issues/1209 fix grouping test test fix test fix some changes for Bruno comments moved if inside gen Workflow optimizations added ( mega squashed commit ) Fixes #1158 added zipp dependency ( rocrate in bscearth000) re-added additional files Database is locked error in historic db ( I think it is an issue in my computer as happened in master aswell) QOL when splits is introduced with "" ( testing francesc experiment ) Ran regression test, noticed issue with experiment a005 and fixed converse job to list changed == for in to dont care about spaces Fix splits when * and not * is in same line added if not monitor Fix changes Fix delay Fixed edge_info Differences fixed Differences fixed comments fixed comments added comments added N-1 deleted test of deleted function deleted old code fixed pipeline Fixed save Added version and hpcarch as requisites to change Improved split_to Improved split_to (wip) Added "previous" filter (wip) Added "previous" filter fixed status .lower() added Add filter previous docs python3 or pytho2 ( fixed) type python updated test changed configparserversion better detection if data is changed working, added the real configuration to the docs changed configparserversion working? changed test working? issue_with_none Added -f flag to force the recreation from 0 ... (useful mainly for test ) maybe almost working fixed bug with chunk wrapper fix comments comments comments comments comments comments doble # job_section comments docstring added ref todo changed wallclock commented removed funcy Deleted funcy, updated configar paser that has some fixes in changed files Improved the run/monitor speed. Fixed some default stuff fix stats Some memory changes introduced added more cases reformat Added test_dependencies changed the location re-added marked_status File parameter reviewing changed results removed root = None update_genealogy clean unused code update_genealogy clean unused code reviewing comments reviewing comments reviewing comments tests tes fix pipeline test fix test fix added funcy to setup.py updated test changed configparserversion better detection if data is changed working, added the real configuration to the docs changed configparserversion working? changed test working? issue_with_none Added -f flag to force the recreation from 0 ... (useful mainly for test ) maybe almost working fixed bug with chunk wrapper fix comments comments comments comments comments comments doble # job_section comments docstring added ref todo changed wallclock commented removed funcy Deleted funcy, updated configar paser that has some fixes in changed files Improved the run/monitor speed. Fixed some default stuff fix stats Some memory changes introduced reviewing changes (comments) reviewing changes (comments) reviewing changes (comments) reviewing changes (graph enumerate) reviewing changes ( delete commentS) reviewing changes ( delete valid parents) reviewing changes reviewing changes reviewing changes reviewing changes reviewing changes reviewing changes (numpy) reviewing changes (numpy) reviewing changes ( docstring) reviewing changes ( docstring) reviewing changes reviewing changes reviewing changes reviewing changes added more cases reformat Added test_dependencies changed the location re-added marked_status File parameter reviewing changed results removed root = None update_genealogy clean unused code update_genealogy clean unused code reviewing comments reviewing comments reviewing comments tests tes fix pipeline test fix test fix added funcy to setup.py fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments Merge lastest changes Fixed ext header to work under this version Fixed default type [rocrate] Add RO-Crate support to Autosubmit. This commit includes work from several other commits, squashed. It started around February 2023, and by July 2023 it was validated by the RO-Crate community, thanks especially to Simone Leo. Unit tests and documentation were added as well. It add support to the following three RO-Crate profiles in Autosubmit: - Process Run Crate - Workflow Run Crate - Workflow RO-Crate profile 1.0 This is available through the Autosubmit commands archive and unarchive. revise the changes update version bug fix an issue with additional_files and \\ variables added retrial key Move temp folder to the outside of for loops to reduce file creation. Rewrite the assertion part Add dani's check so that it doesnt complain with file not found when proj type is none add extended header and tailer documentation test if the file does not exist, it throws an exception test all the routes from extended tailer and header except fetching the file change the check of hashbang to the first two characters Handle if user sets value with empty key Add R, Bash, and python extended scripts Fix an issue with retrials ( present in 4.0) found while testing a full run with templates and wrapper Added platform_name to the variables to load before the rest, ( mainly when building the dict ) Fixed -cw in create, like in inspect Re-adapted some test-cases to match new code workflows fixed fixing all workflows fixing all workflows fixing all workflows # If parent and childs has the same amount of splits \\ doesn't make sense so it is disabled Remove cycles ( job depends on itself) detail is now a function Added a local test to compare workflows from 4.0 to 4.1 using -d option fix default values fix split fix split fixed parent.split == child.split when 1//2 improved test added get_jobs_filtered test Improved job_list test Improved job_list test pipeline not working pipeline not working removed __eq__ due being incompatible with grand part of the code, changed the test instead added job_list generate tests Added __eq__ fixed an issue with dependencies None Changed DB for PKL in tests Added more tests Added more tests fix wrapper dic added run_member test added test_build_job_with_existent_job_list_status test added compare_section test added update_parameters test added update_parameters test added update_parameters test added add_child test added _repr test Old tests working Only 19 remains, have to doble check grouping fix job_list half fix job_list half fix job_list fix test_job.py fix checkpoint and doc tests Fix member_from more changes numpy deleted from environment.yml pep warning fix added test fix doc docs for the new autosubmit_rc env variable docs for the new autosubmit_rc env variable fix doc added another suppress added comment changed try: except for suppress - commented the debug line Changed version Changes to th efunction, fix a bug with the connection, added a close for ._transport of ssh more fixes added a debugfunction Added a notify for push force portalocker to <= 2.7 removed inputtimeout from requeriments requeriments 2fa notification change Fix applied to 2fa, local platform may were asking for a password Fix applied to 2fa indent in docs dependencies docs docs added method parameter 2fa: instead of 2fa rollback few things 2fa threads timeout timeout test 2fa added docs CHANGED input for getpass to hide typing ( it may not work) 2fa 2fa fix additional files for ecmwf Fixed more issues, now edgeless nodes are correctly deleted and dependencies parameter is correctly set , fixed other issues when loading previous job_list and when the node doesnt have the job fixed few workflow inconsistencies fixed dependency fixed ready jobs more fix Working but have an issue with the initial status added apply_filter_1_to_1 more test test more fixes bsic monitor working working on fixing merges working on fixing merges Pickle working, Futher performance improves in the manage_dependencies part working with pickle up to 1000000, afterwards it give segfualt in saving.. looking for alternatives MUCH faster, is probabily bugged for some cases (wip) version update Added a delete function for nodes that are no longer part of the workflow ( with a xor) TODO: Delete old nodes Reloading only the neccesary, added two methods for asconfparser Fix reload in create pkl changes working faster, no memory issues but thinking more solutions corrected prents testing fast test Fixed some bugs with refactor More memory optimization and call optimizations, deleted uneccesary attr when generating the job becasue they will be added later with update_parameters method, code for generate jobs run very fast, inspect working has to check other commands Reduced uneccesary operations, Reduced memory usage Using igraph for perform the transitive reduction added split filter added split filter setstatus refactoring rebased pkl changes working faster, no memory issues but thinking more solutions corrected prents testing fast test Fixed some bugs with refactor More memory optimization and call optimizations, deleted uneccesary attr when generating the job becasue they will be added later with update_parameters method, code for generate jobs run very fast, inspect working has to check other commands Reduced uneccesary operations, Reduced memory usage Using igraph for perform the transitive reduction added split filter added split filter setstatus refactoring --- autosubmit/autosubmit.py | 30 +- autosubmit/job/job.py | 346 +++++++++------------- autosubmit/job/job_list.py | 62 +++- autosubmit/monitor/monitor.py | 2 + autosubmit/platforms/ecplatform.py | 5 + autosubmit/platforms/locplatform.py | 26 +- autosubmit/platforms/lsfplatform.py | 25 +- autosubmit/platforms/paramiko_platform.py | 45 +-- autosubmit/platforms/pbsplatform.py | 25 +- autosubmit/platforms/pjmplatform.py | 11 +- autosubmit/platforms/platform.py | 60 +++- autosubmit/platforms/sgeplatform.py | 9 +- autosubmit/platforms/slurmplatform.py | 11 +- 13 files changed, 337 insertions(+), 320 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index ac2a775d3..f60bd131a 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1636,6 +1636,8 @@ class Autosubmit: Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True, only_wrappers, hold=False) job_list.update_list(as_conf, False) + for job in job_list.get_job_list(): + job.status = Status.WAITING @staticmethod @@ -2012,7 +2014,6 @@ class Autosubmit: # establish the connection to all platforms # Restore is a missleading, it is actually a "connect" function when the recover flag is not set. Autosubmit.restore_platforms(platforms_to_test) - return job_list, submitter , exp_history, host , as_conf, platforms_to_test, packages_persistence, False else: return job_list, submitter , None, None, as_conf , platforms_to_test, packages_persistence, True @@ -2193,7 +2194,6 @@ class Autosubmit: Log.printlog("Error trying to store failed job count", Log.WARNING) Log.result("Storing failed job count...done") while not recovery and (recovery_retrials < max_recovery_retrials or max_recovery_retrials <= 0 ): - delay = min(15 * consecutive_retrials, 120) recovery_retrials += 1 sleep(delay) @@ -2273,21 +2273,17 @@ class Autosubmit: except Exception as e: pass # Wait for all remaining threads of I/O, close remaining connections - timeout = 0 - active_threads = True - all_threads = threading.enumerate() - while active_threads and timeout <= 180: - active_threads = False - for thread in all_threads: - if "JOB_" in thread.name: - if thread.is_alive(): - active_threads = True - Log.info("{0} is still retrieving outputs, time remaining is {1} seconds.".format( - thread.name, 180 - timeout)) - break - if active_threads: - sleep(10) - timeout += 10 + timeout = 180 + Log.info("Waiting for all logs to be updated") + while len(job_list.get_completed_without_logs()) > 0 and timeout > 0: + for job in job_list.get_completed_without_logs(): + job_list.update_log_status(job) + sleep(1) + timeout = timeout - 1 + if timeout % 10 == 0: + Log.info(f"Timeout: {timeout}") + + for platform in platforms_to_test: platform.closeConnection() if len(job_list.get_failed()) > 0: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index bed0521b3..9cc20c38e 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -242,6 +242,8 @@ class Job(object): self.delete_when_edgeless = False # hetjobs self.het = None + self.updated_log = True + self.ready_start_date = None def _init_runtime_parameters(self): # hetjobs @@ -255,6 +257,7 @@ class Job(object): self._processors = '1' self._memory = '' self._memory_per_task = '' + self.log_retrieved = False @property @autosubmit_parameter(name='tasktype') @@ -1003,220 +1006,138 @@ class Job(object): retrials_list.insert(0, retrial_dates) return retrials_list - def retrieve_logfiles_unthreaded(self, copy_remote_logs, local_logs): - remote_logs = (self.script_name + ".out."+str(self.fail_count), self.script_name + ".err."+str(self.fail_count)) - out_exist = False - err_exist = False - retries = 3 - sleeptime = 0 - i = 0 - no_continue = False - try: - while (not out_exist and not err_exist) and i < retries: - try: - out_exist = self._platform.check_file_exists( - remote_logs[0], True) - except IOError as e: - out_exist = False - try: - err_exist = self._platform.check_file_exists( - remote_logs[1], True) - except IOError as e: - err_exists = False - if not out_exist or not err_exist: - sleeptime = sleeptime + 5 - i = i + 1 - sleep(sleeptime) - if i >= retries: - if not out_exist or not err_exist: - Log.printlog("Failed to retrieve log files {1} and {2} e=6001".format( - retries, remote_logs[0], remote_logs[1])) - return - if str(copy_remote_logs).lower() == "true": - # unifying names for log files - if remote_logs != local_logs: - self.synchronize_logs( - self._platform, remote_logs, local_logs) - remote_logs = copy.deepcopy(local_logs) - self._platform.get_logs_files(self.expid, remote_logs) - # Update the logs with Autosubmit Job ID Brand - try: - for local_log in local_logs: - self._platform.write_jobid(self.id, os.path.join( - self._tmp_path, 'LOG_' + str(self.expid), local_log)) - except BaseException as e: - Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( - str(e), self.name)) - except AutosubmitError as e: - Log.printlog("Trace {0} \nFailed to retrieve log file for job {1}".format( - str(e), self.name), 6001) - except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error - Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format( - str(e), self.name), 6001) - return - - @threaded - def retrieve_logfiles(self, copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = 0,job_id="",auth_password=None, local_auth_password = None): - as_conf = AutosubmitConfig(expid, BasicConfig, YAMLParserFactory()) - as_conf.reload(force_load=True) - max_retrials = self.retrials - max_logs = 0 - last_log = 0 - stat_file = self.script_name[:-4] + "_STAT_" - lang = locale.getlocale()[1] - if lang is None: - lang = locale.getdefaultlocale()[1] - if lang is None: - lang = 'UTF-8' - retries = 2 - count = 0 - success = False - error_message = "" - platform = None - while (count < retries) and not success: - try: - as_conf = AutosubmitConfig(expid, BasicConfig, YAMLParserFactory()) - as_conf.reload(force_load=True) - max_retrials = self.retrials - max_logs = int(max_retrials) - fail_count - last_log = int(max_retrials) - fail_count - submitter = self._get_submitter(as_conf) - submitter.load_platforms(as_conf, auth_password=auth_password, local_auth_password=local_auth_password) - platform = submitter.platforms[platform_name] - platform.test_connection() - success = True - except BaseException as e: - error_message = str(e) - sleep(5) - pass - count = count + 1 - if not success: - raise AutosubmitError( - "Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format( - error_message), 6006) + def get_new_remotelog(self, platform, max_logs, last_log, stat_file): + """ + Checks if stat file exists on remote host + if it exists, remote_log variable is updated + """ try: - if self.wrapper_type is not None and self.wrapper_type == "vertical": - found = False - retrials = 0 - while retrials < 3 and not found: - if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): - found = True - retrials = retrials + 1 + if self.wrapper_type and self.wrapper_type == "vertical": + platform.check_stat_file_by_retrials(stat_file + str(max_logs), retries=1) for i in range(max_logs-1,-1,-1): - if platform.check_stat_file_by_retrials(stat_file + str(i)): + if platform.check_stat_file_by_retrials(stat_file + str(i), retries=1, first=False): last_log = i else: break - remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) - + remote_logs = (f"{self.script_name}.out.{last_log}", f"{self.script_name}.err.{last_log}") else: - remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) + remote_logs = (f"{self.script_name}.out.{self._fail_count}", f"{self.script_name}.err.{self._fail_count}") except BaseException as e: - Log.printlog( - "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) - out_exist = False - err_exist = False - retries = 3 - i = 0 + remote_logs = "" + Log.printlog(f"Trace {e} \n Failed to retrieve stat file for job {self.name}", 6000) + return remote_logs + + def check_remote_log_exists(self, platform): try: - while (not out_exist and not err_exist) and i < retries: - try: - out_exist = platform.check_file_exists( - remote_logs[0], False, sleeptime=0, max_retries=1) - except IOError as e: - out_exist = False - try: - err_exist = platform.check_file_exists( - remote_logs[1], False, sleeptime=0, max_retries=1) - except IOError as e: - err_exist = False - if not out_exist or not err_exist: - i = i + 1 - sleep(5) + out_exist = platform.check_file_exists(self.remote_logs[0], False, sleeptime=0, max_retries=1) + except IOError: + out_exist = False + try: + err_exist = platform.check_file_exists(self.remote_logs[1], False, sleeptime=0, max_retries=1) + except IOError: + err_exist = False + if out_exist or err_exist: + return True + else: + return False + def retrieve_vertical_wrapper_logs(self, last_log, max_logs, platform, stat_file, max_retrials, fail_count): + """ + Retrieves log files from remote host meant to be used inside a daemon thread. + :param last_log: + :param max_logs: + :param platform: + :param stat_file: + :param max_retrials: + :param fail_count: + :return: + """ + lang = locale.getlocale()[1] + if not lang: + lang = locale.getdefaultlocale()[1] + if not lang: + lang = 'UTF-8' + log_start = last_log + exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, self.name[:4]) + tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) + time_stamp = "1970" + at_least_one_recovered = False + while log_start <= max_logs: + try: + if platform.get_stat_file_by_retrials(stat_file + str(max_logs)): + with open(os.path.join(tmp_path, stat_file + str(max_logs)), 'r+') as f: + total_stats = [f.readline()[:-1], f.readline()[:-1], f.readline()[:-1]] try: - platform.restore_connection() - except BaseException as e: - Log.printlog("{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format( - str(e), self.name), 6001) - if i >= retries: - if not out_exist or not err_exist: - Log.printlog("Failed to retrieve log files {1} and {2} e=6001".format( - retries, remote_logs[0], remote_logs[1])) - return - if copy_remote_logs: - l_log = copy.deepcopy(local_logs) - # unifying names for log files - if remote_logs != local_logs: - if self.wrapper_type == "vertical": # internal_Retrial mechanism - log_start = last_log - exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) - tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) - time_stamp = "1970" - total_stats = ["", "","FAILED"] - while log_start <= max_logs: - try: - if platform.get_stat_file_by_retrials(stat_file+str(max_logs)): - with open(os.path.join(tmp_path,stat_file+str(max_logs)), 'r+') as f: - total_stats = [f.readline()[:-1],f.readline()[:-1],f.readline()[:-1]] - try: - total_stats[0] = float(total_stats[0]) - total_stats[1] = float(total_stats[1]) - except Exception as e: - total_stats[0] = int(str(total_stats[0]).split('.')[0]) - total_stats[1] = int(str(total_stats[1]).split('.')[0]) - if max_logs != ( int(max_retrials) - fail_count ): - time_stamp = date2str(datetime.datetime.fromtimestamp(total_stats[0]), 'S') - else: - with open(os.path.join(self._tmp_path, self.name + '_TOTAL_STATS_TMP'), 'rb+') as f2: - for line in f2.readlines(): - if len(line) > 0: - line = line.decode(lang) - time_stamp = line.split(" ")[0] - - self.write_total_stat_by_retries(total_stats,max_logs == ( int(max_retrials) - fail_count )) - platform.remove_stat_file_by_retrials(stat_file+str(max_logs)) - l_log = (self.script_name[:-4] + "." + time_stamp + ".out",self.script_name[:-4] + "." + time_stamp + ".err") - r_log = ( remote_logs[0][:-1]+str(max_logs) , remote_logs[1][:-1]+str(max_logs) ) - self.synchronize_logs(platform, r_log, l_log,last = False) - platform.get_logs_files(self.expid, l_log) - try: - for local_log in l_log: - platform.write_jobid(job_id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) - except BaseException as e: - pass - max_logs = max_logs - 1 - else: - max_logs = -1 # exit, no more logs - except BaseException as e: - max_logs = -1 # exit - local_logs = copy.deepcopy(l_log) - remote_logs = copy.deepcopy(local_logs) - if self.wrapper_type != "vertical": - self.synchronize_logs(platform, remote_logs, local_logs) - remote_logs = copy.deepcopy(local_logs) + total_stats[0] = float(total_stats[0]) + total_stats[1] = float(total_stats[1]) + except Exception as e: + total_stats[0] = int(str(total_stats[0]).split('.')[0]) + total_stats[1] = int(str(total_stats[1]).split('.')[0]) + if max_logs != (int(max_retrials) - fail_count): + time_stamp = date2str(datetime.datetime.fromtimestamp(total_stats[0]), 'S') + else: + with open(os.path.join(self._tmp_path, self.name + '_TOTAL_STATS_TMP'), 'rb+') as f2: + for line in f2.readlines(): + if len(line) > 0: + line = line.decode(lang) + time_stamp = line.split(" ")[0] + + self.write_total_stat_by_retries(total_stats, max_logs == (int(max_retrials) - fail_count)) + platform.remove_stat_file_by_retrials(stat_file + str(max_logs)) + l_log = (self.script_name[:-4] + "." + time_stamp + ".out", + self.script_name[:-4] + "." + time_stamp + ".err") + r_log = (self.remote_logs[0][:-1] + str(max_logs), self.remote_logs[1][:-1] + str(max_logs)) + self.synchronize_logs(platform, r_log, l_log, last=False) + platform.get_logs_files(self.expid, l_log) + with suppress(BaseException): + for local_log in l_log: + platform.write_jobid(self.id,os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) + max_logs = max_logs - 1 + at_least_one_recovered = True + else: + max_logs = -1 # exit, no more logs + except Exception: + return False + return at_least_one_recovered + + def retrieve_logfiles(self, platform): + """ + Retrieves log files from remote host meant to be used inside a process. + :param platform: platform that is calling the function, already connected. + :return: + """ + log_retrieved = False + max_retrials = self.retrials + max_logs = int(max_retrials) - self._fail_count + last_log = int(max_retrials) - self._fail_count + stat_file = self.script_name[:-4] + "_STAT_" + self.remote_logs = self.get_new_remotelog(platform, max_logs, last_log, stat_file) + if not self.remote_logs: + self.log_retrieved = False + else: + if self.check_remote_log_exists(platform): + # retrieve logs and stat files + if self.wrapper_type is not None and self.wrapper_type == "vertical": + if self.retrieve_vertical_wrapper_logs(last_log, max_logs, platform, stat_file, max_retrials, self._fail_count): + log_retrieved = True + else: + try: + self.synchronize_logs(platform, self.remote_logs, self.local_logs) + remote_logs = copy.deepcopy(self.local_logs) platform.get_logs_files(self.expid, remote_logs) - # Update the logs with Autosubmit Job ID Brand - try: - for local_log in local_logs: - platform.write_jobid(job_id, os.path.join( - self._tmp_path, 'LOG_' + str(self.expid), local_log)) - except BaseException as e: - Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( - str(e), self.name)) - with suppress(Exception): - platform.closeConnection() - except AutosubmitError as e: - Log.printlog("Trace {0} \nFailed to retrieve log file for job {1}".format( - e.message, self.name), 6001) - with suppress(Exception): - platform.closeConnection() - except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error - Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format( - e.message, self.name), 6001) - with suppress(Exception): - platform.closeConnection() - return + log_retrieved = True + except: + log_retrieved = False + # Update the logs with Autosubmit Job ID Brand + try: + for local_log in self.local_logs: + platform.write_jobid(self.id, os.path.join( + self._tmp_path, 'LOG_' + str(self.expid), local_log)) + except BaseException as e: + Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name)) + self.log_retrieved = log_retrieved + if not self.log_retrieved: + Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6001) def parse_time(self,wallclock): regex = re.compile(r'(((?P\d+):)((?P\d+)))(:(?P\d+))?') @@ -1272,6 +1193,7 @@ class Job(object): :param failed_file: boolean, if True, checks if the job failed :return: """ + self.log_avaliable = False copy_remote_logs = as_conf.get_copy_remote_logs() previous_status = self.status self.prev_status = previous_status @@ -1327,21 +1249,20 @@ class Job(object): self.write_submit_time() # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: - # New thread, check if file exist - expid = copy.deepcopy(self.expid) - platform_name = copy.deepcopy(self.platform_name) - local_logs = copy.deepcopy(self.local_logs) - remote_logs = copy.deepcopy(self.remote_logs) - if as_conf.get_disable_recovery_threads(self.platform.name) == "true": - self.retrieve_logfiles_unthreaded(copy_remote_logs, local_logs) - else: - self.retrieve_logfiles(copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = copy.copy(self.fail_count),job_id=self.id,auth_password=self._platform.pw, local_auth_password=self._platform.pw) + import time + start = time.time() + self.platform.add_job_to_log_recover(self) + Log.debug(f"Time to retrieve logs for job {self.name} {time.time() - start}") if self.wrapper_type == "vertical": max_logs = int(self.retrials) for i in range(0,max_logs): self.inc_fail_count() else: self.write_end_time(self.status == Status.COMPLETED) + + if self.status in [Status.COMPLETED, Status.FAILED]: + self.updated_log = False + return self.status @staticmethod @@ -2593,6 +2514,7 @@ class WrapperJob(Job): if job.name in completed_files: completed_jobs.append(job) job.new_status = Status.COMPLETED + job.updated_log = False job.update_status(self.as_config) for job in completed_jobs: self.running_jobs_start.pop(job, None) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index f23ca4e73..be418a202 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -25,6 +25,7 @@ from contextlib import suppress from shutil import move from threading import Thread from typing import List, Dict +from pathlib import Path import math import networkx as nx @@ -32,6 +33,12 @@ from bscearth.utils.date import date2str, parse_date from networkx import DiGraph from time import localtime, strftime, mktime +import math +import networkx as nx +from bscearth.utils.date import date2str, parse_date +from networkx import DiGraph +from time import localtime, strftime, mktime, time + import autosubmit.database.db_structure as DbStructure from autosubmit.helpers.data_transfer import JobRow from autosubmit.job.job import Job @@ -46,8 +53,6 @@ from autosubmitconfigparser.config.configcommon import AutosubmitConfig from log.log import AutosubmitCritical, AutosubmitError, Log -# Log.get_logger("Log.Autosubmit") - def threaded(fn): def wrapper(*args, **kwargs): @@ -97,6 +102,7 @@ class JobList(object): self.graph = DiGraph() self.depends_on_previous_chunk = dict() self.depends_on_previous_split = dict() + self.path_to_logs = Path(BasicConfig.LOCAL_ROOT_DIR, self.expid, BasicConfig.LOCAL_TMP_DIR,f'LOG_{self.expid}') @property def expid(self): @@ -1671,6 +1677,21 @@ class JobList(object): else: return completed_jobs + def get_completed_without_logs(self, platform=None): + """ + Returns a list of completed jobs wihtout updated logs + + :param platform: job platform + :type platform: HPCPlatform + :return: completed jobs + :rtype: list + """ + + completed_jobs = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and + job.status == Status.COMPLETED and job.updated_log is False ] + + return completed_jobs + def get_uncompleted(self, platform=None, wrapper=False): """ Returns a list of completed jobs @@ -2499,6 +2520,30 @@ class JobList(object): return jobs_to_check + def update_log_status(self, job): + """ + Updates the log err and log out. + """ + if not hasattr(job, + "updated_log") or not job.updated_log: # hasattr for backward compatibility (job.updated_logs is only for newer jobs, as the loaded ones may not have this set yet) + # order path_to_logs by name and get the two last element + err = "" + out = "" + log_file = None + for log_file in sorted(self.path_to_logs.glob(f"{job.name}.*"))[-3:]: # cmd, err, out + if "err" in log_file.suffix: + err = log_file.name + elif "out" in log_file.suffix: + out = log_file.name + job.local_logs = (out, err) + job.remote_logs = (out, err) + if log_file: + if not hasattr(job, "ready_start_date") or not job.ready_start_date or log_file.name.split(".")[ + -2] >= job.ready_start_date: # hasattr for backward compatibility + job.updated_log = True + if not job.updated_log: + job.platform.add_job_to_log_recover(job) + def update_list(self, as_conf, store_change=True, fromSetStatus=False, submitter=None, first_time=False): # type: (AutosubmitConfig, bool, bool, object, bool) -> bool """ @@ -2576,16 +2621,22 @@ class JobList(object): job.packed = False save = True # Check checkpoint jobs, the status can be Any - for job in self.check_special_status(): + for job in ( job for job in self.check_special_status() ): job.status = Status.READY + # Run start time in format (YYYYMMDDHH:MM:SS) from current time + job.ready_start_date = strftime("%Y%m%d%H%M%S") job.id = None job.packed = False job.wrapper_type = None save = True Log.debug(f"Special condition fullfilled for job {job.name}") # if waiting jobs has all parents completed change its State to READY - for job in self.get_completed(): + for job in ( job for job in self.get_completed() ): job.packed = False + # Log name has this format: + # a02o_20000101_fc0_2_SIM.20240212115021.err + # $jobname.$(YYYYMMDDHHMMSS).err or .out + self.update_log_status(job) if job.synchronize is not None and len(str(job.synchronize)) > 0: tmp = [parent for parent in job.parents if parent.status == Status.COMPLETED] if len(tmp) != len(job.parents): @@ -2673,6 +2724,9 @@ class JobList(object): if len(tmp2) == len(job.parents) and len(tmp3) != len(job.parents): job.status = Status.READY job.packed = False + # Run start time in format (YYYYMMDDHH:MM:SS) from current time + job.ready_start_date = strftime("%Y%m%d%H%M%S") + job.packed = False job.hold = False save = True Log.debug( diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index e1b9bb3b2..de1f0282d 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -453,6 +453,8 @@ class Monitor: log_out = "" log_err = "" if job.status in [Status.FAILED, Status.COMPLETED]: + if not job.local_logs[0]: + job.local_logs = ("","") log_out = path + "/" + job.local_logs[0] log_err = path + "/" + job.local_logs[1] diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index fb880e694..c41986ee9 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -170,6 +170,10 @@ class EcPlatform(ParamikoPlatform): self.connected = False except: self.connected = False + if not self.log_retrieval_process_active: + self.log_retrieval_process_active = True + self.recover_job_logs() + def restore_connection(self): """ In this case, it does nothing because connection is established for each command @@ -187,6 +191,7 @@ class EcPlatform(ParamikoPlatform): self.connected = False except: self.connected = False + def test_connection(self): """ In this case, it does nothing because connection is established for each command diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 7f41060eb..a01088dac 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -28,7 +28,7 @@ from autosubmit.platforms.headers.local_header import LocalHeader from autosubmitconfigparser.config.basicconfig import BasicConfig from time import sleep from log.log import Log, AutosubmitError, AutosubmitCritical - +import threading class LocalPlatform(ParamikoPlatform): """ Class to manage jobs to localhost @@ -113,15 +113,24 @@ class LocalPlatform(ParamikoPlatform): def connect(self, reconnect=False): self.connected = True + if not self.log_retrieval_process_active: + self.log_retrieval_process_active = True + self.recover_job_logs() + + def test_connection(self): - self.connected = True + if not self.connected: + self.connect() + + def restore_connection(self): self.connected = True def check_Alljobs(self, job_list, as_conf, retries=5): for job,prev_job_status in job_list: self.check_job(job) - def send_command(self, command,ignore_log=False, x11 = False): + + def send_command(self, command, ignore_log=False, x11 = False): lang = locale.getlocale()[1] if lang is None: lang = locale.getdefaultlocale()[1] @@ -175,7 +184,7 @@ class LocalPlatform(ParamikoPlatform): return True # Moves .err .out - def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3): + def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3, first=True): """ Moves a file on the platform :param src: source name @@ -187,12 +196,17 @@ class LocalPlatform(ParamikoPlatform): file_exist = False remote_path = os.path.join(self.get_files_path(), src) retries = 0 + # Not first is meant for vertical_wrappers. There you have to download STAT_{MAX_LOGS} then STAT_{MAX_LOGS-1} and so on + if not first: + max_retries = 1 + sleeptime = 0 while not file_exist and retries < max_retries: try: file_exist = os.path.isfile(os.path.join(self.get_files_path(),src)) if not file_exist: # File doesn't exist, retry in sleep-time - Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, - max_retries - retries, remote_path) + if first: + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, + max_retries - retries, remote_path) if not wrapper_failed: sleep(sleeptime) sleeptime = sleeptime + 5 diff --git a/autosubmit/platforms/lsfplatform.py b/autosubmit/platforms/lsfplatform.py index a03ec5dee..ed65c772d 100644 --- a/autosubmit/platforms/lsfplatform.py +++ b/autosubmit/platforms/lsfplatform.py @@ -138,27 +138,4 @@ class LsfPlatform(ParamikoPlatform): ############################################################################### """.format(filename, queue, project, wallclock, num_procs, dependency, '\n'.ljust(13).join(str(s) for s in directives)) - # def connect(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True - # def restore_connection(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True - # def test_connection(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True + diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 8bb6ef2cc..2e03b218b 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -1,5 +1,6 @@ +import copy + import locale -from binascii import hexlify from contextlib import suppress from time import sleep import sys @@ -7,7 +8,6 @@ import socket import os import paramiko import datetime -import time import select import re from datetime import timedelta @@ -15,17 +15,17 @@ import random from autosubmit.job.job_common import Status from autosubmit.job.job_common import Type from autosubmit.platforms.platform import Platform -from bscearth.utils.date import date2str from log.log import AutosubmitError, AutosubmitCritical, Log from paramiko.ssh_exception import (SSHException) import Xlib.support.connect as xlib_connect from threading import Thread +import threading import getpass from paramiko.agent import Agent def threaded(fn): def wrapper(*args, **kwargs): - thread = Thread(target=fn, args=args, kwargs=kwargs) + thread = Thread(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_X11") thread.start() return thread @@ -134,6 +134,7 @@ class ParamikoPlatform(Platform): except: message = "Timeout connection" return message + except EOFError as e: self.connected = False raise AutosubmitError("[{0}] not alive. Host: {1}".format( @@ -162,7 +163,7 @@ class ParamikoPlatform(Platform): "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,str(e)) while self.connected is False and retry < retries: try: - self.connect(True) + self.connect(True) except Exception as e: pass retry += 1 @@ -266,7 +267,7 @@ class ParamikoPlatform(Platform): except Exception as e: self._ssh.connect(self._host_config['hostname'], port, username=self.user, key_filename=self._host_config_id, sock=self._proxy, timeout=60, - banner_timeout=60,disabled_algorithms={'pubkeys': ['rsa-sha2-256', 'rsa-sha2-512']}) + banner_timeout=60, disabled_algorithms={'pubkeys': ['rsa-sha2-256', 'rsa-sha2-512']}) else: try: self._ssh.connect(self._host_config['hostname'], port, username=self.user, @@ -300,7 +301,10 @@ class ParamikoPlatform(Platform): self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=pow(4, 12) ,max_packet_size=pow(4, 12) ) self._ftpChannel.get_channel().settimeout(120) self.connected = True - except SSHException as e: + if not self.log_retrieval_process_active: + self.log_retrieval_process_active = True + self.recover_job_logs() + except SSHException: raise except IOError as e: if "refused" in str(e.strerror).lower(): @@ -644,6 +648,9 @@ class ParamikoPlatform(Platform): job_status = Status.UNKNOWN Log.error( 'check_job() The job id ({0}) status is {1}.', job_id, job_status) + + if job_status in [Status.FAILED, Status.COMPLETED]: + job.updated_log = False if submit_hold_check: return job_status else: @@ -775,7 +782,6 @@ class ParamikoPlatform(Platform): elif retries == 0: job_status = Status.COMPLETED job.update_status(as_conf) - else: job_status = Status.UNKNOWN Log.error( @@ -887,6 +893,7 @@ class ParamikoPlatform(Platform): sys.stdout.write(session.recv(4096)) while session.recv_stderr_ready(): sys.stderr.write(session.recv_stderr(4096)) + @threaded def x11_status_checker(self, session, session_fileno): self.transport.accept() @@ -1325,16 +1332,16 @@ class ParamikoPlatform(Platform): if self.transport: self.transport.close() self.transport.stop_thread() - with suppress(Exception): - del self._ssh._agent # May not be in all runs - with suppress(Exception): - del self._ssh._transport - with suppress(Exception): - del self._ftpChannel - with suppress(Exception): - del self.transport - with suppress(Exception): - del self._ssh + # with suppress(Exception): + # del self._ssh._agent # May not be in all runs + # with suppress(Exception): + # del self._ssh._transport + # with suppress(Exception): + # del self._ftpChannel + # with suppress(Exception): + # del self.transport + # with suppress(Exception): + # del self._ssh def check_tmp_exists(self): try: @@ -1366,8 +1373,6 @@ class ParamikoPlatform(Platform): """ Creates log dir on remote host """ - - try: if self.send_command(self.get_mkdir_cmd()): Log.debug('{0} has been created on {1} .', diff --git a/autosubmit/platforms/pbsplatform.py b/autosubmit/platforms/pbsplatform.py index 132b8715c..1a1ef89b5 100644 --- a/autosubmit/platforms/pbsplatform.py +++ b/autosubmit/platforms/pbsplatform.py @@ -129,27 +129,4 @@ class PBSPlatform(ParamikoPlatform): return self._checkjob_cmd + str(job_id) else: return "ssh " + self.host + " " + self.get_qstatjob(job_id) - # def connect(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True - # def restore_connection(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True - # def test_connection(self): - # """ - # In this case, it does nothing because connection is established for each command - # - # :return: True - # :rtype: bool - # """ - # self.connected = True + diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py index 9014cd6a5..9e182c5c0 100644 --- a/autosubmit/platforms/pjmplatform.py +++ b/autosubmit/platforms/pjmplatform.py @@ -463,9 +463,13 @@ class PJMPlatform(ParamikoPlatform): def allocated_nodes(): return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list_{0}".format(node_id))""" - def check_file_exists(self, filename, wrapper_failed=False, sleeptime=5, max_retries=3): + def check_file_exists(self, filename, wrapper_failed=False, sleeptime=5, max_retries=3, first=True): file_exist = False retries = 0 + # Not first is meant for vertical_wrappers. There you have to download STAT_{MAX_LOGS} then STAT_{MAX_LOGS-1} and so on + if not first: + max_retries = 1 + sleeptime = 0 while not file_exist and retries < max_retries: try: # This return IOError if path doesn't exist @@ -473,8 +477,9 @@ class PJMPlatform(ParamikoPlatform): self.get_files_path(), filename)) file_exist = True except IOError as e: # File doesn't exist, retry in sleeptime - Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, - max_retries - retries, os.path.join(self.get_files_path(), filename)) + if first: + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, + max_retries - retries, os.path.join(self.get_files_path(), filename)) if not wrapper_failed: sleep(sleeptime) sleeptime = sleeptime + 5 diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index 05340a526..52012ad1c 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -1,6 +1,11 @@ +import copy + +import queue + +import time + import locale import os -from pathlib import Path import traceback from autosubmit.job.job_common import Status @@ -8,7 +13,16 @@ from typing import List, Union from autosubmit.helpers.parameters import autosubmit_parameter from log.log import AutosubmitCritical, AutosubmitError, Log -import getpass +from multiprocessing import Process, Queue + + +def processed(fn): + def wrapper(*args, **kwargs): + process = Process(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_platform") + process.start() + return process + + return wrapper class Platform(object): """ Class to manage the connections to the different platforms. @@ -78,6 +92,8 @@ class Platform(object): self.pw = auth_password else: self.pw = None + self.recovery_queue = Queue() + self.log_retrieval_process_active = False @property @@ -272,6 +288,7 @@ class Platform(object): for innerJob in package._jobs: # Setting status to COMPLETED, so it does not get stuck in the loop that calls this function innerJob.status = Status.COMPLETED + innerJob.updated_log = False # If called from RUN or inspect command if not only_wrappers: @@ -624,7 +641,7 @@ class Platform(object): if self.check_file_exists(filename): self.delete_file(filename) - def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3): + def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3, first=True): return True def get_stat_file(self, job_name, retries=0): @@ -650,19 +667,19 @@ class Platform(object): Log.debug('{0}_STAT file not found', job_name) return False - def check_stat_file_by_retrials(self, job_name, retries=0): + def check_stat_file_by_retrials(self, job_name, retries=3, first=True): """ check *STAT* file :param retries: number of intents to get the completed files - :type retries: int + :type first: int :param job_name: name of job to check :type job_name: str :return: True if successful, False otherwise :rtype: bool """ filename = job_name - if self.check_file_exists(filename): + if self.check_file_exists(filename,first=first,max_retries = retries): return True else: return False @@ -821,3 +838,34 @@ class Platform(object): """ raise NotImplementedError + def add_job_to_log_recover(self, job): + self.recovery_queue.put((job,job.children)) + + def connect(self, reconnect=False): + raise NotImplementedError + + def restore_connection(self): + raise NotImplementedError + + @processed + def recover_job_logs(self): + job_names_processed = set() + self.connected = False + self.restore_connection() + while True: + try: + job,children = self.recovery_queue.get() + if job.name in job_names_processed: + continue + job.children = children + job.platform = self + job.retrieve_logfiles(self) + job_names_processed.add(job.name) + except queue.Empty: + pass + except Exception as e: + self.restore_connection() + time.sleep(1) + + + diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py index 58671cd98..1816379e2 100644 --- a/autosubmit/platforms/sgeplatform.py +++ b/autosubmit/platforms/sgeplatform.py @@ -61,6 +61,7 @@ class SgePlatform(ParamikoPlatform): 'ds', 'dS', 'dT', 'dRs', 'dRS', 'dRT'] self._pathdir = "\$HOME/LOG_" + self.expid self.update_cmds() + self.log_retrieval_process_active = False def submit_Script(self, hold=False): pass @@ -114,7 +115,7 @@ class SgePlatform(ParamikoPlatform): def get_checkjob_cmd(self, job_id): return self.get_qstatjob(job_id) - def connect(self,reconnect=False): + def connect(self, reconnect=False): """ In this case, it does nothing because connection is established for each command @@ -122,6 +123,9 @@ class SgePlatform(ParamikoPlatform): :rtype: bool """ self.connected = True + if not self.log_retrieval_process_active: + self.log_retrieval_process_active = True + self.recover_job_logs() def restore_connection(self): """ In this case, it does nothing because connection is established for each command @@ -130,6 +134,7 @@ class SgePlatform(ParamikoPlatform): :rtype: bool """ self.connected = True + def test_connection(self): """ In this case, it does nothing because connection is established for each command @@ -138,3 +143,5 @@ class SgePlatform(ParamikoPlatform): :rtype: bool """ self.connected = True + self.connected(True) + diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index e741239db..8b3daae85 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -629,9 +629,13 @@ class SlurmPlatform(ParamikoPlatform): def allocated_nodes(): return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list_{0}".format(node_id))""" - def check_file_exists(self, filename, wrapper_failed=False, sleeptime=5, max_retries=3): + def check_file_exists(self, filename, wrapper_failed=False, sleeptime=5, max_retries=3, first=True): file_exist = False retries = 0 + # Not first is meant for vertical_wrappers. There you have to download STAT_{MAX_LOGS} then STAT_{MAX_LOGS-1} and so on + if not first: + max_retries = 1 + sleeptime = 0 while not file_exist and retries < max_retries: try: # This return IOError if path doesn't exist @@ -639,8 +643,9 @@ class SlurmPlatform(ParamikoPlatform): self.get_files_path(), filename)) file_exist = True except IOError as e: # File doesn't exist, retry in sleeptime - Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, - max_retries - retries, os.path.join(self.get_files_path(), filename)) + if first: + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, + max_retries - retries, os.path.join(self.get_files_path(), filename)) if not wrapper_failed: sleep(sleeptime) sleeptime = sleeptime + 5 -- GitLab From 8dee23d3e4406d0be33b6f304b53adb83d12fbfa Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 11:00:15 +0100 Subject: [PATCH 02/20] fix feedback typo and uncommented --- autosubmit/job/job_list.py | 2 +- autosubmit/platforms/paramiko_platform.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index be418a202..6f3781dc8 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -1679,7 +1679,7 @@ class JobList(object): def get_completed_without_logs(self, platform=None): """ - Returns a list of completed jobs wihtout updated logs + Returns a list of completed jobs without updated logs :param platform: job platform :type platform: HPCPlatform diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 2e03b218b..4083378c8 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -1332,16 +1332,6 @@ class ParamikoPlatform(Platform): if self.transport: self.transport.close() self.transport.stop_thread() - # with suppress(Exception): - # del self._ssh._agent # May not be in all runs - # with suppress(Exception): - # del self._ssh._transport - # with suppress(Exception): - # del self._ftpChannel - # with suppress(Exception): - # del self.transport - # with suppress(Exception): - # del self._ssh def check_tmp_exists(self): try: -- GitLab From b0566ffc0c0f79dbcc8d4b0634fbf0a223ec5048 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 13 Mar 2024 13:15:20 +0100 Subject: [PATCH 03/20] revised some of the feedback --- autosubmit/autosubmit.py | 7 +++---- autosubmit/job/job.py | 8 +++----- autosubmit/job/job_list.py | 4 ++-- autosubmit/monitor/monitor.py | 2 +- autosubmit/platforms/sgeplatform.py | 1 - 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index f60bd131a..664ba433e 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2273,17 +2273,16 @@ class Autosubmit: except Exception as e: pass # Wait for all remaining threads of I/O, close remaining connections - timeout = 180 Log.info("Waiting for all logs to be updated") - while len(job_list.get_completed_without_logs()) > 0 and timeout > 0: + for timeout in range(180, 0, -1): + if len(job_list.get_completed_without_logs()) == 0: + break for job in job_list.get_completed_without_logs(): job_list.update_log_status(job) sleep(1) - timeout = timeout - 1 if timeout % 10 == 0: Log.info(f"Timeout: {timeout}") - for platform in platforms_to_test: platform.closeConnection() if len(job_list.get_failed()) > 0: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 9cc20c38e..7a9e89436 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1012,7 +1012,7 @@ class Job(object): if it exists, remote_log variable is updated """ try: - if self.wrapper_type and self.wrapper_type == "vertical": + if self.wrapper_type == "vertical": platform.check_stat_file_by_retrials(stat_file + str(max_logs), retries=1) for i in range(max_logs-1,-1,-1): if platform.check_stat_file_by_retrials(stat_file + str(i), retries=1, first=False): @@ -1037,10 +1037,8 @@ class Job(object): err_exist = platform.check_file_exists(self.remote_logs[1], False, sleeptime=0, max_retries=1) except IOError: err_exist = False - if out_exist or err_exist: - return True - else: - return False + return out_exist or err_exist + def retrieve_vertical_wrapper_logs(self, last_log, max_logs, platform, stat_file, max_retrials, fail_count): """ Retrieves log files from remote host meant to be used inside a daemon thread. diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 6f3781dc8..39e9bde62 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2621,7 +2621,7 @@ class JobList(object): job.packed = False save = True # Check checkpoint jobs, the status can be Any - for job in ( job for job in self.check_special_status() ): + for job in self.check_special_status(): job.status = Status.READY # Run start time in format (YYYYMMDDHH:MM:SS) from current time job.ready_start_date = strftime("%Y%m%d%H%M%S") @@ -2631,7 +2631,7 @@ class JobList(object): save = True Log.debug(f"Special condition fullfilled for job {job.name}") # if waiting jobs has all parents completed change its State to READY - for job in ( job for job in self.get_completed() ): + for job in self.get_completed(): job.packed = False # Log name has this format: # a02o_20000101_fc0_2_SIM.20240212115021.err diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index de1f0282d..4b0afea1f 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -453,7 +453,7 @@ class Monitor: log_out = "" log_err = "" if job.status in [Status.FAILED, Status.COMPLETED]: - if not job.local_logs[0]: + if type(job.local_logs) is not tuple: job.local_logs = ("","") log_out = path + "/" + job.local_logs[0] log_err = path + "/" + job.local_logs[1] diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py index 1816379e2..f5ca3abbe 100644 --- a/autosubmit/platforms/sgeplatform.py +++ b/autosubmit/platforms/sgeplatform.py @@ -61,7 +61,6 @@ class SgePlatform(ParamikoPlatform): 'ds', 'dS', 'dT', 'dRs', 'dRS', 'dRT'] self._pathdir = "\$HOME/LOG_" + self.expid self.update_cmds() - self.log_retrieval_process_active = False def submit_Script(self, hold=False): pass -- GitLab From a96a9c1a55f08b2d1fc9596412db36e36d3e07b4 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 14 Mar 2024 09:32:14 +0100 Subject: [PATCH 04/20] revised some of the feedback --- autosubmit/job/job.py | 23 +++++++++++------------ autosubmit/platforms/platform.py | 2 ++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 7a9e89436..11d047d3c 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -244,6 +244,7 @@ class Job(object): self.het = None self.updated_log = True self.ready_start_date = None + self.log_retrieved = False def _init_runtime_parameters(self): # hetjobs @@ -1032,10 +1033,12 @@ class Job(object): try: out_exist = platform.check_file_exists(self.remote_logs[0], False, sleeptime=0, max_retries=1) except IOError: + Log.debug(f'Output log {self.remote_logs[0]} still does not exist') out_exist = False try: err_exist = platform.check_file_exists(self.remote_logs[1], False, sleeptime=0, max_retries=1) except IOError: + Log.debug(f'Error log {self.remote_logs[1]} still does not exist') err_exist = False return out_exist or err_exist @@ -1050,17 +1053,12 @@ class Job(object): :param fail_count: :return: """ - lang = locale.getlocale()[1] - if not lang: - lang = locale.getdefaultlocale()[1] - if not lang: - lang = 'UTF-8' - log_start = last_log + lang = locale.getlocale()[1] or locale.getdefaultlocale()[1] or 'UTF-8' exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, self.name[:4]) tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) time_stamp = "1970" at_least_one_recovered = False - while log_start <= max_logs: + for _ in range(max_logs, last_log - 1, -1): try: if platform.get_stat_file_by_retrials(stat_file + str(max_logs)): with open(os.path.join(tmp_path, stat_file + str(max_logs)), 'r+') as f: @@ -1069,6 +1067,7 @@ class Job(object): total_stats[0] = float(total_stats[0]) total_stats[1] = float(total_stats[1]) except Exception as e: + Log.debug(f"Trace {e} \n Failed to convert total stats to float, falling back to int") total_stats[0] = int(str(total_stats[0]).split('.')[0]) total_stats[1] = int(str(total_stats[1]).split('.')[0]) if max_logs != (int(max_retrials) - fail_count): @@ -1087,14 +1086,14 @@ class Job(object): r_log = (self.remote_logs[0][:-1] + str(max_logs), self.remote_logs[1][:-1] + str(max_logs)) self.synchronize_logs(platform, r_log, l_log, last=False) platform.get_logs_files(self.expid, l_log) - with suppress(BaseException): + try: for local_log in l_log: platform.write_jobid(self.id,os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) - max_logs = max_logs - 1 + except BaseException as e: + Log.debug(f"Trace {e} \n Failed to write the {self.id} inside {l_log}") at_least_one_recovered = True - else: - max_logs = -1 # exit, no more logs - except Exception: + except Exception as e: + Log.debug(f"Trace {e} \n Failed to retrieve log files for job {self.name}") return False return at_least_one_recovered diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index 52012ad1c..5b095ba0e 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -672,6 +672,8 @@ class Platform(object): check *STAT* file :param retries: number of intents to get the completed files + :type retries: int + :param first: First time calling the function :type first: int :param job_name: name of job to check :type job_name: str -- GitLab From f195471c2d8e8e7c477e3942c8781c8a667a0fde Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 14 Mar 2024 09:51:06 +0100 Subject: [PATCH 05/20] revised some of the feedback --- autosubmit/job/job.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 11d047d3c..9c47c2968 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1191,7 +1191,6 @@ class Job(object): :return: """ self.log_avaliable = False - copy_remote_logs = as_conf.get_copy_remote_logs() previous_status = self.status self.prev_status = previous_status new_status = self.new_status @@ -1246,10 +1245,7 @@ class Job(object): self.write_submit_time() # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: - import time - start = time.time() self.platform.add_job_to_log_recover(self) - Log.debug(f"Time to retrieve logs for job {self.name} {time.time() - start}") if self.wrapper_type == "vertical": max_logs = int(self.retrials) for i in range(0,max_logs): -- GitLab From d13a6315dc7b7799437ee036abf2ff0e103b0566 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 14 Mar 2024 10:07:42 +0100 Subject: [PATCH 06/20] Added a fix to count the retrials --- autosubmit/platforms/platform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index 5b095ba0e..bed8b0284 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -857,12 +857,12 @@ class Platform(object): while True: try: job,children = self.recovery_queue.get() - if job.name in job_names_processed: + if (f'{job.name}_{job.fail_count}') in job_names_processed: continue job.children = children job.platform = self job.retrieve_logfiles(self) - job_names_processed.add(job.name) + job_names_processed.add(f'{job.name}_{job.fail_count}') except queue.Empty: pass except Exception as e: -- GitLab From 012d14a28cd73ac2ad2cf9fdf9b41c28614a719b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 16 May 2023 12:21:02 +0200 Subject: [PATCH 07/20] logs rework logs Working some changes and trying to use process instead of thread Fixed a perfomance issue Fixed an issue with running: once adding more edges logs fix run Rework of Retrieval of logs parcially working Now remote works but local not Rework of Retrieval of logs parcially working log_recovery rework wip logs change Clean the code a bit working now needs some cleaning test fix bug fix bugs, reworked a bit fix bug updated test updated test fixed some bugs, added some docs more fixes test fix pipeline fix pipeline math readded datetime readded fix issue with delay retrial fix issue with -1 Fixes !https://earth.bsc.es/gitlab/es/autosubmit/-/issues/1209 fix grouping test test fix test fix some changes for Bruno comments moved if inside gen Workflow optimizations added ( mega squashed commit ) Fixes #1158 added zipp dependency ( rocrate in bscearth000) re-added additional files Database is locked error in historic db ( I think it is an issue in my computer as happened in master aswell) QOL when splits is introduced with "" ( testing francesc experiment ) Ran regression test, noticed issue with experiment a005 and fixed converse job to list changed == for in to dont care about spaces Fix splits when * and not * is in same line added if not monitor Fix changes Fix delay Fixed edge_info Differences fixed Differences fixed comments fixed comments added comments added N-1 deleted test of deleted function deleted old code fixed pipeline Fixed save Added version and hpcarch as requisites to change Improved split_to Improved split_to (wip) Added "previous" filter (wip) Added "previous" filter fixed status .lower() added Add filter previous docs python3 or pytho2 ( fixed) type python updated test changed configparserversion better detection if data is changed working, added the real configuration to the docs changed configparserversion working? changed test working? issue_with_none Added -f flag to force the recreation from 0 ... (useful mainly for test ) maybe almost working fixed bug with chunk wrapper fix comments comments comments comments comments comments doble # job_section comments docstring added ref todo changed wallclock commented removed funcy Deleted funcy, updated configar paser that has some fixes in changed files Improved the run/monitor speed. Fixed some default stuff fix stats Some memory changes introduced added more cases reformat Added test_dependencies changed the location re-added marked_status File parameter reviewing changed results removed root = None update_genealogy clean unused code update_genealogy clean unused code reviewing comments reviewing comments reviewing comments tests tes fix pipeline test fix test fix added funcy to setup.py updated test changed configparserversion better detection if data is changed working, added the real configuration to the docs changed configparserversion working? changed test working? issue_with_none Added -f flag to force the recreation from 0 ... (useful mainly for test ) maybe almost working fixed bug with chunk wrapper fix comments comments comments comments comments comments doble # job_section comments docstring added ref todo changed wallclock commented removed funcy Deleted funcy, updated configar paser that has some fixes in changed files Improved the run/monitor speed. Fixed some default stuff fix stats Some memory changes introduced reviewing changes (comments) reviewing changes (comments) reviewing changes (comments) reviewing changes (graph enumerate) reviewing changes ( delete commentS) reviewing changes ( delete valid parents) reviewing changes reviewing changes reviewing changes reviewing changes reviewing changes reviewing changes (numpy) reviewing changes (numpy) reviewing changes ( docstring) reviewing changes ( docstring) reviewing changes reviewing changes reviewing changes reviewing changes added more cases reformat Added test_dependencies changed the location re-added marked_status File parameter reviewing changed results removed root = None update_genealogy clean unused code update_genealogy clean unused code reviewing comments reviewing comments reviewing comments tests tes fix pipeline test fix test fix added funcy to setup.py fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments Merge lastest changes Fixed ext header to work under this version Fixed default type [rocrate] Add RO-Crate support to Autosubmit. This commit includes work from several other commits, squashed. It started around February 2023, and by July 2023 it was validated by the RO-Crate community, thanks especially to Simone Leo. Unit tests and documentation were added as well. It add support to the following three RO-Crate profiles in Autosubmit: - Process Run Crate - Workflow Run Crate - Workflow RO-Crate profile 1.0 This is available through the Autosubmit commands archive and unarchive. revise the changes update version bug fix an issue with additional_files and \\ variables added retrial key Move temp folder to the outside of for loops to reduce file creation. Rewrite the assertion part Add dani's check so that it doesnt complain with file not found when proj type is none add extended header and tailer documentation test if the file does not exist, it throws an exception test all the routes from extended tailer and header except fetching the file change the check of hashbang to the first two characters Handle if user sets value with empty key Add R, Bash, and python extended scripts Fix an issue with retrials ( present in 4.0) found while testing a full run with templates and wrapper Added platform_name to the variables to load before the rest, ( mainly when building the dict ) Fixed -cw in create, like in inspect Re-adapted some test-cases to match new code workflows fixed fixing all workflows fixing all workflows fixing all workflows # If parent and childs has the same amount of splits \\ doesn't make sense so it is disabled Remove cycles ( job depends on itself) detail is now a function Added a local test to compare workflows from 4.0 to 4.1 using -d option fix default values fix split fix split fixed parent.split == child.split when 1//2 improved test added get_jobs_filtered test Improved job_list test Improved job_list test pipeline not working pipeline not working removed __eq__ due being incompatible with grand part of the code, changed the test instead added job_list generate tests Added __eq__ fixed an issue with dependencies None Changed DB for PKL in tests Added more tests Added more tests fix wrapper dic added run_member test added test_build_job_with_existent_job_list_status test added compare_section test added update_parameters test added update_parameters test added update_parameters test added add_child test added _repr test Old tests working Only 19 remains, have to doble check grouping fix job_list half fix job_list half fix job_list fix test_job.py fix checkpoint and doc tests Fix member_from more changes numpy deleted from environment.yml pep warning fix added test fix doc docs for the new autosubmit_rc env variable docs for the new autosubmit_rc env variable fix doc added another suppress added comment changed try: except for suppress - commented the debug line Changed version Changes to th efunction, fix a bug with the connection, added a close for ._transport of ssh more fixes added a debugfunction Added a notify for push force portalocker to <= 2.7 removed inputtimeout from requeriments requeriments 2fa notification change Fix applied to 2fa, local platform may were asking for a password Fix applied to 2fa indent in docs dependencies docs docs added method parameter 2fa: instead of 2fa rollback few things 2fa threads timeout timeout test 2fa added docs CHANGED input for getpass to hide typing ( it may not work) 2fa 2fa fix additional files for ecmwf Fixed more issues, now edgeless nodes are correctly deleted and dependencies parameter is correctly set , fixed other issues when loading previous job_list and when the node doesnt have the job fixed few workflow inconsistencies fixed dependency fixed ready jobs more fix Working but have an issue with the initial status added apply_filter_1_to_1 more test test more fixes bsic monitor working working on fixing merges working on fixing merges Pickle working, Futher performance improves in the manage_dependencies part working with pickle up to 1000000, afterwards it give segfualt in saving.. looking for alternatives MUCH faster, is probabily bugged for some cases (wip) version update Added a delete function for nodes that are no longer part of the workflow ( with a xor) TODO: Delete old nodes Reloading only the neccesary, added two methods for asconfparser Fix reload in create pkl changes working faster, no memory issues but thinking more solutions corrected prents testing fast test Fixed some bugs with refactor More memory optimization and call optimizations, deleted uneccesary attr when generating the job becasue they will be added later with update_parameters method, code for generate jobs run very fast, inspect working has to check other commands Reduced uneccesary operations, Reduced memory usage Using igraph for perform the transitive reduction added split filter added split filter setstatus refactoring rebased pkl changes working faster, no memory issues but thinking more solutions corrected prents testing fast test Fixed some bugs with refactor More memory optimization and call optimizations, deleted uneccesary attr when generating the job becasue they will be added later with update_parameters method, code for generate jobs run very fast, inspect working has to check other commands Reduced uneccesary operations, Reduced memory usage Using igraph for perform the transitive reduction added split filter added split filter setstatus refactoring --- autosubmit/job/job.py | 2 +- autosubmit/job/job_list.py | 19 +++++++++++++++++-- autosubmit/platforms/sgeplatform.py | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 9c47c2968..778642d6a 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1013,7 +1013,7 @@ class Job(object): if it exists, remote_log variable is updated """ try: - if self.wrapper_type == "vertical": + if self.wrapper_type and self.wrapper_type == "vertical": platform.check_stat_file_by_retrials(stat_file + str(max_logs), retries=1) for i in range(max_logs-1,-1,-1): if platform.check_stat_file_by_retrials(stat_file + str(i), retries=1, first=False): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 39e9bde62..e696b69ba 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -1692,6 +1692,21 @@ class JobList(object): return completed_jobs + def get_completed_without_logs(self, platform=None): + """ + Returns a list of completed jobs wihtout updated logs + + :param platform: job platform + :type platform: HPCPlatform + :return: completed jobs + :rtype: list + """ + + completed_jobs = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and + job.status == Status.COMPLETED and job.updated_log is False ] + + return completed_jobs + def get_uncompleted(self, platform=None, wrapper=False): """ Returns a list of completed jobs @@ -2621,7 +2636,7 @@ class JobList(object): job.packed = False save = True # Check checkpoint jobs, the status can be Any - for job in self.check_special_status(): + for job in ( job for job in self.check_special_status() ): job.status = Status.READY # Run start time in format (YYYYMMDDHH:MM:SS) from current time job.ready_start_date = strftime("%Y%m%d%H%M%S") @@ -2631,7 +2646,7 @@ class JobList(object): save = True Log.debug(f"Special condition fullfilled for job {job.name}") # if waiting jobs has all parents completed change its State to READY - for job in self.get_completed(): + for job in ( job for job in self.get_completed() ): job.packed = False # Log name has this format: # a02o_20000101_fc0_2_SIM.20240212115021.err diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py index f5ca3abbe..1816379e2 100644 --- a/autosubmit/platforms/sgeplatform.py +++ b/autosubmit/platforms/sgeplatform.py @@ -61,6 +61,7 @@ class SgePlatform(ParamikoPlatform): 'ds', 'dS', 'dT', 'dRs', 'dRS', 'dRT'] self._pathdir = "\$HOME/LOG_" + self.expid self.update_cmds() + self.log_retrieval_process_active = False def submit_Script(self, hold=False): pass -- GitLab From 61e167818d32058e45e186d8770dd3794c1b6536 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 11:00:15 +0100 Subject: [PATCH 08/20] fix feedback typo and uncommented --- autosubmit/job/job_list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index e696b69ba..f16b280d2 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -1694,7 +1694,7 @@ class JobList(object): def get_completed_without_logs(self, platform=None): """ - Returns a list of completed jobs wihtout updated logs + Returns a list of completed jobs without updated logs :param platform: job platform :type platform: HPCPlatform -- GitLab From 629ea6f2a6d8327be6470cb4b3db87bdca01a2bc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 13 Mar 2024 13:15:20 +0100 Subject: [PATCH 09/20] revised some of the feedback --- autosubmit/job/job.py | 2 +- autosubmit/job/job_list.py | 4 ++-- autosubmit/platforms/sgeplatform.py | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 778642d6a..9c47c2968 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1013,7 +1013,7 @@ class Job(object): if it exists, remote_log variable is updated """ try: - if self.wrapper_type and self.wrapper_type == "vertical": + if self.wrapper_type == "vertical": platform.check_stat_file_by_retrials(stat_file + str(max_logs), retries=1) for i in range(max_logs-1,-1,-1): if platform.check_stat_file_by_retrials(stat_file + str(i), retries=1, first=False): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index f16b280d2..a00dc65af 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2636,7 +2636,7 @@ class JobList(object): job.packed = False save = True # Check checkpoint jobs, the status can be Any - for job in ( job for job in self.check_special_status() ): + for job in self.check_special_status(): job.status = Status.READY # Run start time in format (YYYYMMDDHH:MM:SS) from current time job.ready_start_date = strftime("%Y%m%d%H%M%S") @@ -2646,7 +2646,7 @@ class JobList(object): save = True Log.debug(f"Special condition fullfilled for job {job.name}") # if waiting jobs has all parents completed change its State to READY - for job in ( job for job in self.get_completed() ): + for job in self.get_completed(): job.packed = False # Log name has this format: # a02o_20000101_fc0_2_SIM.20240212115021.err diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py index 1816379e2..f5ca3abbe 100644 --- a/autosubmit/platforms/sgeplatform.py +++ b/autosubmit/platforms/sgeplatform.py @@ -61,7 +61,6 @@ class SgePlatform(ParamikoPlatform): 'ds', 'dS', 'dT', 'dRs', 'dRS', 'dRT'] self._pathdir = "\$HOME/LOG_" + self.expid self.update_cmds() - self.log_retrieval_process_active = False def submit_Script(self, hold=False): pass -- GitLab From 489d4e44ea54b952214051b34018f0d445a8b6ed Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 4 Apr 2024 13:29:33 +0200 Subject: [PATCH 10/20] added a way to disable the process that recovery logs --- autosubmit/autosubmit.py | 14 +++++++------- autosubmit/job/job.py | 5 ++++- autosubmit/job/job_list.py | 2 +- autosubmit/platforms/ecplatform.py | 10 ++++++---- autosubmit/platforms/locplatform.py | 12 +++++++----- autosubmit/platforms/paramiko_platform.py | 18 +++++++++--------- autosubmit/platforms/platform.py | 8 ++++---- autosubmit/platforms/sgeplatform.py | 12 +++++++----- 8 files changed, 45 insertions(+), 36 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 664ba433e..9764aa764 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2013,7 +2013,7 @@ class Autosubmit: exp_history = Autosubmit.get_historical_database(expid, job_list,as_conf) # establish the connection to all platforms # Restore is a missleading, it is actually a "connect" function when the recover flag is not set. - Autosubmit.restore_platforms(platforms_to_test) + Autosubmit.restore_platforms(platforms_to_test,as_conf=as_conf) return job_list, submitter , exp_history, host , as_conf, platforms_to_test, packages_persistence, False else: return job_list, submitter , None, None, as_conf , platforms_to_test, packages_persistence, True @@ -2322,7 +2322,7 @@ class Autosubmit: for platform in platform_to_test: platform_issues = "" try: - message = platform.test_connection() + message = platform.test_connection(as_conf) if message is None: message = "OK" if message != "OK": @@ -2828,7 +2828,7 @@ class Autosubmit: job.platform = submitter.platforms[job.platform_name] platforms_to_test.add(job.platform) for platform in platforms_to_test: - platform.test_connection() + platform.test_connection(as_conf) for job in current_active_jobs: job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) @@ -2851,7 +2851,7 @@ class Autosubmit: # noinspection PyTypeChecker platforms_to_test.add(platforms[job.platform_name]) # establish the connection to all platforms - Autosubmit.restore_platforms(platforms_to_test) + Autosubmit.restore_platforms(platforms_to_test,as_conf=as_conf) if all_jobs: jobs_to_recover = job_list.get_job_list() @@ -2984,7 +2984,7 @@ class Autosubmit: job.platform_name = as_conf.get_platform() platforms_to_test.add(platforms[job.platform_name]) # establish the connection to all platforms on use - Autosubmit.restore_platforms(platforms_to_test) + Autosubmit.restore_platforms(platforms_to_test,as_conf=as_conf) Log.info('Migrating experiment {0}'.format(experiment_id)) Autosubmit._check_ownership(experiment_id, raise_error=True) if submitter.platforms is None: @@ -3201,7 +3201,7 @@ class Autosubmit: backup_files = [] # establish the connection to all platforms on use try: - Autosubmit.restore_platforms(platforms_to_test) + Autosubmit.restore_platforms(platforms_to_test,as_conf=as_conf) except AutosubmitCritical as e: raise AutosubmitCritical( e.message + "\nInvalid Remote Platform configuration, recover them manually or:\n 1) Configure platform.yml with the correct info\n 2) autosubmit expid -p --onlyremote", @@ -5396,7 +5396,7 @@ class Autosubmit: definitive_platforms = list() for platform in platforms_to_test: try: - Autosubmit.restore_platforms([platform]) + Autosubmit.restore_platforms([platform],as_conf=as_conf) definitive_platforms.append(platform.name) except Exception as e: pass diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 9c47c2968..238acd6ae 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1245,7 +1245,10 @@ class Job(object): self.write_submit_time() # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: - self.platform.add_job_to_log_recover(self) + if as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false").lower() == "true": + self.retrieve_logfiles(self.platform) + else: + self.platform.add_job_to_log_recover(self) if self.wrapper_type == "vertical": max_logs = int(self.retrials) for i in range(0,max_logs): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index a00dc65af..2f0d7231f 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2556,7 +2556,7 @@ class JobList(object): if not hasattr(job, "ready_start_date") or not job.ready_start_date or log_file.name.split(".")[ -2] >= job.ready_start_date: # hasattr for backward compatibility job.updated_log = True - if not job.updated_log: + if not job.updated_log and as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false").lower() == "false": job.platform.add_job_to_log_recover(job) def update_list(self, as_conf, store_change=True, fromSetStatus=False, submitter=None, first_time=False): diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index c41986ee9..053686646 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -153,7 +153,7 @@ class EcPlatform(ParamikoPlatform): export += " ; " return export + self._submit_cmd + job_script - def connect(self, reconnect=False): + def connect(self, as_conf, reconnect=False): """ In this case, it does nothing because connection is established for each command @@ -170,11 +170,13 @@ class EcPlatform(ParamikoPlatform): self.connected = False except: self.connected = False - if not self.log_retrieval_process_active: + if not self.log_retrieval_process_active and ( + as_conf is None or as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', + "false").lower() == "false"): self.log_retrieval_process_active = True self.recover_job_logs() - def restore_connection(self): + def restore_connection(self,as_conf): """ In this case, it does nothing because connection is established for each command @@ -192,7 +194,7 @@ class EcPlatform(ParamikoPlatform): except: self.connected = False - def test_connection(self): + def test_connection(self,as_conf): """ In this case, it does nothing because connection is established for each command diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index a01088dac..572f4398e 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -111,19 +111,21 @@ class LocalPlatform(ParamikoPlatform): def get_checkjob_cmd(self, job_id): return self.get_pscall(job_id) - def connect(self, reconnect=False): + def connect(self, as_conf, reconnect=False): self.connected = True - if not self.log_retrieval_process_active: + if not self.log_retrieval_process_active and ( + as_conf is None or as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', + "false").lower() == "false"): self.log_retrieval_process_active = True self.recover_job_logs() - def test_connection(self): + def test_connection(self,as_conf): if not self.connected: - self.connect() + self.connect(as_conf) - def restore_connection(self): + def restore_connection(self,as_conf): self.connected = True def check_Alljobs(self, job_list, as_conf, retries=5): diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 4083378c8..2161becfa 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -115,7 +115,7 @@ class ParamikoPlatform(Platform): self.local_x11_display = xlib_connect.get_display(display) - def test_connection(self): + def test_connection(self,as_conf): """ Test if the connection is still alive, reconnect if not. """ @@ -123,7 +123,7 @@ class ParamikoPlatform(Platform): if not self.connected: self.reset() try: - self.restore_connection() + self.restore_connection(as_conf) message = "OK" except BaseException as e: message = str(e) @@ -147,13 +147,13 @@ class ParamikoPlatform(Platform): raise AutosubmitCritical(str(e),7051) #raise AutosubmitError("[{0}] connection failed for host: {1}".format(self.name, self.host), 6002, e.message) - def restore_connection(self): + def restore_connection(self, as_conf): try: self.connected = False retries = 2 retry = 0 try: - self.connect() + self.connect(as_conf) except Exception as e: if ',' in self.host: Log.printlog("Connection Failed to {0}, will test another host".format( @@ -163,7 +163,7 @@ class ParamikoPlatform(Platform): "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,str(e)) while self.connected is False and retry < retries: try: - self.connect(True) + self.connect(as_conf,True) except Exception as e: pass retry += 1 @@ -225,7 +225,7 @@ class ParamikoPlatform(Platform): # pass return tuple(answers) - def connect(self, reconnect=False): + def connect(self, as_conf, reconnect=False): """ Creates ssh connection to host @@ -301,7 +301,7 @@ class ParamikoPlatform(Platform): self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=pow(4, 12) ,max_packet_size=pow(4, 12) ) self._ftpChannel.get_channel().settimeout(120) self.connected = True - if not self.log_retrieval_process_active: + if not self.log_retrieval_process_active and (as_conf is None or as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', "false").lower() == "false"): self.log_retrieval_process_active = True self.recover_job_logs() except SSHException: @@ -319,7 +319,7 @@ class ParamikoPlatform(Platform): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: - self.restore_connection() + self.restore_connection(as_conf) else: raise AutosubmitError( "Couldn't establish a connection to the specified host, wrong configuration?", 6003, str(e)) @@ -974,7 +974,7 @@ class ParamikoPlatform(Platform): except paramiko.SSHException as e: if str(e) in "SSH session not active": self._ssh = None - self.restore_connection() + self.restore_connection(None) timeout = timeout + 60 retries = retries - 1 if retries <= 0: diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index bed8b0284..6281d79ca 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -843,17 +843,17 @@ class Platform(object): def add_job_to_log_recover(self, job): self.recovery_queue.put((job,job.children)) - def connect(self, reconnect=False): + def connect(self, as_conf, reconnect=False): raise NotImplementedError - def restore_connection(self): + def restore_connection(self,as_conf): raise NotImplementedError @processed def recover_job_logs(self): job_names_processed = set() self.connected = False - self.restore_connection() + self.restore_connection(None) while True: try: job,children = self.recovery_queue.get() @@ -866,7 +866,7 @@ class Platform(object): except queue.Empty: pass except Exception as e: - self.restore_connection() + self.restore_connection(None) time.sleep(1) diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py index f5ca3abbe..00ac165b9 100644 --- a/autosubmit/platforms/sgeplatform.py +++ b/autosubmit/platforms/sgeplatform.py @@ -114,7 +114,7 @@ class SgePlatform(ParamikoPlatform): def get_checkjob_cmd(self, job_id): return self.get_qstatjob(job_id) - def connect(self, reconnect=False): + def connect(self, as_conf, reconnect=False): """ In this case, it does nothing because connection is established for each command @@ -122,10 +122,12 @@ class SgePlatform(ParamikoPlatform): :rtype: bool """ self.connected = True - if not self.log_retrieval_process_active: + if not self.log_retrieval_process_active and ( + as_conf is None or as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', + "false").lower() == "false"): self.log_retrieval_process_active = True self.recover_job_logs() - def restore_connection(self): + def restore_connection(self,as_conf): """ In this case, it does nothing because connection is established for each command @@ -134,7 +136,7 @@ class SgePlatform(ParamikoPlatform): """ self.connected = True - def test_connection(self): + def test_connection(self,as_conf): """ In this case, it does nothing because connection is established for each command @@ -142,5 +144,5 @@ class SgePlatform(ParamikoPlatform): :rtype: bool """ self.connected = True - self.connected(True) + self.connected(as_conf,True) -- GitLab From e81eac44d0cfcab29b60ff3ac0b908405ac4160e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 4 Apr 2024 14:18:51 +0200 Subject: [PATCH 11/20] added a way to disable the process that recovery logs --- autosubmit/job/job.py | 2 +- autosubmit/job/job_list.py | 2 +- autosubmit/platforms/ecplatform.py | 4 ++-- autosubmit/platforms/locplatform.py | 3 +-- autosubmit/platforms/paramiko_platform.py | 5 +++-- autosubmit/platforms/sgeplatform.py | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 238acd6ae..a14b8da02 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1245,7 +1245,7 @@ class Job(object): self.write_submit_time() # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: - if as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false").lower() == "true": + if str(as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "true": self.retrieve_logfiles(self.platform) else: self.platform.add_job_to_log_recover(self) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 2f0d7231f..e2cfa93fd 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2556,7 +2556,7 @@ class JobList(object): if not hasattr(job, "ready_start_date") or not job.ready_start_date or log_file.name.split(".")[ -2] >= job.ready_start_date: # hasattr for backward compatibility job.updated_log = True - if not job.updated_log and as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false").lower() == "false": + if not job.updated_log and str(as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false": job.platform.add_job_to_log_recover(job) def update_list(self, as_conf, store_change=True, fromSetStatus=False, submitter=None, first_time=False): diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index 053686646..b023677a4 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -171,8 +171,8 @@ class EcPlatform(ParamikoPlatform): except: self.connected = False if not self.log_retrieval_process_active and ( - as_conf is None or as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', - "false").lower() == "false"): + as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', + "false")).lower() == "false"): self.log_retrieval_process_active = True self.recover_job_logs() diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 572f4398e..ae8c7dd60 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -114,8 +114,7 @@ class LocalPlatform(ParamikoPlatform): def connect(self, as_conf, reconnect=False): self.connected = True if not self.log_retrieval_process_active and ( - as_conf is None or as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', - "false").lower() == "false"): + as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',"false")).lower() == "false"): self.log_retrieval_process_active = True self.recover_job_logs() diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 2161becfa..7c8af1e94 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -194,7 +194,8 @@ class ParamikoPlatform(Platform): key.public_blob = None self._ssh.connect(self._host_config['hostname'], port=port, username=self.user, timeout=60, banner_timeout=60) except BaseException as e: - Log.warning(f'Failed to authenticate with ssh-agent due to {e}') + Log.debug(f'Failed to authenticate with ssh-agent due to {e}') + Log.debug('Trying to authenticate with other methods') return False return True @@ -301,7 +302,7 @@ class ParamikoPlatform(Platform): self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=pow(4, 12) ,max_packet_size=pow(4, 12) ) self._ftpChannel.get_channel().settimeout(120) self.connected = True - if not self.log_retrieval_process_active and (as_conf is None or as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', "false").lower() == "false"): + if not self.log_retrieval_process_active and (as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false"): self.log_retrieval_process_active = True self.recover_job_logs() except SSHException: diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py index 00ac165b9..875d45599 100644 --- a/autosubmit/platforms/sgeplatform.py +++ b/autosubmit/platforms/sgeplatform.py @@ -123,8 +123,8 @@ class SgePlatform(ParamikoPlatform): """ self.connected = True if not self.log_retrieval_process_active and ( - as_conf is None or as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', - "false").lower() == "false"): + as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', + "false")).lower() == "false"): self.log_retrieval_process_active = True self.recover_job_logs() def restore_connection(self,as_conf): -- GitLab From f2e1d195e56e619fe95463f16efde6a9c1c9e7fd Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 8 Apr 2024 14:49:51 +0200 Subject: [PATCH 12/20] Fixed issues with last jobs not being retrieved, --- autosubmit/autosubmit.py | 26 ++++++++++++++--------- autosubmit/job/job.py | 13 ++++++------ autosubmit/job/job_list.py | 18 ++++++++++------ autosubmit/job/job_packages.py | 9 ++++++++ autosubmit/platforms/paramiko_platform.py | 5 ++++- autosubmit/platforms/platform.py | 14 +++++++++--- 6 files changed, 59 insertions(+), 26 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 9764aa764..34c84618d 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2261,6 +2261,22 @@ class Autosubmit: Log.result("No more jobs to run.") + + # Wait for all remaining threads of I/O, close remaining connections + # search hint - finished run + Log.info("Waiting for all logs to be updated") + # get all threads + threads = threading.enumerate() + # print name + for timeout in range(7000, 0, -1): + if len(job_list.get_completed_without_logs()) == 0: + break + for job in job_list.get_completed_without_logs(): + job_list.update_log_status(job, as_conf) + sleep(1) + if timeout % 10 == 0: + Log.info(f"Timeout: {timeout}") + # Updating job data header with current information when experiment ends try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, @@ -2272,16 +2288,6 @@ class Autosubmit: Autosubmit.database_fix(expid) except Exception as e: pass - # Wait for all remaining threads of I/O, close remaining connections - Log.info("Waiting for all logs to be updated") - for timeout in range(180, 0, -1): - if len(job_list.get_completed_without_logs()) == 0: - break - for job in job_list.get_completed_without_logs(): - job_list.update_log_status(job) - sleep(1) - if timeout % 10 == 0: - Log.info(f"Timeout: {timeout}") for platform in platforms_to_test: platform.closeConnection() diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index a14b8da02..966aff1b0 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1097,12 +1097,13 @@ class Job(object): return False return at_least_one_recovered - def retrieve_logfiles(self, platform): + def retrieve_logfiles(self, platform, raise_error=False): """ Retrieves log files from remote host meant to be used inside a process. :param platform: platform that is calling the function, already connected. :return: """ + backup_logname = copy.copy(self.local_logs) log_retrieved = False max_retrials = self.retrials max_logs = int(max_retrials) - self._fail_count @@ -1134,7 +1135,10 @@ class Job(object): Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name)) self.log_retrieved = log_retrieved if not self.log_retrieved: - Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6001) + self.local_logs = backup_logname + Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6000) + if raise_error: + raise def parse_time(self,wallclock): regex = re.compile(r'(((?P\d+):)((?P\d+)))(:(?P\d+))?') @@ -1238,10 +1242,7 @@ class Job(object): # after checking the jobs , no job should have the status "submitted" Log.printlog("Job {0} in SUBMITTED status. This should never happen on this step..".format( self.name), 6008) - if previous_status != Status.RUNNING and self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN, - Status.RUNNING]: - self.write_start_time() - if previous_status == Status.HELD and self.status in [Status.SUBMITTED, Status.QUEUING, Status.RUNNING]: + if previous_status == Status.HELD and self.status in [Status.SUBMITTED, Status.QUEUING, Status.RUNNING, Status.COMPLETED]: self.write_submit_time() # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index e2cfa93fd..81b204ed2 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2535,7 +2535,7 @@ class JobList(object): return jobs_to_check - def update_log_status(self, job): + def update_log_status(self, job, as_conf): """ Updates the log err and log out. """ @@ -2550,14 +2550,20 @@ class JobList(object): err = log_file.name elif "out" in log_file.suffix: out = log_file.name - job.local_logs = (out, err) - job.remote_logs = (out, err) + if out or err: + if out and not err: + err = out[-3] + ".err" + else: + out = err[-3] + ".out" + job.local_logs = (out, err) + job.remote_logs = (out, err) + if log_file: if not hasattr(job, "ready_start_date") or not job.ready_start_date or log_file.name.split(".")[ -2] >= job.ready_start_date: # hasattr for backward compatibility job.updated_log = True - if not job.updated_log and str(as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false": - job.platform.add_job_to_log_recover(job) + if not job.updated_log and str(as_conf.platforms_data.get(job.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false": + job.platform.add_job_to_log_recover(job) def update_list(self, as_conf, store_change=True, fromSetStatus=False, submitter=None, first_time=False): # type: (AutosubmitConfig, bool, bool, object, bool) -> bool @@ -2651,7 +2657,7 @@ class JobList(object): # Log name has this format: # a02o_20000101_fc0_2_SIM.20240212115021.err # $jobname.$(YYYYMMDDHHMMSS).err or .out - self.update_log_status(job) + self.update_log_status(job, as_conf) if job.synchronize is not None and len(str(job.synchronize)) > 0: tmp = [parent for parent in job.parents if parent.status == Status.COMPLETED] if len(tmp) != len(job.parents): diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 5294193ec..59c28e1e1 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -236,6 +236,7 @@ class JobPackageSimple(JobPackageBase): if len(job_scripts) == 0: job_scripts = self._job_scripts for job in self.jobs: + job.write_start_time() #CLEANS PREVIOUS RUN ON LOCAL log_completed = os.path.join(self._tmp_path, job.name + '_COMPLETED') log_stat = os.path.join(self._tmp_path, job.name + '_STAT') @@ -273,6 +274,8 @@ class JobPackageSimpleWrapped(JobPackageSimple): self.platform.send_file(self._job_wrapped_scripts[job.name]) def _do_submission(self, job_scripts=None, hold=False): + for job in self.jobs: + job.write_start_time() if job_scripts is None or not job_scripts: job_scripts = self._job_wrapped_scripts super(JobPackageSimpleWrapped, self)._do_submission(job_scripts, hold=hold) @@ -330,6 +333,7 @@ class JobPackageArray(JobPackageBase): for job in self.jobs: self.platform.remove_stat_file(job.name) self.platform.remove_completed_file(job.name) + job.write_start_time() package_id = self.platform.submit_job(None, self._common_script, hold=hold, export = self.export) @@ -610,13 +614,16 @@ class JobPackageThread(JobPackageBase): for job in self.jobs: filenames += " " + self.platform.remote_log_dir + "/" + job.name + "_STAT " + \ self.platform.remote_log_dir + "/" + job.name + "_COMPLETED" + job.write_start_time() self.platform.remove_multiple_files(filenames) + else: for job in self.jobs: self.platform.remove_stat_file(job.name) self.platform.remove_completed_file(job.name) if hold: job.hold = hold + job.write_start_time() package_id = self.platform.submit_job(None, self._common_script, hold=hold, export = self.export) @@ -695,6 +702,8 @@ class JobPackageThreadWrapped(JobPackageThread): self.platform.remove_completed_file(job.name) if hold: job.hold = hold + job.write_start_time() + package_id = self.platform.submit_job(None, self._common_script, hold=hold, export = self.export) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 7c8af1e94..4d9e7169f 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -478,7 +478,10 @@ class ParamikoPlatform(Platform): path_root = self.get_files_path() src = os.path.join(path_root, src) dest = os.path.join(path_root, dest) - self._ftpChannel.rename(src,dest) + try: + self._ftpChannel.stat(dest) + except IOError: + self._ftpChannel.rename(src,dest) return True except IOError as e: diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index 6281d79ca..83e0d416b 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -861,12 +861,20 @@ class Platform(object): continue job.children = children job.platform = self - job.retrieve_logfiles(self) - job_names_processed.add(f'{job.name}_{job.fail_count}') + try: + job.retrieve_logfiles(self, raise_error=True) + job_names_processed.add(f'{job.name}_{job.fail_count}') + except: + pass except queue.Empty: pass + except (IOError, OSError): + pass except Exception as e: - self.restore_connection(None) + try: + self.restore_connection(None) + except: + pass time.sleep(1) -- GitLab From 49b90b427fae946499ed567984e2b08125460c47 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 8 Apr 2024 15:00:43 +0200 Subject: [PATCH 13/20] added a modificable timeout for the ending --- autosubmit/autosubmit.py | 7 ++++--- docs/source/userguide/configure/develop_a_project.rst | 5 ++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 34c84618d..c7a3c838f 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2268,14 +2268,15 @@ class Autosubmit: # get all threads threads = threading.enumerate() # print name - for timeout in range(7000, 0, -1): + timeout = as_conf.experiment_data.get("CONFIG",{}).get("LAST_LOGS_TIMEOUT", 180) + for remaining in range(timeout, 0, -1): if len(job_list.get_completed_without_logs()) == 0: break for job in job_list.get_completed_without_logs(): job_list.update_log_status(job, as_conf) sleep(1) - if timeout % 10 == 0: - Log.info(f"Timeout: {timeout}") + if remaining % 10 == 0: + Log.info(f"Timeout: {remaining}") # Updating job data header with current information when experiment ends try: diff --git a/docs/source/userguide/configure/develop_a_project.rst b/docs/source/userguide/configure/develop_a_project.rst index 7621b29d0..74786fda5 100644 --- a/docs/source/userguide/configure/develop_a_project.rst +++ b/docs/source/userguide/configure/develop_a_project.rst @@ -121,7 +121,10 @@ Autosubmit configuration TOTALJOBS: 6 # Time (seconds) between connections to the HPC queue scheduler to poll already submitted jobs status # Default:10 - SAFETYSLEEPTIME:10 + SAFETYSLEEPTIME: 10 + # Time (seconds) before ending the run to retrieve the last logs. + # Default:180 + LAST_LOGS_TIMEOUT: 180 # Number of retrials if a job fails. Can ve override at job level # Default:0 RETRIALS:0 -- GitLab From 03c6e86850f9b822963d828e44284ef9bd2716c8 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 9 Apr 2024 09:18:55 +0200 Subject: [PATCH 14/20] Fixed issues with start_time --- autosubmit/job/job.py | 42 +++++++++++++++++----------------- autosubmit/job/job_list.py | 2 +- autosubmit/job/job_packages.py | 8 +------ 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 966aff1b0..079859df0 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -2161,6 +2161,7 @@ class Job(object): Writes submit date and time to TOTAL_STATS file. It doesn't write if hold is True. """ # print(traceback.format_stack()) + self.write_start_time() print(("Call from {} with status {}".format(self.name, self.status_str))) if hold is True: return # Do not write for HELD jobs. @@ -2209,17 +2210,16 @@ class Job(object): :return: True if successful, False otherwise :rtype: bool """ - timestamp = date2str(datetime.datetime.now(), 'S') + start_time = time.time() - self.local_logs = (f"{self.name}.{timestamp}.out", f"{self.name}.{timestamp}.err") + if self.wrapper_type == "vertical": + timestamp = date2str(datetime.datetime.now(), 'S') + self.local_logs = (f"{self.name}.{timestamp}.out", f"{self.name}.{timestamp}.err") - if self.wrapper_type != "vertical" or enabled: - if self._platform.get_stat_file(self.name, retries=5): #fastlook - start_time = self.check_start_time() - else: - Log.printlog('Could not get start time for {0}. Using current time as an approximation'.format( - self.name), 3000) - start_time = time.time() + elif self.wrapper_type != "vertical" or enabled: + start_time_ = self.check_start_time() + if start_time_: + start_time = start_time_ timestamp = date2str(datetime.datetime.now(), 'S') self.local_logs = (self.name + "." + timestamp + @@ -2231,11 +2231,11 @@ class Job(object): # noinspection PyTypeChecker f.write(date2str(datetime.datetime.fromtimestamp(start_time), 'S')) # Writing database - exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), - children=self.children_names_str) + exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + children=self.children_names_str) return True def write_end_time(self, completed,enabled = False): @@ -2303,6 +2303,12 @@ class Job(object): out, err = self.local_logs path_out = os.path.join(self._tmp_path, 'LOG_' + str(self.expid), out) # Launch first as simple non-threaded function + + exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.write_start_time(self.name, start=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + children=self.children_names_str) if not first_retrial: exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_submit_time(self.name, submit=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, @@ -2310,12 +2316,6 @@ class Job(object): platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.write_start_time(self.name, start=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), - children=self.children_names_str) - - exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) job_data_dc = exp_history.write_finish_time(self.name, finish=total_stats[1], status=total_stats[2], ncpus=self.processors, wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, @@ -2388,7 +2388,7 @@ class Job(object): def synchronize_logs(self, platform, remote_logs, local_logs, last = True): platform.move_file(remote_logs[0], local_logs[0], True) # .out platform.move_file(remote_logs[1], local_logs[1], True) # .err - if last: + if last and local_logs[0] != "": self.local_logs = local_logs self.remote_logs = copy.deepcopy(local_logs) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 81b204ed2..5f52781c8 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2550,7 +2550,7 @@ class JobList(object): err = log_file.name elif "out" in log_file.suffix: out = log_file.name - if out or err: + if out != "" or err != "": if out and not err: err = out[-3] + ".err" else: diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 59c28e1e1..f4a5eb164 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -236,7 +236,6 @@ class JobPackageSimple(JobPackageBase): if len(job_scripts) == 0: job_scripts = self._job_scripts for job in self.jobs: - job.write_start_time() #CLEANS PREVIOUS RUN ON LOCAL log_completed = os.path.join(self._tmp_path, job.name + '_COMPLETED') log_stat = os.path.join(self._tmp_path, job.name + '_STAT') @@ -274,8 +273,6 @@ class JobPackageSimpleWrapped(JobPackageSimple): self.platform.send_file(self._job_wrapped_scripts[job.name]) def _do_submission(self, job_scripts=None, hold=False): - for job in self.jobs: - job.write_start_time() if job_scripts is None or not job_scripts: job_scripts = self._job_wrapped_scripts super(JobPackageSimpleWrapped, self)._do_submission(job_scripts, hold=hold) @@ -333,7 +330,6 @@ class JobPackageArray(JobPackageBase): for job in self.jobs: self.platform.remove_stat_file(job.name) self.platform.remove_completed_file(job.name) - job.write_start_time() package_id = self.platform.submit_job(None, self._common_script, hold=hold, export = self.export) @@ -614,7 +610,6 @@ class JobPackageThread(JobPackageBase): for job in self.jobs: filenames += " " + self.platform.remote_log_dir + "/" + job.name + "_STAT " + \ self.platform.remote_log_dir + "/" + job.name + "_COMPLETED" - job.write_start_time() self.platform.remove_multiple_files(filenames) else: @@ -623,7 +618,7 @@ class JobPackageThread(JobPackageBase): self.platform.remove_completed_file(job.name) if hold: job.hold = hold - job.write_start_time() + package_id = self.platform.submit_job(None, self._common_script, hold=hold, export = self.export) @@ -702,7 +697,6 @@ class JobPackageThreadWrapped(JobPackageThread): self.platform.remove_completed_file(job.name) if hold: job.hold = hold - job.write_start_time() package_id = self.platform.submit_job(None, self._common_script, hold=hold, export = self.export) -- GitLab From cf08a8a293fb535a9a1fad3329e59f01fe0921d9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 10 Apr 2024 11:08:09 +0200 Subject: [PATCH 15/20] Fixed issues with start_time --- autosubmit/job/job.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 079859df0..23bdb5d6f 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -245,6 +245,8 @@ class Job(object): self.updated_log = True self.ready_start_date = None self.log_retrieved = False + self.start_time_writted = False + self.submission_time_writted = False def _init_runtime_parameters(self): # hetjobs -- GitLab From 4614a5a382e0c782dc66143193455d53cb6935b5 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 15 Apr 2024 17:37:49 +0200 Subject: [PATCH 16/20] Fixed job_data issues Fixed Stat issues Fixed change no detected in one case ( as_conf 1.60) Fixed issues with STAT,TOTAL_STATS with vertical wrapper Fixed parameters don't be load in the vertical wrapper Fixed issues when recovering the logs --- VERSION | 2 +- autosubmit/autosubmit.py | 8 + autosubmit/history/experiment_history.py | 683 +++++++++--------- autosubmit/job/job.py | 348 ++++----- autosubmit/job/job_common.py | 4 +- autosubmit/job/job_packager.py | 6 +- autosubmit/job/job_packages.py | 26 +- autosubmit/platforms/platform.py | 63 +- autosubmit/platforms/slurmplatform.py | 1 - .../platforms/wrappers/wrapper_builder.py | 4 +- 10 files changed, 544 insertions(+), 601 deletions(-) diff --git a/VERSION b/VERSION index 8c7fafd36..9d086c6df 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -4.1.3 \ No newline at end of file +4.1.4 \ No newline at end of file diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index c7a3c838f..5d040a6fa 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2434,6 +2434,14 @@ class Autosubmit: if error_message != "": raise AutosubmitCritical("Submission Failed due wrong configuration:{0}".format(error_message), 7014) + if not inspect: + for package in valid_packages_to_submit: + wrapper_time = None + for job in package.jobs: # if jobs > 1 == wrapped == same submission time + job.write_submit_time(wrapper_submit_time=wrapper_time) + wrapper_time = job.write_submit_time + + if save_1 or save_2: return True else: diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index ee0558edd..5fd081600 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -16,346 +16,363 @@ # You should have received a copy of the GNU General Public License # along with Autosubmit. If not, see . import traceback +from time import time, sleep + import autosubmit.history.database_managers.database_models as Models import autosubmit.history.utils as HUtils -from time import time, sleep -from .database_managers.experiment_history_db_manager import ExperimentHistoryDbManager -from .database_managers.database_manager import DEFAULT_JOBDATA_DIR, DEFAULT_HISTORICAL_LOGS_DIR -from .strategies import PlatformInformationHandler, SingleAssociationStrategy, StraightWrapperAssociationStrategy, TwoDimWrapperDistributionStrategy, GeneralizedWrapperDistributionStrategy -from .data_classes.job_data import JobData +from autosubmitconfigparser.config.basicconfig import BasicConfig +from log.log import Log from .data_classes.experiment_run import ExperimentRun -from .platform_monitor.slurm_monitor import SlurmMonitor +from .data_classes.job_data import JobData +from .database_managers.database_manager import DEFAULT_JOBDATA_DIR, DEFAULT_HISTORICAL_LOGS_DIR +from .database_managers.experiment_history_db_manager import ExperimentHistoryDbManager from .internal_logging import Logging -from log.log import Log -from autosubmitconfigparser.config.basicconfig import BasicConfig +from .platform_monitor.slurm_monitor import SlurmMonitor +from .strategies import PlatformInformationHandler, SingleAssociationStrategy, StraightWrapperAssociationStrategy, \ + TwoDimWrapperDistributionStrategy, GeneralizedWrapperDistributionStrategy SECONDS_WAIT_PLATFORM = 60 + class ExperimentHistory: - def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR, historiclog_dir_path=DEFAULT_HISTORICAL_LOGS_DIR): - self.expid = expid - BasicConfig.read() - self._log = Logging(expid, BasicConfig.HISTORICAL_LOG_DIR) - self._job_data_dir_path = BasicConfig.JOBDATA_DIR - self._historiclog_dir_path = BasicConfig.HISTORICAL_LOG_DIR - try: - self.manager = ExperimentHistoryDbManager(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR) - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - self.manager = None - - def initialize_database(self): - try: - self.manager.initialize() - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - - self.manager = None - - def is_header_ready(self): - if self.manager: - return self.manager.is_header_ready_db_version() - return False - - - def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", - member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, children=""): - try: - next_counter = self._get_next_counter_by_job_name(job_name) - current_experiment_run = self.manager.get_experiment_run_dc_with_max_id() - job_data_dc = JobData(_id=0, - counter=next_counter, - job_name=job_name, - submit=submit, - status=status, - rowtype=self._get_defined_rowtype(wrapper_code), - ncpus=ncpus, - wallclock=wallclock, - qos=self._get_defined_queue_name(wrapper_queue, wrapper_code, qos), - date=date, - member=member, - section=section, - chunk=chunk, - platform=platform, - job_id=job_id, - children=children, - run_id=current_experiment_run.run_id) - return self.manager.register_submitted_job_data_dc(job_data_dc) - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - - return None - - def write_start_time(self, job_name, start=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", - member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, children=""): - try: - job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) - if not job_data_dc_last: - job_data_dc_last = self.write_submit_time(job_name=job_name, - status=status, - ncpus=ncpus, - wallclock=wallclock, - qos=qos, - date=date, - member=member, - section=section, - chunk=chunk, - platform=platform, - job_id=job_id, - wrapper_queue=wrapper_queue, - wrapper_code=wrapper_code) - self._log.log("write_start_time {0} start not found.".format(job_name)) - job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) - if not job_data_dc_last: - raise Exception("Job {0} has not been found in the database.".format(job_name)) - job_data_dc_last.start = start - job_data_dc_last.qos = self._get_defined_queue_name(wrapper_queue, wrapper_code, qos) - job_data_dc_last.status = status - job_data_dc_last.rowtype = self._get_defined_rowtype(wrapper_code) - job_data_dc_last.job_id = job_id - job_data_dc_last.children = children - return self.manager.update_job_data_dc_by_id(job_data_dc_last) - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - - - def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", - member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None, - wrapper_queue=None, wrapper_code=None, children=""): - try: - job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) - if not job_data_dc_last: - job_data_dc_last = self.write_submit_time(job_name=job_name, - status=status, - ncpus=ncpus, - wallclock=wallclock, - qos=qos, - date=date, - member=member, - section=section, - chunk=chunk, - platform=platform, - job_id=job_id, - wrapper_queue=wrapper_queue, - wrapper_code=wrapper_code, - children=children) - self._log.log("write_finish_time {0} submit not found.".format(job_name)) - job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) - if not job_data_dc_last: - raise Exception("Job {0} has not been found in the database.".format(job_name)) - job_data_dc_last.finish = finish if finish > 0 else int(time()) - job_data_dc_last.status = status - job_data_dc_last.job_id = job_id - job_data_dc_last.rowstatus = Models.RowStatus.PENDING_PROCESS - job_data_dc_last.out = out_file if out_file else "" - job_data_dc_last.err = err_file if err_file else "" - return self.manager.update_job_data_dc_by_id(job_data_dc_last) - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - - - def write_platform_data_after_finish(self, job_data_dc, platform_obj): - """ - Call it in a thread. - """ - try: - sleep(SECONDS_WAIT_PLATFORM) - ssh_output = platform_obj.check_job_energy(job_data_dc.job_id) - slurm_monitor = SlurmMonitor(ssh_output) - self._verify_slurm_monitor(slurm_monitor, job_data_dc) - job_data_dcs_in_wrapper = self.manager.get_job_data_dcs_last_by_wrapper_code(job_data_dc.wrapper_code) - job_data_dcs_in_wrapper = sorted([job for job in job_data_dcs_in_wrapper if job.status == "COMPLETED"], key=lambda x: x._id) - job_data_dcs_to_update = [] - if len(job_data_dcs_in_wrapper) > 0: - info_handler = PlatformInformationHandler(StraightWrapperAssociationStrategy(self._historiclog_dir_path)) - job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) - if len(job_data_dcs_to_update) == 0: - info_handler.strategy = TwoDimWrapperDistributionStrategy(self._historiclog_dir_path) - job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) - if len(job_data_dcs_to_update) == 0: - info_handler.strategy = GeneralizedWrapperDistributionStrategy(self._historiclog_dir_path) - job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) - else: - info_handler = PlatformInformationHandler(SingleAssociationStrategy(self._historiclog_dir_path)) - job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) - return self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_to_update) - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - - - def _verify_slurm_monitor(self, slurm_monitor, job_data_dc): - try: - if slurm_monitor.header.status not in ["COMPLETED", "FAILED"]: - self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), - "Slurm status {0} is not COMPLETED nor FAILED for ID {1}.\n".format(slurm_monitor.header.status, slurm_monitor.header.name)) - Log.debug(f'Historical Database error: Slurm status {slurm_monitor.header.status} is not COMPLETED nor FAILED for ID {slurm_monitor.header.name}.') - if not slurm_monitor.steps_plus_extern_approximate_header_energy(): - self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), - "Steps + extern != total energy for ID {0}. Number of steps {1}.\n".format(slurm_monitor.header.name, slurm_monitor.step_count)) - Log.debug(f'Historical Database error: Steps + extern != total energy for ID {slurm_monitor.header.name}. Number of steps {slurm_monitor.step_count}.') - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - - - def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="",create=False): - """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ - try: + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR, historiclog_dir_path=DEFAULT_HISTORICAL_LOGS_DIR): + self.expid = expid + BasicConfig.read() + self._log = Logging(expid, BasicConfig.HISTORICAL_LOG_DIR) + self._job_data_dir_path = BasicConfig.JOBDATA_DIR + self._historiclog_dir_path = BasicConfig.HISTORICAL_LOG_DIR + try: + self.manager = ExperimentHistoryDbManager(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + self.manager = None + + def initialize_database(self): + try: + self.manager.initialize() + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + self.manager = None + + def is_header_ready(self): + if self.manager: + return self.manager.is_header_ready_db_version() + return False + + def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, + wrapper_code=None, children=""): + try: + next_counter = self._get_next_counter_by_job_name(job_name) + current_experiment_run = self.manager.get_experiment_run_dc_with_max_id() + job_data_dc = JobData(_id=0, + counter=next_counter, + job_name=job_name, + submit=submit, + status=status, + rowtype=self._get_defined_rowtype(wrapper_code), + ncpus=ncpus, + wallclock=wallclock, + qos=self._get_defined_queue_name(wrapper_queue, wrapper_code, qos), + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + children=children, + run_id=current_experiment_run.run_id) + return self.manager.register_submitted_job_data_dc(job_data_dc) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + return None + + def write_start_time(self, job_name, start=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, + children=""): + try: + job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + if not job_data_dc_last: + job_data_dc_last = self.write_submit_time(job_name=job_name, + status=status, + ncpus=ncpus, + wallclock=wallclock, + qos=qos, + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + wrapper_queue=wrapper_queue, + wrapper_code=wrapper_code) + self._log.log("write_start_time {0} start not found.".format(job_name)) + job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + if not job_data_dc_last: + raise Exception("Job {0} has not been found in the database.".format(job_name)) + job_data_dc_last.start = start + job_data_dc_last.qos = self._get_defined_queue_name(wrapper_queue, wrapper_code, qos) + job_data_dc_last.status = status + job_data_dc_last.rowtype = self._get_defined_rowtype(wrapper_code) + job_data_dc_last.job_id = job_id + job_data_dc_last.children = children + return self.manager.update_job_data_dc_by_id(job_data_dc_last) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None, + wrapper_queue=None, wrapper_code=None, children=""): + try: + job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + if not job_data_dc_last: + job_data_dc_last = self.write_submit_time(job_name=job_name, + status=status, + ncpus=ncpus, + wallclock=wallclock, + qos=qos, + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + wrapper_queue=wrapper_queue, + wrapper_code=wrapper_code, + children=children) + self._log.log("write_finish_time {0} submit not found.".format(job_name)) + job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + if not job_data_dc_last: + raise Exception("Job {0} has not been found in the database.".format(job_name)) + job_data_dc_last.finish = finish if finish > 0 else int(time()) + job_data_dc_last.status = status + job_data_dc_last.job_id = job_id + job_data_dc_last.rowstatus = Models.RowStatus.PENDING_PROCESS + job_data_dc_last.out = out_file if out_file else "" + job_data_dc_last.err = err_file if err_file else "" + return self.manager.update_job_data_dc_by_id(job_data_dc_last) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def write_platform_data_after_finish(self, job_data_dc, platform_obj): + """ + Call it in a thread. + """ + try: + sleep(SECONDS_WAIT_PLATFORM) + ssh_output = platform_obj.check_job_energy(job_data_dc.job_id) + slurm_monitor = SlurmMonitor(ssh_output) + self._verify_slurm_monitor(slurm_monitor, job_data_dc) + job_data_dcs_in_wrapper = self.manager.get_job_data_dcs_last_by_wrapper_code(job_data_dc.wrapper_code) + job_data_dcs_in_wrapper = sorted([job for job in job_data_dcs_in_wrapper if job.status == "COMPLETED"], + key=lambda x: x._id) + job_data_dcs_to_update = [] + if len(job_data_dcs_in_wrapper) > 0: + info_handler = PlatformInformationHandler( + StraightWrapperAssociationStrategy(self._historiclog_dir_path)) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, + slurm_monitor) + if len(job_data_dcs_to_update) == 0: + info_handler.strategy = TwoDimWrapperDistributionStrategy(self._historiclog_dir_path) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, + slurm_monitor) + if len(job_data_dcs_to_update) == 0: + info_handler.strategy = GeneralizedWrapperDistributionStrategy(self._historiclog_dir_path) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, + slurm_monitor) + else: + info_handler = PlatformInformationHandler(SingleAssociationStrategy(self._historiclog_dir_path)) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, + slurm_monitor) + return self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_to_update) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def _verify_slurm_monitor(self, slurm_monitor, job_data_dc): + try: + if slurm_monitor.header.status not in ["COMPLETED", "FAILED"]: + self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, + slurm_monitor.original_input), + "Slurm status {0} is not COMPLETED nor FAILED for ID {1}.\n".format( + slurm_monitor.header.status, slurm_monitor.header.name)) + Log.debug( + f'Historical Database error: Slurm status {slurm_monitor.header.status} is not COMPLETED nor FAILED for ID {slurm_monitor.header.name}.') + if not slurm_monitor.steps_plus_extern_approximate_header_energy(): + self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, + slurm_monitor.original_input), + "Steps + extern != total energy for ID {0}. Number of steps {1}.\n".format( + slurm_monitor.header.name, slurm_monitor.step_count)) + Log.debug( + f'Historical Database error: Steps + extern != total energy for ID {slurm_monitor.header.name}. Number of steps {slurm_monitor.step_count}.') + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="", create=False): + """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ + try: + try: + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + update_these_changes = self._get_built_list_of_changes(job_list) + except: + current_experiment_run_dc = 0 + update_these_changes = [] + # ("no runs") + should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), + current_experiment_run_dc, chunk_unit, chunk_size, + create) + if len(update_these_changes) > 0 and should_create_new_run == False: + self.manager.update_many_job_data_change_status(update_these_changes) + if should_create_new_run: + return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) + return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def _get_built_list_of_changes(self, job_list): + """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ + job_data_dcs = self.detect_changes_in_job_list(job_list) + return [(HUtils.get_current_datetime(), job.status, Models.RowStatus.CHANGED, job._id) for job in job_data_dcs] + + def process_job_list_changes_to_experiment_totals(self, job_list=None): + """ Updates current experiment_run row with totals calculated from job_list. """ + try: + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, + new_chunk_size, create=False): + if create: + return True + elif not create and self.expid[0].lower() != "t": + if len(job_list) != current_experiment_run_dc.total: + return True + if changes_count > int(self._get_date_member_completed_count(job_list)): + return True + return self._chunk_config_has_changed(current_experiment_run_dc, new_chunk_unit, new_chunk_size) + + def _chunk_config_has_changed(self, current_exp_run_dc, new_chunk_unit, new_chunk_size): + if not current_exp_run_dc: + return True + if current_exp_run_dc.chunk_unit != new_chunk_unit or current_exp_run_dc.chunk_size != new_chunk_size: + return True + return False + + def update_counts_on_experiment_run_dc(self, experiment_run_dc, job_list=None): + """ Return updated row as Models.ExperimentRun. """ + status_counts = self.get_status_counts_from_job_list(job_list) + experiment_run_dc.completed = status_counts[HUtils.SupportedStatus.COMPLETED] + experiment_run_dc.failed = status_counts[HUtils.SupportedStatus.FAILED] + experiment_run_dc.queuing = status_counts[HUtils.SupportedStatus.QUEUING] + experiment_run_dc.submitted = status_counts[HUtils.SupportedStatus.SUBMITTED] + experiment_run_dc.running = status_counts[HUtils.SupportedStatus.RUNNING] + experiment_run_dc.suspended = status_counts[HUtils.SupportedStatus.SUSPENDED] + experiment_run_dc.total = status_counts["TOTAL"] + return self.manager.update_experiment_run_dc_by_id(experiment_run_dc) + + def finish_current_experiment_run(self): + if self.manager.is_there_a_last_experiment_run(): current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() - update_these_changes = self._get_built_list_of_changes(job_list) - except: - current_experiment_run_dc = 0 - update_these_changes = [] - #("no runs") - should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size,create) - if len(update_these_changes) > 0 and should_create_new_run == False: - self.manager.update_many_job_data_change_status(update_these_changes) - if should_create_new_run: - return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) - return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - - - def _get_built_list_of_changes(self, job_list): - """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ - job_data_dcs = self.detect_changes_in_job_list(job_list) - return [(HUtils.get_current_datetime(), job.status, Models.RowStatus.CHANGED, job._id) for job in job_data_dcs] - - def process_job_list_changes_to_experiment_totals(self, job_list=None): - """ Updates current experiment_run row with totals calculated from job_list. """ - try: - current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() - return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) - except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') - - - def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): - if create: - return True - elif not create and self.expid[0].lower() != "t": - if len(job_list) != current_experiment_run_dc.total: - return True - if changes_count > int(self._get_date_member_completed_count(job_list)): - return True - return self._chunk_config_has_changed(current_experiment_run_dc, new_chunk_unit, new_chunk_size) - - def _chunk_config_has_changed(self, current_exp_run_dc, new_chunk_unit, new_chunk_size): - if not current_exp_run_dc: - return True - if current_exp_run_dc.chunk_unit != new_chunk_unit or current_exp_run_dc.chunk_size != new_chunk_size: - return True - return False - - def update_counts_on_experiment_run_dc(self, experiment_run_dc, job_list=None): - """ Return updated row as Models.ExperimentRun. """ - status_counts = self.get_status_counts_from_job_list(job_list) - experiment_run_dc.completed = status_counts[HUtils.SupportedStatus.COMPLETED] - experiment_run_dc.failed = status_counts[HUtils.SupportedStatus.FAILED] - experiment_run_dc.queuing = status_counts[HUtils.SupportedStatus.QUEUING] - experiment_run_dc.submitted = status_counts[HUtils.SupportedStatus.SUBMITTED] - experiment_run_dc.running = status_counts[HUtils.SupportedStatus.RUNNING] - experiment_run_dc.suspended = status_counts[HUtils.SupportedStatus.SUSPENDED] - experiment_run_dc.total = status_counts["TOTAL"] - return self.manager.update_experiment_run_dc_by_id(experiment_run_dc) - - def finish_current_experiment_run(self): - if self.manager.is_there_a_last_experiment_run(): - current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() - current_experiment_run_dc.finish = int(time()) - return self.manager.update_experiment_run_dc_by_id(current_experiment_run_dc) - return None - - def create_new_experiment_run(self, chunk_unit="NA", chunk_size=0, current_config="", job_list=None): - """ Also writes the finish timestamp of the previous run. """ - self.finish_current_experiment_run() - return self._create_new_experiment_run_dc_with_counts(chunk_unit=chunk_unit, chunk_size=chunk_size, current_config=current_config, job_list=job_list) - - def _create_new_experiment_run_dc_with_counts(self, chunk_unit, chunk_size, current_config="", job_list=None): - """ Create new experiment_run row and return the new Models.ExperimentRun data class from database. """ - status_counts = self.get_status_counts_from_job_list(job_list) - experiment_run_dc = ExperimentRun(0, - chunk_unit=chunk_unit, - chunk_size=chunk_size, - metadata=current_config, - start=int(time()), - completed=status_counts[HUtils.SupportedStatus.COMPLETED], - total=status_counts["TOTAL"], - failed=status_counts[HUtils.SupportedStatus.FAILED], - queuing=status_counts[HUtils.SupportedStatus.QUEUING], - running=status_counts[HUtils.SupportedStatus.RUNNING], - submitted=status_counts[HUtils.SupportedStatus.SUBMITTED], - suspended=status_counts[HUtils.SupportedStatus.SUSPENDED]) - return self.manager.register_experiment_run_dc(experiment_run_dc) - - def detect_changes_in_job_list(self, job_list): - """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" - job_name_to_job = {str(job.name): job for job in job_list} - current_job_data_dcs = self.manager.get_all_last_job_data_dcs() - differences = [] - for job_dc in current_job_data_dcs: - if job_dc.job_name in job_name_to_job: - if job_dc.status != job_name_to_job[job_dc.job_name].status_str: - if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): - # If the job is not changing from a finalized status to a starting status - job_dc.status = job_name_to_job[job_dc.job_name].status_str - differences.append(job_dc) - return differences - - def _get_defined_rowtype(self, code): - if code: - return code - else: - return Models.RowType.NORMAL - - def _get_defined_queue_name(self, wrapper_queue, wrapper_code, qos): - if wrapper_code and wrapper_code > 2 and wrapper_queue is not None and len(str(wrapper_queue)) > 0: - return wrapper_queue - return qos - - def _get_next_counter_by_job_name(self, job_name): - """ Return the counter attribute from the latest job data row by job_name. """ - job_data_dc = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) - max_counter = self.manager.get_job_data_max_counter() - if job_data_dc: - return max(max_counter, job_data_dc.counter + 1) - else: - return max_counter - - def _get_date_member_completed_count(self, job_list): - """ Each item in the job_list must have attributes: date, member, status_str. """ - job_list = job_list if job_list else [] - return sum(1 for job in job_list if job.date is not None and job.member is not None and job.status_str == HUtils.SupportedStatus.COMPLETED) - - def get_status_counts_from_job_list(self, job_list): - """ - Return dict with keys COMPLETED, FAILED, QUEUING, SUBMITTED, RUNNING, SUSPENDED, TOTAL. - """ - result = { - HUtils.SupportedStatus.COMPLETED: 0, - HUtils.SupportedStatus.FAILED: 0, - HUtils.SupportedStatus.QUEUING: 0, - HUtils.SupportedStatus.SUBMITTED: 0, - HUtils.SupportedStatus.RUNNING: 0, - HUtils.SupportedStatus.SUSPENDED: 0, - "TOTAL": 0 - } - - if not job_list: - job_list = [] - - for job in job_list: - if job.status_str in result: - result[job.status_str] += 1 - result["TOTAL"] = len(job_list) - return result + current_experiment_run_dc.finish = int(time()) + return self.manager.update_experiment_run_dc_by_id(current_experiment_run_dc) + return None + + def create_new_experiment_run(self, chunk_unit="NA", chunk_size=0, current_config="", job_list=None): + """ Also writes the finish timestamp of the previous run. """ + self.finish_current_experiment_run() + return self._create_new_experiment_run_dc_with_counts(chunk_unit=chunk_unit, chunk_size=chunk_size, + current_config=current_config, job_list=job_list) + + def _create_new_experiment_run_dc_with_counts(self, chunk_unit, chunk_size, current_config="", job_list=None): + """ Create new experiment_run row and return the new Models.ExperimentRun data class from database. """ + status_counts = self.get_status_counts_from_job_list(job_list) + experiment_run_dc = ExperimentRun(0, + chunk_unit=chunk_unit, + chunk_size=chunk_size, + metadata=current_config, + start=int(time()), + completed=status_counts[HUtils.SupportedStatus.COMPLETED], + total=status_counts["TOTAL"], + failed=status_counts[HUtils.SupportedStatus.FAILED], + queuing=status_counts[HUtils.SupportedStatus.QUEUING], + running=status_counts[HUtils.SupportedStatus.RUNNING], + submitted=status_counts[HUtils.SupportedStatus.SUBMITTED], + suspended=status_counts[HUtils.SupportedStatus.SUSPENDED]) + return self.manager.register_experiment_run_dc(experiment_run_dc) + + def detect_changes_in_job_list(self, job_list): + """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" + job_name_to_job = {str(job.name): job for job in job_list} + current_job_data_dcs = self.manager.get_all_last_job_data_dcs() + differences = [] + for job_dc in current_job_data_dcs: + if job_dc.job_name in job_name_to_job: + if job_dc.status != job_name_to_job[job_dc.job_name].status_str: + if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[ + job_dc.job_name].status_str in ["WAITING", "READY"]): + # If the job is not changing from a finalized status to a starting status + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) + return differences + + def _get_defined_rowtype(self, code): + if code: + return code + else: + return Models.RowType.NORMAL + + def _get_defined_queue_name(self, wrapper_queue, wrapper_code, qos): + if wrapper_code and wrapper_code > 2 and wrapper_queue is not None and len(str(wrapper_queue)) > 0: + return wrapper_queue + return qos + + def _get_next_counter_by_job_name(self, job_name): + """ Return the counter attribute from the latest job data row by job_name. """ + job_data_dc = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + max_counter = self.manager.get_job_data_max_counter() + if job_data_dc: + return max(max_counter, job_data_dc.counter + 1) + else: + return max_counter + + def _get_date_member_completed_count(self, job_list): + """ Each item in the job_list must have attributes: date, member, status_str. """ + job_list = job_list if job_list else [] + return sum(1 for job in job_list if + job.date is not None and job.member is not None and job.status_str == HUtils.SupportedStatus.COMPLETED) + + def get_status_counts_from_job_list(self, job_list): + """ + Return dict with keys COMPLETED, FAILED, QUEUING, SUBMITTED, RUNNING, SUSPENDED, TOTAL. + """ + result = { + HUtils.SupportedStatus.COMPLETED: 0, + HUtils.SupportedStatus.FAILED: 0, + HUtils.SupportedStatus.QUEUING: 0, + HUtils.SupportedStatus.SUBMITTED: 0, + HUtils.SupportedStatus.RUNNING: 0, + HUtils.SupportedStatus.SUSPENDED: 0, + "TOTAL": 0 + } + + if not job_list: + job_list = [] + + for job in job_list: + if job.status_str in result: + result[job.status_str] += 1 + result["TOTAL"] = len(job_list) + return result diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 23bdb5d6f..81dbed820 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -245,9 +245,9 @@ class Job(object): self.updated_log = True self.ready_start_date = None self.log_retrieved = False - self.start_time_writted = False - self.submission_time_writted = False - + self.start_time_written = False + self.submit_time_timestamp = None # for wrappers, all jobs inside a wrapper are submitted at the same time + self.finish_time_timestamp = None # for wrappers, with inner_retrials, the submission time should be the last finish_time of the previous retrial def _init_runtime_parameters(self): # hetjobs self.het = {'HETSIZE': 0} @@ -261,6 +261,7 @@ class Job(object): self._memory = '' self._memory_per_task = '' self.log_retrieved = False + self.start_time_placeholder = "" @property @autosubmit_parameter(name='tasktype') @@ -908,7 +909,7 @@ class Job(object): """ return self.parents.__len__() - def _get_from_stat(self, index): + def _get_from_stat(self, index, fail_count =-1): """ Returns value from given row index position in STAT file associated to job @@ -917,7 +918,11 @@ class Job(object): :return: value in index position :rtype: int """ - logname = os.path.join(self._tmp_path, self.name + '_STAT') + if fail_count == -1: + logname = os.path.join(self._tmp_path, self.name + '_STAT') + else: + fail_count = str(fail_count) + logname = os.path.join(self._tmp_path, self.name + '_STAT_' + fail_count) if os.path.exists(logname): lines = open(logname).readlines() if len(lines) >= index + 1: @@ -947,23 +952,23 @@ class Job(object): lst.append(parse_date(fields[index])) return lst - def check_end_time(self): + def check_end_time(self, fail_count=-1): """ Returns end time from stat file :return: date and time :rtype: str """ - return self._get_from_stat(1) + return self._get_from_stat(1, fail_count) - def check_start_time(self): + def check_start_time(self, fail_count=-1): """ Returns job's start time :return: start time :rtype: str """ - return self._get_from_stat(0) + return self._get_from_stat(0,fail_count) def check_retrials_end_time(self): """ @@ -1009,26 +1014,17 @@ class Job(object): retrials_list.insert(0, retrial_dates) return retrials_list - def get_new_remotelog(self, platform, max_logs, last_log, stat_file): + def get_new_remotelog_name(self): """ - Checks if stat file exists on remote host + Checks if remote log file exists on remote host if it exists, remote_log variable is updated + :param """ try: - if self.wrapper_type == "vertical": - platform.check_stat_file_by_retrials(stat_file + str(max_logs), retries=1) - for i in range(max_logs-1,-1,-1): - if platform.check_stat_file_by_retrials(stat_file + str(i), retries=1, first=False): - last_log = i - else: - break - remote_logs = (f"{self.script_name}.out.{last_log}", f"{self.script_name}.err.{last_log}") - else: - remote_logs = (f"{self.script_name}.out.{self._fail_count}", f"{self.script_name}.err.{self._fail_count}") - + remote_logs = (f"{self.script_name}.out.{self._fail_count}", f"{self.script_name}.err.{self._fail_count}") except BaseException as e: remote_logs = "" - Log.printlog(f"Trace {e} \n Failed to retrieve stat file for job {self.name}", 6000) + Log.printlog(f"Trace {e} \n Failed to retrieve log file for job {self.name}", 6000) return remote_logs def check_remote_log_exists(self, platform): @@ -1044,103 +1040,56 @@ class Job(object): err_exist = False return out_exist or err_exist - def retrieve_vertical_wrapper_logs(self, last_log, max_logs, platform, stat_file, max_retrials, fail_count): - """ - Retrieves log files from remote host meant to be used inside a daemon thread. - :param last_log: - :param max_logs: - :param platform: - :param stat_file: - :param max_retrials: - :param fail_count: - :return: - """ - lang = locale.getlocale()[1] or locale.getdefaultlocale()[1] or 'UTF-8' - exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, self.name[:4]) - tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) - time_stamp = "1970" - at_least_one_recovered = False - for _ in range(max_logs, last_log - 1, -1): - try: - if platform.get_stat_file_by_retrials(stat_file + str(max_logs)): - with open(os.path.join(tmp_path, stat_file + str(max_logs)), 'r+') as f: - total_stats = [f.readline()[:-1], f.readline()[:-1], f.readline()[:-1]] - try: - total_stats[0] = float(total_stats[0]) - total_stats[1] = float(total_stats[1]) - except Exception as e: - Log.debug(f"Trace {e} \n Failed to convert total stats to float, falling back to int") - total_stats[0] = int(str(total_stats[0]).split('.')[0]) - total_stats[1] = int(str(total_stats[1]).split('.')[0]) - if max_logs != (int(max_retrials) - fail_count): - time_stamp = date2str(datetime.datetime.fromtimestamp(total_stats[0]), 'S') - else: - with open(os.path.join(self._tmp_path, self.name + '_TOTAL_STATS_TMP'), 'rb+') as f2: - for line in f2.readlines(): - if len(line) > 0: - line = line.decode(lang) - time_stamp = line.split(" ")[0] - - self.write_total_stat_by_retries(total_stats, max_logs == (int(max_retrials) - fail_count)) - platform.remove_stat_file_by_retrials(stat_file + str(max_logs)) - l_log = (self.script_name[:-4] + "." + time_stamp + ".out", - self.script_name[:-4] + "." + time_stamp + ".err") - r_log = (self.remote_logs[0][:-1] + str(max_logs), self.remote_logs[1][:-1] + str(max_logs)) - self.synchronize_logs(platform, r_log, l_log, last=False) - platform.get_logs_files(self.expid, l_log) - try: - for local_log in l_log: - platform.write_jobid(self.id,os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) - except BaseException as e: - Log.debug(f"Trace {e} \n Failed to write the {self.id} inside {l_log}") - at_least_one_recovered = True - except Exception as e: - Log.debug(f"Trace {e} \n Failed to retrieve log files for job {self.name}") - return False - return at_least_one_recovered - def retrieve_logfiles(self, platform, raise_error=False): """ Retrieves log files from remote host meant to be used inside a process. :param platform: platform that is calling the function, already connected. + :param raise_error: boolean to raise an error if the logs are not retrieved :return: """ backup_logname = copy.copy(self.local_logs) log_retrieved = False max_retrials = self.retrials - max_logs = int(max_retrials) - self._fail_count - last_log = int(max_retrials) - self._fail_count - stat_file = self.script_name[:-4] + "_STAT_" - self.remote_logs = self.get_new_remotelog(platform, max_logs, last_log, stat_file) + if self.wrapper_type == "vertical": + stat_file = self.script_name[:-4] + "_STAT_" + else: + stat_file = self.script_name[:-4] + "_STAT" + self.remote_logs = self.get_new_remotelog_name() if not self.remote_logs: self.log_retrieved = False else: if self.check_remote_log_exists(platform): - # retrieve logs and stat files - if self.wrapper_type is not None and self.wrapper_type == "vertical": - if self.retrieve_vertical_wrapper_logs(last_log, max_logs, platform, stat_file, max_retrials, self._fail_count): - log_retrieved = True - else: - try: - self.synchronize_logs(platform, self.remote_logs, self.local_logs) - remote_logs = copy.deepcopy(self.local_logs) - platform.get_logs_files(self.expid, remote_logs) - log_retrieved = True - except: - log_retrieved = False - # Update the logs with Autosubmit Job ID Brand - try: - for local_log in self.local_logs: - platform.write_jobid(self.id, os.path.join( - self._tmp_path, 'LOG_' + str(self.expid), local_log)) - except BaseException as e: - Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name)) + try: + self.synchronize_logs(platform, self.remote_logs, self.local_logs) + remote_logs = copy.deepcopy(self.local_logs) + platform.get_logs_files(self.expid, remote_logs) + log_retrieved = True + except BaseException: + log_retrieved = False self.log_retrieved = log_retrieved if not self.log_retrieved: self.local_logs = backup_logname Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6000) if raise_error: raise + else: + # Update the logs with Autosubmit Job ID Brand + try: + for local_log in self.local_logs: + platform.write_jobid(self.id, os.path.join( + self._tmp_path, 'LOG_' + str(self.expid), local_log)) + except BaseException as e: + Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name)) + # write stats + if self.wrapper_type == "vertical": # Disable AS retrials for vertical wrappers to use internal ones + for i in range(0,int(self.retrials)): + self.platform.get_stat_file(self.name, stat_file, count=i) + self.write_vertical_time() + self.inc_fail_count() + else: + self.platform.get_stat_file(self.name, stat_file) + self.write_start_time(from_stat_file=True) + self.write_end_time(self.status == Status.COMPLETED) def parse_time(self,wallclock): regex = re.compile(r'(((?P\d+):)((?P\d+)))(:(?P\d+))?') @@ -1244,24 +1193,22 @@ class Job(object): # after checking the jobs , no job should have the status "submitted" Log.printlog("Job {0} in SUBMITTED status. This should never happen on this step..".format( self.name), 6008) - if previous_status == Status.HELD and self.status in [Status.SUBMITTED, Status.QUEUING, Status.RUNNING, Status.COMPLETED]: - self.write_submit_time() + # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: if str(as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "true": self.retrieve_logfiles(self.platform) else: self.platform.add_job_to_log_recover(self) - if self.wrapper_type == "vertical": - max_logs = int(self.retrials) - for i in range(0,max_logs): - self.inc_fail_count() - else: - self.write_end_time(self.status == Status.COMPLETED) + if self.status in [Status.COMPLETED, Status.FAILED]: self.updated_log = False + # Write start_time() if not already written and job is running, completed or failed + if self.status in [Status.RUNNING, Status.COMPLETED, Status.FAILED] and not self.start_time_written: + self.write_start_time() + return self.status @staticmethod @@ -1655,6 +1602,14 @@ class Job(object): def update_dict_parameters(self,as_conf): self.retrials = as_conf.jobs_data.get(self.section,{}).get("RETRIALS", as_conf.experiment_data.get("CONFIG",{}).get("RETRIALS", 0)) + for wrapper_data in ( wrapper for wrapper in as_conf.experiment_data.get("WRAPPERS",{}).values() if type(wrapper) is dict): + jobs_in_wrapper = wrapper_data.get("JOBS_IN_WRAPPER", "").upper() + if "," in jobs_in_wrapper: + jobs_in_wrapper = jobs_in_wrapper.split(",") + else: + jobs_in_wrapper = jobs_in_wrapper.split(" ") + if self.section.upper() in jobs_in_wrapper: + self.retrials = wrapper_data.get("RETRIALS", self.retrials) self.splits = as_conf.jobs_data.get(self.section,{}).get("SPLITS", None) self.delete_when_edgeless = as_conf.jobs_data.get(self.section,{}).get("DELETE_WHEN_EDGELESS", True) self.dependencies = str(as_conf.jobs_data.get(self.section,{}).get("DEPENDENCIES","")) @@ -2157,135 +2112,126 @@ class Job(object): str(set(parameters) - set(variables))), 5013) return out - def write_submit_time(self, enabled=False, hold=False): + def write_submit_time(self, hold=False, enable_vertical_write=False, wrapper_submit_time=None): # type: (bool, bool) -> None """ Writes submit date and time to TOTAL_STATS file. It doesn't write if hold is True. """ - # print(traceback.format_stack()) - self.write_start_time() + + self.start_time_written = False + if wrapper_submit_time: + self.submit_time_timestamp = wrapper_submit_time + else: + self.submit_time_timestamp = date2str(datetime.datetime.now(), 'S') + if self.wrapper_type == "vertical" and self.fail_count > 0: + self.submit_time_timestamp = self.finish_time_timestamp + self.local_logs = (f"{self.name}.{self.submit_time_timestamp}.out", f"{self.name}.{self.submit_time_timestamp}.err") # for wrappers with inner retrials + + if not enable_vertical_write and self.wrapper_type == "vertical": + return print(("Call from {} with status {}".format(self.name, self.status_str))) if hold is True: return # Do not write for HELD jobs. - data_time = ["",time.time()] - if self.wrapper_type != "vertical" or enabled: - path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') - else: - path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS_TMP') + + data_time = ["",int(datetime.datetime.strptime(self.submit_time_timestamp, "%Y%m%d%H%M%S").timestamp())] + path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') if os.path.exists(path): f = open(path, 'a') f.write('\n') else: f = open(path, 'w') - if not enabled: - f.write(date2str(datetime.datetime.now(), 'S')) - if self.wrapper_type == "vertical": - f.write(" "+str(time.time())) - else: - path2 = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS_TMP') - f2 = open(path2, 'r') - for line in f2.readlines(): - if len(line) > 0: - data_time = line.split(" ") - try: - data_time[1] = float(data_time[1]) - except Exception as e: - data_time[1] = int(data_time[1]) - f.write(data_time[0]) - f2.close() - try: - os.remove(path2) - except Exception as e: - pass - # Get + f.write(self.submit_time_timestamp) + # Writing database - if self.wrapper_type != "vertical" or enabled: - exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.write_submit_time(self.name, submit=data_time[1], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), - children=self.children_names_str) + exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.write_submit_time(self.name, submit=data_time[1], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + children=self.children_names_str) - def write_start_time(self, enabled = False): + def write_start_time(self, enable_vertical_write=False, from_stat_file=False, count=-1): """ Writes start date and time to TOTAL_STATS file :return: True if successful, False otherwise :rtype: bool """ - start_time = time.time() - if self.wrapper_type == "vertical": - timestamp = date2str(datetime.datetime.now(), 'S') - self.local_logs = (f"{self.name}.{timestamp}.out", f"{self.name}.{timestamp}.err") + if not enable_vertical_write and self.wrapper_type == "vertical": + return - elif self.wrapper_type != "vertical" or enabled: - start_time_ = self.check_start_time() + self.start_time_written = True + if not from_stat_file: # last known start time from AS + self.start_time_placeholder = time.time() + elif from_stat_file: + start_time_ = self.check_start_time(count) # last known start time from the .cmd file if start_time_: start_time = start_time_ - timestamp = date2str(datetime.datetime.now(), 'S') - - self.local_logs = (self.name + "." + timestamp + - ".out", self.name + "." + timestamp + ".err") - + else: + start_time = self.start_time_placeholder if self.start_time_placeholder else time.time() path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') f = open(path, 'a') f.write(' ') # noinspection PyTypeChecker f.write(date2str(datetime.datetime.fromtimestamp(start_time), 'S')) # Writing database - exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), - children=self.children_names_str) + exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + children=self.children_names_str) return True - def write_end_time(self, completed,enabled = False): + def write_vertical_time(self): + self.write_submit_time(enable_vertical_write=True) + self.write_start_time(enable_vertical_write=True, from_stat_file=True) + self.write_end_time(self.status == Status.COMPLETED, enable_vertical_write=True) + def write_end_time(self, completed, enable_vertical_write=False): """ Writes ends date and time to TOTAL_STATS file - :param enabled: :param completed: True if job was completed successfully, False otherwise :type completed: bool """ - if self.wrapper_type != "vertical" or enabled: - self._platform.get_stat_file(self.name, retries=5) - end_time = self.check_end_time() - path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') - f = open(path, 'a') - f.write(' ') - finish_time = None - final_status = None - if len(str(end_time)) > 0: - # noinspection PyTypeChecker - f.write(date2str(datetime.datetime.fromtimestamp(float(end_time)), 'S')) - # date2str(datetime.datetime.fromtimestamp(end_time), 'S') - finish_time = end_time - else: - f.write(date2str(datetime.datetime.now(), 'S')) - finish_time = time.time() # date2str(datetime.datetime.now(), 'S') - f.write(' ') - if completed: - final_status = "COMPLETED" - f.write('COMPLETED') - else: - final_status = "FAILED" - f.write('FAILED') - out, err = self.local_logs - path_out = os.path.join(self._tmp_path, 'LOG_' + str(self.expid), out) - # Launch first as simple non-threaded function - exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - job_data_dc = exp_history.write_finish_time(self.name, finish=finish_time, status=final_status, ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, - wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) + if not enable_vertical_write and self.wrapper_type == "vertical": + return + end_time = self.check_end_time() + path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') + f = open(path, 'a') + f.write(' ') + finish_time = None + final_status = None + if end_time > 0: + # noinspection PyTypeChecker + f.write(date2str(datetime.datetime.fromtimestamp(float(end_time)), 'S')) + self.finish_time_timestamp = date2str(datetime.datetime.fromtimestamp(end_time),'S') + # date2str(datetime.datetime.fromtimestamp(end_time), 'S') + finish_time = end_time + else: + f.write(date2str(datetime.datetime.now(), 'S')) + self.finish_time_timestamp = date2str(datetime.datetime.now(), 'S') + finish_time = time.time() # date2str(datetime.datetime.now(), 'S') + f.write(' ') + if completed: + final_status = "COMPLETED" + f.write('COMPLETED') + else: + final_status = "FAILED" + f.write('FAILED') + out, err = self.local_logs + path_out = os.path.join(self._tmp_path, 'LOG_' + str(self.expid), out) + # Launch first as simple non-threaded function + exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + job_data_dc = exp_history.write_finish_time(self.name, finish=finish_time, status=final_status, ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, + wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) - # Launch second as threaded function only for slurm - if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": - thread_write_finish = Thread(target=ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR).write_platform_data_after_finish, args=(job_data_dc, self.platform)) - thread_write_finish.name = "JOB_data_{}".format(self.name) - thread_write_finish.start() + # Launch second as threaded function only for slurm + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + thread_write_finish = Thread(target=ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR).write_platform_data_after_finish, args=(job_data_dc, self.platform)) + thread_write_finish.name = "JOB_data_{}".format(self.name) + thread_write_finish.start() - def write_total_stat_by_retries(self,total_stats, first_retrial = False): + def write_total_stat_by_retries(self, total_stats, first_retrial = False): """ Writes all data to TOTAL_STATS file :param total_stats: data gathered by the wrapper @@ -2294,8 +2240,6 @@ class Job(object): :type first_retrial: bool """ - if first_retrial: - self.write_submit_time(enabled=True) path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') f = open(path, 'a') if first_retrial: diff --git a/autosubmit/job/job_common.py b/autosubmit/job/job_common.py index 69d541352..3999a03b0 100644 --- a/autosubmit/job/job_common.py +++ b/autosubmit/job/job_common.py @@ -199,7 +199,7 @@ class StatisticsSnippetPython: locale.setlocale(locale.LC_ALL, 'C') job_name_ptrn = '%CURRENT_LOGDIR%/%JOBNAME%' stat_file = open(job_name_ptrn + '_STAT', 'w') - stat_file.write('{0:.0f}\\n'.format(time.time())) + stat_file.write('int({0:.0f})\\n'.format(time.time())) stat_file.close() ################### # Autosubmit Checkpoint @@ -228,7 +228,7 @@ class StatisticsSnippetPython: ################### stat_file = open(job_name_ptrn + '_STAT', 'a') - stat_file.write('{0:.0f}\\n'.format(time.time())) + stat_file.write('int({0:.0f})\\n'.format(time.time())) stat_file.close() open(job_name_ptrn + '_COMPLETED', 'a').close() exit(0) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 9fc95fbf6..103fa2f37 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -484,6 +484,8 @@ class JobPackager(object): built_packages_tmp = list() for param in self.wrapper_info: current_info.append(param[self.current_wrapper_section]) + current_info.append(self._as_config) + if self.wrapper_type[self.current_wrapper_section] == 'vertical': built_packages_tmp = self._build_vertical_packages(jobs, wrapper_limits,wrapper_info=current_info) elif self.wrapper_type[self.current_wrapper_section] == 'horizontal': @@ -595,9 +597,8 @@ class JobPackager(object): if job.packed is False: job.packed = True dict_jobs = self._jobs_list.get_ordered_jobs_by_date_member(self.current_wrapper_section) - job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, wrapper_limits["max"], wrapper_limits, self._platform.max_wallclock) + job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, wrapper_limits["max"], wrapper_limits, self._platform.max_wallclock,wrapper_info=wrapper_info) jobs_list = job_vertical_packager.build_vertical_package(job) - packages.append(JobPackageVertical(jobs_list, configuration=self._as_config,wrapper_section=self.current_wrapper_section,wrapper_info=wrapper_info)) else: @@ -718,6 +719,7 @@ class JobPackagerVertical(object): child = self.get_wrappable_child(job) # If not None, it is wrappable if child is not None and len(str(child)) > 0: + child.update_parameters(self.wrapper_info[-1],{}) # Calculate total wallclock per possible wrapper self.total_wallclock = sum_str_hours( self.total_wallclock, child.wallclock) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index f4a5eb164..7af882111 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -333,14 +333,15 @@ class JobPackageArray(JobPackageBase): package_id = self.platform.submit_job(None, self._common_script, hold=hold, export = self.export) - if package_id is None or not package_id: + if package_id is None or not package_id: # platforms with a submit.cmd return - - for i in range(0, len(self.jobs)): + wrapper_time = None + for i in range(0, len(self.jobs)): # platforms without a submit.cmd Log.info("{0} submitted", self.jobs[i].name) self.jobs[i].id = str(package_id) + '[{0}]'.format(i) self.jobs[i].status = Status.SUBMITTED - self.jobs[i].write_submit_time(hold=hold) + self.jobs[i].write_submit_time(hold=hold,wrapper_submit_time=wrapper_time) + wrapper_time = self.jobs[i].write_submit_time class JobPackageThread(JobPackageBase): @@ -624,12 +625,13 @@ class JobPackageThread(JobPackageBase): if package_id is None or not package_id: return - - for i in range(0, len(self.jobs) ): + wrapper_time = None + for i in range(0, len(self.jobs)): Log.info("{0} submitted", self.jobs[i].name) self.jobs[i].id = str(package_id) - self.jobs[i].status = Status.SUBMITTED - self.jobs[i].write_submit_time(hold=hold) + self.jobs[i].status = Status.SUBMITTED + self.jobs[i].write_submit_time(hold=hold,wrapper_submit_time=wrapper_time) + wrapper_time = self.jobs[i].write_submit_time def _common_script_content(self): pass @@ -703,12 +705,14 @@ class JobPackageThreadWrapped(JobPackageThread): if package_id is None or not package_id: raise Exception('Submission failed') - + wrapper_time = None for i in range(0, len(self.jobs)): Log.info("{0} submitted", self.jobs[i].name) self.jobs[i].id = str(package_id) - self.jobs[i].status = Status.SUBMITTED - self.jobs[i].write_submit_time(hold=hold) + self.jobs[i].status = Status.SUBMITTED + self.jobs[i].write_submit_time(hold=hold,wrapper_submit_time=wrapper_time) + wrapper_time = self.jobs[i].write_submit_time + class JobPackageVertical(JobPackageThread): """ Class to manage a vertical thread-based package of jobs to be submitted by autosubmit diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index 83e0d416b..ac3d09eab 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -335,6 +335,7 @@ class Platform(object): raise except Exception as e: raise + return save, failed_packages, error_message, valid_packages_to_submit @property @@ -644,7 +645,7 @@ class Platform(object): def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3, first=True): return True - def get_stat_file(self, job_name, retries=0): + def get_stat_file(self, job_name, retries=0, count = -1): """ Copies *STAT* files from remote to local @@ -655,7 +656,10 @@ class Platform(object): :return: True if successful, False otherwise :rtype: bool """ - filename = job_name + '_STAT' + if count == -1: # No internal retrials + filename = job_name + '_STAT' + else: + filename = job_name + '_STAT_{0}'.format(str(count)) stat_local_path = os.path.join( self.config.get("LOCAL_ROOT_DIR"), self.expid, self.config.get("LOCAL_TMP_DIR"), filename) if os.path.exists(stat_local_path): @@ -667,48 +671,6 @@ class Platform(object): Log.debug('{0}_STAT file not found', job_name) return False - def check_stat_file_by_retrials(self, job_name, retries=3, first=True): - """ - check *STAT* file - - :param retries: number of intents to get the completed files - :type retries: int - :param first: First time calling the function - :type first: int - :param job_name: name of job to check - :type job_name: str - :return: True if successful, False otherwise - :rtype: bool - """ - filename = job_name - if self.check_file_exists(filename,first=first,max_retries = retries): - return True - else: - return False - - def get_stat_file_by_retrials(self, job_name, retries=0): - """ - Copies *STAT* files from remote to local - - :param retries: number of intents to get the completed files - :type retries: int - :param job_name: name of job to check - :type job_name: str - :return: True if successful, False otherwise - :rtype: bool - """ - filename = job_name - stat_local_path = os.path.join( - self.config.get("LOCAL_ROOT_DIR"), self.expid, self.config.get("LOCAL_TMP_DIR"), filename) - if os.path.exists(stat_local_path): - os.remove(stat_local_path) - if self.check_file_exists(filename): - if self.get_file(filename, True): - return True - else: - return False - else: - return False @autosubmit_parameter(name='current_logdir') def get_files_path(self): @@ -857,13 +819,20 @@ class Platform(object): while True: try: job,children = self.recovery_queue.get() - if (f'{job.name}_{job.fail_count}') in job_names_processed: - continue + if job.wrapper_type != "vertical": + if f'{job.name}_{job.fail_count}' in job_names_processed: + continue + else: + if f'{job.name}' in job_names_processed: + continue job.children = children job.platform = self try: job.retrieve_logfiles(self, raise_error=True) - job_names_processed.add(f'{job.name}_{job.fail_count}') + if job.wrapper_type != "vertical": + job_names_processed.add(f'{job.name}_{job.fail_count}') + else: + job_names_processed.add(f'{job.name}') except: pass except queue.Empty: diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 8b3daae85..c52a6c0e1 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -170,7 +170,6 @@ class SlurmPlatform(ParamikoPlatform): job.hold = hold job.id = str(jobs_id[i]) job.status = Status.SUBMITTED - job.write_submit_time(hold=hold) # Check if there are duplicated jobnames if not duplicated_jobs_already_checked: job_name = package.name if hasattr(package, "name") else package.jobs[0].name diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index df348f6dd..6ab487251 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -454,7 +454,7 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder): while job_retrials >= 0 and not completed: current = {1} current.start() - os.system("echo "+str(time.time())+" > "+scripts[i][:-4]+"_STAT_"+str(job_retrials)) #Start/submit running + os.system("echo "+str(int(time.time()))+" > "+scripts[i][:-4]+"_STAT_"+str(job_retrials)) #Start/submit running current.join({3}) job_retrials = job_retrials - 1 total_steps = total_steps + 1 @@ -467,7 +467,7 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder): failed_filename = {0}[i].replace('.cmd', '_FAILED') failed_path = os.path.join(os.getcwd(), failed_filename) failed_wrapper = os.path.join(os.getcwd(), wrapper_id) - os.system("echo "+str(time.time())+" >> "+scripts[i][:-4]+"_STAT_"+str(job_retrials+1)) #Completed + os.system("echo "+str(int(time.time()))+" >> "+scripts[i][:-4]+"_STAT_"+str(job_retrials+1)) #Completed if os.path.exists(completed_path): completed = True print(datetime.now(), "The job ", current.template," has been COMPLETED") -- GitLab From 71d0c39b9d82f38a5dac16fd69e09d55cddffef1 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 15 Apr 2024 18:21:03 +0200 Subject: [PATCH 17/20] Fix horizontal-vertical Fix vertical-horizontal --- autosubmit/job/job_packager.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 103fa2f37..11d4d8270 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -606,6 +606,7 @@ class JobPackager(object): return packages def _build_hybrid_package(self, jobs_list, wrapper_limits, section,wrapper_info={}): + self.wrapper_info = wrapper_info jobs_resources = dict() jobs_resources['MACHINEFILES'] = self._as_config.get_wrapper_machinefiles() @@ -621,12 +622,12 @@ class JobPackager(object): def _build_horizontal_vertical_package(self, horizontal_packager, section, jobs_resources): total_wallclock = '00:00' - horizontal_package = horizontal_packager.build_horizontal_package() + horizontal_package = horizontal_packager.build_horizontal_package(wrapper_info=self.wrapper_info) horizontal_packager.create_sections_order(section) horizontal_packager.add_sectioncombo_processors( horizontal_packager.total_processors) horizontal_package.sort( - key=lambda job: horizontal_packager.sort_by_expression(job.name)) + key=lambda job: horizontal_packager.sort_by_expression(job.section)) job = max(horizontal_package, key=attrgetter('total_wallclock')) wallclock = job.wallclock current_package = [horizontal_package] @@ -664,7 +665,7 @@ class JobPackager(object): dict_jobs = self._jobs_list.get_ordered_jobs_by_date_member(self.current_wrapper_section) job_list = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, horizontal_packager.wrapper_limits["max"], horizontal_packager.wrapper_limits, - self._platform.max_wallclock).build_vertical_package(job) + self._platform.max_wallclock,wrapper_info=self.wrapper_info).build_vertical_package(job) current_package.append(list(set(job_list))) for job in current_package[-1]: @@ -858,14 +859,16 @@ class JobPackagerHorizontal(object): self._maxTotalProcessors = 0 self._sectionList = list() self._package_sections = dict() - - def build_horizontal_package(self, horizontal_vertical=False): + self.wrapper_info = {} + def build_horizontal_package(self, horizontal_vertical=False,wrapper_info={}): + self.wrapper_info = wrapper_info current_package = [] current_package_by_section = {} if horizontal_vertical: self._current_processors = 0 jobs_by_section = dict() for job in self.job_list: + job.update_parameters(self.wrapper_info[-1],{}) if job.section not in jobs_by_section: jobs_by_section[job.section] = list() jobs_by_section[job.section].append(job) @@ -920,9 +923,8 @@ class JobPackagerHorizontal(object): max(self._package_sections.values()), self._maxTotalProcessors) return True - def sort_by_expression(self, jobname): - jobname = jobname.split('_')[-1] - return self._sort_order_dict[jobname] + def sort_by_expression(self, section): + return self._sort_order_dict[section] def get_next_packages(self, jobs_sections, max_wallclock=None, potential_dependency=None, packages_remote_dependencies=list(), horizontal_vertical=False, max_procs=0): packages = [] @@ -941,12 +943,13 @@ class JobPackagerHorizontal(object): if other_parent.status != Status.COMPLETED and other_parent not in self.job_list: wrappable = False if wrappable and child not in next_section_list: + child.update_parameters(self.wrapper_info[-1],{}) next_section_list.append(child) next_section_list.sort( - key=lambda job: self.sort_by_expression(job.name)) + key=lambda job: self.sort_by_expression(job.section)) self.job_list = next_section_list - package_jobs = self.build_horizontal_package(horizontal_vertical) + package_jobs = self.build_horizontal_package(horizontal_vertical,wrapper_info=self.wrapper_info) if package_jobs: sections_aux = set() -- GitLab From 3b29882c4ae3567dc1b04317c7354dd96359d3f2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 16 Apr 2024 12:42:49 +0200 Subject: [PATCH 18/20] fix tests --- autosubmit/job/job_packager.py | 2 +- autosubmit/job/job_packages.py | 2 +- test/unit/test_wrappers.py | 44 +++++++++++++++++++--------------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 11d4d8270..d3eda6a82 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -859,7 +859,7 @@ class JobPackagerHorizontal(object): self._maxTotalProcessors = 0 self._sectionList = list() self._package_sections = dict() - self.wrapper_info = {} + self.wrapper_info = [] def build_horizontal_package(self, horizontal_vertical=False,wrapper_info={}): self.wrapper_info = wrapper_info current_package = [] diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 7af882111..581738da4 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -720,7 +720,7 @@ class JobPackageVertical(JobPackageThread): :type jobs: :param: dependency: """ - def __init__(self, jobs, dependency=None,configuration=None,wrapper_section="WRAPPERS", wrapper_info = {}): + def __init__(self, jobs, dependency=None,configuration=None,wrapper_section="WRAPPERS", wrapper_info = []): self._num_processors = 0 for job in jobs: if int(job.processors) >= int(self._num_processors): diff --git a/test/unit/test_wrappers.py b/test/unit/test_wrappers.py index c005020b8..62ff9bc8d 100644 --- a/test/unit/test_wrappers.py +++ b/test/unit/test_wrappers.py @@ -172,6 +172,7 @@ class TestWrappers(TestCase): self.temp_directory = tempfile.mkdtemp() self.job_list = JobList(self.experiment_id, self.config, YAMLParserFactory(), JobListPersistenceDb(self.temp_directory, 'db'),self.as_conf) + self.parser_mock = MagicMock(spec='SafeConfigParser') self._platform.max_waiting_jobs = 100 @@ -200,6 +201,8 @@ class TestWrappers(TestCase): self.job_packager = JobPackager( self.as_conf, self._platform, self.job_list) self.job_list._ordered_jobs_by_date_member["WRAPPERS"] = dict() + self.wrapper_info = ['vertical', 'flexible', 'asthread', ['SIM'], 0,self.as_conf] + def tearDown(self) -> None: shutil.rmtree(self.temp_directory) @@ -272,8 +275,10 @@ class TestWrappers(TestCase): wrapper_limits["min_v"] = 2 wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section + + returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits, self.wrapper_info) package_m1_s2 = [d1_m1_1_s2, d1_m1_2_s2, d1_m1_3_s2, d1_m1_4_s2, d1_m1_5_s2, d1_m1_6_s2, d1_m1_7_s2, d1_m1_8_s2, d1_m1_9_s2, d1_m1_10_s2] @@ -354,7 +359,7 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits, wrapper_info=self.wrapper_info) package_m1_s2 = [d1_m1_1_s2, d1_m1_2_s2, d1_m1_3_s2, d1_m1_4_s2, d1_m1_5_s2, d1_m1_6_s2, d1_m1_7_s2, d1_m1_8_s2, d1_m1_9_s2, d1_m1_10_s2] @@ -362,7 +367,7 @@ class TestWrappers(TestCase): d1_m2_9_s2, d1_m2_10_s2] packages = [JobPackageVertical( - package_m1_s2,configuration=self.as_conf), JobPackageVertical(package_m2_s2,configuration=self.as_conf)] + package_m1_s2,configuration=self.as_conf, wrapper_info=self.wrapper_info), JobPackageVertical(package_m2_s2,configuration=self.as_conf, wrapper_info=self.wrapper_info)] for i in range(0, len(returned_packages)): self.assertListEqual(returned_packages[i]._jobs, packages[i]._jobs) @@ -424,7 +429,7 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits, self.wrapper_info) package_m1_s2 = [d1_m1_1_s2, d1_m1_2_s2, d1_m1_3_s2, d1_m1_4_s2, d1_m1_5_s2] @@ -432,7 +437,7 @@ class TestWrappers(TestCase): d1_m2_3_s2, d1_m2_4_s2, d1_m2_5_s2] packages = [JobPackageVertical( - package_m1_s2,configuration=self.as_conf), JobPackageVertical(package_m2_s2,configuration=self.as_conf)] + package_m1_s2,configuration=self.as_conf,wrapper_info=self.wrapper_info), JobPackageVertical(package_m2_s2,configuration=self.as_conf,wrapper_info=self.wrapper_info)] #returned_packages = returned_packages[0] for i in range(0, len(returned_packages)): @@ -495,7 +500,7 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits, self.wrapper_info) package_m1_s2 = [d1_m1_1_s2, d1_m1_2_s2, d1_m1_3_s2, d1_m1_4_s2, d1_m1_5_s2] @@ -503,7 +508,7 @@ class TestWrappers(TestCase): d1_m2_3_s2, d1_m2_4_s2, d1_m2_5_s2] packages = [JobPackageVertical( - package_m1_s2,configuration=self.as_conf), JobPackageVertical(package_m2_s2,configuration=self.as_conf)] + package_m1_s2,configuration=self.as_conf, wrapper_info=self.wrapper_info), JobPackageVertical(package_m2_s2,configuration=self.as_conf, wrapper_info=self.wrapper_info)] #returned_packages = returned_packages[0] for i in range(0, len(returned_packages)): @@ -646,7 +651,7 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits,wrapper_info=self.wrapper_info) package_m1_s2_s3 = [d1_m1_1_s2, d1_m1_1_s3, d1_m1_2_s2, d1_m1_2_s3, d1_m1_3_s2, d1_m1_3_s3, d1_m1_4_s2, d1_m1_4_s3] @@ -654,7 +659,7 @@ class TestWrappers(TestCase): d1_m2_4_s3] packages = [JobPackageVertical( - package_m1_s2_s3,configuration=self.as_conf), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf)] + package_m1_s2_s3,configuration=self.as_conf,wrapper_info=self.wrapper_info), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf,wrapper_info=self.wrapper_info)] #returned_packages = returned_packages[0] for i in range(0, len(returned_packages)): @@ -726,12 +731,12 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapper_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits,wrapper_info=self.wrapper_info) package_m1_s2_s3 = [d1_m1_1_s2, d1_m1_1_s3, d1_m1_2_s2, d1_m1_2_s3, d1_m1_3_s2, d1_m1_3_s3, d1_m1_4_s2, d1_m1_4_s3] - packages = [JobPackageVertical(package_m1_s2_s3,configuration=self.as_conf)] + packages = [JobPackageVertical(package_m1_s2_s3,configuration=self.as_conf,wrapper_info=self.wrapper_info)] #returned_packages = returned_packages[0] for i in range(0, len(returned_packages)): @@ -805,7 +810,7 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits, wrapper_info=self.wrapper_info) package_m1_s2_s3 = [d1_m1_1_s2, d1_m1_1_s3, d1_m1_2_s2, d1_m1_2_s3, d1_m1_3_s2, d1_m1_3_s3, d1_m1_4_s2, d1_m1_4_s3] @@ -813,7 +818,7 @@ class TestWrappers(TestCase): d1_m2_4_s3] packages = [JobPackageVertical( - package_m1_s2_s3,configuration=self.as_conf), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf)] + package_m1_s2_s3,configuration=self.as_conf, wrapper_info=self.wrapper_info), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf, wrapper_info=self.wrapper_info)] #returned_packages = returned_packages[0] # print("test_returned_packages_max_jobs_mixed_wrapper") @@ -895,7 +900,7 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits,wrapper_info=self.wrapper_info) package_m1_s2_s3 = [d1_m1_1_s2, d1_m1_1_s3, d1_m1_2_s2, d1_m1_2_s3, d1_m1_3_s2] @@ -903,7 +908,7 @@ class TestWrappers(TestCase): d1_m2_2_s2, d1_m2_2_s3, d1_m2_3_s2] packages = [JobPackageVertical( - package_m1_s2_s3,configuration=self.as_conf), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf)] + package_m1_s2_s3,configuration=self.as_conf,wrapper_info=self.wrapper_info), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf,wrapper_info=self.wrapper_info)] #returned_packages = returned_packages[0] for i in range(0, len(returned_packages)): @@ -977,13 +982,13 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits,wrapper_info=self.wrapper_info) package_m1_s2_s3 = [d1_m1_1_s2, d1_m1_1_s3, d1_m1_2_s2, d1_m1_2_s3] package_m2_s2_s3 = [d1_m2_1_s2, d1_m2_1_s3, d1_m2_2_s2, d1_m2_2_s3] packages = [JobPackageVertical( - package_m1_s2_s3,configuration=self.as_conf), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf)] + package_m1_s2_s3,configuration=self.as_conf, wrapper_info=self.wrapper_info), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf, wrapper_info=self.wrapper_info)] #returned_packages = returned_packages[0] for i in range(0, len(returned_packages)): @@ -1075,13 +1080,13 @@ class TestWrappers(TestCase): wrapper_limits["min_h"] = 2 wrapper_limits["max_by_section"] = max_wrapped_job_by_section returned_packages = self.job_packager._build_vertical_packages( - section_list, wrapper_limits) + section_list, wrapper_limits, wrapper_info=self.wrapper_info) package_m1_s2_s3 = [d1_m1_2_s3, d1_m1_3_s3, d1_m1_4_s2, d1_m1_4_s3] package_m2_s2_s3 = [d1_m2_3_s2, d1_m2_3_s3, d1_m2_4_s2, d1_m2_4_s3] packages = [JobPackageVertical( - package_m1_s2_s3,configuration=self.as_conf), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf)] + package_m1_s2_s3,configuration=self.as_conf,wrapper_info=self.wrapper_info), JobPackageVertical(package_m2_s2_s3,configuration=self.as_conf,wrapper_info=self.wrapper_info)] #returned_packages = returned_packages[0] for i in range(0, len(returned_packages)): @@ -1879,6 +1884,7 @@ class TestWrappers(TestCase): self._manage_dependencies(sections_dict) for job in self.job_list.get_job_list(): job._init_runtime_parameters() + job.update_parameters = MagicMock() def _manage_dependencies(self, sections_dict): for job in self.job_list.get_job_list(): -- GitLab From 47bd0f7e88d994babdf193e36ddc83b1eba0ee54 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 16 Apr 2024 13:36:46 +0200 Subject: [PATCH 19/20] Fix last retrial STAT --- autosubmit/job/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 81dbed820..34526c420 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1082,7 +1082,7 @@ class Job(object): Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name)) # write stats if self.wrapper_type == "vertical": # Disable AS retrials for vertical wrappers to use internal ones - for i in range(0,int(self.retrials)): + for i in range(0,int(self.retrials+1)): self.platform.get_stat_file(self.name, stat_file, count=i) self.write_vertical_time() self.inc_fail_count() -- GitLab From 0a7b785155faac389152f9a39fe05fd534c7e28e Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 17 Apr 2024 16:03:21 +0200 Subject: [PATCH 20/20] Last commit of logs --- autosubmit/autosubmit.py | 2 +- autosubmit/job/job.py | 105 ++++++++++++------ autosubmit/job/job_list.py | 33 +++--- .../platforms/wrappers/wrapper_builder.py | 24 ++-- 4 files changed, 96 insertions(+), 68 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 5d040a6fa..b1a3968b9 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2439,7 +2439,7 @@ class Autosubmit: wrapper_time = None for job in package.jobs: # if jobs > 1 == wrapped == same submission time job.write_submit_time(wrapper_submit_time=wrapper_time) - wrapper_time = job.write_submit_time + wrapper_time = job.submit_time_timestamp if save_1 or save_2: diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 34526c420..bb6e3244b 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -242,7 +242,7 @@ class Job(object): self.delete_when_edgeless = False # hetjobs self.het = None - self.updated_log = True + self.updated_log = False self.ready_start_date = None self.log_retrieved = False self.start_time_written = False @@ -1014,14 +1014,16 @@ class Job(object): retrials_list.insert(0, retrial_dates) return retrials_list - def get_new_remotelog_name(self): + def get_new_remotelog_name(self, count = -1): """ Checks if remote log file exists on remote host if it exists, remote_log variable is updated :param """ + if count == -1: + count = self._fail_count try: - remote_logs = (f"{self.script_name}.out.{self._fail_count}", f"{self.script_name}.err.{self._fail_count}") + remote_logs = (f"{self.script_name}.out.{count}", f"{self.script_name}.err.{count}") except BaseException as e: remote_logs = "" Log.printlog(f"Trace {e} \n Failed to retrieve log file for job {self.name}", 6000) @@ -1040,20 +1042,8 @@ class Job(object): err_exist = False return out_exist or err_exist - def retrieve_logfiles(self, platform, raise_error=False): - """ - Retrieves log files from remote host meant to be used inside a process. - :param platform: platform that is calling the function, already connected. - :param raise_error: boolean to raise an error if the logs are not retrieved - :return: - """ - backup_logname = copy.copy(self.local_logs) + def retrieve_external_retrials_logfiles(self, platform): log_retrieved = False - max_retrials = self.retrials - if self.wrapper_type == "vertical": - stat_file = self.script_name[:-4] + "_STAT_" - else: - stat_file = self.script_name[:-4] + "_STAT" self.remote_logs = self.get_new_remotelog_name() if not self.remote_logs: self.log_retrieved = False @@ -1067,6 +1057,42 @@ class Job(object): except BaseException: log_retrieved = False self.log_retrieved = log_retrieved + + def retrieve_internal_retrials_logfiles(self, platform): + log_retrieved = False + original = copy.deepcopy(self.local_logs) + for i in range(0, int(self.retrials + 1)): + if i > 0: + self.local_logs = (original[0][:-4] + "_{0}".format(i) + ".out", original[1][:-4] + "_{0}".format(i) + ".err") + self.remote_logs = self.get_new_remotelog_name(i) + if not self.remote_logs: + self.log_retrieved = False + else: + if self.check_remote_log_exists(platform): + try: + self.synchronize_logs(platform, self.remote_logs, self.local_logs) + remote_logs = copy.deepcopy(self.local_logs) + platform.get_logs_files(self.expid, remote_logs) + log_retrieved = True + except BaseException: + log_retrieved = False + self.log_retrieved = log_retrieved + def retrieve_logfiles(self, platform, raise_error=False): + """ + Retrieves log files from remote host meant to be used inside a process. + :param platform: platform that is calling the function, already connected. + :param raise_error: boolean to raise an error if the logs are not retrieved + :return: + """ + backup_logname = copy.copy(self.local_logs) + + if self.wrapper_type == "vertical": + stat_file = self.script_name[:-4] + "_STAT_" + self.retrieve_internal_retrials_logfiles(platform) + else: + stat_file = self.script_name[:-4] + "_STAT" + self.retrieve_external_retrials_logfiles(platform) + if not self.log_retrieved: self.local_logs = backup_logname Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6000) @@ -1083,9 +1109,9 @@ class Job(object): # write stats if self.wrapper_type == "vertical": # Disable AS retrials for vertical wrappers to use internal ones for i in range(0,int(self.retrials+1)): - self.platform.get_stat_file(self.name, stat_file, count=i) - self.write_vertical_time() - self.inc_fail_count() + if self.platform.get_stat_file(self.name, stat_file, count=i): + self.write_vertical_time(i) + self.inc_fail_count() else: self.platform.get_stat_file(self.name, stat_file) self.write_start_time(from_stat_file=True) @@ -1193,6 +1219,12 @@ class Job(object): # after checking the jobs , no job should have the status "submitted" Log.printlog("Job {0} in SUBMITTED status. This should never happen on this step..".format( self.name), 6008) + if self.status in [Status.COMPLETED, Status.FAILED]: + self.updated_log = False + + # # Write start_time() if not already written and job is running, completed or failed + # if self.status in [Status.RUNNING, Status.COMPLETED, Status.FAILED] and not self.start_time_written: + # self.write_start_time() # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: @@ -1202,12 +1234,7 @@ class Job(object): self.platform.add_job_to_log_recover(self) - if self.status in [Status.COMPLETED, Status.FAILED]: - self.updated_log = False - # Write start_time() if not already written and job is running, completed or failed - if self.status in [Status.RUNNING, Status.COMPLETED, Status.FAILED] and not self.start_time_written: - self.write_start_time() return self.status @@ -2119,16 +2146,19 @@ class Job(object): """ self.start_time_written = False - if wrapper_submit_time: - self.submit_time_timestamp = wrapper_submit_time - else: - self.submit_time_timestamp = date2str(datetime.datetime.now(), 'S') + if not enable_vertical_write: + if wrapper_submit_time: + self.submit_time_timestamp = wrapper_submit_time + else: + self.submit_time_timestamp = date2str(datetime.datetime.now(), 'S') + if self.wrapper_type != "vertical": + self.local_logs = (f"{self.name}.{self.submit_time_timestamp}.out", f"{self.name}.{self.submit_time_timestamp}.err") # for wrappers with inner retrials + else: + self.local_logs = (f"{self.name}.{self.submit_time_timestamp}.out", + f"{self.name}.{self.submit_time_timestamp}.err") # for wrappers with inner retrials + return if self.wrapper_type == "vertical" and self.fail_count > 0: self.submit_time_timestamp = self.finish_time_timestamp - self.local_logs = (f"{self.name}.{self.submit_time_timestamp}.out", f"{self.name}.{self.submit_time_timestamp}.err") # for wrappers with inner retrials - - if not enable_vertical_write and self.wrapper_type == "vertical": - return print(("Call from {} with status {}".format(self.name, self.status_str))) if hold is True: return # Do not write for HELD jobs. @@ -2181,11 +2211,12 @@ class Job(object): children=self.children_names_str) return True - def write_vertical_time(self): + def write_vertical_time(self, count=-1): self.write_submit_time(enable_vertical_write=True) - self.write_start_time(enable_vertical_write=True, from_stat_file=True) - self.write_end_time(self.status == Status.COMPLETED, enable_vertical_write=True) - def write_end_time(self, completed, enable_vertical_write=False): + self.write_start_time(enable_vertical_write=True, from_stat_file=True, count=count) + self.write_end_time(self.status == Status.COMPLETED, enable_vertical_write=True, count=count) + + def write_end_time(self, completed, enable_vertical_write=False, count = -1): """ Writes ends date and time to TOTAL_STATS file :param completed: True if job was completed successfully, False otherwise @@ -2193,7 +2224,7 @@ class Job(object): """ if not enable_vertical_write and self.wrapper_type == "vertical": return - end_time = self.check_end_time() + end_time = self.check_end_time(count) path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') f = open(path, 'a') f.write(' ') diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 5f52781c8..0cd615b78 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2539,28 +2539,23 @@ class JobList(object): """ Updates the log err and log out. """ - if not hasattr(job, - "updated_log") or not job.updated_log: # hasattr for backward compatibility (job.updated_logs is only for newer jobs, as the loaded ones may not have this set yet) + if not hasattr(job,"updated_log") or not job.updated_log: # hasattr for backward compatibility (job.updated_logs is only for newer jobs, as the loaded ones may not have this set yet) # order path_to_logs by name and get the two last element - err = "" - out = "" - log_file = None - for log_file in sorted(self.path_to_logs.glob(f"{job.name}.*"))[-3:]: # cmd, err, out - if "err" in log_file.suffix: - err = log_file.name - elif "out" in log_file.suffix: - out = log_file.name - if out != "" or err != "": - if out and not err: - err = out[-3] + ".err" - else: - out = err[-3] + ".out" - job.local_logs = (out, err) - job.remote_logs = (out, err) + log_file = False + if job.wrapper_type == "vertical" and job.fail_count > 0: + for log_recovered in self.path_to_logs.glob(f"{job.name}.*._{job.fail_count}.out"): + if job.local_logs[0][-4] in log_recovered.name: + log_file = True + break + else: + for log_recovered in self.path_to_logs.glob(f"{job.name}.*.out"): + if job.local_logs[0] == log_recovered.name: + log_file = True + break if log_file: - if not hasattr(job, "ready_start_date") or not job.ready_start_date or log_file.name.split(".")[ - -2] >= job.ready_start_date: # hasattr for backward compatibility + if not hasattr(job, "ready_start_date") or not job.ready_start_date or job.local_logs[0] >= job.ready_start_date: # hasattr for backward compatibility + job.local_logs = (log_recovered.name, log_recovered.name[:-4] + ".err") job.updated_log = True if not job.updated_log and str(as_conf.platforms_data.get(job.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false": job.platform.add_job_to_log_recover(job) diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 6ab487251..d40a985d1 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -451,12 +451,12 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder): for i in range(len({0})): job_retrials = retrials completed = False - while job_retrials >= 0 and not completed: + fail_count = 0 + while fail_count <= job_retrials and not completed: current = {1} current.start() - os.system("echo "+str(int(time.time()))+" > "+scripts[i][:-4]+"_STAT_"+str(job_retrials)) #Start/submit running + os.system("echo "+str(int(time.time()))+" > "+scripts[i][:-4]+"_STAT_"+str(fail_count)) #Start/submit running current.join({3}) - job_retrials = job_retrials - 1 total_steps = total_steps + 1 """).format(jobs_list, thread,self.retrials,str(self.wallclock_by_level),'\n'.ljust(13)) @@ -467,15 +467,17 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder): failed_filename = {0}[i].replace('.cmd', '_FAILED') failed_path = os.path.join(os.getcwd(), failed_filename) failed_wrapper = os.path.join(os.getcwd(), wrapper_id) - os.system("echo "+str(int(time.time()))+" >> "+scripts[i][:-4]+"_STAT_"+str(job_retrials+1)) #Completed + os.system("echo "+str(int(time.time()))+" >> "+scripts[i][:-4]+"_STAT_"+str(fail_count)) #Completed if os.path.exists(completed_path): completed = True print(datetime.now(), "The job ", current.template," has been COMPLETED") - os.system("echo COMPLETED >> " + scripts[i][:-4]+"_STAT_"+str(job_retrials+1)) + os.system("echo COMPLETED >> " + scripts[i][:-4]+"_STAT_"+str(fail_count)) else: print(datetime.now(), "The job ", current.template," has FAILED") - os.system("echo FAILED >> " + scripts[i][:-4]+"_STAT_"+str(job_retrials+1)) + os.system("echo FAILED >> " + scripts[i][:-4]+"_STAT_"+str(fail_count)) #{1} + fail_count = fail_count + 1 + """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8) sequential_threads_launcher += self._indent(textwrap.dedent(""" if not os.path.exists(completed_path): @@ -493,17 +495,17 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder): def build_job_thread(self): # fastlook return textwrap.dedent(""" class JobThread(Thread): - def __init__ (self, template, id_run, retrials): + def __init__ (self, template, id_run, retrials, fail_count): Thread.__init__(self) self.template = template self.id_run = id_run self.retrials = retrials + self.fail_count = fail_count def run(self): jobname = self.template.replace('.cmd', '') - #os.system("echo $(date +%s) > "+jobname+"_STAT") - out = str(self.template) + ".out." + str(self.retrials) - err = str(self.template) + ".err." + str(self.retrials) + out = str(self.template) + ".out." + str(self.fail_count) + err = str(self.template) + ".err." + str(self.fail_count) print((out+"\\n")) command = "./" + str(self.template) + " " + str(self.id_run) + " " + os.getcwd() print((command+"\\n")) @@ -515,7 +517,7 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder): """).format(str(self.wallclock_by_level),'\n'.ljust(13)) def build_main(self): self.exit_thread = "os._exit(1)" - return self.build_sequential_threads_launcher("scripts", "JobThread(scripts[i], i, job_retrials)") + return self.build_sequential_threads_launcher("scripts", "JobThread(scripts[i], i, retrials, fail_count)") class PythonHorizontalWrapperBuilder(PythonWrapperBuilder): def build_main(self): -- GitLab