From b2a35f5d3ba6432dbdd69feee9df59cc501d8653 Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Tue, 15 Dec 2020 10:35:40 +0100 Subject: [PATCH 1/2] Fixing #634. Modified exception calls of possible not defined attributes. --- autosubmit/autosubmit.py | 35 ++- autosubmit/config/config_common.py | 4 +- autosubmit/database/db_jobdata.py | 4 +- autosubmit/job/job.py | 2 +- autosubmit/job/job_list.py | 2 +- autosubmit/platforms/paramiko_platform.py | 279 ++++++++++++++-------- autosubmit/platforms/slurmplatform.py | 2 +- bin/autosubmit | 9 +- 8 files changed, 205 insertions(+), 132 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 05d0f1dd7..59a7bb4fd 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -759,7 +759,7 @@ class Autosubmit: # Avoid calling Log at this point since it is possible that tmp folder is already deleted. # print(traceback.format_exc()) raise AutosubmitCritical( - "Couldn't delete the experiment:", 7012, e.message) + "Couldn't delete the experiment:", 7012, str(e)) @staticmethod def expid(hpc, description, copy_id='', dummy=False, test=False, operational=False, root_folder=''): @@ -1392,7 +1392,7 @@ class Autosubmit: job_list.check_scripts(as_conf) except Exception as e: raise AutosubmitCritical( - "Error while checking job templates", 7015, e.message) + "Error while checking job templates", 7015, str(e)) Log.debug("Loading job packages") try: packages_persistence = JobPackagePersistence(os.path.join( @@ -1437,7 +1437,7 @@ class Autosubmit: ExperimentStatus(expid).update_running_status() except Exception as e: raise AutosubmitCritical( - "Error while processing job_data_structure", 7067, e.message) + "Error while processing job_data_structure", 7067, str(e)) if allowed_members: # Set allowed members after checks have been performed. This triggers the setter and main logic of the -rm feature. job_list.run_members = allowed_members @@ -1447,7 +1447,7 @@ class Autosubmit: raise AutosubmitCritical(e.message, 7067, e.trace) except Exception as e: raise AutosubmitCritical( - "Error in run initialization", 7067, e.message) + "Error in run initialization", 7067, str(e)) ######################### # AUTOSUBMIT - MAIN LOOP @@ -1833,7 +1833,7 @@ class Autosubmit: e.job_name), 7014, e.message) except Exception as e: raise AutosubmitError("{0} submission failed".format( - platform.name), 6015, e.message + "\n" + e.trace) + platform.name), 6015, str(e)) except WrongTemplateException as e: raise AutosubmitCritical( "Invalid parameter substitution in {0} template".format(e.job_name), 7014) @@ -1926,7 +1926,7 @@ class Autosubmit: raise except Exception as e: raise AutosubmitError("{0} submission failed".format( - platform.name), 6015, e.message) + platform.name), 6015, str(e)) try: for package in valid_packages_to_submit: if package.jobs[0].id not in failed_packages: @@ -1944,7 +1944,7 @@ class Autosubmit: package.name, package.jobs, package._expid, inspect) except Exception as e: raise AutosubmitError("{0} submission failed".format( - platform.name), 6015, e.message) + platform.name), 6015, str(e)) return save @staticmethod @@ -2185,7 +2185,7 @@ class Autosubmit: Log.result("Stats plot ready") except Exception as e: raise AutosubmitCritical( - "Stats couldn't be shown", 7061, e.message) + "Stats couldn't be shown", 7061, str(e)) else: Log.info("There are no {0} jobs in the period from {1} to {2}...".format( ft, period_ini, period_fi)) @@ -2524,7 +2524,7 @@ class Autosubmit: break except Exception as e: Log.printlog("Trace: {2}\nThe files/dirs on {0} cannot be moved to {1}.".format( - p.root_dir, os.path.join(p.temp_dir, experiment_id), e.message), 6012) + p.root_dir, os.path.join(p.temp_dir, experiment_id), str(e)), 6012) error = True break backup_files.append(platform) @@ -2573,7 +2573,7 @@ class Autosubmit: if platform[2] is not None: as_conf.set_new_project(platform[0], platform[2]) raise AutosubmitCritical( - "The experiment cannot be offered, changes are reverted", 7014, e.message) + "The experiment cannot be offered, changes are reverted", 7014, str(e)) elif pickup: Log.info('Migrating experiment {0}'.format(experiment_id)) Log.info("Moving local files/dirs") @@ -2620,7 +2620,7 @@ class Autosubmit: Autosubmit.restore_platforms(platforms_to_test) except Exception as e: raise AutosubmitCritical( - "Invalid Remote Platform configuration, recover them manually or:\n 1) Configure platform.conf with the correct info\n 2) autosubmit expid -p --onlyremote", 7014, e.message) + "Invalid Remote Platform configuration, recover them manually or:\n 1) Configure platform.conf with the correct info\n 2) autosubmit expid -p --onlyremote", 7014, str(e)) error = True if not error: for platform in platforms: @@ -2782,7 +2782,7 @@ class Autosubmit: exp_parameters = Autosubmit.capitalize_keys(exp_parameters) except Exception as e: raise AutosubmitCritical( - "Couldn't gather the experiment parameters", 7012, e.message) + "Couldn't gather the experiment parameters", 7012, str(e)) if show_all_parameters: Log.info("Gathering all parameters (all keys are on upper_case)") @@ -3378,7 +3378,7 @@ class Autosubmit: tar.close() os.chmod(os.path.join(year_path, output_filepath), 0o755) except Exception as e: - raise AutosubmitCritical("Can not write tar file", 7012, e.message) + raise AutosubmitCritical("Can not write tar file", 7012, str(e)) Log.info("Tar file created!") @@ -3386,7 +3386,7 @@ class Autosubmit: shutil.rmtree(exp_folder) except Exception as e: Log.warning( - "Can not fully remove experiments folder: {0}".format(e)) + "Can not fully remove experiments folder: {0}".format(str(e))) if os.stat(exp_folder): try: tmp_folder = os.path.join( @@ -3396,10 +3396,9 @@ class Autosubmit: Log.warning("Experiment folder renamed to: {0}".format( exp_folder + "_to_delete ")) except Exception as e: - Autosubmit.unarchive(expid, uncompress=False) raise AutosubmitCritical( - "Can not remove or rename experiments folder", 7012, e.message) + "Can not remove or rename experiments folder", 7012, str(e)) Log.result("Experiment archived successfully") return True @@ -3447,7 +3446,7 @@ class Autosubmit: tar.close() except Exception as e: shutil.rmtree(exp_folder, ignore_errors=True) - Log.printlog("Can not extract tar file: {0}".format(e), 6012) + Log.printlog("Can not extract tar file: {0}".format(str(e)), 6012) return False Log.info("Unpacking finished") @@ -3456,7 +3455,7 @@ class Autosubmit: os.remove(archive_path) except Exception as e: Log.printlog( - "Can not remove archived file folder: {0}".format(e), 7012) + "Can not remove archived file folder: {0}".format(str(e)), 7012) return False Log.result("Experiment {0} unarchived successfully", experiment_id) diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index f2b6c4eba..b66b146cd 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -438,7 +438,7 @@ class AutosubmitConfig(object): raise AutosubmitCritical(e.message, e.code, e.trace) except Exception as e: raise AutosubmitCritical( - "There was an error while showing the config log messages", 7014, e.message) + "There was an error while showing the config log messages", 7014, str(e)) def check_autosubmit_conf(self): """ @@ -759,7 +759,7 @@ class AutosubmitConfig(object): self.parser_factory, self._exp_parser_file) except Exception as e: raise AutosubmitCritical( - "{0} \n Repeated parameter, check if you have any uncommented value that should be commented".format(e.message), 7014) + "{0} \n Repeated parameter, check if you have any uncommented value that should be commented".format(str(e)), 7014) if self._proj_parser_file == '': self._proj_parser = None else: diff --git a/autosubmit/database/db_jobdata.py b/autosubmit/database/db_jobdata.py index 042e91d71..81a81db07 100644 --- a/autosubmit/database/db_jobdata.py +++ b/autosubmit/database/db_jobdata.py @@ -472,7 +472,7 @@ class ExperimentStatus(MainDataBase): self.current_row = next( (exp for exp in self.current_table if exp.expid == self.expid), None) if len(self.current_table) > 0 else None except Exception as exp: - Log.debug(exp) + Log.debug(str(exp)) pass def print_current_table(self): @@ -913,7 +913,7 @@ class JobDataStructure(MainDataBase): self._update_experiment_run(current_run) self.current_run_id = current_run.run_id except Exception as exp: - Log.debug(exp) + Log.debug(str(exp)) pass def get_job_package_code(self, current_job_name): diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index af0df3c7e..89d34d312 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -528,7 +528,7 @@ class Job(object): self._platform.restore_connection() except Exception as e: Log.printlog( - "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) + "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 5 diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 277fb8f42..bcf9675a1 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -1398,7 +1398,7 @@ class JobList(object): DbStructure.save_structure( self.graph, self.expid, self._config.STRUCTURES_DIR) except Exception as exp: - Log.warning(exp) + Log.warning(str(exp)) pass for job in self._job_list: diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 7fb048ef5..f7915db32 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -9,7 +9,8 @@ from autosubmit.job.job_common import Status from autosubmit.job.job_common import Type from autosubmit.platforms.platform import Platform from bscearth.utils.date import date2str -from log.log import AutosubmitError,AutosubmitCritical,Log +from log.log import AutosubmitError, AutosubmitCritical, Log + class ParamikoPlatform(Platform): """ @@ -37,6 +38,7 @@ class ParamikoPlatform(Platform): self.submit_cmd = "" self._ftpChannel = None self.transport = None + @property def header(self): """ @@ -56,6 +58,7 @@ class ParamikoPlatform(Platform): :rtype: object """ return self._wrapper + def reset(self): self.connected = False self._ssh = None @@ -77,9 +80,12 @@ class ParamikoPlatform(Platform): transport = self._ssh.get_transport() transport.send_ignore() except EOFError as e: - raise AutosubmitError("[{0}] not alive. Host: {1}".format(self.name,self.host), 6002,e.message) + raise AutosubmitError("[{0}] not alive. Host: {1}".format( + self.name, self.host), 6002, str(e)) except Exception as e: - raise AutosubmitError("[{0}] connection failed for host: {1}".format(self.name,self.host), 6002, e.message) + raise AutosubmitError("[{0}] connection failed for host: {1}".format( + self.name, self.host), 6002, str(e)) + def restore_connection(self): try: self.connected = False @@ -89,9 +95,11 @@ class ParamikoPlatform(Platform): self.connect() except Exception as e: if ',' in self.host: - Log.printlog("Connection Failed to {0}, will test another host".format(self.host.split(',')[0]), 6002) + Log.printlog("Connection Failed to {0}, will test another host".format( + self.host.split(',')[0]), 6002) else: - raise AutosubmitCritical("First connection to {0} is failed, check host configuration or try another login node ".format(self.host),7050) + raise AutosubmitCritical( + "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050) while self.connected is False and retry < retries: try: self.connect(True) @@ -99,13 +107,16 @@ class ParamikoPlatform(Platform): pass retry += 1 if not self.connected: - trace='Can not create ssh or sftp connection to {0}: Connection could not be established to platform {1}\n Please, check your expid platform.conf to see if there are mistakes in the configuration\n Also Ensure that the login node listed on HOST parameter is available(try to connect via ssh on a terminal)\n Also you can put more than one host using a comma as separator'.format(self.host, self.name) - raise AutosubmitCritical('Experiment cant no continue without unexpected behaviour, Stopping Autosubmit',7050,trace) + trace = 'Can not create ssh or sftp connection to {0}: Connection could not be established to platform {1}\n Please, check your expid platform.conf to see if there are mistakes in the configuration\n Also Ensure that the login node listed on HOST parameter is available(try to connect via ssh on a terminal)\n Also you can put more than one host using a comma as separator'.format( + self.host, self.name) + raise AutosubmitCritical( + 'Experiment cant no continue without unexpected behaviour, Stopping Autosubmit', 7050, trace) except AutosubmitCritical: raise except Exception as e: - raise AutosubmitCritical('Cant connect to this platform due an unknown error',7050,e.message) + raise AutosubmitCritical( + 'Cant connect to this platform due an unknown error', 7050, str(e)) def connect(self, reconnect=False): """ @@ -127,31 +138,37 @@ class ParamikoPlatform(Platform): self._host_config = self._ssh_config.lookup(self.host) if "," in self._host_config['hostname']: if reconnect: - self._host_config['hostname'] = random.choice(self._host_config['hostname'].split(',')[1:]) + self._host_config['hostname'] = random.choice( + self._host_config['hostname'].split(',')[1:]) else: - self._host_config['hostname'] = self._host_config['hostname'].split(',')[0] + self._host_config['hostname'] = self._host_config['hostname'].split(',')[ + 0] if 'identityfile' in self._host_config: self._host_config_id = self._host_config['identityfile'] if 'proxycommand' in self._host_config: - self._proxy = paramiko.ProxyCommand(self._host_config['proxycommand']) + self._proxy = paramiko.ProxyCommand( + self._host_config['proxycommand']) self._ssh.connect(self._host_config['hostname'], 22, username=self.user, key_filename=self._host_config_id, sock=self._proxy) else: self._ssh.connect(self._host_config['hostname'], 22, username=self.user, key_filename=self._host_config_id) - self.transport = paramiko.Transport((self._host_config['hostname'], 22)) + self.transport = paramiko.Transport( + (self._host_config['hostname'], 22)) self.transport.connect(username=self.user) self._ftpChannel = self._ssh.open_sftp() self.connected = True except BaseException as e: self.connected = False if "Authentication failed." in e.message: - raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format(self._host_config['hostname']),7050,e.message) + raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( + self._host_config['hostname']), 7050, e.message) if not reconnect and "," in self._host_config['hostname']: self.restore_connection(reconnect=True) else: - raise AutosubmitError("Couldn't establish a connection to the specified host, wrong configuration?",6003,e.message) + raise AutosubmitError( + "Couldn't establish a connection to the specified host, wrong configuration?", 6003, e.message) def check_completed_files(self, sections=None): if self.host == 'localhost': @@ -160,12 +177,12 @@ class ParamikoPlatform(Platform): if sections: for i, section in enumerate(sections.split()): command += " -name *%s_COMPLETED" % section - if i < len(sections.split())-1: + if i < len(sections.split()) - 1: command += " -o " else: command += " -name *_COMPLETED" - if self.send_command(command,True): + if self.send_command(command, True): return self._ssh_output else: return None @@ -173,12 +190,14 @@ class ParamikoPlatform(Platform): def remove_multiple_files(self, filenames): #command = "rm" log_dir = os.path.join(self.tmp_path, 'LOG_{0}'.format(self.expid)) - multiple_delete_previous_run = os.path.join(log_dir,"multiple_delete_previous_run.sh") + multiple_delete_previous_run = os.path.join( + log_dir, "multiple_delete_previous_run.sh") if os.path.exists(log_dir): - open(multiple_delete_previous_run, 'w+').write("rm -f"+filenames) + open(multiple_delete_previous_run, 'w+').write("rm -f" + filenames) os.chmod(multiple_delete_previous_run, 0o770) self.send_file(multiple_delete_previous_run, False) - command = os.path.join(self.get_files_path(),"multiple_delete_previous_run.sh") + command = os.path.join(self.get_files_path(), + "multiple_delete_previous_run.sh") if self.send_command(command, ignore_log=True): return self._ssh_output else: @@ -197,17 +216,20 @@ class ParamikoPlatform(Platform): self.delete_file(filename) try: local_path = os.path.join(os.path.join(self.tmp_path, filename)) - remote_path = os.path.join(self.get_files_path(), os.path.basename(filename)) + remote_path = os.path.join( + self.get_files_path(), os.path.basename(filename)) self._ftpChannel.put(local_path, remote_path) - self._ftpChannel.chmod(remote_path,os.stat(local_path).st_mode) + self._ftpChannel.chmod(remote_path, os.stat(local_path).st_mode) return True except IOError as e: - raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join(self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, e.message) + raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( + self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, e.message) except BaseException as e: - raise AutosubmitError('Send file failed. Connection seems to no be active',6004) + raise AutosubmitError( + 'Send file failed. Connection seems to no be active', 6004) # Gets .err and .out - def get_file(self, filename, must_exist=True, relative_path='',ignore_log = False,wrapper_failed=False): + def get_file(self, filename, must_exist=True, relative_path='', ignore_log=False, wrapper_failed=False): """ Copies a file from the current platform to experiment's tmp folder @@ -236,17 +258,19 @@ class ParamikoPlatform(Platform): if str(e) in "Garbage": #raise AutosubmitError("Files couldn't be retrieved, session not active".format(filename),6004,e.message) if not ignore_log: - Log.printlog("File {0} seems to no exists (skipping)".format(filename),5004) + Log.printlog( + "File {0} seems to no exists (skipping)".format(filename), 5004) if must_exist: if not ignore_log: - Log.printlog("File {0} does not exists".format(filename),6004) + Log.printlog( + "File {0} does not exists".format(filename), 6004) return False else: if not ignore_log: - Log.printlog("Log file couldn't be retrieved: {0}".format(filename),5000) + Log.printlog( + "Log file couldn't be retrieved: {0}".format(filename), 5000) return False - def delete_file(self, filename): """ Deletes a file from this platform @@ -258,20 +282,20 @@ class ParamikoPlatform(Platform): """ try: - self._ftpChannel.remove(os.path.join(self.get_files_path(), filename)) + self._ftpChannel.remove(os.path.join( + self.get_files_path(), filename)) return True except IOError as e: #Log.printlog("{0} couldn't be retrieved, session not active".format(os.path.join(self.get_files_path(), filename)),6004) return False except BaseException as e: - Log.error('Could not remove file {0} due a wrong configuration'.format(os.path.join(self.get_files_path(), filename))) + Log.error('Could not remove file {0} due a wrong configuration'.format( + os.path.join(self.get_files_path(), filename))) if e.message.lower().find("garbage") != -1: - raise AutosubmitCritical("Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ",7051,e.message) - - + raise AutosubmitCritical( + "Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ", 7051, e.message) - - def move_file(self, src, dest,must_exist=False): + def move_file(self, src, dest, must_exist=False): """ Moves a file on the platform (includes .err and .out) :param src: source name @@ -288,20 +312,26 @@ class ParamikoPlatform(Platform): except IOError as e: if str(e) in "Garbage": - raise AutosubmitError('File {0} does not exists, something went wrong with the platform'.format(path_root),6004,e.message) + raise AutosubmitError('File {0} does not exists, something went wrong with the platform'.format( + path_root), 6004, e.message) if must_exist: - raise AutosubmitError("A critical file couldn't be retrieved, File {0} does not exists".format(path_root),6004,e.message) + raise AutosubmitError("A critical file couldn't be retrieved, File {0} does not exists".format( + path_root), 6004, e.message) else: Log.debug("File {0} doesn't exists ".format(path_root)) return False except Exception as e: if str(e) in "Garbage": - raise AutosubmitError('File {0} does not exists'.format(os.path.join(self.get_files_path(), src)),6004,e.message) + raise AutosubmitError('File {0} does not exists'.format( + os.path.join(self.get_files_path(), src)), 6004, str(e)) if must_exist: - raise AutosubmitError("A critical file couldn't be retrieved, File {0} does not exists".format(os.path.join(self.get_files_path(), src)),6004,e.message) + raise AutosubmitError("A critical file couldn't be retrieved, File {0} does not exists".format( + os.path.join(self.get_files_path(), src)), 6004, str(e)) else: - Log.printlog("Log file couldn't be moved: {0}".format(os.path.join(self.get_files_path(), src)),5001) + Log.printlog("Log file couldn't be moved: {0}".format( + os.path.join(self.get_files_path(), src)), 5001) return False + def submit_job(self, job, script_name, hold=False): """ Submit a job from a given job object. @@ -339,7 +369,7 @@ class ParamikoPlatform(Platform): check_energy_cmd = self.get_job_energy_cmd(job_id) self.send_command(check_energy_cmd) return self.parse_job_finish_data( - self.get_ssh_output(), packed) + self.get_ssh_output(), packed) def submit_Script(self, hold=False): """ @@ -352,7 +382,7 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def check_job(self, job, default_status=Status.COMPLETED, retries=5,submit_hold_check=False): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False): """ Checks job running status @@ -367,22 +397,25 @@ class ParamikoPlatform(Platform): job_id = job.id job_status = Status.UNKNOWN if type(job_id) is not int and type(job_id) is not str: - Log.error('check_job() The job id ({0}) is not an integer neither a string.', job_id) + Log.error( + 'check_job() The job id ({0}) is not an integer neither a string.', job_id) job.new_status = job_status - sleep_time=5 + sleep_time = 5 sleep(2) self.send_command(self.get_checkjob_cmd(job_id)) while self.get_ssh_output().strip(" ") == "" and retries > 0: retries = retries - 1 - Log.debug('Retrying check job command: {0}', self.get_checkjob_cmd(job_id)) + Log.debug( + 'Retrying check job command: {0}', self.get_checkjob_cmd(job_id)) Log.debug('retries left {0}', retries) Log.debug('Will be retrying in {0} seconds', sleep_time) sleep(sleep_time) - sleep_time = sleep_time+5 + sleep_time = sleep_time + 5 self.send_command(self.get_checkjob_cmd(job_id)) if retries >= 0: #Log.debug('Successful check job command: {0}', self.get_checkjob_cmd(job_id)) - job_status = self.parse_job_output(self.get_ssh_output()).strip("\n") + job_status = self.parse_job_output( + self.get_ssh_output()).strip("\n") # URi: define status list in HPC Queue Class if job_status in self.job_status['COMPLETED'] or retries == 0: job_status = Status.COMPLETED @@ -397,20 +430,23 @@ class ParamikoPlatform(Platform): else: job_status = Status.UNKNOWN else: - Log.error(" check_job(), job is not on the queue system. Output was: {0}", self.get_checkjob_cmd(job_id)) + Log.error( + " check_job(), job is not on the queue system. Output was: {0}", self.get_checkjob_cmd(job_id)) job_status = Status.UNKNOWN - Log.error('check_job() The job id ({0}) status is {1}.', job_id, job_status) + Log.error( + 'check_job() The job id ({0}) status is {1}.', job_id, job_status) if submit_hold_check: return job_status else: job.new_status = job_status - def _check_jobid_in_queue(self,ssh_output,job_list_cmd): + def _check_jobid_in_queue(self, ssh_output, job_list_cmd): for job in job_list_cmd[:-1].split(','): if job not in ssh_output: return False return True - def check_Alljobs(self, job_list,job_list_cmd,remote_logs, retries=5): + + def check_Alljobs(self, job_list, job_list_cmd, remote_logs, retries=5): """ Checks jobs running status @@ -424,15 +460,15 @@ class ParamikoPlatform(Platform): """ cmd = self.get_checkAlljobs_cmd(job_list_cmd) - sleep_time=5 + sleep_time = 5 - while not (self.send_command(cmd) and retries >= 0) or ( not self._check_jobid_in_queue(self.get_ssh_output(),job_list_cmd) and retries >= 0): + while not (self.send_command(cmd) and retries >= 0) or (not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries >= 0): retries -= 1 Log.debug('Retrying check job command: {0}', cmd) Log.debug('retries left {0}', retries) Log.debug('Will be retrying in {0} seconds', sleep_time) sleep(sleep_time) - sleep_time=sleep_time+5 + sleep_time = sleep_time + 5 job_list_status = self.get_ssh_output() if retries >= 0: Log.debug('Successful check job command') @@ -440,7 +476,7 @@ class ParamikoPlatform(Platform): list_queue_jobid = "" for job in job_list: job_id = job.id - job_status = self.parse_Alljobs_output(job_list_status,job_id) + job_status = self.parse_Alljobs_output(job_list_status, job_id) # URi: define status list in HPC Queue Class if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED @@ -448,7 +484,7 @@ class ParamikoPlatform(Platform): job_status = Status.RUNNING elif job_status in self.job_status['QUEUING']: if job.hold: - job_status = Status.HELD # release? + job_status = Status.HELD # release? else: job_status = Status.QUEUING list_queue_jobid += str(job.id) + ',' @@ -461,39 +497,47 @@ class ParamikoPlatform(Platform): else: job_status = Status.UNKNOWN - Log.error('check_job() The job id ({0}) status is {1}.', job_id, job_status) - job.new_status=job_status + Log.error( + 'check_job() The job id ({0}) status is {1}.', job_id, job_status) + job.new_status = job_status reason = str() if self.type == 'slurm' and len(in_queue_jobs) > 0: - cmd=self.get_queue_status_cmd(list_queue_jobid) + cmd = self.get_queue_status_cmd(list_queue_jobid) self.send_command(cmd) - queue_status=self._ssh_output + queue_status = self._ssh_output for job in in_queue_jobs: - reason = self.parse_queue_reason(queue_status,job.id) + reason = self.parse_queue_reason(queue_status, job.id) if job.queuing_reason_cancel(reason): - Log.error("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) - self.send_command(self.platform.cancel_cmd + " {0}".format(job.id)) + Log.error( + "Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) + self.send_command( + self.platform.cancel_cmd + " {0}".format(job.id)) job.new_status = Status.FAILED job.update_status(remote_logs) return elif reason == '(JobHeldUser)': - job.new_status=Status.HELD + job.new_status = Status.HELD if not job.hold: - self.send_command("scontrol release {0}".format(job.id)) # SHOULD BE MORE CLASS (GET_scontrol realease but not sure if this can be implemented on others PLATFORMS + # SHOULD BE MORE CLASS (GET_scontrol realease but not sure if this can be implemented on others PLATFORMS + self.send_command( + "scontrol release {0}".format(job.id)) else: pass - elif reason == '(JobHeldAdmin)': #This shouldn't happen anymore TODO delete - Log.debug("Job {0} Failed to be HELD, canceling... ", job.name) + # This shouldn't happen anymore TODO delete + elif reason == '(JobHeldAdmin)': + Log.debug( + "Job {0} Failed to be HELD, canceling... ", job.name) job.new_status = Status.WAITING - job.platform.send_command(job.platform.cancel_cmd + " {0}".format(job.id)) + job.platform.send_command( + job.platform.cancel_cmd + " {0}".format(job.id)) else: for job in job_list: job_status = Status.UNKNOWN - Log.warning('check_job() The job id ({0}) from platform {1} has an status of {2}.', job.id, self.name, job_status) - raise AutosubmitError("Some Jobs are in Unknown status",6008) - #job.new_status=job_status - + Log.warning( + 'check_job() The job id ({0}) from platform {1} has an status of {2}.', job.id, self.name, job_status) + raise AutosubmitError("Some Jobs are in Unknown status", 6008) + # job.new_status=job_status def get_checkjob_cmd(self, job_id): """ @@ -516,6 +560,7 @@ class ParamikoPlatform(Platform): :rtype: str """ raise NotImplementedError + def send_command(self, command, ignore_log=False): """ Sends given command to HPC @@ -531,7 +576,7 @@ class ParamikoPlatform(Platform): elif "rm" in command: timeout = 60 else: - timeout = 60*2 + timeout = 60 * 2 stderr_readlines = [] stdout_chunks = [] try: @@ -541,8 +586,8 @@ class ParamikoPlatform(Platform): stdin.close() channel.shutdown_write() - stdout_chunks.append(stdout.channel.recv(len(stdout.channel.in_buffer))) - + stdout_chunks.append(stdout.channel.recv( + len(stdout.channel.in_buffer))) while not channel.closed or channel.recv_ready() or channel.recv_stderr_ready(): # stop if channel was closed prematurely, and there is no data in the buffers. @@ -550,12 +595,14 @@ class ParamikoPlatform(Platform): readq, _, _ = select.select([stdout.channel], [], [], 2) for c in readq: if c.recv_ready(): - stdout_chunks.append(stdout.channel.recv(len(c.in_buffer))) + stdout_chunks.append( + stdout.channel.recv(len(c.in_buffer))) #stdout_chunks.append(" ") got_chunk = True if c.recv_stderr_ready(): # make sure to read stderr to prevent stall - stderr_readlines.append(stderr.channel.recv_stderr(len(c.in_stderr_buffer))) + stderr_readlines.append( + stderr.channel.recv_stderr(len(c.in_stderr_buffer))) #stdout_chunks.append(" ") got_chunk = True if not got_chunk and stdout.channel.exit_status_ready() and not stderr.channel.recv_stderr_ready() and not stdout.channel.recv_ready(): @@ -576,17 +623,20 @@ class ParamikoPlatform(Platform): self._ssh_output_err += errorLineCase errorLine = errorLineCase.lower() if "not active" in errorLine: - raise AutosubmitError('SSH Session not active, will restart the platforms', 6005) + raise AutosubmitError( + 'SSH Session not active, will restart the platforms', 6005) if errorLine.find("submission failed") != -1 or errorLine.find("git clone") != -1 or errorLine.find("sbatch: error: ") != -1 or errorLine.find("not submitted") != -1: - if (self._submit_command_name == "sbatch" and errorLine.find("policy") != -1 ) or (self._submit_command_name == "sbatch" and errorLine.find("argument") != -1 ) or ( self._submit_command_name == "bsub" and errorLine.find("job not submitted") != -1 ) or self._submit_command_name == "ecaccess-job-submit" or self._submit_command_name == "qsub ": - raise AutosubmitError("bad parameters",7014,stderr_readlines) - - raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines),6005)) + if (self._submit_command_name == "sbatch" and errorLine.find("policy") != -1) or (self._submit_command_name == "sbatch" and errorLine.find("argument") != -1) or (self._submit_command_name == "bsub" and errorLine.find("job not submitted") != -1) or self._submit_command_name == "ecaccess-job-submit" or self._submit_command_name == "qsub ": + raise AutosubmitError( + "bad parameters", 7014, stderr_readlines) + raise AutosubmitError('Command {0} in {1} warning: {2}'.format( + command, self.host, '\n'.join(stderr_readlines), 6005)) if not ignore_log: if len(stderr_readlines) > 0: - Log.printlog('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines)),6006) + Log.printlog('Command {0} in {1} warning: {2}'.format( + command, self.host, '\n'.join(stderr_readlines)), 6006) else: pass #Log.debug('Command {0} in {1} successful with out message: {2}', command, self.host, self._ssh_output) @@ -599,7 +649,8 @@ class ParamikoPlatform(Platform): except AutosubmitError as e: raise except BaseException as e: - raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines)),6005,e.message) + raise AutosubmitError('Command {0} in {1} warning: {2}'.format( + command, self.host, '\n'.join(stderr_readlines)), 6005, e.message) def parse_job_output(self, output): """ @@ -611,7 +662,8 @@ class ParamikoPlatform(Platform): :rtype: str """ raise NotImplementedError - def parse_Alljobs_output(self, output,job_id): + + def parse_Alljobs_output(self, output, job_id): """ Parses check jobs command output so it can be interpreted by autosubmit :param output: output to parse @@ -628,8 +680,7 @@ class ParamikoPlatform(Platform): def get_submit_script(self): pass - - def get_submit_cmd(self, job_script, job_type,hold=False): + def get_submit_cmd(self, job_script, job_type, hold=False): """ Get command to add job to scheduler @@ -664,8 +715,10 @@ class ParamikoPlatform(Platform): """ #Log.debug('Output {0}', self._ssh_output) return self._ssh_output + def get_ssh_output_err(self): return self._ssh_output_err + def get_call(self, job_script, job): """ Gets execution command for given job @@ -734,25 +787,35 @@ class ParamikoPlatform(Platform): header = header.replace('%ERR_LOG_DIRECTIVE%', err_filename) if hasattr(self.header, 'get_queue_directive'): - header = header.replace('%QUEUE_DIRECTIVE%', self.header.get_queue_directive(job)) + header = header.replace( + '%QUEUE_DIRECTIVE%', self.header.get_queue_directive(job)) if hasattr(self.header, 'get_tasks_per_node'): - header = header.replace('%TASKS_PER_NODE_DIRECTIVE%', self.header.get_tasks_per_node(job)) + header = header.replace( + '%TASKS_PER_NODE_DIRECTIVE%', self.header.get_tasks_per_node(job)) if hasattr(self.header, 'get_threads_per_task'): - header = header.replace('%THREADS%', self.header.get_threads_per_task(job)) + header = header.replace( + '%THREADS%', self.header.get_threads_per_task(job)) if hasattr(self.header, 'get_scratch_free_space'): - header = header.replace('%SCRATCH_FREE_SPACE_DIRECTIVE%', self.header.get_scratch_free_space(job)) + header = header.replace( + '%SCRATCH_FREE_SPACE_DIRECTIVE%', self.header.get_scratch_free_space(job)) if hasattr(self.header, 'get_custom_directives'): - header = header.replace('%CUSTOM_DIRECTIVES%', self.header.get_custom_directives(job)) + header = header.replace( + '%CUSTOM_DIRECTIVES%', self.header.get_custom_directives(job)) if hasattr(self.header, 'get_exclusivity'): - header = header.replace('%EXCLUSIVITY_DIRECTIVE%', self.header.get_exclusivity(job)) + header = header.replace( + '%EXCLUSIVITY_DIRECTIVE%', self.header.get_exclusivity(job)) if hasattr(self.header, 'get_account_directive'): - header = header.replace('%ACCOUNT_DIRECTIVE%', self.header.get_account_directive(job)) + header = header.replace( + '%ACCOUNT_DIRECTIVE%', self.header.get_account_directive(job)) if hasattr(self.header, 'get_memory_directive'): - header = header.replace('%MEMORY_DIRECTIVE%', self.header.get_memory_directive(job)) + header = header.replace( + '%MEMORY_DIRECTIVE%', self.header.get_memory_directive(job)) if hasattr(self.header, 'get_memory_per_task_directive'): - header = header.replace('%MEMORY_PER_TASK_DIRECTIVE%', self.header.get_memory_per_task_directive(job)) + header = header.replace( + '%MEMORY_PER_TASK_DIRECTIVE%', self.header.get_memory_per_task_directive(job)) if hasattr(self.header, 'get_hyperthreading_directive'): - header = header.replace('%HYPERTHREADING_DIRECTIVE%', self.header.get_hyperthreading_directive(job)) + header = header.replace( + '%HYPERTHREADING_DIRECTIVE%', self.header.get_hyperthreading_directive(job)) return header def closeConnection(self): @@ -767,7 +830,6 @@ class ParamikoPlatform(Platform): except: pass - def check_tmp_exists(self): try: if self.send_command("ls {0}".format(self.temp_dir)): @@ -787,23 +849,30 @@ class ParamikoPlatform(Platform): if self.type == "slurm": try: - self._ftpChannel.chdir(self.remote_log_dir) # Test if remote_path exists + # Test if remote_path exists + self._ftpChannel.chdir(self.remote_log_dir) except IOError: try: if self.send_command(self.get_mkdir_cmd()): - Log.debug('{0} has been created on {1} .', self.remote_log_dir, self.host) + Log.debug('{0} has been created on {1} .', + self.remote_log_dir, self.host) else: - raise AutosubmitError("SFTP session not active ", 6007,"Could not create the DIR {0} on HPC {1}'.format(self.remote_log_dir, self.host)".format(self.remote_log_dir, self.host)) + raise AutosubmitError("SFTP session not active ", 6007, "Could not create the DIR {0} on HPC {1}'.format(self.remote_log_dir, self.host)".format( + self.remote_log_dir, self.host)) except BaseException as e: - raise AutosubmitError("SFTP session not active ", 6007,e.message) + raise AutosubmitError( + "SFTP session not active ", 6007, e.message) else: try: if self.send_command(self.get_mkdir_cmd()): - Log.debug('{0} has been created on {1} .', self.remote_log_dir, self.host) + Log.debug('{0} has been created on {1} .', + self.remote_log_dir, self.host) else: - Log.debug('Could not create the DIR {0} to HPC {1}'.format(self.remote_log_dir, self.host)) + Log.debug('Could not create the DIR {0} to HPC {1}'.format( + self.remote_log_dir, self.host)) except BaseException as e: - raise AutosubmitError("Couldn't send the file {0} to HPC {1}".format(self.remote_log_dir,self.host), 6004, e.message) + raise AutosubmitError("Couldn't send the file {0} to HPC {1}".format( + self.remote_log_dir, self.host), 6004, e.message) class ParamikoPlatformException(Exception): diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 8699e4ee2..b2c5e7ef2 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -99,7 +99,7 @@ class SlurmPlatform(ParamikoPlatform): except AutosubmitCritical as e: raise except Exception as e: - raise AutosubmitError("Submit script is not found, retry again in next AS iteration", 6008, e.message) + raise AutosubmitError("Submit script is not found, retry again in next AS iteration", 6008, str(e)) def update_cmds(self): """ diff --git a/bin/autosubmit b/bin/autosubmit index a5ca82b5c..00dc40d9b 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -20,6 +20,7 @@ """Script for handling experiment monitoring""" import os import sys +import traceback from log.log import Log, AutosubmitCritical @@ -37,14 +38,18 @@ def main(): Autosubmit.parse_args() os._exit(0) except AutosubmitCritical as e: + # If an exception reaches this point, print tracebak on debug + Log.debug(traceback.format_exc()) if e.trace is not None: Log.error("Trace: {0}", e.trace) Log.critical("{1} [eCode={0}]", e.code, e.message) Log.info("More info at https://autosubmit.readthedocs.io/en/latest/faq.html") os._exit(1) except Exception as e: - Log.error("Trace: {0}", e.message) - if "temporarily unavailable" in str(e.message): + # If an exception reaches this point, print tracebak on debug + Log.debug(traceback.format_exc()) + Log.error("Trace: {0}", str(e)) + if "temporarily unavailable" in str(e): Log.critical( "Another instance of autosubmit is running on this experiment. If this is not the case, delete autosubmit.lock", 7000) else: -- GitLab From 1eba81168457a10402f4790eed65f0f7a30613ce Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Tue, 15 Dec 2020 12:29:34 +0100 Subject: [PATCH 2/2] Re-added FAQ section on documentatino --- docs/source/faq-original.rst | 6 ++++++ docs/source/faq.rst | 2 ++ docs/source/index.rst | 1 + 3 files changed, 9 insertions(+) create mode 100644 docs/source/faq-original.rst diff --git a/docs/source/faq-original.rst b/docs/source/faq-original.rst new file mode 100644 index 000000000..4db1c2b00 --- /dev/null +++ b/docs/source/faq-original.rst @@ -0,0 +1,6 @@ +############################## +Frequent Questions and Answers +############################## + +The latest version of **Autosubmit** implements a code system that guides you through the process of fixing some of the common problems you might find. Consequently, the **FAQ** section has been replaced by :ref:`faqnew`, where you will find the list of error codes, their descriptions, and solutions. + diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 3f1207ab6..1b4eff2f3 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -1,3 +1,5 @@ +.. _faqnew: + ################################## Error codes and solutions ################################## diff --git a/docs/source/index.rst b/docs/source/index.rst index 54bbff960..397feeb6b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -16,6 +16,7 @@ Welcome to autosubmit's documentation! installation usage workflows + faq-original troubleshoot faq project -- GitLab