diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 70226dc74aa53880f55d636cd6a9bc939d8e3c6f..09e8ec28c0a7de4f2bcab242e58109a0c16fb87f 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -101,7 +101,7 @@ def signal_handler_create(signal_received, frame): :param signal_received: :param frame: """ - raise AutosubmitCritical('Autosubmit has been closed in an unexpected way. Killed or control + c.',7000) + raise AutosubmitCritical('Autosubmit has been closed in an unexpected way. Killed or control + c.',7010) class Autosubmit: """ @@ -135,8 +135,8 @@ class Autosubmit: BasicConfig.read() parser = argparse.ArgumentParser( description='Main executable for autosubmit. ') - parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version, - help="returns autosubmit's version number and exit") + + parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version) parser.add_argument('-lf', '--logfile', choices=('NO_LOG','INFO','WARNING', 'DEBUG'), default='WARNING', type=str, help="sets file's log level.") @@ -145,7 +145,6 @@ class Autosubmit: help="sets console's log level") subparsers = parser.add_subparsers(dest='command') - # Run subparser = subparsers.add_parser( 'run', description="runs specified experiment") @@ -491,8 +490,13 @@ class Autosubmit: # Changelog subparsers.add_parser('changelog', description='show changelog') args = parser.parse_args() - except BaseException as e: - raise AutosubmitCritical("Incorrect arguments for this command",7000) + + except Exception as e: + if type(e) is SystemExit: + if e.message == 0: # Version keyword force an exception in parse arg due and os_exit(0) but the program is succesfully finished + print(Autosubmit.autosubmit_version) + os._exit(0) + raise AutosubmitCritical("Incorrect arguments for this command",7011) expid = "None" @@ -580,7 +584,7 @@ class Autosubmit: tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) aslogs_path = os.path.join(tmp_path, BasicConfig.LOCAL_ASLOG_DIR) if not os.path.exists(exp_path) and "create" not in command: - raise AutosubmitCritical("Experiment does not exist", 7000) + raise AutosubmitCritical("Experiment does not exist", 7012) if not os.path.exists(tmp_path): os.mkdir(tmp_path) if not os.path.exists(aslogs_path): @@ -600,9 +604,9 @@ class Autosubmit: current_owner_id = pwd.getpwuid(os.stat(os.path.join( BasicConfig.LOCAL_ROOT_DIR, expid)).st_uid).pw_name if current_user_id != current_owner_id: - raise AutosubmitCritical("You don't own the experiment {0}.".format(expid),7000) + raise AutosubmitCritical("You don't own the experiment {0}.".format(expid),7012) except BaseException as e: - raise AutosubmitCritical("User or owner does not exists",7000,e.message) + raise AutosubmitCritical("User or owner does not exists",7012,e.message) @staticmethod @@ -625,7 +629,7 @@ class Autosubmit: id_eadmin = os.popen('id -u eadmin').read().strip() if expid_delete == '' or expid_delete is None and not os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)): - Log.info("Experiment directory does not exist.") + Log.result("Experiment directory does not exist.") else: ret = False # Handling possible failure of retrieval of current owner data @@ -651,16 +655,16 @@ class Autosubmit: shutil.rmtree(os.path.join( BasicConfig.LOCAL_ROOT_DIR, expid_delete)) except OSError as e: - raise AutosubmitCritical('Can not delete experiment folder: ',7000,e.message) + raise AutosubmitCritical('Can not delete experiment folder: ',7012,e.message) Log.info("Deleting experiment from database...") ret = delete_experiment(expid_delete) if ret: Log.result("Experiment {0} deleted".format(expid_delete)) else: if currentOwner_id == 0: - raise AutosubmitCritical('Detected Eadmin user however, -f flag is not found. {0} can not be deleted!'.format(expid_delete), 7000) + raise AutosubmitCritical('Detected Eadmin user however, -f flag is not found. {0} can not be deleted!'.format(expid_delete), 7012) else: - raise AutosubmitCritical('Current user is not the owner of the experiment. {0} can not be deleted!'.format(expid_delete), 7000) + raise AutosubmitCritical('Current user is not the owner of the experiment. {0} can not be deleted!'.format(expid_delete), 7012) @staticmethod def expid(hpc, description, copy_id='', dummy=False, test=False, operational=False, root_folder=''): @@ -683,12 +687,12 @@ class Autosubmit: """ exp_id = None if description is None or hpc is None: - raise AutosubmitCritical("Check that the parameters are defined (-d and -H) ",7000) + raise AutosubmitCritical("Check that the parameters are defined (-d and -H) ",7011) if not copy_id: exp_id = new_experiment( description, Autosubmit.autosubmit_version, test, operational) if exp_id == '': - raise AutosubmitCritical("Couldn't create a new experiment",7000) + raise AutosubmitCritical("Couldn't create a new experiment",7011) try: os.mkdir(os.path.join(BasicConfig.LOCAL_ROOT_DIR, exp_id)) os.mkdir(os.path.join( @@ -726,7 +730,7 @@ class Autosubmit: exp_id, hpc, Autosubmit.autosubmit_version, dummy) except (OSError, IOError) as e: Autosubmit._delete_expid(exp_id) - raise AutosubmitCritical("Couldn't create a new experiment, permissions?", 7000, e.message) + raise AutosubmitCritical("Couldn't create a new experiment, permissions?", 7012, e.message) else: try: if root_folder == '' or root_folder is None: @@ -809,19 +813,19 @@ class Autosubmit: ##### autosubmit_config = AutosubmitConfig( exp_id, BasicConfig, ConfigParserFactory()) - autosubmit_config.check_conf_files() + autosubmit_config.check_conf_files(False) project_type = autosubmit_config.get_project_type() if project_type == "git": autosubmit_git = AutosubmitGit(copy_id[0]) Log.info("checking model version...") if not autosubmit_git.check_commit(autosubmit_config): - raise AutosubmitCritical("Uncommitted changes",7000) + raise AutosubmitCritical("Uncommitted changes",7013) else: - raise AutosubmitCritical("The experiment directory doesn't exist",7000) + raise AutosubmitCritical("The experiment directory doesn't exist",7012) except (OSError, IOError) as e: Autosubmit._delete_expid(exp_id, True) - raise AutosubmitCritical("Can not create experiment", 7000,e.message) + raise AutosubmitCritical("Can not create experiment", 7012,e.message) Log.debug("Creating temporal directory...") exp_id_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, exp_id) @@ -882,9 +886,9 @@ class Autosubmit: Log.debug('Enter Autosubmit._delete_expid {0}', expid) return Autosubmit._delete_expid(expid, force) else: - raise AutosubmitCritical("Insufficient permissions",7000) + raise AutosubmitCritical("Insufficient permissions",7012) else: - raise AutosubmitCritical("Experiment does not exist", 7000) + raise AutosubmitCritical("Experiment does not exist", 7012) @staticmethod def _load_parameters(as_conf, job_list, platforms): @@ -909,7 +913,7 @@ class Autosubmit: # Platform = from DEFAULT.HPCARCH, e.g. marenostrum4 if as_conf.get_platform().lower() not in platforms.keys(): raise AutosubmitCritical("Specified platform in expdef_.conf " + str(as_conf.get_platform( - ).lower()) + " is not a valid platform defined in platforms_.conf.",7000) + ).lower()) + " is not a valid platform defined in platforms_.conf.",7014) platform = platforms[as_conf.get_platform().lower()] platform.add_parameters(parameters, True) # Attach paramenters to JobList @@ -936,11 +940,8 @@ class Autosubmit: os.system('clear') signal.signal(signal.SIGINT, signal_handler) as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(True) project_type = as_conf.get_project_type() - if project_type != "none": - # Check proj configuration - as_conf.check_proj() safetysleeptime = as_conf.get_safetysleeptime() Log.debug("The Experiment name is: {0}", expid) Log.debug("Sleep: {0}", safetysleeptime) @@ -1137,7 +1138,7 @@ class Autosubmit: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(True) Log.info("Autosubmit is running with {0}", Autosubmit.autosubmit_version) if update_version: if as_conf.get_version() != Autosubmit.autosubmit_version: @@ -1147,7 +1148,7 @@ class Autosubmit: else: if as_conf.get_version() != '' and as_conf.get_version() != Autosubmit.autosubmit_version: raise AutosubmitCritical("Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" - "Or with the -v parameter: autosubmit run {2} -v ".format(as_conf.get_version(), Autosubmit.autosubmit_version, expid),7000 ) + "Or with the -v parameter: autosubmit run {2} -v ".format(as_conf.get_version(), Autosubmit.autosubmit_version, expid),7 ) # checking if there is a lock file to avoid multiple running on the same expid try: with portalocker.Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): @@ -1155,9 +1156,6 @@ class Autosubmit: os.system('clear') signal.signal(signal.SIGINT, signal_handler) - as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() - hpcarch = as_conf.get_platform() safetysleeptime = as_conf.get_safetysleeptime() retrials = as_conf.get_retrials() @@ -1172,7 +1170,7 @@ class Autosubmit: try: job_list = Autosubmit.load_job_list(expid, as_conf, notransitive=notransitive) except BaseException as e: - raise AutosubmitCritical("Corrupted job_list, backup couldn''t be restored",7000,e.message) + raise AutosubmitCritical("Corrupted job_list, backup couldn''t be restored",7040,e.message) Log.debug("Starting from job list restored from {0} files", pkl_dir) @@ -1193,7 +1191,7 @@ class Autosubmit: try: packages_persistence = JobPackagePersistence(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid, "pkl"),"job_packages_" + expid) except BaseException as e: - raise AutosubmitCritical("Corrupted job_packages, python 2.7 and sqlite doesn''t allow to restore these packages",7000,e.message) + raise AutosubmitCritical("Corrupted job_packages, python 2.7 and sqlite doesn''t allow to restore these packages",7040,e.message) if as_conf.get_wrapper_type() != 'none': os.chmod(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid, "pkl", "job_packages_" + expid+".db"), 0644) @@ -1202,7 +1200,7 @@ class Autosubmit: except BaseException as e: raise AutosubmitCritical( "Corrupted job_packages, python 2.7 and sqlite doesn''t allow to restore these packages", - 7000, e.message) + 7040, e.message) for (exp_id, package_name, job_name) in packages: if package_name not in job_list.packages_dict: @@ -1227,9 +1225,7 @@ class Autosubmit: Autosubmit.restore_platforms(platforms_to_test) # establish the connection to all platforms save = True while job_list.get_active(): - try: - if Autosubmit.exit: return 0 # reload parameters changes @@ -1275,7 +1271,7 @@ class Autosubmit: if wrapper_job.status != wrapper_job.new_status: Log.info('Wrapper job ' + wrapper_job.name + ' changed from ' + str(Status.VALUE_TO_KEY[wrapper_job.status]) + ' to status ' + str(Status.VALUE_TO_KEY[wrapper_job.new_status])) except: - raise AutosubmitCritical("Wrapper is in Unknown Status couldn't get wrapper parameters",7000) + raise AutosubmitCritical("Wrapper is in Unknown Status couldn't get wrapper parameters",7050) # New status will be saved and inner_jobs will be checked. wrapper_job.check_status(wrapper_job.new_status) @@ -1358,7 +1354,8 @@ class Autosubmit: job_list.save() time.sleep(safetysleeptime) except AutosubmitError as e: #If an error is detected, restore all connections and job_list, keep trying for 5 more retries - Log.error("{1} [eCode={0}]",e.code, e.message) + Log.error("Trace: {0}", e.trace) + Log.error("{1} [eCode={0}]", e.code, e.message) #Save job_list if not is a failed submitted job if "submitted" not in e.message: try: @@ -1369,19 +1366,22 @@ class Autosubmit: try: job_list = Autosubmit.load_job_list(expid, as_conf, notransitive=notransitive) except BaseException as e: - raise AutosubmitCritical("Corrupted job_list, backup couldn''t be restored", 7000, + raise AutosubmitCritical("Corrupted job_list, backup couldn''t be restored", 7040, e.message) else: # Restore from files try: job_list = Autosubmit.load_job_list(expid, as_conf, notransitive=notransitive) except BaseException as e: - raise AutosubmitCritical("Corrupted job_list, backup couldn''t be restored", 7000, + raise AutosubmitCritical("Corrupted job_list, backup couldn't be restored", 7040, e.message) if main_loop_retrials > 0: # Restore platforms and try again, to avoid endless loop with failed configuration, a hard limit is set. - Autosubmit.restore_platforms(platforms_to_test) main_loop_retrials = main_loop_retrials - 1 + try: + Autosubmit.restore_platforms(platforms_to_test) + except BaseException: + raise AutosubmitCritical("Autosubmit couldn't recover the platforms",7050, e.message) else: - raise AutosubmitCritical("Autosubmit Encounter too much errors during running time",7000,e.message) + raise AutosubmitCritical("Autosubmit Encounter too much errors during running time",7051,e.message) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error raise AutosubmitCritical(e.message, e.code, e.trace) except portalocker.AlreadyLocked: @@ -1483,6 +1483,8 @@ class Autosubmit: valid_packages_to_submit.append(package) except (IOError, OSError): continue + except AutosubmitError as e: + raise if hasattr(package, "name"): job_list.packages_dict[package.name] = package.jobs from job.job import WrapperJob @@ -1497,9 +1499,11 @@ class Autosubmit: packages_persistence.save( package.name, package.jobs, package._expid, inspect) except WrongTemplateException as e: - raise AutosubmitCritical("Invalid parameter substitution in {0} template".format(e.job_name),7000) + raise AutosubmitCritical("Invalid parameter substitution in {0} template".format(e.job_name),7014) except AutosubmitCritical as e: raise AutosubmitCritical(e.message,e.code,e.trace) + except AutosubmitError as e: + raise except Exception as e: raise @@ -1567,7 +1571,7 @@ class Autosubmit: exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) Log.info("Getting job list...") as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(False) # Getting output type from configuration output_type = as_conf.get_output_type() pkl_dir = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid, 'pkl') @@ -1741,7 +1745,7 @@ class Autosubmit: exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) Log.info("Loading jobs...") as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(False) pkl_dir = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid, 'pkl') @@ -1778,9 +1782,8 @@ class Autosubmit: monitor_exp.generate_output_stats( expid, job_list, file_format, period_ini, period_fi, not hide) Log.result("Stats plot ready") - except Exception as ex: - Log.critical(str(ex)) - return False + except Exception as e: + raise AutosubmitCritical("Stats couldn't be shown",7061,e.message) else: Log.info("There are no {0} jobs in the period from {1} to {2}...".format( ft, period_ini, period_fi)) @@ -1808,11 +1811,10 @@ class Autosubmit: if project: autosubmit_config = AutosubmitConfig( expid, BasicConfig, ConfigParserFactory()) - autosubmit_config.check_conf_files() + autosubmit_config.check_conf_files(False) project_type = autosubmit_config.get_project_type() if project_type == "git": - autosubmit_config.check_proj() Log.info("Registering commit SHA...") autosubmit_config.set_git_project_commit(autosubmit_config) autosubmit_git = AutosubmitGit(expid[0]) @@ -1852,7 +1854,7 @@ class Autosubmit: exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(False) Log.info('Recovering experiment {0}'.format(expid)) pkl_dir = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid, 'pkl') @@ -1860,7 +1862,7 @@ class Autosubmit: expid, as_conf, notransitive=notransitive, monitor=True) Log.debug("Job list restored from {0} files", pkl_dir) - as_conf.check_conf_files() + as_conf.check_conf_files(False) # Getting output type provided by the user in config, 'pdf' as default output_type = as_conf.get_output_type() @@ -1978,7 +1980,7 @@ class Autosubmit: Log.info('Migrating experiment {0}'.format(experiment_id)) as_conf = AutosubmitConfig( experiment_id, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(False) submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) if submitter.platforms is None: @@ -1994,8 +1996,8 @@ class Autosubmit: Log.info( "Checking [{0}] from platforms configuration...", platform) if not as_conf.get_migrate_user_to(platform): - Log.critical( - "Missing directive USER_TO in [{0}]", platform) + Log.printlog( + "Missing directive USER_TO in [{0}]".format( platform),7014) error = True break if as_conf.get_migrate_project_to(platform): @@ -2067,8 +2069,8 @@ class Autosubmit: Log.debug( "The platform {0} does not contain absolute symlinks", platform) except BaseException: - Log.warning( - "Absolute symlinks failed to convert, check user in platform.conf") + Log.printlog( + "Absolute symlinks failed to convert, check user in platform.conf",3000) error = True break @@ -2077,13 +2079,14 @@ class Autosubmit: "Moving remote files/dirs on {0}", platform) p.send_command("chmod 777 -R " + p.root_dir) if not p.move_file(p.root_dir, os.path.join(p.temp_dir, experiment_id), True): - Log.critical("The files/dirs on {0} cannot be moved to {1}.", p.root_dir, - os.path.join(p.temp_dir, experiment_id)) + Log.printlog( + "The files/dirs on {0} cannot be moved to {1}.".format(p.root_dir, + os.path.join(p.temp_dir, experiment_id), 6012)) error = True break - except (IOError, BaseException): - Log.critical("The files/dirs on {0} cannot be moved to {1}.", p.root_dir, - os.path.join(p.temp_dir, experiment_id)) + except (IOError, BaseException) as e: + Log.printlog("The files/dirs on {0} cannot be moved to {1}.".format(p.root_dir, + os.path.join(p.temp_dir, experiment_id)),6012) error = True break @@ -2093,11 +2096,11 @@ class Autosubmit: Log.result("[{0}] from platforms configuration OK", platform) if error: - Log.critical( - "The experiment cannot be offered, reverting changes") + Log.printlog( + "The experiment cannot be offered, reverting changes",7012) as_conf = AutosubmitConfig( experiment_id, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(False) for platform in backup_files: p = submitter.platforms[platform] p.move_file(os.path.join( @@ -2112,8 +2115,8 @@ class Autosubmit: return False else: if not Autosubmit.archive(experiment_id, False, False): - Log.critical( - "The experiment cannot be offered,reverting changes.") + Log.printlog( + "The experiment cannot be offered, reverting changes", 7012) for platform in backup_files: p = submitter.platforms[platform] p.move_file(os.path.join( @@ -2132,12 +2135,11 @@ class Autosubmit: Log.info('Migrating experiment {0}'.format(experiment_id)) Log.info("Moving local files/dirs") if not Autosubmit.unarchive(experiment_id, False): - Log.critical("The experiment cannot be picked up") - return False + raise AutosubmitCritical("The experiment cannot be picked up",7012) Log.info("Local files/dirs have been successfully picked up") as_conf = AutosubmitConfig( experiment_id, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(False) Log.info("Checking remote platforms") submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) @@ -2163,8 +2165,7 @@ class Autosubmit: "Files/dirs on {0} have been successfully picked up", platform) except (IOError, BaseException): error = True - Log.critical("The files/dirs on {0} cannot be copied to {1}.", - os.path.join(p.temp_dir, experiment_id), p.root_dir) + Log.printlog("The files/dirs on {0} cannot be copied to {1}.".format(os.path.join(p.temp_dir, experiment_id), p.root_dir),6012) break backup_files.append(platform) else: @@ -2172,8 +2173,8 @@ class Autosubmit: "Files/dirs on {0} have been successfully picked up", platform) if error: Autosubmit.archive(experiment_id, False, False) - Log.critical( - "The experiment cannot be picked,reverting changes.") + Log.printlog( + "The experiment cannot be picked,reverting changes.",7012) for platform in backup_files: p = submitter.platforms[platform] p.send_command("rm -R " + p.root_dir) @@ -2196,22 +2197,13 @@ class Autosubmit: :type experiment_id: str """ exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, experiment_id) - if not os.path.exists(exp_path): - Log.critical( - "The directory {0} is needed and does not exist.", exp_path) - Log.warning("Does an experiment with the given id exist?") - return False - as_conf = AutosubmitConfig( experiment_id, BasicConfig, ConfigParserFactory()) - if not as_conf.check_conf_files(): - return False + as_conf.check_conf_files(False) + project_type = as_conf.get_project_type() - if project_type != "none": - if not as_conf.check_proj(): - return False submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) @@ -2246,16 +2238,11 @@ class Autosubmit: Log.info("Describing {0}", experiment_id) exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, experiment_id) - if not os.path.exists(exp_path): - Log.critical( - "The directory {0} is needed and does not exist.", exp_path) - Log.warning("Does an experiment with the given id exist?") - return False as_conf = AutosubmitConfig( experiment_id, BasicConfig, ConfigParserFactory()) - if not as_conf.check_conf_files(): - return False + as_conf.check_conf_files(False) + user = os.stat(as_conf.experiment_file).st_uid try: user = pwd.getpwuid(user).pw_name @@ -2267,9 +2254,6 @@ class Autosubmit: os.path.getmtime(as_conf.experiment_file)) project_type = as_conf.get_project_type() - if project_type != "none": - if not as_conf.check_proj(): - return False if (as_conf.get_svn_project_url()): model = as_conf.get_svn_project_url() branch = as_conf.get_svn_project_url() @@ -2385,8 +2369,7 @@ class Autosubmit: config_file.close() Log.result("Configuration file written successfully") except (IOError, OSError) as e: - Log.critical("Can not write config file: {0}".format(e.message)) - return False + raise AutosubmitCritical("Can not write config file: {0}",7012,e.message) return True @staticmethod @@ -2406,12 +2389,9 @@ class Autosubmit: d = dialog.Dialog( dialog="dialog", autowidgetsize=True, screen_color='GREEN') except dialog.DialogError: - Log.critical(not_enough_screen_size_msg) - return False + raise AutosubmitCritical("Graphical visualization failed, not enough screen size",7060) except Exception: - Log.critical("Missing package 'dialog', please install it with: 'apt-get install dialog'" - "or provide configure arguments") - return False + raise AutosubmitCritical("Dialog libs aren't found in your Operational system",7060) d.set_background_title("Autosubmit configure utility") if os.geteuid() == 0: @@ -2433,8 +2413,7 @@ class Autosubmit: os.system('clear') return False except dialog.DialogError: - Log.critical(not_enough_screen_size_msg) - return False + raise AutosubmitCritical("Graphical visualization failed, not enough screen size",7060) filename = '.autosubmitrc' if level == 'All': @@ -2471,19 +2450,16 @@ class Autosubmit: jobs_conf_path = parser.get('conf', 'jobs') except (IOError, OSError) as e: - Log.critical("Can not read config file: {0}".format(e.message)) - return False + raise AutosubmitCritical("Can not read config file",7014,e.message) while True: try: code, database_path = d.dselect(database_path, width=80, height=20, title='\Zb\Z1Select path to database\Zn', colors='enable') except dialog.DialogError: - Log.critical(not_enough_screen_size_msg) - return False - + raise AutosubmitCritical("Graphical visualization failed, not enough screen size", 7060) if Autosubmit._requested_exit(code, d): - return False + raise AutosubmitCritical("Graphical visualization failed, requested exit", 7060) elif code == dialog.Dialog.OK: database_path = database_path.replace('~', home_path) if not os.path.exists(database_path): @@ -2498,11 +2474,11 @@ class Autosubmit: title='\Zb\Z1Select path to experiments repository\Zn', colors='enable') except dialog.DialogError: - Log.critical(not_enough_screen_size_msg) - return False + raise AutosubmitCritical("Graphical visualization failed, not enough screen size",7060) + if Autosubmit._requested_exit(code, d): - return False + raise AutosubmitCritical("Graphical visualization failed,requested exit",7060) elif code == dialog.Dialog.OK: database_path = database_path.replace('~', home_path) if not os.path.exists(database_path): @@ -2523,11 +2499,10 @@ class Autosubmit: form_height=10, title='\Zb\Z1Just a few more options:\Zn', colors='enable') except dialog.DialogError: - Log.critical(not_enough_screen_size_msg) - return False + raise AutosubmitCritical("Graphical visualization failed, not enough screen size",7060) if Autosubmit._requested_exit(code, d): - return False + raise AutosubmitCritical("Graphical visualization failed, _requested_exit", 7060) elif code == dialog.Dialog.OK: database_filename = tag[0] platforms_conf_path = tag[1] @@ -2558,11 +2533,10 @@ class Autosubmit: form_height=10, title='\Zb\Z1Mail notifications configuration:\Zn', colors='enable') except dialog.DialogError: - Log.critical(not_enough_screen_size_msg) - return False + raise AutosubmitCritical("Graphical visualization failed, not enough screen size", 7060) if Autosubmit._requested_exit(code, d): - return False + raise AutosubmitCritical("Graphical visualization failed, requested exit", 7060) elif code == dialog.Dialog.OK: smtp_hostname = tag[0] mail_from = tag[1] @@ -2594,9 +2568,7 @@ class Autosubmit: width=50, height=5) os.system('clear') except (IOError, OSError) as e: - Log.critical("Can not write config file: {0}".format(e.message)) - os.system('clear') - return False + raise AutosubmitCritical("Can not write config file", 7012,e.message) return True @staticmethod @@ -2619,12 +2591,10 @@ class Autosubmit: Log.info("Creating autosubmit database...") qry = resource_string('autosubmit.database', 'data/autosubmit.sql') if not create_db(qry): - Log.critical("Can not write database file") - return False + raise AutosubmitCritical("Can not write database file", 7004) Log.result("Autosubmit database created successfully") else: - Log.error("Database already exists.") - return False + raise AutosubmitCritical("Database already exists.", 7004) return True @staticmethod @@ -2642,10 +2612,9 @@ class Autosubmit: Autosubmit._check_ownership(expid) as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() - if not as_conf.check_expdef_conf(): - Log.critical('Can not refresh with invalid configuration') - return False - + as_conf.check_conf_files() + if "Expdef" in as_conf.wrong_config: + as_conf.show_messages() project_type = as_conf.get_project_type() if Autosubmit._copy_code(as_conf, expid, project_type, True): Log.result("Project folder updated") @@ -2664,9 +2633,8 @@ class Autosubmit: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) as_conf.reload() - if not as_conf.check_expdef_conf(): - Log.critical('Can not refresh with invalid configuration') - return False + as_conf.check_expdef_conf(False) + Log.info("Changing {0} experiment version from {1} to {2}", expid, as_conf.get_version(), Autosubmit.autosubmit_version) as_conf.set_version(Autosubmit.autosubmit_version) @@ -2685,11 +2653,6 @@ class Autosubmit: """ exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) - if not os.path.exists(exp_path): - Log.critical( - "The directory %s is needed and does not exist." % exp_path) - Log.warning("Does an experiment with the given id exist?") - return 1 exp_folder = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) @@ -2697,8 +2660,7 @@ class Autosubmit: # Cleaning to reduce file size. version = get_autosubmit_version(expid) if version is not None and version.startswith('3') and not Autosubmit.clean(expid, True, True, True, False): - Log.critical("Can not archive project. Clean not successful") - return False + raise AutosubmitCritical("Can not archive project. Clean not successful", 7012) # Getting year of last completed. If not, year of expid folder year = None @@ -2733,8 +2695,8 @@ class Autosubmit: tar.close() os.chmod(os.path.join(year_path, output_filepath), 0o755) except Exception as e: - Log.critical("Can not write tar file: {0}".format(e)) - return False + raise AutosubmitCritical("Can not write tar file", 7012,e.message) + Log.info("Tar file created!") @@ -2752,10 +2714,9 @@ class Autosubmit: Log.warning("Experiment folder renamed to: {0}".format( exp_folder+"_to_delete ")) except Exception as e: - Log.critical( - "Can not remove or rename experiments folder: {0}".format(e)) + Autosubmit.unarchive(expid, compress, True) - return False + raise AutosubmitCritical("Can not remove or rename experiments folder",7012,e.message) Log.result("Experiment archived successfully") return True @@ -2803,7 +2764,7 @@ class Autosubmit: tar.close() except Exception as e: shutil.rmtree(exp_folder, ignore_errors=True) - Log.critical("Can not extract tar file: {0}".format(e)) + Log.printlog("Can not extract tar file: {0}".format(e),6012) return False Log.info("Unpacking finished") @@ -2811,7 +2772,7 @@ class Autosubmit: try: os.remove(archive_path) except Exception as e: - Log.error("Can not remove archived file folder: {0}".format(e)) + Log.printlog("Can not remove archived file folder: {0}".format(e),7012) return False Log.result("Experiment {0} unarchived successfully", experiment_id) @@ -2868,17 +2829,6 @@ class Autosubmit: Autosubmit._check_ownership(expid) exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) - aslogs_path = os.path.join(tmp_path, BasicConfig.LOCAL_ASLOG_DIR) - if not os.path.exists(aslogs_path): - os.mkdir(aslogs_path) - os.chmod(aslogs_path, 0o775) - else: - os.chmod(aslogs_path, 0o775) - if not os.path.exists(exp_path): - Log.critical( - "The directory %s is needed and does not exist." % exp_path) - Log.warning("Does an experiment with the given id exist?") - return 1 # checking if there is a lock file to avoid multiple running on the same expid try: @@ -2888,7 +2838,7 @@ class Autosubmit: Log.info("Preparing .lock file to avoid multiple instances with same expid.") as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.check_conf_files() + as_conf.check_conf_files(False) project_type = as_conf.get_project_type() # Getting output type provided by the user in config, 'pdf' as default output_type = as_conf.get_output_type() @@ -2900,9 +2850,6 @@ class Autosubmit: Autosubmit._create_project_associated_conf( as_conf, False, update_job) - if project_type != "none": - # Check project configuration - as_conf.check_proj() # Load parameters Log.info("Loading parameters...") @@ -2910,14 +2857,12 @@ class Autosubmit: date_list = as_conf.get_date_list() if len(date_list) != len(set(date_list)): - Log.error('There are repeated start dates!') - return False + raise AutosubmitCritical('There are repeated start dates!',7014) num_chunks = as_conf.get_num_chunks() chunk_ini = as_conf.get_chunk_ini() member_list = as_conf.get_member_list() if len(member_list) != len(set(member_list)): - Log.error('There are repeated member names!') - return False + raise AutosubmitCritical("There are repeated member names!") rerun = as_conf.get_rerun() Log.info("\nCreating the jobs list...") @@ -3021,10 +2966,12 @@ class Autosubmit: signal.signal(signal.SIGINT, signal_handler_create) fh.flush() os.fsync(fh.fileno()) - raise AutosubmitCritical("Stopped by user input", 7000) + raise AutosubmitCritical("Stopped by user input", 7010) except portalocker.AlreadyLocked: message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" raise AutosubmitCritical(message,7000) + except AutosubmitCritical as e: + raise AutosubmitCritical(e.message,e.code) @staticmethod def _copy_code(as_conf, expid, project_type, force): @@ -3045,7 +2992,10 @@ class Autosubmit: if project_type == "git": submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) - hpcarch = submitter.platforms[as_conf.get_platform().lower()] + try: + hpcarch = submitter.platforms[as_conf.get_platform()] + except: + raise AutosubmitCritical("Can't set main platform",7014) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": svn_project_url = as_conf.get_svn_project_url() @@ -3068,10 +3018,10 @@ class Autosubmit: svn_project_revision + " " + svn_project_url + " " + project_destination, shell=True) except subprocess.CalledProcessError: - Log.error("Can not check out revision {0} into {1}", svn_project_revision + " " + svn_project_url, - project_path) + shutil.rmtree(project_path, ignore_errors=True) - return False + raise AutosubmitCritical("Can not check out revision {0} into {1}".format(svn_project_revision + " " + svn_project_url, + project_path),7062) Log.debug("{0}", output) elif project_type == "local": @@ -3089,19 +3039,17 @@ class Autosubmit: local_project_path+"/* "+local_destination] subprocess.call(cmd, shell=True) except subprocess.CalledProcessError: - Log.error( - "Can not synchronize {0} into {1}. Exiting...", local_project_path, project_path) - return False + raise AutosubmitCritical("Can not rsync {0} into {1}. Exiting...".format( + local_project_path, project_path), 7063) else: os.mkdir(local_destination) try: output = subprocess.check_output( "cp -R " + local_project_path + "/* " + local_destination, shell=True) except subprocess.CalledProcessError: - Log.error( - "Can not copy {0} into {1}. Exiting...", local_project_path, project_path) shutil.rmtree(project_path) - return False + raise AutosubmitCritical("Can not copy {0} into {1}. Exiting...".format( + local_project_path, project_path), 7063) else: os.mkdir(project_path) os.mkdir(local_destination) @@ -3113,10 +3061,9 @@ class Autosubmit: output = subprocess.check_output( "cp -R " + local_project_path + "/* " + local_destination, shell=True) except subprocess.CalledProcessError: - Log.error( - "Can not copy {0} into {1}. Exiting...", local_project_path, project_path) shutil.rmtree(project_path) - return False + raise AutosubmitCritical( + "Can not copy {0} into {1}. Exiting...".format( local_project_path, project_path), 7063) Log.debug("{0}", output) return True @@ -3173,12 +3120,6 @@ class Autosubmit: Autosubmit._check_ownership(expid) exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) - if not os.path.exists(exp_path): - Log.critical( - "The directory %s is needed and does not exist." % exp_path) - Log.warning("Does an experiment with the given id exist?") - return 1 - # checking if there is a lock file to avoid multiple running on the same expid try: with portalocker.Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): @@ -3195,9 +3136,8 @@ class Autosubmit: wrongExpid = 0 as_conf = AutosubmitConfig( expid, BasicConfig, ConfigParserFactory()) - if not as_conf.check_conf_files(): - Log.critical('Can not run with invalid configuration') - return False + as_conf.check_conf_files(False) + # Getting output type from configuration output_type = as_conf.get_output_type() @@ -3236,10 +3176,7 @@ class Autosubmit: ".\n\tProcess stopped. Review the format of the provided input. Comparison is case sensitive." + \ "\n\tRemember that this option expects section names separated by a blank space as input." - Log.info(section_validation_message) - Log.critical("Error in the supplied input for -ft.") - return False - + raise AutosubmitCritical("Error in the supplied input for -ft.",7011,section_validation_message) job_list = Autosubmit.load_job_list( expid, as_conf, notransitive=notransitive) submitter = Autosubmit._get_submitter(as_conf) @@ -3284,9 +3221,7 @@ class Autosubmit: job_validation_message += "\n\tSpecified job(s) : [" + str(job_not_foundList) + "] not found in the experiment " + \ str(expid) + ". \n\tProcess stopped. Review the format of the provided input. Comparison is case sensitive." + \ "\n\tRemember that this option expects job names separated by a blank space as input." - Log.info(job_validation_message) - Log.critical("Error in the supplied input for -fl.") - return False + raise AutosubmitCritical("Error in the supplied input for -ft.",7011,section_validation_message) # Validating fc if filter_chunks -fc has been set: if filter_chunks is not None: @@ -3350,10 +3285,7 @@ class Autosubmit: # Ending validation if fc_filter_is_correct == False: - Log.info(fc_validation_message) - Log.critical("Error in the supplied input for -fc.") - return False - + raise AutosubmitCritical("Error in the supplied input for -fc.",7011,section_validation_message) # Validating status, if filter_status -fs has been set: # At this point we already have job_list from where we are getting the allows STATUS if filter_status is not None: @@ -3385,9 +3317,8 @@ class Autosubmit: status_validation_message += "\n\t There are no jobs with status " + \ status + " in this experiment." if status_validation_error == True: - Log.info(status_validation_message) - Log.critical("Error in the supplied input for -fs.") - return False + raise AutosubmitCritical("Error in the supplied input for -fs.",7011,section_validation_message) + jobs_filtered = [] final_status = Autosubmit._get_status(final) if filter_section or filter_chunks: @@ -3472,9 +3403,7 @@ class Autosubmit: # Ending validation if filter_is_correct == False: - Log.info(validation_message) - Log.critical("Error in the supplied input for -ftc.") - return False + raise AutosubmitCritical("Error in the supplied input for -ftc.", 7011, section_validation_message) # If input is valid, continue. record = dict() @@ -3572,8 +3501,6 @@ class Autosubmit: "-d option: Experiment has too many jobs to be printed in the terminal. Maximum job quantity is 1000, your experiment has " + str(current_length) + " jobs.") else: Log.info(job_list.print_with_status(statusChange = performed_changes)) - Log.status(job_list.print_with_status(statusChange = performed_changes)) - else: Log.warning("No changes were performed.") # End of New Feature @@ -3654,12 +3581,8 @@ class Autosubmit: if save and wrongExpid == 0: job_list.save() else: - Log.warning( - "Changes NOT saved to the JobList!!!!: use -s option to save") - if wrongExpid > 0: - - Log.error( - "Save disabled due invalid expid, please check or/and jobs expid name") + Log.printlog( + "Changes NOT saved to the JobList!!!!: use -s option to save",3000) if as_conf.get_wrapper_type() != 'none' and check_wrapper: packages_persistence = JobPackagePersistence(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid, "pkl"), @@ -3868,11 +3791,8 @@ class Autosubmit: communications_library = as_conf.get_communications_library() if communications_library == 'paramiko': return ParamikoSubmitter() - - # communications library not known - Log.error( - 'You have defined a not valid communications library on the configuration file') - raise Exception('Communications library not known') + else: + return ParamikoSubmitter()# only paramiko is avaliable right now so.. @staticmethod def _get_job_list_persistence(expid, as_conf): @@ -3888,7 +3808,7 @@ class Autosubmit: elif storage_type == 'db': return JobListPersistenceDb(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid, "pkl"), "job_list_" + expid) - raise AutosubmitCritical('Storage type not known',7000) + raise AutosubmitCritical('Storage type not known',7014) @staticmethod def _create_json(text): @@ -3996,8 +3916,7 @@ class Autosubmit: exp_parser = as_conf.get_parser( ConfigParserFactory(), as_conf.experiment_file) if exp_parser.get_bool_option('rerun', "RERUN", True): - Log.error('Can not test a RERUN experiment') - return False + raise AutosubmitCritical('Can not test a RERUN experiment',7014) content = open(as_conf.experiment_file).read() if random_select: @@ -4009,8 +3928,8 @@ class Autosubmit: if platforms_parser.get_option(section, 'TEST_SUITE', 'false').lower() == 'true': test_platforms.append(section) if len(test_platforms) == 0: - Log.critical('No test HPC defined') - return False + raise AutosubmitCritical("Missing hpcarch setting in expdef",7014) + hpc = random.choice(test_platforms) if member is None: member = random.choice(exp_parser.get( @@ -4105,9 +4024,12 @@ class Autosubmit: hpcarch = as_conf.get_platform() submitter = Autosubmit._get_submitter(as_conf) - submitter.load_platforms(as_conf) - if submitter.platforms is None: - raise AutosubmitCritical("platforms couldn't be loaded",7000) + try: + submitter.load_platforms(as_conf) + if submitter.platforms is None: + raise AutosubmitCritical("platforms couldn't be loaded",7014) + except: + raise AutosubmitCritical("platforms couldn't be loaded", 7014) platforms = submitter.platforms platforms_to_test = set() @@ -4134,18 +4056,7 @@ class Autosubmit: if job.platform.get_completed_files(job.name, 0): job.status = Status.COMPLETED - Log.info("CHANGED job '{0}' status to COMPLETED".format(job.name)) - Log.status("CHANGED job '{0}' status to COMPLETED".format(job.name)) - - #elif job.status != Status.SUSPENDED: - # job.status = Status.WAITING - # job.fail_count = 0 - # Log.info("CHANGED job '{0}' status to WAITING".format(job.name)) job.platform.get_logs_files(expid, job.remote_logs) - - #end = datetime.datetime.now() - #Log.info("Time spent: '{0}'".format(end - start)) - #Log.info("Updating the jobs list") return job_list diff --git a/autosubmit/config/basicConfig.py b/autosubmit/config/basicConfig.py index cd031d101264ff85feff7d5ce57ede4268478e6c..0cecd90585a8df41d798c2959c6dc9e91337d81f 100755 --- a/autosubmit/config/basicConfig.py +++ b/autosubmit/config/basicConfig.py @@ -29,7 +29,7 @@ from log.log import Log, AutosubmitError,AutosubmitCritical class BasicConfig: """ - Class to manage configuration for autosubmit path, database and default values for new experiments + Class to manage configuration for Autosubmit path, database and default values for new experiments """ def __init__(self): diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 363341e556df2670023ec4f6cfa5f71cb1b43edb..e44f11e67a555fbd3407cd2b36538a11e68b010c 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -60,7 +60,6 @@ class AutosubmitConfig(object): self._proj_parser = None self._proj_parser_file = os.path.join(self.basic_config.LOCAL_ROOT_DIR, expid, "conf", "proj_" + expid + ".conf") - #self.check_proj_file() self.ignore_file_path = False self.wrong_config = defaultdict(list) self.warn_config = defaultdict(list) @@ -352,8 +351,32 @@ class AutosubmitConfig(object): :rtype: str """ return str(self._jobs_parser.get_option(section, 'CUSTOM_DIRECTIVES', '')) + def show_messages(self): - def check_conf_files(self): + if len(self.warn_config.keys()) == 0 and len(self.wrong_config.keys()) == 0: + Log.result("Configuration files OK\n") + elif len(self.warn_config.keys()) > 0 and len(self.wrong_config.keys()) == 0: + Log.result("Configuration files contain some issues ignored") + if len(self.warn_config.keys()) > 0: + message = "In Configuration files:\n" + for section in self.warn_config: + message += "Issues in [{0}] config file:".format(section) + for parameter in self.warn_config[section]: + message += "\n[{0}] {1} ".format(parameter[0],parameter[1]) + message += "\n" + Log.printlog(message,6013) + + if len(self.wrong_config.keys()) > 0: + message = "On Configuration files:\n" + for section in self.wrong_config: + message += "Critical Issues on [{0}] config file:".format(section) + for parameter in self.wrong_config[section]: + message += "\n[{0}] {1}".format(parameter[0], parameter[1]) + message += "\n" + raise AutosubmitCritical(message,7014) + else: + return True + def check_conf_files(self,check_file=False): """ Checks configuration files (autosubmit, experiment jobs and platforms), looking for invalid values, missing required options. Prints results in log @@ -362,41 +385,29 @@ class AutosubmitConfig(object): :rtype: bool """ Log.info('\nChecking configuration files...') + self.ignore_file_path = check_file self.reload() + #Annotates all errors found in the configuration files in dictionaries self.warn_config and self.wrong_config. self.check_expdef_conf() self.check_platforms_conf() self.check_jobs_conf() self.check_autosubmit_conf() - try: if self.get_project_type() != "none": # Check proj configuration self.check_proj() except: - pass # test doesn't check proj - if len(self.warn_config.keys()) == 0 and len(self.wrong_config.keys()) == 0: - Log.result("Configuration files OK\n") - elif len(self.warn_config.keys()) > 0 and len(self.wrong_config.keys()) == 0: - Log.result("Configuration files contains some issues ignored") - if len(self.warn_config.keys()) > 0: - message = "On Configuration files:\n" - for section in self.warn_config: - message += "Issues on [{0}] config file:".format(section) - for parameter in self.warn_config[section]: - message += "\n[{0}] {1} ".format(parameter[0],parameter[1]) - message += "\n" - Log.printlog(message,6000) + pass # This exception is in case that the experiment doesn't contains any file ( usefull for test the workflow with None Option) + # End of checkers. - if len(self.wrong_config.keys()) > 0: - message = "On Configuration files:\n" - for section in self.wrong_config: - message += "Critical Issues on [{0}] config file:".format(section) - for parameter in self.wrong_config[section]: - message += "\n[{0}] {1}".format(parameter[0], parameter[1]) - message += "\n" - raise AutosubmitCritical(message,7000) - else: - return True + # This Try/Except is in charge of print all the info gathered by all the checkers and stop the program if any critical error is found. + try: + result = self.show_messages() + return result + except AutosubmitCritical as e: + raise AutosubmitCritical(e.message,e.code,e.trace) # In case that there are critical errors in the configuration, Autosubmit won't continue. + except Exception as e: + raise AutosubmitCritical("There was an error while showing the config log messages",7014,e.message) def check_autosubmit_conf(self): """ @@ -446,8 +457,10 @@ class AutosubmitConfig(object): if len(self._platforms_parser.sections()) != len(set(self._platforms_parser.sections())): self.wrong_config["Platform"]+=[["Global", "Platforms found multiple times"]] - + main_platform_found = False for section in self._platforms_parser.sections(): + if section in self.hpcarch: + main_platform_found= True if not self._platforms_parser.check_exists(section, 'TYPE'): self.wrong_config["Platform"]+=[[section, "Mandatory TYPE parameter not found"]] platform_type = self._platforms_parser.get_option(section, 'TYPE', '').lower() @@ -468,6 +481,8 @@ class AutosubmitConfig(object): self.wrong_config["Platform"]+=[[ section, "Mandatory MAX_WAITING_JOBS parameter not found or non-integer"]] if not self._platforms_parser.check_is_int(section, 'TOTAL_JOBS', False): self.wrong_config["Platform"]+=[[ section, "Mandatory TOTAL_JOBS parameter not found or non-integer"]] + if not main_platform_found: + self.wrong_config["Expdef"] += [["Default", "Main platform is not defined! check if [HPCARCH = {0}] has any typo".format(self.hpcarch)]] if "Platform" not in self.wrong_config: Log.result('{0} OK'.format(os.path.basename(self._platforms_parser_file))) return True @@ -493,13 +508,13 @@ class AutosubmitConfig(object): else: section_file_path = parser.get_option(section,'FILE') try: - if not self.ignore_file_path: - if not os.path.exists(section_file_path): + if self.ignore_file_path: + if not os.path.exists(os.path.join(self.get_project_dir(),section_file_path)): if parser.check_exists(section, 'CHECK'): if not parser.get_option(section, 'CHECK') in "on_submission": - self.wrong_config["Jobs"] += [[section, "FILE path doesn't exists, check parameter is found however is not in on_submission value"]] + self.wrong_config["Jobs"] += [[section, "FILE {0} doesn't exist and check parameter is not set on_submission value".format(section_file_path)]] else: - self.wrong_config["Jobs"] += [[section, "FILE path doesn't exists"]] + self.wrong_config["Jobs"] += [[section, "FILE {0} doesn't exist".format(os.path.join(self.get_project_dir(),section_file_path))]] except BaseException: pass # tests conflict quick-patch if not parser.check_is_boolean(section, 'RERUN_ONLY', False): @@ -550,7 +565,11 @@ class AutosubmitConfig(object): if not parser.check_exists('DEFAULT', 'HPCARCH'): self.wrong_config["Expdef"]+=[['DEFAULT', "Mandatory HPCARCH parameter is invalid"]] - + else: + try: + self.hpcarch = self.get_platform() + except: + self.wrong_config["Expdef"] += [['Default', "HPCARCH value is not a valid platform (check typo)"]] if not parser.check_exists('experiment', 'DATELIST'): self.wrong_config["Expdef"]+=[['DEFAULT', "Mandatory DATELIST parameter is invalid"]] if not parser.check_exists('experiment', 'MEMBERS'): @@ -588,7 +607,7 @@ class AutosubmitConfig(object): if not parser.check_exists('local', 'PROJECT_PATH'): self.wrong_config["Expdef"]+=[['local', "PROJECT_PATH parameter is invalid"]] elif project_type == 'none': #debug propouses - self.ignore_file_path = True + self.ignore_file_path = False if project_type != 'none': if not parser.check_exists('project_files', 'FILE_PROJECT_CONF'): @@ -596,6 +615,7 @@ class AutosubmitConfig(object): else: self.wrong_config["Expdef"]+=[['project', "Mandatory project choice is invalid"]] + if "Expdef" not in self.wrong_config: Log.result('{0} OK'.format(os.path.basename(self._exp_parser_file))) return True @@ -814,14 +834,14 @@ class AutosubmitConfig(object): output = subprocess.check_output("cd {0}; git rev-parse --abbrev-ref HEAD".format(full_project_path), shell=True) except subprocess.CalledProcessError as e: - raise AutosubmitCritical("Failed to retrieve project branch...",7000,e.message) + raise AutosubmitCritical("Failed to retrieve project branch...",7014,e.message) project_branch = output Log.debug("Project branch is: " + project_branch) try: output = subprocess.check_output("cd {0}; git rev-parse HEAD".format(full_project_path), shell=True) except subprocess.CalledProcessError as e: - raise AutosubmitCritical("Failed to retrieve project commit SHA...", 7000,e.message) + raise AutosubmitCritical("Failed to retrieve project commit SHA...", 7014,e.message) Log.critical("Failed to retrieve project commit SHA...") project_sha = output Log.debug("Project commit SHA is: " + project_sha) diff --git a/autosubmit/database/db_common.py b/autosubmit/database/db_common.py index c9d9ded05f450698a3e639e3981f28a235a9c4d0..cfc1c7bedaa98a552660f64556d531e36d7ffa20 100644 --- a/autosubmit/database/db_common.py +++ b/autosubmit/database/db_common.py @@ -40,14 +40,14 @@ def create_db(qry): try: (conn, cursor) = open_conn(False) except DbException as e: - raise AutosubmitCritical('Connection to database could not be established',7000,e.message) + raise AutosubmitCritical("Could not establish a connection to database",7001,e.message) try: cursor.executescript(qry) except sqlite3.Error as e: close_conn(conn, cursor) - raise AutosubmitCritical('Database can not be created',7000,e.message) + raise AutosubmitCritical('Database can not be created',7004,e.message) conn.commit() close_conn(conn, cursor) @@ -62,7 +62,7 @@ def check_db(): """ if not os.path.exists(BasicConfig.DB_PATH): - raise AutosubmitCritical('DB path does not exists: {0}'.format(BasicConfig.DB_PATH),7000) + raise AutosubmitCritical('DB path does not exists: {0}'.format(BasicConfig.DB_PATH),7003) return True @@ -100,12 +100,12 @@ def open_conn(check_version=True): # If database version is not the expected, update database.... if version < CURRENT_DATABASE_VERSION: if not _update_database(version, cursor): - raise AutosubmitCritical('Database version doesn''t match', 7000) + raise AutosubmitCritical('Database version doesn''t match', 7001) # ... or ask for autosubmit upgrade elif version > CURRENT_DATABASE_VERSION: raise AutosubmitCritical('Database version is not compatible with this autosubmit version. Please execute pip install ' - 'autosubmit --upgrade', 7000) + 'autosubmit --upgrade', 7002) return conn, cursor @@ -140,14 +140,14 @@ def save_experiment(name, description, version): try: (conn, cursor) = open_conn() except DbException as e: - raise AutosubmitCritical('Connection to database could not be established',7000,e.message) + raise AutosubmitCritical("Could not establish a connection to database",7001,e.message) try: cursor.execute('INSERT INTO experiment (name, description, autosubmit_version) VALUES (:name, :description, ' ':version)', {'name': name, 'description': description, 'version': version}) except sqlite3.IntegrityError as e: close_conn(conn, cursor) - raise AutosubmitCritical('Couldn''t register experiment',7000,e.message) + raise AutosubmitCritical('Couldn''t register experiment',7005,e.message) conn.commit() @@ -171,7 +171,7 @@ def check_experiment_exists(name, error_on_inexistence=True): try: (conn, cursor) = open_conn() except DbException as e: - raise AutosubmitCritical('Connection to database could not be established',7000,e.message) + raise AutosubmitCritical("Could not establish a connection to database",7001,e.message) conn.isolation_level = None # SQLite always return a unicode object, but we can change this @@ -182,7 +182,7 @@ def check_experiment_exists(name, error_on_inexistence=True): close_conn(conn, cursor) if row is None: if error_on_inexistence: - raise AutosubmitCritical('The experiment name "{0}" does not exist yet!!!', 7000) + raise AutosubmitCritical('The experiment name "{0}" does not exist yet!!!', 7005) return False return True @@ -202,7 +202,7 @@ def get_autosubmit_version(expid): try: (conn, cursor) = open_conn() except DbException as e: - raise AutosubmitCritical('Connection to database could not be established',7000,e.message) + raise AutosubmitCritical("Could not establish a connection to database",7001,e.message) conn.isolation_level = None # SQLite always return a unicode object, but we can change this @@ -212,7 +212,7 @@ def get_autosubmit_version(expid): row = cursor.fetchone() close_conn(conn, cursor) if row is None: - raise AutosubmitCritical('The experiment "{0}" does not exist'.format(expid),7000) + raise AutosubmitCritical('The experiment "{0}" does not exist'.format(expid),7005) return row[0] @@ -232,7 +232,7 @@ def last_name_used(test=False, operational=False): try: (conn, cursor) = open_conn() except DbException as e: - raise AutosubmitCritical('Connection to database could not be established',7000,e.message) + raise AutosubmitCritical("Could not establish a connection to database",7001,e.message) conn.text_factory = str if test: cursor.execute('SELECT name ' @@ -281,7 +281,7 @@ def delete_experiment(experiment_id): try: (conn, cursor) = open_conn() except DbException as e: - raise AutosubmitCritical('Connection to database could not be established',7000,e.message) + raise AutosubmitCritical("Could not establish a connection to database",7001,e.message) return False cursor.execute('DELETE FROM experiment ' 'WHERE name=:name', {'name': experiment_id}) @@ -317,7 +317,7 @@ def _update_database(version, cursor): 'WHERE autosubmit_version NOT NULL;') cursor.execute('UPDATE db_version SET version={0};'.format(CURRENT_DATABASE_VERSION)) except sqlite3.Error as e: - raise AutosubmitCritical('unable to update database version', 7000,e.message) + raise AutosubmitCritical('unable to update database version', 7001,e.message) Log.info("Update completed") return True diff --git a/autosubmit/git/autosubmit_git.py b/autosubmit/git/autosubmit_git.py index d2ef046dd877aed63ecb160ec6e5894cb0eb6b62..a4fd238470f14f716ae2b6f7aab91d5fb3a9cc23 100644 --- a/autosubmit/git/autosubmit_git.py +++ b/autosubmit/git/autosubmit_git.py @@ -56,16 +56,16 @@ class AutosubmitGit: output = subprocess.check_output("cd {0}; git diff-index HEAD --".format(dirname_path), shell=True) except subprocess.CalledProcessError as e: - raise AutosubmitCritical("Failed to retrieve git info ...",7000,e.message) + raise AutosubmitCritical("Failed to retrieve git info ...",7064,e.message) if output: Log.info("Changes not committed detected... SKIPPING!") - raise AutosubmitCritical("Commit needed!",7000) + raise AutosubmitCritical("Commit needed!",7013) else: output = subprocess.check_output("cd {0}; git log --branches --not --remotes".format(dirname_path), shell=True) if output: Log.info("Changes not pushed detected... SKIPPING!") - raise AutosubmitCritical("Synchronization needed!", 7000) + raise AutosubmitCritical("Synchronization needed!", 7064) else: if not as_conf.set_git_project_commit(as_conf): return False @@ -124,7 +124,7 @@ class AutosubmitGit: :return: True if clone was successful, False otherwise """ if not as_conf.is_valid_git_repository(): - raise AutosubmitCritical("Incorrect Git Configuration, check origin,commit and branch settings of expdef file", 7000) + raise AutosubmitCritical("Incorrect git Configuration, check origin,commit and branch settings of expdef file", 7064) git_project_origin = as_conf.get_git_project_origin() git_project_branch = as_conf.get_git_project_branch() git_remote_project_path = as_conf.get_git_remote_project_root() @@ -219,5 +219,5 @@ class AutosubmitGit: except subprocess.CalledProcessError as e: shutil.rmtree(project_path) - raise AutosubmitCritical("Can not clone {0} into {1}".format(git_project_branch + " " + git_project_origin, project_path), 7000,e.message) + raise AutosubmitCritical("Can not clone {0} into {1}".format(git_project_branch + " " + git_project_origin, project_path), 7065,e.message) return True diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index fff538682e5a6fdd9149e2eb76ef0a7b743a6df6..2a4f8bfaa88f8e4cec4d0ba8b8fe31dba5acbca7 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -18,7 +18,7 @@ # along with Autosubmit. If not, see . """ -Main module for autosubmit. Only contains an interface class to all functionality implemented on autosubmit +Main module for Autosubmit. Only contains an interface class to all functionality implemented on Autosubmit """ import os @@ -101,7 +101,7 @@ class Job(object): self.scratch_free_space = None self.custom_directives = [] self.undefined_variables = None - + self.log_retries = 5 self.id = job_id self.file = None self._local_logs = ('', '') @@ -518,28 +518,41 @@ class Job(object): retries = 3 sleeptime = 5 i = 0 - while (not out_exist or not err_exist) and i < retries: - out_exist = platform.check_file_exists( - remote_logs[0]) # will do 5 retries - err_exist = platform.check_file_exists( - remote_logs[1]) # will do 5 retries - if not out_exist or not err_exist: - sleeptime = sleeptime + 5 - i = i + 1 - sleep(sleeptime) - if out_exist and err_exist: - if copy_remote_logs: - if local_logs != remote_logs: - # unifying names for log files - self.synchronize_logs(platform, remote_logs, local_logs) - remote_logs = local_logs - platform.get_logs_files(self.expid, remote_logs) - # Update the logs with Autosubmit Job Id Brand - for local_log in local_logs: - platform.write_jobid(self.id, os.path.join( - self._tmp_path, 'LOG_' + str(self.expid), local_log)) - platform.closeConnection() - sleep(2) + sleep(10) + try: + while (not out_exist or not err_exist) and i < retries: + try: + out_exist = platform.check_file_exists(remote_logs[0]) # will do 5 retries + err_exist = platform.check_file_exists(remote_logs[1]) # will do 5 retries + except AutosubmitError as e: + out_exist = False + err_exist = False + pass + if not out_exist or not err_exist: + sleeptime = sleeptime + 5 + i = i + 1 + sleep(sleeptime) + if i >= retries: + raise AutosubmitError("Failed to retrieve log files",6001) + if out_exist and err_exist: + if copy_remote_logs: + if local_logs != remote_logs: + # unifying names for log files + self.synchronize_logs(platform, remote_logs, local_logs) + remote_logs = local_logs + platform.get_logs_files(self.expid, remote_logs) + # Update the logs with Autosubmit Job Id Brand + for local_log in local_logs: + platform.write_jobid(self.id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) + except AutosubmitError as e: + Log.printlog("Failed to retrieve log file for job {0}".format(self.name), 6001) + except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error + Log.printlog("Failed to retrieve log file for job {0}".format(self.name), 6001) + try: + platform.closeConnection() + except: + pass + sleep(5) # safe wait before end a thread return def update_status(self, copy_remote_logs=False): @@ -584,12 +597,12 @@ class Job(object): self.platform.get_completed_files(self.name) self.check_completion(Status.UNKNOWN) if self.status == Status.UNKNOWN: - Log.printlog("Job {0} is UNKNOWN. Checking completed files to confirm the failure...".format(self.name),6000) + Log.printlog("Job {0} is UNKNOWN. Checking completed files to confirm the failure...".format(self.name),6009) elif self.status == Status.COMPLETED: Log.result("Job {0} is COMPLETED", self.name) elif self.status == Status.SUBMITTED: # after checking the jobs , no job should have the status "submitted" - Log.printlog("Job {0} in SUBMITTED. This should never happen on this step..".format(self.name),6000) + Log.printlog("Job {0} in SUBMITTED status. This should never happen on this step..".format(self.name),6008) if previous_status != Status.RUNNING and self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN, Status.RUNNING]: @@ -610,7 +623,7 @@ class Job(object): @staticmethod def _get_submitter(as_conf): """ - Returns the submitter corresponding to the communication defined on autosubmit's config file + Returns the submitter corresponding to the communication defined on Autosubmit's config file :return: submitter :rtype: Submitter @@ -619,7 +632,7 @@ class Job(object): if communications_library == 'paramiko': return ParamikoSubmitter() # communications library not known - raise AutosubmitCritical( 'You have defined a not valid communications library on the configuration file', 7000) + raise AutosubmitCritical( 'You have defined a not valid communications library on the configuration file', 7014) def update_children_status(self): children = list(self.children) @@ -640,7 +653,7 @@ class Job(object): if os.path.exists(log_name): self.status = Status.COMPLETED else: - Log.printlog("Job {0} completion check failed. There is no COMPLETED file".format(self.name),6000) + Log.printlog("Job {0} completion check failed. There is no COMPLETED file".format(self.name),6009) self.status = default_status def update_parameters(self, as_conf, parameters, @@ -835,7 +848,7 @@ class Job(object): if communications_library == 'paramiko': return self._get_paramiko_template(snippet, template) else: - raise AutosubmitCritical("Job {0} does not have an correct template// template not found".format(self.name),7000) + raise AutosubmitCritical("Job {0} does not have an correct template// template not found".format(self.name),7014) def _get_paramiko_template(self, snippet, template): current_platform = self.platform @@ -1092,6 +1105,7 @@ class WrapperJob(Job): self.job_list = job_list # divide jobs in dictionary by state? self.wallclock = total_wallclock + self.num_processors = num_processors self.running_jobs_start = OrderedDict() self.platform = platform @@ -1170,7 +1184,7 @@ class WrapperJob(Job): reason = self.platform.parse_queue_reason( self.platform._ssh_output, self.id) if self._queuing_reason_cancel(reason): - Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}".format(self.name,reason),6000) + Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}".format(self.name,reason),6009) self.cancel_failed_wrapper_job() self.update_failed_jobs() return @@ -1207,7 +1221,7 @@ class WrapperJob(Job): start_time = self.running_jobs_start[job] if self._is_over_wallclock(start_time, job.wallclock): # if self.as_config.get_wrapper_type() in ['vertical', 'horizontal']: - Log.printlog("Job {0} inside wrapper {1} is running for longer than it's wallclock! Cancelling...".format(job.name,self.name),6000) + Log.printlog("Job {0} inside wrapper {1} is running for longer than it's wallclock! Cancelling...".format(job.name,self.name),6009) job.new_status = Status.FAILED job.update_status(self.as_config.get_copy_remote_logs() == 'true') return True @@ -1278,7 +1292,7 @@ done job) if over_wallclock: Log.printlog( - "Job {0} is FAILED".format(jobname),6000) + "Job {0} is FAILED".format(jobname),6009) elif len(out) == 3: end_time = self._check_time(out, 2) @@ -1318,8 +1332,7 @@ done self._check_finished_job(job) def cancel_failed_wrapper_job(self): - Log.error("Cancelling job with id {0}".format(self.id)) - Log.printlog("Cancelling job with id {0}".format(self.id),6000) + Log.printlog("Cancelling job with id {0}".format(self.id),6009) self.platform.send_command( self.platform.cancel_cmd + " " + str(self.id)) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index f11e702e8aa44f90b6dab51e4396a3dace5a2950..724432d8b524d9661397cd2d1405d8d7b4126d8a 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -940,7 +940,7 @@ class JobList: Status.SUBMITTED and not job.status == Status.READY] if len(tmp) == len(active): # IF only held jobs left without dependencies satisfied if len(tmp) != 0 and len(active) != 0: - raise AutosubmitCritical("Only Held Jobs active,Exiting Autosubmit (TIP: This can happen if suspended or/and Failed jobs are found on the workflow)",7000) + raise AutosubmitCritical("Only Held Jobs active. Exiting Autosubmit (TIP: This can happen if suspended or/and Failed jobs are found on the workflow)",7066) active = [] return active @@ -1031,7 +1031,7 @@ class JobList: else: return list() except IOError: - Log.printlog("Autosubmit will use a backup for recover the job_list",6000) + Log.printlog("Autosubmit will use a backup for recover the job_list",6010) return list() def load(self): diff --git a/autosubmit/job/job_list_persistence.py b/autosubmit/job/job_list_persistence.py index 2a100e9d45b945deca80baff96f68e5d71d82b95..13eabf88d0f994c71b1791b8fa81b694a39030ae 100644 --- a/autosubmit/job/job_list_persistence.py +++ b/autosubmit/job/job_list_persistence.py @@ -72,7 +72,7 @@ class JobListPersistencePkl(JobListPersistence): fd = open(path, 'r') return pickle.load(fd) else: - Log.printlog('File {0} does not exist'.format(path),7000) + Log.printlog('File {0} does not exist'.format(path),7040) return list() def save(self, persistence_path, persistence_file, job_list): diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 13942821c3a3a4b48f046f16af54acd685dda06b..00c4cbada645b409fe86badf59b5b06ae80c2afe 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -93,7 +93,7 @@ class JobPackageBase(object): exit=True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - raise AutosubmitCritical("check=on_submission parameter didn't generate the template {0}".format(job.name),7000) + raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) if not job.check_script(configuration, parameters,show_logs=job.check_warnings): Log.warning("Script {0} check failed",job.name) Log.warning("On submission script has some empty variables") diff --git a/autosubmit/monitor/diagram.py b/autosubmit/monitor/diagram.py index 11ab04cb642d0e1281b8bfba8af3e588c0448e3c..b2497347d5d2b778b22e4032bd3761662bde7cc6 100644 --- a/autosubmit/monitor/diagram.py +++ b/autosubmit/monitor/diagram.py @@ -41,9 +41,9 @@ def create_bar_diagram(experiment_id, jobs_list, general_stats, output_file, per # Creating stats figure + sanity check if num_plots > MAX_NUM_PLOTS: message = "The results are too large to be shown, try narrowing your query. \n Use a filter like -ft where you supply a list of job types, e.g. INI, SIM; \ -or -fp where you supply an integer that represents the number of hours into the past that should be queried, \ +or -fp where you supply an integer that represents the number of hours into the past that should be queried: \ suppose it is noon, if you supply -fp 5 the query will consider changes starting from 7:00 am. If you really wish to query the whole experiment, refer to Autosubmit GUI." - raise AutosubmitCritical("Stats query our of bounds",7000,message) + raise AutosubmitCritical("Stats query out of bounds",7061,message) fig = plt.figure(figsize=(RATIO * 4, 3 * RATIO * num_plots)) diff --git a/autosubmit/notifications/mail_notifier.py b/autosubmit/notifications/mail_notifier.py index 2428102486ce34499426ba88ef9cd2e8aec38be2..396b60478f9dd0e8957633fb98a196d97ff3006a 100644 --- a/autosubmit/notifications/mail_notifier.py +++ b/autosubmit/notifications/mail_notifier.py @@ -36,7 +36,7 @@ class MailNotifier: try: self._send_mail(self.config.MAIL_FROM, mail, message) except BaseException as e: - Log.printlog('An error occurred while sending a mail for the job {0}', job_name,6000) + Log.printlog('An error occurred while sending a mail for the job {0}', job_name,6011) def _send_mail(self, mail_from, mail_to, message): server = smtplib.SMTP_SSL(self.config.SMTP_SERVER) diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index 65ae5873d12143e3eb7a5b18723a44e7af67b2e3..d473d230d316ebc2917e5cefe83ac648677de533 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -20,14 +20,13 @@ import os import subprocess from autosubmit.platforms.paramiko_platform import ParamikoPlatform, ParamikoPlatformException -from log.log import Log +from log.log import Log,AutosubmitCritical,AutosubmitError from autosubmit.platforms.headers.ec_header import EcHeader from autosubmit.platforms.headers.ec_cca_header import EcCcaHeader from autosubmit.platforms.headers.slurm_header import SlurmHeader from autosubmit.platforms.wrappers.wrapper_factory import EcWrapperFactory from time import sleep - class EcPlatform(ParamikoPlatform): """ Class to manage queues with ecaccess @@ -115,7 +114,7 @@ class EcPlatform(ParamikoPlatform): def connect(self): """ - In this case, it does nothing because connection is established foe each command + In this case, it does nothing because connection is established for each command :return: True :rtype: bool @@ -123,7 +122,7 @@ class EcPlatform(ParamikoPlatform): self.connected = True def restore_connection(self): """ - In this case, it does nothing because connection is established foe each command + In this case, it does nothing because connection is established for each command :return: True :rtype: bool @@ -131,7 +130,7 @@ class EcPlatform(ParamikoPlatform): self.connected = True def test_connection(self): """ - In this case, it does nothing because connection is established foe each command + In this case, it does nothing because connection is established for each command :return: True :rtype: bool @@ -142,7 +141,7 @@ class EcPlatform(ParamikoPlatform): output = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: if not ignore_log: - Log.error('Could not execute command {0} on {1}'.format(e.cmd, self.host)) + raise AutosubmitError('Could not execute command {0} on {1}'.format(e.cmd, self.host),7500,e.message) return False self._ssh_output = output return True diff --git a/autosubmit/platforms/lsfplatform.py b/autosubmit/platforms/lsfplatform.py index caaed7a1ac333a107b13e2eecdcd8c7fb45ced9b..7c8c769d32201b7824e22db19544a2e1a1bdc6ff 100644 --- a/autosubmit/platforms/lsfplatform.py +++ b/autosubmit/platforms/lsfplatform.py @@ -110,27 +110,27 @@ class LsfPlatform(ParamikoPlatform): ############################################################################### """.format(filename, queue, project, wallclock, num_procs, dependency, '\n'.ljust(13).join(str(s) for s in directives)) - def connect(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True - def restore_connection(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True - def test_connection(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True \ No newline at end of file + # def connect(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True + # def restore_connection(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True + # def test_connection(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True \ No newline at end of file diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 0bffc52769ce1946c75083b94783d5d52f494b35..75d0dbed7a3f5ec88ad8f5a152f4d446c1c59831 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -73,6 +73,7 @@ class ParamikoPlatform(Platform): try: transport = self._ssh.get_transport() transport.send_ignore() + pass except BaseException as e: try: self.reset() @@ -80,21 +81,24 @@ class ParamikoPlatform(Platform): transport = self._ssh.get_transport() transport.send_ignore() except EOFError as e: - raise AutosubmitError("After a reconnection procedure, the platform is still not alive.",6000) + raise AutosubmitError("After a reconnection procedure, the platform is still not alive.",6002) def restore_connection(self): - self.connected = True - if self._ssh is None: - retries = 2 - retry = 0 + try: self.connected = False - while self.connected is False and retry < retries: - if self.connect(True): - self.connected = True - retry += 1 - if not self.connected: - trace='Can not create ssh or sftp connection to {0}: Connection could not be established to platform {1}\n Please, check your expid platform.conf to see if there are mistakes in the configuration\n Also Ensure that the login node listed on HOST parameter is available(try to connect via ssh on a terminal)\n Also you can put more than one host using a comma as separator'.format(self.host, self.name) - raise AutosubmitCritical('Experiment cant no continue without unexpected behaviour, Stopping Autosubmit',7000,trace) + if self._ssh is None: + retries = 2 + retry = 0 + while self.connected is False and retry < retries: + self.connect(True) + retry += 1 + if not self.connected: + trace='Can not create ssh or sftp connection to {0}: Connection could not be established to platform {1}\n Please, check your expid platform.conf to see if there are mistakes in the configuration\n Also Ensure that the login node listed on HOST parameter is available(try to connect via ssh on a terminal)\n Also you can put more than one host using a comma as separator'.format(self.host, self.name) + raise AutosubmitCritical('Experiment cant no continue without unexpected behaviour, Stopping Autosubmit',7050,trace) + except AutosubmitCritical: + raise + except: + raise AutosubmitCritical('Cant connect to this platform due an unknown error',7050) def connect(self, reconnect=False): """ @@ -134,10 +138,12 @@ class ParamikoPlatform(Platform): self._ftpChannel = self._ssh.open_sftp() self.connected = True except BaseException as e: + if "Authentication failed." in e.message: + raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format(self._host_config['hostname']),7050,e.message) if not reconnect and "," in self._host_config['hostname']: self.restore_connection(reconnect=True) else: - raise AutosubmitError("Couldn't establish a connection to the specified host, wrong configuration?",6000,e.message) + raise AutosubmitError("Couldn't establish a connection to the specified host, wrong configuration?",6003,e.message) def check_completed_files(self, sections=None): if self.host == 'localhost': @@ -188,9 +194,9 @@ class ParamikoPlatform(Platform): self._ftpChannel.chmod(remote_path,os.stat(local_path).st_mode) return True except IOError as e: - raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join(self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6000, e.message) + raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join(self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, e.message) except BaseException as e: - raise AutosubmitError('Send file failed. Connection seems to no be active',6000) + raise AutosubmitError('Send file failed. Connection seems to no be active',6004) # Gets .err and .out def get_file(self, filename, must_exist=True, relative_path=''): @@ -220,9 +226,9 @@ class ParamikoPlatform(Platform): return True except Exception as e: if str(e) in "Garbage": - raise AutosubmitError('Files couldn''t be retrieved, session not active'.format(filename),6000,e.message) + raise AutosubmitError("Files couldn't be retrieved, session not active".format(filename),6004,e.message) if must_exist: - raise AutosubmitError('A critical file couldn''t be retrieved, File {0} does not exists'.format(filename),6000,e.message) + raise AutosubmitError("A critical file couldn't be retrieved, File {0} does not exists".format(filename),6004,e.message) else: Log.printlog("Log file couldn't be retrieved: {0}".format(filename),5000) return False @@ -241,13 +247,12 @@ class ParamikoPlatform(Platform): self._ftpChannel.remove(os.path.join(self.get_files_path(), filename)) return True except IOError as e: - Log.printlog('{0} couldn''t be retrieved, session not active'.format(os.path.join(self.get_files_path(), filename)),5000) + Log.printlog("{0} couldn't be retrieved, session not active".format(os.path.join(self.get_files_path(), filename)),6004) return False - #raise AutosubmitError('Files couldn''t be retrieved, session not active'.format(filename), 6000, e.message) except BaseException as e: Log.error('Could not remove file {0} due a wrong configuration'.format(os.path.join(self.get_files_path(), filename))) if e.lower().contains("garbage"): - raise AutosubmitCritical("Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ",7000,e.message) + raise AutosubmitCritical("Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ",7051,e.message) @@ -269,11 +274,11 @@ class ParamikoPlatform(Platform): except (Exception,IOError) as e: if str(e) in "Garbage": - raise AutosubmitError('File {0} does not exists'.format(os.path.join(self.get_files_path(), src)),6000,e.message) + raise AutosubmitError('File {0} does not exists'.format(os.path.join(self.get_files_path(), src)),6004,e.message) if must_exist: - raise AutosubmitError('A critical file couldn''t be retrieved, File {0} does not exists'.format(os.path.join(self.get_files_path(), src)),6000,e.message) + raise AutosubmitError("A critical file couldn't be retrieved, File {0} does not exists".format(os.path.join(self.get_files_path(), src)),6004,e.message) else: - Log.printlog("Log file couldn't be moved: {0}".format(os.path.join(self.get_files_path(), src)),5000) + Log.printlog("Log file couldn't be moved: {0}".format(os.path.join(self.get_files_path(), src)),5001) return False def submit_job(self, job, script_name, hold=False): @@ -443,7 +448,7 @@ class ParamikoPlatform(Platform): for job in job_list: job_status = Status.UNKNOWN Log.warning('check_job() The job id ({0}) from platform {1} has an status of {2}.', job.id, self.name, job_status) - raise AutosubmitError("Some Jobs are in Unknown status",6000) + raise AutosubmitError("Some Jobs are in Unknown status",6008) #job.new_status=job_status @@ -484,15 +489,17 @@ class ParamikoPlatform(Platform): timeout = 60/2 else: timeout = 60*2 + stderr_readlines = [] + stdout_chunks = [] try: stdin, stdout, stderr = self._ssh.exec_command(command) channel = stdout.channel channel.settimeout(timeout) stdin.close() channel.shutdown_write() - stdout_chunks = [] + stdout_chunks.append(stdout.channel.recv(len(stdout.channel.in_buffer))) - stderr_readlines = [] + while not channel.closed or channel.recv_ready() or channel.recv_stderr_ready(): # stop if channel was closed prematurely, and there is no data in the buffers. @@ -523,15 +530,18 @@ class ParamikoPlatform(Platform): self._ssh_output += s for errorLine in stderr_readlines: if errorLine.find("submission failed") != -1 or errorLine.find("git clone") != -1: - raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines),6000)) + raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines),6005)) if not ignore_log: if len(stderr_readlines) > 0: - Log.printlog('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines)),6000) + Log.printlog('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines)),6006) else: Log.debug('Command {0} in {1} successful with out message: {2}', command, self.host, self._ssh_output) return True + except AttributeError as e: + raise AutosubmitError( + 'Session not active: {0}'.format(e.message), 6005) except BaseException as e: - raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines)),6000,e.message) + raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host, '\n'.join(stderr_readlines)),6005,e.message) def parse_job_output(self, output): """ @@ -712,17 +722,17 @@ class ParamikoPlatform(Platform): if self.send_command(self.get_mkdir_cmd()): Log.debug('{0} has been created on {1} .', self.remote_log_dir, self.host) else: - raise AutosubmitError("SFTP session not active ", 6000,"Could not create the DIR {0} on HPC {1}'.format(self.remote_log_dir, self.host)".format(self.remote_log_dir, self.host)) + raise AutosubmitError("SFTP session not active ", 6007,"Could not create the DIR {0} on HPC {1}'.format(self.remote_log_dir, self.host)".format(self.remote_log_dir, self.host)) except BaseException as e: - raise AutosubmitError("SFTP session not active ", 6000,e.message) + raise AutosubmitError("SFTP session not active ", 6007,e.message) else: try: if self.send_command(self.get_mkdir_cmd()): Log.debug('{0} has been created on {1} .', self.remote_log_dir, self.host) else: - Log.error('Could not create the DIR {0} on HPC {1}'.format(self.remote_log_dir, self.host)) + Log.error('Could not create the DIR {0} to HPC {1}'.format(self.remote_log_dir, self.host)) except BaseException as e: - raise AutosubmitError("Couldn''t send the file", 6000, e.message) + raise AutosubmitError("Couldn't send the file {0} to HPC {1}".format(self.remote_log_dir,self.host), 6004, e.message) class ParamikoPlatformException(Exception): diff --git a/autosubmit/platforms/pbsplatform.py b/autosubmit/platforms/pbsplatform.py index 33f7d9820c27e82699d432fb0b7c50171cda4aa8..7003323c4cd9c464f48f43d6faf2ade20c221300 100644 --- a/autosubmit/platforms/pbsplatform.py +++ b/autosubmit/platforms/pbsplatform.py @@ -101,27 +101,27 @@ class PBSPlatform(ParamikoPlatform): return self._checkjob_cmd + str(job_id) else: return "ssh " + self.host + " " + self.get_qstatjob(job_id) - def connect(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True - def restore_connection(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True - def test_connection(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True \ No newline at end of file + # def connect(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True + # def restore_connection(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True + # def test_connection(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True \ No newline at end of file diff --git a/autosubmit/platforms/psplatform.py b/autosubmit/platforms/psplatform.py index 163611f05121b66df31477997a4b682b7a77fd30..e8981eec8e135c0309e2ae838d5146d941243230 100644 --- a/autosubmit/platforms/psplatform.py +++ b/autosubmit/platforms/psplatform.py @@ -79,27 +79,27 @@ class PsPlatform(ParamikoPlatform): def get_checkjob_cmd(self, job_id): return self.get_pscall(job_id) - def connect(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True - def restore_connection(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True - def test_connection(self): - """ - In this case, it does nothing because connection is established foe each command - - :return: True - :rtype: bool - """ - self.connected = True \ No newline at end of file + # def connect(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True + # def restore_connection(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True + # def test_connection(self): + # """ + # In this case, it does nothing because connection is established for each command + # + # :return: True + # :rtype: bool + # """ + # self.connected = True \ No newline at end of file diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py index 40e959c22f05e99650d2f9fe3a1703a3c361ec52..5957cc225775e5806f3a94ca72ee1304ce14207d 100644 --- a/autosubmit/platforms/sgeplatform.py +++ b/autosubmit/platforms/sgeplatform.py @@ -87,7 +87,7 @@ class SgePlatform(ParamikoPlatform): def connect(self): """ - In this case, it does nothing because connection is established foe each command + In this case, it does nothing because connection is established for each command :return: True :rtype: bool @@ -95,7 +95,7 @@ class SgePlatform(ParamikoPlatform): self.connected = True def restore_connection(self): """ - In this case, it does nothing because connection is established foe each command + In this case, it does nothing because connection is established for each command :return: True :rtype: bool @@ -103,7 +103,7 @@ class SgePlatform(ParamikoPlatform): self.connected = True def test_connection(self): """ - In this case, it does nothing because connection is established foe each command + In this case, it does nothing because connection is established for each command :return: True :rtype: bool diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index 30df13c289ea34a21bb0c1a3c96bcd1c6cc95c61..c0b553de316c7817a1c566abacec9faa8b7cd776 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -79,11 +79,11 @@ class SlurmPlatform(ParamikoPlatform): jobs_id = self.get_submitted_job_id(self.get_ssh_output()) return jobs_id else: - raise AutosubmitError("Jobs couldn't be submitted, retry again in next iteration",6000) + raise AutosubmitError("Jobs couldn't be submitted, retry again in next iteration",6008) except IOError as e: - raise AutosubmitError("Submit script is not found, retry again in next AS iteration", 6000, e.message) + raise AutosubmitError("Submit script is not found, retry again in next AS iteration", 6008, e.message) except BaseException as e: - raise AutosubmitError("Job couldn't be submitted, retry again in next AS iteration", 6000, e.message) + raise AutosubmitError("Job couldn't be submitted, retry again in next AS iteration", 6008, e.message) def update_cmds(self): """ @@ -133,7 +133,7 @@ class SlurmPlatform(ParamikoPlatform): jobs_id.append(int(output.split(' ')[3])) return jobs_id except IndexError: - raise AutosubmitCritical("Submission failed, There are issues on your config file",7000) + raise AutosubmitCritical("Submission failed. There are issues on your config file",7014) def jobs_in_queue(self): dom = parseString('') jobs_xml = dom.getElementsByTagName("JB_job_number") diff --git a/bin/autosubmit b/bin/autosubmit index 55c651ef944298e47eab1faf1e5c54f0316bb2e9..64b1be42d2a993b312271859c91e9b4aa6b07a90 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -21,7 +21,7 @@ import os import sys -from log.log import Log,AutosubmitError,AutosubmitCritical +from log.log import Log,AutosubmitCritical scriptdir = os.path.abspath(os.path.dirname(sys.argv[0])) assert sys.path[0] == scriptdir @@ -41,13 +41,14 @@ def main(): if e.trace is not None: Log.error("Trace: {0}", e.trace) Log.critical("{1} [eCode={0}]", e.code, e.message) + Log.info("More info at https://autosubmit.readthedocs.io/en/latest/faq.html") os._exit(1) - except BaseException as e: + except Exception as e: Log.error("Trace: {0}", e.message) - if "temporarily unavailable" in e.message: + if "temporarily unavailable" in str(e.message): Log.critical("Another instance of autosubmit is running on this experiment. If this is not the case, delete autosubmit.lock",7000) else: - Log.critical("Unhandled error, if you see this message report it in autosubmit git") + Log.critical("Unhandled error: If you see this message, please report it in Autosubmit's GitLab project") os._exit(1) if __name__ == "__main__": diff --git a/docs/source/faq.rst b/docs/source/faq.rst index abc03240c429e9c207f31590ae3d88b78ffdd34b..b0822d6a04d1772e49c73c2911309ddf6a384fc3 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -1,135 +1,135 @@ -############ -FAQ - Frequently Asked Questions -############ - -[CRITICAL] Unhandled exception on Autosubmit: [Errno 11] Resource temporarily unavailable -==================== - -.. code-block:: python - - [CRITICAL] Unhandled exception on Autosubmit: [Errno 11] Resource temporarily unavailable - Traceback (most recent call last): - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit.py", line 402, in parse_args - args.group_by, args.expand, args.expand_status) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit.py", line 2093, in set_status - with portalocker.Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/portalocker-1.2.0-py2.7.egg/portalocker/utils.py", line 195, in __enter__ - return self.acquire() - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/portalocker-1.2.0-py2.7.egg/portalocker/utils.py", line 155, in acquire - raise exceptions.LockException(exception) - LockException: [Errno 11] Resource temporarily unavailable - - -Solution ---------------- -Make sure the experiment is not still running. If it's not, delete the autosubmit.lock in the /tmp folder inside your experiment directory. +################################## +Error codes and solutions +################################## + +Database Issues - Critical Error codes [7001-7005] +=================================================== + ++------------------------------------------------------------------------------------------------------------------------+ +| Code | Details | Solution | ++======+===============================================+=================================================================+ +| 7001 | Connection to the db could not be established | Check if database exists | ++------+-----------------------------------------------+-----------------------------------------------------------------+ +| 7002 | Wrong version | Check system sqlite version | ++------+-----------------------------------------------+-----------------------------------------------------------------+ +| 7003 | DB doesn't exists | Check if database exists | ++------+-----------------------------------------------+-----------------------------------------------------------------+ +| 7004 | Can't create a new database | Check your user permissions | ++------+-----------------------------------------------+-----------------------------------------------------------------+ +| 7005 | AS database is corrupted or locked | Please, open a new issue ASAP. (If you are on BSC environment) | ++------+-----------------------------------------------+-----------------------------------------------------------------+ + +Default Solution +---------------- +These issues are usually from server side, please, ask first in Autosubmit git if you don't have a custom installation. ---- -[CRITICAL] Unhandled exception on Autosubmit: attempt to write a readonly database -==================== - -.. code-block:: python - - [CRITICAL] Unhandled exception on Autosubmit: attempt to write a readonly database - Traceback (most recent call last): - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit - .py", line 389, in parse_args - return Autosubmit.create(args.expid, args.noplot, args.hide, args.output, args.group_by, args.expand, args.expand_status) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit - .py", line 1953, in create - "job_packages_" + expid).reset_table() - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/job/job_pa - ckage_persistence.py", line 65, in reset_table - self.db_manager.drop_table(self.JOB_PACKAGES_TABLE) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/database/d - b_manager.py", line 65, in drop_table - cursor.execute(drop_command) - OperationalError: attempt to write a readonly database - -Solution ---------------- -This usually happens when trying to run `autosubmit create` with an expid of another user, please double check the expid you are using. - ----- +Wrong User Input - Critical Error codes [7010-7030] +==================================================== -[ERROR] Command sbatch -D ... failed with error message: sbatch: error: Batch job submission failed: Invalid account or account/partition combination specified -==================== ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| Code | Details | Solution | ++======+======================================================+================================================================================================+ +| 7010 | Experiment has been halted in a manual way | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 7011 | Wrong arguments for an specific command | Check the command section for more info | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 7012 | Insufficient permissions for an specific experiment. | Check if you have enough permissions, experiment exists or specified expid has a typo | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 7013 | Pending commits | You must commit/synchronize pending changes in the experiment proj folder. | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 7014 | Wrong configuration | Check your experiment/conf files, also take a look to the ASLOG/command.log detailed output | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ -Solution ---------------- -This can be due to an invalid configuration in your ~/.ssh/config file, so check if you are able to run a ssh command using the account displayed in the error message. -If so, once you are in the remote platform, type bsc_acct and see if the information for your username/account is displayed: +Default Solution +---------------- -.. code-block:: ini +These issues are usually mistakes from the user input, check the avaliable logs and git resolved issues. Alternative, you can ask for help to Autosubmit team. - USER CONSUMED CPU: - - User:                                             Machine:          Used [khours]: +---- -If not, contact support referring to the problem and specifying your account. +Platform issues - Critical Error codes. Local [7040-7050] and remote [7050-7060] +================================================================================= ----- ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Code | Details | Solution | ++======+=================================================================+=========================================================================================================================================+ +| 7040 | Invalid experiment pkl/db likely due a local platform failure | Should be recovered automatically, if not check if there is a backup file and do it manually | ++------+-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +| 7041 | Weird job status | Weird Job status, try to recover experiment(check the recovery how-to for more info) if this issue persist please, report it to gitlab | ++------+-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +| 7050 | Connection can't be established. | check your experiment platform configuration | ++------+-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +| 7050 | Failure after a restart, connection can't be restored. | Check or ask (manually) if the remote platforms have any known issue | ++------+-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +| 7051 | Invalid ssh configuration. | Check .ssh/config file. Additionally, Check if you can perform a password less connection to that platform. | ++------+-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ -[ERROR] Cannot send file to remote platform -=================================== +Default Solution +---------------- -.. code-block:: python - - [ERROR] marenostrum4 submission failed - [CRITICAL] Unhandled exception on Autosubmit: size mismatch in put! 0 != 38998 - Traceback (most recent call last): - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit.py", line 368, in parse_args - return Autosubmit.run_experiment(args.expid) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit.py", line 776, in run_experiment - if Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence): - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit.py", line 819, in submit_ready_jobs - package.submit(as_conf, job_list.parameters) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/job/job_packages.py", line 87, in submit - self._send_files() - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/job/job_packages.py", line 115, in _send_files - self.platform.send_file(self._job_scripts[job.name]) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/platforms/paramiko_platform.py", line 129, in send_file - ftp.put(os.path.join(self.tmp_path, filename), os.path.join(self.get_files_path(), filename)) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/paramiko-1.15.0-py2.7.egg/paramiko/sftp_client.py", line 669, in put - return self.putfo(fl, remotepath, file_size, callback, confirm) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/paramiko-1.15.0-py2.7.egg/paramiko/sftp_client.py", line 635, in putfo - raise IOError('size mismatch in put! %d != %d' % (s.st_size, size)) - IOError: size mismatch in put! 0 != 38998 - -This happens when the quota has been reached and the machine is full +Check autosubmit log for detailed information, there will be additional error codes. ---- -[CRITICAL] Unhandled exception on Autosubmit: database is locked -=================================== - -.. code-block:: python - - [CRITICAL] Unhandled exception on Autosubmit: database is locked - Traceback (most recent call last): - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit.py", line 377, in parse_args - args.operational) != '' - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/autosubmit.py", line 532, in expid - exp_id = copy_experiment(copy_id, description, Autosubmit.autosubmit_version, test, operational) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/experiment/experiment_common.py", line 93, in copy_experiment - new_name = new_experiment(description, version, test, operational) - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/experiment/experiment_common.py", line 68, in new_experiment - if not db_common.save_experiment(new_name, description, version): - File "/shared/earth/software/autosubmit/3.11.0b-foss-2015a-Python-2.7.9/lib/python2.7/site-packages/autosubmit-3.10.0-py2.7.egg/autosubmit/database/db_common.py", line 151, in save_experiment - {'name': name, 'description': description, 'version': version}) - OperationalError: database is locked - -Solution ---------------- -If you were trying to copy an experiment, make sure you put the -y immediately after expid: `autosubmit expid -y` +Uncatalogued codes - Critical Error codes [7060+] +=================================================== + ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Code | Details | Solution | ++======+=====================================+========================================================================================================================+ +| 7060 | Display issues during monitoring | try to use a different output or txt | ++------+-------------------------------------+------------------------------------------------------------------------------------------------------------------------+ +| 7061 | Stat command failed | Check Aslogs command output, open a git issue | ++------+-------------------------------------+------------------------------------------------------------------------------------------------------------------------+ +| 7062 | Svn issues | Check, in expdef, if url exists | ++------+-------------------------------------+------------------------------------------------------------------------------------------------------------------------+ +| 7063 | cp/rsync issues | Check if destination path exists | ++------+-------------------------------------+------------------------------------------------------------------------------------------------------------------------+ +| 7064 | Git issues | check that the proj folder is a well configured git folder. Also, check [GIT] expdef config | ++------+-------------------------------------+------------------------------------------------------------------------------------------------------------------------+ +| 7065 | Wrong git configuration | Invalid git url. Check [GIT] expdef config. If issue persists, check if proj folder is a well configured git folder. | ++------+-------------------------------------+------------------------------------------------------------------------------------------------------------------------+ +| 7066 | Presubmission feature issues | New feature, this message should be prompt. Please report it to Git | ++------+-------------------------------------+------------------------------------------------------------------------------------------------------------------------+ + +Default Solution +---------------- + +Check autosubmit log for detailed information, there will be additional error codes. ---- -bash: sbatch: command not found +Minor errors - Error codes [6000+] =================================== -Solution ---------------- -First, check your jobs_expid.conf and platforms_expid.conf files and make sure the platform assigned to the running job is defined correctly and is a SLURM platform. -If this is ok, check that the hostname of the platform you are using is also correctly defined in your ~/.ssh/config file. ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| Code | Details | Solution | ++======+======================================================+================================================================================================+ +| 6001 | Failed to retrieve log files | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6002 | Failed reconection | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6003 | Failed connection, wrong configuration | Check your platform.conf file | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6004 | input output issues | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6005 | Unable to execute the command | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6006 | Failed command | Check err output for more info, command worked but some issue was detected | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6007 | Broken sFTP connection | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6008 | Inconsistent/unexpected ,jhjob sñtatus | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6009 | Failed job checker | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6010 | Corrupted job_list using backup | Automatically, if it fails, Perform mv /pkl/job_list_backup.pkl /pkl/job_list.pkl| ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6011 | Incorrect mail notifier configuration | Double check your mail configuration on job.conf (job status) and autosubmit.conf (email) | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6012 | Migrate , archive/unarchive I/O issues | Check migrate how-to configuration | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6013 | Configuration issues | Check log output for more info | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/setup.py b/setup.py index 425e34608ae0c2f9d1e9376c26b1e68684d416bf..ed9e285dab6de725c9f4ef9bc9dc5f260379c8cd 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( keywords=['climate', 'weather', 'workflow', 'HPC'], install_requires=['argparse>=1.2,<2', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', 'numpy', 'matplotlib', 'paramiko==1.15', - 'mock>=1.3.0', 'portalocker>=0.5.7', 'networkx', 'bscearth.utils'], + 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] },